Python Deep Learning Reading Notes (Optimize IMDB comment classification using pre-trained word embedding)

Posted by pothole_89 on Tue, 27 Aug 2019 04:18:22 +0200

Chapter Six: Deep Learning for Text and Sequence

 

In-depth learning does not accept the original text as input but only processes numeric tensors. Text vectorization refers to the process of converting text to numeric tensors.(Text can be split into tags: words, characters, n-gram s, the process of breaking down into tags is called a participle and converting it into a tensor)

One-hot associates each word with a unique integer index, converting integer index i to a binary vector of length N.One-hot encoding is the most basic and common method to convert tags into vectors, resulting in binary, sparse vectors with very high dimensions.

 

Word vector (word embedding): A low-dimensional floating-point vector derived from data.

Learning method one: learning word embedding while completing the main task, learning random word vectors.

Learning method 2: Pre-training words embedding, calculating good words embedding and loading into the model.

Learn word embedding using the Embedding layer: learn a new embedding space for each new task, learn the weights of a layer using the Embedding layer, understand the Embedding layer as a dictionary, and map the frame number index to a dense vector.The Embedding layer is a two-dimensional integer tensor (samples, sequence_length) that returns a three-dimensional floating-point tensor with the shape (samples,sequence_length,embedding_dimensionality).

Pre-trained word embedding: The principle is the same as that of the pre-trained convolution network used in image classification, which is calculated using word frequency statistics.

 

Handle IMDB The original text of the data
import os
imdb_dir='D:\\jupyter_code\\GloVe\\aclImdb'
train_dir=os.path.join(imdb_dir,'train')

labels=[]
texts=[]

for label_type in ['neg','pos']:
    dir_name=os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:]=='.txt':
            f=open(os.path.join(dir_name,fname),encoding='UTF-8')
            texts.append(f.read())
            f.close()
            if label_type=='neg':
                labels.append(0)
            else:
                labels.append(1)
                
#Separate the text of the original imdb data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
#Truncate comments after 100 words
maxlen=100
#Training on 200 samples
training_samples=200
#Validation on 1000 samples
validation_samples=10000
#Consider only the top 10,000 most common words
max_words=10000

tokenizer=Tokenizer(num_words=max_words)
#Update vocabulary from text list
tokenizer.fit_on_texts(texts)
sequences=tokenizer.texts_to_sequences(texts)

word_index=tokenizer.word_index
print('Found %s unique tokens.'%len(word_index))

data=pad_sequences(sequences,maxlen=maxlen)

labels=np.asarray(labels)
print(data.shape)
print(labels.shape)

#Disrupting data while dividing data
indices=np.arange(data.shape[0])
np.random.shuffle(indices)
data=data[indices]
labels=labels[indices]

x_train=data[:training_samples]
y_train=labels[:training_samples]
x_val=data[training_samples:training_samples+validation_samples]
y_val=labels[training_samples:training_samples+validation_samples]

#Parse GloVe Embedded File
glove_dir='D:\\jupyter_code\\GloVe\\glove.6B'

embeddings_index={}
f=open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='UTF-8')
for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeddings_index[word]=coefs
f.close()

print('Found %s word vectors'%len(embeddings_index))

#Prepare GloVe Word Embedding Lexicon
embedding_dim=100
embedding_matrix=np.zeros((max_words,embedding_dim))
for word, i in word_index.items():
    if i<max_words:
        embedding_vector=embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector

#Define Model
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense

model=Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

#Pre-trained words embedded in Embedding layer, freezing Embedding
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable=False


#Model training and evaluation
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))
model.save_weights('pre_trained_glove_model.h5')

#Training the same model without pretraining word embedding
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense

model=Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))

#Segmentation of test set data
test_dir=os.path.join(imdb_dir,'test')
labels=[]
texts=[]

for label_type in ['neg','pos']:
    dir_name=os.path.join(test_dir,label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:]=='.txt':
            f=open(os.path.join(dir_name,fname),encoding='UTF-8')
            texts.append(f.read())
            f.close()
            if label_type=='neg':
                labels.append(0)
            else:
                labels.append(1)

sequences=tokenizer.texts_to_sequences(texts)
x_test=pad_sequences(sequences,maxlen=maxlen)
y_test=np.asarray(labels)

model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test,y_test)

 

Topics: encoding network