Chapter Six: Deep Learning for Text and Sequence
In-depth learning does not accept the original text as input but only processes numeric tensors. Text vectorization refers to the process of converting text to numeric tensors.(Text can be split into tags: words, characters, n-gram s, the process of breaking down into tags is called a participle and converting it into a tensor)
One-hot associates each word with a unique integer index, converting integer index i to a binary vector of length N.One-hot encoding is the most basic and common method to convert tags into vectors, resulting in binary, sparse vectors with very high dimensions.
Word vector (word embedding): A low-dimensional floating-point vector derived from data.
Learning method one: learning word embedding while completing the main task, learning random word vectors.
Learning method 2: Pre-training words embedding, calculating good words embedding and loading into the model.
Learn word embedding using the Embedding layer: learn a new embedding space for each new task, learn the weights of a layer using the Embedding layer, understand the Embedding layer as a dictionary, and map the frame number index to a dense vector.The Embedding layer is a two-dimensional integer tensor (samples, sequence_length) that returns a three-dimensional floating-point tensor with the shape (samples,sequence_length,embedding_dimensionality).
Pre-trained word embedding: The principle is the same as that of the pre-trained convolution network used in image classification, which is calculated using word frequency statistics.
Handle IMDB The original text of the data import os imdb_dir='D:\\jupyter_code\\GloVe\\aclImdb' train_dir=os.path.join(imdb_dir,'train') labels=[] texts=[] for label_type in ['neg','pos']: dir_name=os.path.join(train_dir,label_type) for fname in os.listdir(dir_name): if fname[-4:]=='.txt': f=open(os.path.join(dir_name,fname),encoding='UTF-8') texts.append(f.read()) f.close() if label_type=='neg': labels.append(0) else: labels.append(1) #Separate the text of the original imdb data from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences import numpy as np #Truncate comments after 100 words maxlen=100 #Training on 200 samples training_samples=200 #Validation on 1000 samples validation_samples=10000 #Consider only the top 10,000 most common words max_words=10000 tokenizer=Tokenizer(num_words=max_words) #Update vocabulary from text list tokenizer.fit_on_texts(texts) sequences=tokenizer.texts_to_sequences(texts) word_index=tokenizer.word_index print('Found %s unique tokens.'%len(word_index)) data=pad_sequences(sequences,maxlen=maxlen) labels=np.asarray(labels) print(data.shape) print(labels.shape) #Disrupting data while dividing data indices=np.arange(data.shape[0]) np.random.shuffle(indices) data=data[indices] labels=labels[indices] x_train=data[:training_samples] y_train=labels[:training_samples] x_val=data[training_samples:training_samples+validation_samples] y_val=labels[training_samples:training_samples+validation_samples] #Parse GloVe Embedded File glove_dir='D:\\jupyter_code\\GloVe\\glove.6B' embeddings_index={} f=open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='UTF-8') for line in f: values=line.split() word=values[0] coefs=np.asarray(values[1:],dtype='float32') embeddings_index[word]=coefs f.close() print('Found %s word vectors'%len(embeddings_index)) #Prepare GloVe Word Embedding Lexicon embedding_dim=100 embedding_matrix=np.zeros((max_words,embedding_dim)) for word, i in word_index.items(): if i<max_words: embedding_vector=embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i]=embedding_vector #Define Model from keras.models import Sequential from keras.layers import Embedding,Flatten,Dense model=Sequential() model.add(Embedding(max_words,embedding_dim,input_length=maxlen)) model.add(Flatten()) model.add(Dense(32,activation='relu')) model.add(Dense(1,activation='sigmoid')) model.summary() #Pre-trained words embedded in Embedding layer, freezing Embedding model.layers[0].set_weights([embedding_matrix]) model.layers[0].trainable=False #Model training and evaluation model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc']) history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val)) model.save_weights('pre_trained_glove_model.h5') #Training the same model without pretraining word embedding from keras.models import Sequential from keras.layers import Embedding,Flatten,Dense model=Sequential() model.add(Embedding(max_words,embedding_dim,input_length=maxlen)) model.add(Flatten()) model.add(Dense(32,activation='relu')) model.add(Dense(1,activation='sigmoid')) model.summary() model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc']) history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val)) #Segmentation of test set data test_dir=os.path.join(imdb_dir,'test') labels=[] texts=[] for label_type in ['neg','pos']: dir_name=os.path.join(test_dir,label_type) for fname in sorted(os.listdir(dir_name)): if fname[-4:]=='.txt': f=open(os.path.join(dir_name,fname),encoding='UTF-8') texts.append(f.read()) f.close() if label_type=='neg': labels.append(0) else: labels.append(1) sequences=tokenizer.texts_to_sequences(texts) x_test=pad_sequences(sequences,maxlen=maxlen) y_test=np.asarray(labels) model.load_weights('pre_trained_glove_model.h5') model.evaluate(x_test,y_test)