Code summary
Text cleaning
Remove URL
def remove_URL(text): url = re.compile(r'https?://\S+|www\.\S+') return url.sub(r'', text)
Remove emoticons
def remove_emoji(text): emoji_pattern = re.compile( '[' u'\U0001F600-\U0001F64F' # emoticons u'\U0001F300-\U0001F5FF' # symbols & pictographs u'\U0001F680-\U0001F6FF' # transport & map symbols u'\U0001F1E0-\U0001F1FF' # flags (iOS) u'\U00002702-\U000027B0' u'\U000024C2-\U0001F251' ']+', flags=re.UNICODE) return emoji_pattern.sub(r'', text)
Remove page tags
def remove_html(text): html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') return re.sub(html, '', text)
Remove punctuation
def remove_punct(text): #All punctuation characters table = str.maketrans('', '', string.punctuation) return text.translate(table) def remove_punct(s): s = re.sub(r"([.!?])", r" \1", s) s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) s = re.sub(r"\s+", r" ", s).strip() return s
Character encoding conversion
def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' )
Remove low frequency words
Remove stop words
from nltk.corpus import stopwords def remove_stopword(x): return [y for y in x if y not in stopwords.words('english')] train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))
text processing
def clean_text(text): '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.''' text = str(text).lower() text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) return text train['text'] = train['text'].apply(lambda x:clean_text(x))
nltk
#Corpus and dictionary from nltk.corpus import stopwords, wordnet #participle from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize #Frequency distribution and smoothing probability from nltk.probability import FreqDist
Data analysis and visualization
Analysis data
#You can see the number of training data and the number of different values train.describe() #Count the types of different values of a variable temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False) #sns visualization can also be used directly sns.countplot(x='sentiment',data=train) #Iterate over the data read by pandas for ind,row in train.iterrows() #Create dataframe observation data temp = pd.DataFrame(top.most_common(20)) temp.columns = ['Common_words','count'] temp.style.background_gradient(cmap='Blues')
Word frequency statistics of data set
from collections import Counter c = Counter() #By default, dictionary elements are entered in the order they appear most = counter.most_common() fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Selected Text', orientation='h', width=700, height=700,color='Common_words')
Relationship between two variables
iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm") sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5) sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
Relationship between two pairs of variables
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)
Process input data
Remove blank values
train.dropna(inplace=True)
Partition dataset
from torch.utils.data import TensorDataset, random_split train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
Word segmentation conversion
bert
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True) #Adaptive participle tokenizer.tokenize(combined[0]) #Convert to id, excluding csl,sep, etc tokenizer.convert_tokens_to_ids(tokenizer.tokenize(combined[0])) #code tokenizer.encode(combined[0],max_length = 512) encoded_dict = tokenizer.encode_plus( text, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' truncation='longest_first', # Activate and control truncation max_length = 84, # Max length according to our text data. pad_to_max_length = True, # Pad & truncate all sentences. return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) input_ids.append(encoded_dict['input_ids']) attention_masks.append(encoded_dict['attention_mask'])
Create tokenizer manually
#fill def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0): x = (np.ones(maxlen) * value).astype(dtype) if truncating == 'pre': trunc = sequence[-maxlen:] else: trunc = sequence[:maxlen] trunc = np.asarray(trunc, dtype=dtype) if padding == 'post': x[:len(trunc)] = trunc else: x[-len(trunc):] = trunc return x class Tokenizer(object): def __init__(self, max_seq_len, lower=True): self.lower = lower self.max_seq_len = max_seq_len self.word2idx = {} self.idx2word = {} self.idx = 1 def fit_on_text(self, text): if self.lower: text = text.lower() words = text.split() for word in words: if word not in self.word2idx: self.word2idx[word] = self.idx self.idx2word[self.idx] = word self.idx += 1 def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'): if self.lower: text = text.lower() words = text.split() unknownidx = len(self.word2idx)+1 sequence = [self.word2idx[w] if w in self.word2idx else unknownidx for w in words] if len(sequence) == 0: sequence = [0] if reverse: sequence = sequence[::-1] return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating) #Use the word splitter to write on the corresponding text tokenizer.fit_on_text(text) #Save word breaker pickle.dump(tokenizer, open(dat_fname, 'wb')) #Use word splitter tokenizer = pickle.load(open(dat_fname, 'rb'))
Convert to vector matrix
embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim)) #Using glove fname = './glove.twitter.27B/glove.twitter.27B.' + str(embed_dim) + 'd.txt' \ if embed_dim != 300 else './glove.42B.300d.txt' word_vec = _load_word_vec(fname, word2idx=word2idx, embed_dim=embed_dim) for word, i in word2idx.items(): vec = word_vec.get(word) if vec is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = vec pickle.dump(embedding_matrix, open(dat_fname, 'wb'))
Package data
for x in train: temp_ids = tokenizer.encode(x, add_special_tokens=True) max_len = max(max_len, len(temp_ids)) input_ids.append(temp_ids) #Convert to imput_ids and attention_masks input_ids = np.array([i + [0]*(max_len-len(i)) for i in input_ids]) attention_masks = np.where(input_ids != 0, 1, 0) dataset = TensorDataset(input_ids, attention_masks, labels)
Encapsulate data
A custom Dataset needs to inherit it and implement two member methods:
getitem() this method defines the index (0 to len(self)) to get a piece of data or a sample
len() this method returns the total length of the dataset
from torch.utils.data import Dataset,DataLoader class MRPCDataset(Dataset): def __init__(self, dataset): self.data = dataset def __getitem__(self, index): #There are many operations here return self.data[index][0], self.data[index][1], self.data[index][2] def __len__(self): return len(self.data) #Instantiate and send to DataLoader train_dataset = MRPCDataset(train_dataset) train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)# Random: shuffle=True
bert
# TensorDataset packages tensor train_ids = TensorDataset(a, b) for x_train, y_label in train_ids: print(x_train, y_label) # Data loader for data encapsulation train_loader = DataLoader(dataset=train_ids, batch_size=4, shuffle=True) for i, data in enumerate(train_loader, 1): # Note that there are two return values of enumerate, one is serial number and the other is data (including training data and label) x_data, label = data
Define load model
bert
model = BertForSequenceClassification.from_pretrained( 'bert-large-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab. num_labels = 2, # The number of output labels--2 for binary classification. You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. )
Custom model
import torch #Custom forward propagation, automatic back propagation class FCModel(torch.nn.Module):#Note: inherited from torch nn. Module def __init__(self): super(FCModel, self).__init__() # init parent class #There are many ways to define the layer of the model self.fc = torch.nn.Linear(in_features=768, out_features=1) def forward(self, input): #Layer using model score = self.fc(input) result = torch.sigmoid(score) return result
GPU/CPU
#Get device type device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Adapt the model to the corresponding equipment model = FCModel() #Model instantiation model = model.to(device) #Adapt data to corresponding equipment input_ids = input_ids.to(device)
Graphics card settings
#Multi GPU parallel operation model = nn.DataParallel(model) #Clear video memory torch.cuda.empty_cache()
optimizer
optimizer = AdamW(model.parameters(), lr = 6e-6, # args.learning_rate eps = 1e-8 # args.adam_epsilon ) #Learning rate preheating scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = total_steps )
RMSProp
Idea: for the item with large gradient vibration, when descending, reduce its descending speed; For the items with small vibration amplitude, when descending, accelerate the descending speed
RMSprop uses the root mean square as the denominator, which can alleviate the rapid decline of Adagrad learning rate and has a good effect on RNN
torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
Advantages: it can alleviate the problem of rapid decline in Adagrad learning rate, and the introduction of root mean square can reduce swing. It is suitable for dealing with non-stationary targets and has a good effect on RNN
Disadvantages: it still depends on the global learning rate
Adam
An algorithm combining Momentum algorithm and RMSProp algorithm not only uses Momentum to accumulate the gradient, but also makes the convergence speed faster, makes the amplitude of fluctuation smaller, and corrects the deviation
torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
advantage:
1. There is no stationary requirement for the objective function, that is, the loss function can change over time
2. The updating of parameters is not affected by the scaling transformation of gradient
3. The update step size has nothing to do with the gradient size, only with alpha and beta_1,beta_2 has a relationship. And they determine the theoretical upper limit of the step size
4. The updated step size can be limited to an approximate range (initial learning rate)
5. It can better process noise samples and naturally realize step annealing process (automatically adjust learning rate)
6. It is suitable for large-scale data and parameter scenarios, unstable objective functions, gradient sparse or gradient with large noise
Training and evaluation
for epoch_i in range(0, epochs): # model.train() #model.eval(): tell all layers of the network that you are in eval mode, that is, layers such as batchNorm and dropout will work in eval mode instead of training mode #model.eval() for step, batch in enumerate(train_dataloader): model.zero_grad() loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step()
Model saving and loading
general model
# Save all of the model (not recommended for large models) torch.save(model, "./model_fc.pth") model = torch.load("./model_fc.pth") # Save only the parameters of each layer (recommended for large model) torch.save(model.state_dict(), "./model_fc.pt") model = FCModel()#A model instance needs to be constructed before loading model.load_state_dict(torch.load("./model_fc.pt"))
huggingface
#save tokenizer does not need to be saved if it is not modified bert_model.save_pretrained('./Fine_tune_BERT/') #load bert_model = TFBertModel.from_pretrained('./Fine_tune_BERT/') tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
Visual attention
def showAttention(input_sentence, output_words, attentions): # Set the graph with the colorbar fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(attentions.numpy(), cmap='bone') fig.colorbar(cax) # Set coordinates ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90) ax.set_yticklabels([''] + output_words) # Display labels at each scale ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) plt.show() def evaluateAndShowAttention(input_sentence): output_words, attentions = evaluate( encoder1, attn_decoder1, input_sentence) print('input =', input_sentence) print('output =', ' '.join(output_words)) showAttention(input_sentence, output_words, attentions)
[external chain picture transfer failed. The source station may have anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-vyqgicpf-1629613912305)( https://i.loli.net/2021/08/20/L1AfQu6evprKdbM.png )]
labels([''] + output_words)
# Display labels at each scale ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) plt.show()
def evaluateAndShowAttention(input_sentence):
output_words, attentions = evaluate(
encoder1, attn_decoder1, input_sentence)
print('input =', input_sentence)
print('output =', ' '.join(output_words))
showAttention(input_sentence, output_words, attentions)
[External chain picture transfer...(img-VYqGicPf-1629613912305)]