nlp deep learning code summary -- pytoch

Posted by project-nz on Mon, 20 Dec 2021 13:34:14 +0100

Code summary

Text cleaning

Remove URL

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

Remove emoticons

def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

Remove page tags

def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

Remove punctuation

def remove_punct(text):
    #All punctuation characters
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_punct(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

Character encoding conversion

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

Remove low frequency words

Remove stop words

from nltk.corpus import stopwords

def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

text processing

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train['text'] = train['text'].apply(lambda x:clean_text(x))

nltk

#Corpus and dictionary
from nltk.corpus import stopwords, wordnet
#participle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
#Frequency distribution and smoothing probability
from nltk.probability import FreqDist

Data analysis and visualization

Analysis data

#You can see the number of training data and the number of different values
train.describe()

#Count the types of different values of a variable
temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
#sns visualization can also be used directly
sns.countplot(x='sentiment',data=train)

#Iterate over the data read by pandas
for ind,row in train.iterrows()

#Create dataframe observation data
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')
    

Word frequency statistics of data set

from collections import Counter

c = Counter()
#By default, dictionary elements are entered in the order they appear
most = counter.most_common()
    
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Selected Text', orientation='h', width=700, height=700,color='Common_words')

Relationship between two variables

iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")

sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)

sns.boxplot(x="Species", y="PetalLengthCm", data=iris)

Relationship between two pairs of variables

sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)

Process input data

Remove blank values

train.dropna(inplace=True)

Partition dataset

from torch.utils.data import TensorDataset, random_split
	
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

Word segmentation conversion

bert

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

#Adaptive participle
tokenizer.tokenize(combined[0])
#Convert to id, excluding csl,sep, etc
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(combined[0]))
#code
tokenizer.encode(combined[0],max_length = 512)

encoded_dict = tokenizer.encode_plus(
text,                      # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
truncation='longest_first', # Activate and control truncation
max_length = 84,           # Max length according to our text data.
pad_to_max_length = True, # Pad & truncate all sentences.
return_attention_mask = True,   # Construct attn. masks.
return_tensors = 'pt',     # Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])

Create tokenizer manually

#fill
def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
    x = (np.ones(maxlen) * value).astype(dtype)
    if truncating == 'pre':
        trunc = sequence[-maxlen:]
    else:
        trunc = sequence[:maxlen]
    trunc = np.asarray(trunc, dtype=dtype)
    if padding == 'post':
        x[:len(trunc)] = trunc
    else:
        x[-len(trunc):] = trunc
    return x


class Tokenizer(object):
    def __init__(self, max_seq_len, lower=True):
        self.lower = lower
        self.max_seq_len = max_seq_len
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 1
    def fit_on_text(self, text):
        if self.lower:
            text = text.lower()
        words = text.split()
        for word in words:
            if word not in self.word2idx:
                self.word2idx[word] = self.idx
                self.idx2word[self.idx] = word
                self.idx += 1
    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        if self.lower:
            text = text.lower()
        words = text.split()
        unknownidx = len(self.word2idx)+1
        sequence = [self.word2idx[w] if w in self.word2idx else unknownidx for w in words]
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)
#Use the word splitter to write on the corresponding text  
tokenizer.fit_on_text(text)
#Save word breaker
pickle.dump(tokenizer, open(dat_fname, 'wb'))
#Use word splitter
tokenizer = pickle.load(open(dat_fname, 'rb'))

Convert to vector matrix

embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim))

#Using glove
fname = './glove.twitter.27B/glove.twitter.27B.' + str(embed_dim) + 'd.txt' \ if embed_dim != 300 else './glove.42B.300d.txt' word_vec = _load_word_vec(fname, word2idx=word2idx, embed_dim=embed_dim)

for word, i in word2idx.items():
 vec = word_vec.get(word)
 if vec is not None:
# words not found in embedding index will be all-zeros.
  embedding_matrix[i] = vec
  pickle.dump(embedding_matrix, open(dat_fname, 'wb'))

Package data

for x in train:
    temp_ids = tokenizer.encode(x, add_special_tokens=True)
    max_len = max(max_len, len(temp_ids))
    input_ids.append(temp_ids)
#Convert to imput_ids and attention_masks
input_ids = np.array([i + [0]*(max_len-len(i)) for i in input_ids])
attention_masks = np.where(input_ids != 0, 1, 0)

dataset = TensorDataset(input_ids, attention_masks, labels)

Encapsulate data

A custom Dataset needs to inherit it and implement two member methods:
getitem() this method defines the index (0 to len(self)) to get a piece of data or a sample
len() this method returns the total length of the dataset

from torch.utils.data import Dataset,DataLoader
class MRPCDataset(Dataset):
     def __init__(self, dataset):
        self.data = dataset
     def __getitem__(self, index):
        #There are many operations here
         return self.data[index][0], self.data[index][1], self.data[index][2]
     def __len__(self):
         return len(self.data)
 #Instantiate and send to DataLoader
train_dataset = MRPCDataset(train_dataset)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)# Random: shuffle=True

bert

# TensorDataset packages tensor
train_ids = TensorDataset(a, b) 
for x_train, y_label in train_ids:
    print(x_train, y_label)

    
# Data loader for data encapsulation
train_loader = DataLoader(dataset=train_ids, batch_size=4, shuffle=True)
for i, data in enumerate(train_loader, 1):  
# Note that there are two return values of enumerate, one is serial number and the other is data (including training data and label)
    x_data, label = data

Define load model

bert

model = BertForSequenceClassification.from_pretrained(
'bert-large-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
num_labels = 2, # The number of output labels--2 for binary classification. You can increase this for multi-class tasks.   
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)

Custom model

import torch
#Custom forward propagation, automatic back propagation
class FCModel(torch.nn.Module):#Note: inherited from torch nn. Module
    def __init__(self):
        super(FCModel, self).__init__() # init parent class
        #There are many ways to define the layer of the model
        self.fc = torch.nn.Linear(in_features=768, out_features=1)
    def forward(self, input):
        #Layer using model
        score = self.fc(input)
        result = torch.sigmoid(score)
        return result

GPU/CPU

#Get device type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Adapt the model to the corresponding equipment
model = FCModel()	#Model instantiation
model = model.to(device)
#Adapt data to corresponding equipment
input_ids = input_ids.to(device)

Graphics card settings

#Multi GPU parallel operation
model = nn.DataParallel(model)

#Clear video memory
torch.cuda.empty_cache()

optimizer

optimizer = AdamW(model.parameters(),
                  lr = 6e-6, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
                )
                
#Learning rate preheating
scheduler = get_linear_schedule_with_warmup(
optimizer, 
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps
)
              

RMSProp

Idea: for the item with large gradient vibration, when descending, reduce its descending speed; For the items with small vibration amplitude, when descending, accelerate the descending speed

RMSprop uses the root mean square as the denominator, which can alleviate the rapid decline of Adagrad learning rate and has a good effect on RNN

torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

Advantages: it can alleviate the problem of rapid decline in Adagrad learning rate, and the introduction of root mean square can reduce swing. It is suitable for dealing with non-stationary targets and has a good effect on RNN

Disadvantages: it still depends on the global learning rate

Adam

An algorithm combining Momentum algorithm and RMSProp algorithm not only uses Momentum to accumulate the gradient, but also makes the convergence speed faster, makes the amplitude of fluctuation smaller, and corrects the deviation

torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

advantage:
1. There is no stationary requirement for the objective function, that is, the loss function can change over time
2. The updating of parameters is not affected by the scaling transformation of gradient
3. The update step size has nothing to do with the gradient size, only with alpha and beta_1,beta_2 has a relationship. And they determine the theoretical upper limit of the step size
4. The updated step size can be limited to an approximate range (initial learning rate)
5. It can better process noise samples and naturally realize step annealing process (automatically adjust learning rate)
6. It is suitable for large-scale data and parameter scenarios, unstable objective functions, gradient sparse or gradient with large noise

Training and evaluation

for epoch_i in range(0, epochs):
    #
    model.train()
    #model.eval(): tell all layers of the network that you are in eval mode, that is, layers such as batchNorm and dropout will work in eval mode instead of training mode
    #model.eval()
	for step, batch in enumerate(train_dataloader):
    	model.zero_grad()
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

Model saving and loading

general model

# Save all of the model (not recommended for large models)
torch.save(model, "./model_fc.pth")
model = torch.load("./model_fc.pth")
# Save only the parameters of each layer (recommended for large model)
torch.save(model.state_dict(), "./model_fc.pt")
model = FCModel()#A model instance needs to be constructed before loading
model.load_state_dict(torch.load("./model_fc.pt"))

huggingface

#save tokenizer does not need to be saved if it is not modified
bert_model.save_pretrained('./Fine_tune_BERT/')
#load
bert_model = TFBertModel.from_pretrained('./Fine_tune_BERT/')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Visual attention

def showAttention(input_sentence, output_words, attentions):
    # Set the graph with the colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set coordinates
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Display labels at each scale
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

[external chain picture transfer failed. The source station may have anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-vyqgicpf-1629613912305)( https://i.loli.net/2021/08/20/L1AfQu6evprKdbM.png )]

labels([''] + output_words)

# Display labels at each scale
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

plt.show()

def evaluateAndShowAttention(input_sentence):
output_words, attentions = evaluate(
encoder1, attn_decoder1, input_sentence)
print('input =', input_sentence)
print('output =', ' '.join(output_words))
showAttention(input_sentence, output_words, attentions)

[External chain picture transfer...(img-VYqGicPf-1629613912305)]