Say nothing but code
Data Loading Class
from io import open import unicodedata import re import random import torch import torch.nn as nn import torch.nn.functional as F from torch import optim # Register Driver device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Start Mark SOS_token = 0 EOS_token = 1 class Lang(): def __init__(self, name): self.name = name self.word2index = {} # Lexical Subscript Lookup Dictionary Mapping Relation Preservation self.index2word = {0: 'SOS', 1: 'EOS'} # Subscript Corresponds to Dictionary Number Lookup Mapping Relation Save self.n_words = 2 # Initialize the index because 01 is already occupied, so the initial value is 2 def addSentence(self, sentence): # Add a sentence function to convert a sentence into a corresponding numeric sequence # This is set to divide sentences by using spaces as the standard for word breaking and get corresponding Subscripts for word in sentence.split(' '): self.addWord(word) def addWord(self, word): ''' Add Word to Dictionary :param word:Words after participle :return: ''' if word not in self.word2index: self.word2index[word] = self.n_words self.index2word[self.n_words] = word self.n_words += 1 def demo1_testlong(): name = 'eng' sen = 'I am IronMan' dicts = Lang(name) dicts.addSentence(sen) print(dicts.word2index) print(dicts.index2word) print(dicts.n_words) def unicodeToAscii(s): ''' unicode Convert Universal Code to ASCII Code is processed as standard :param s: :return: ''' return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' ) def normalizesString(s): ''' String Standardization,Remove unwanted special symbols,Inductive word-breaking rules, etc.,Cleaning data :param s: :return: ''' # Strings are capitalized and whitespace removed. Special symbols s = unicodeToAscii(s.lower().strip()) # Will it be!?? str money = plus a space before the symbol s = re.sub(r"([.!?])", r" \1", s) # Not normal punctuation and uppercase characters are replaced with spaces s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) return s def readLangs(lang1, lang2, path): lines = open(path, encoding='UTF-8').read().strip().split('\n') list1 = [] c = 0 list1 = [[normalizesString(s) for s in l.split('\t')] for l in lines] input_lang = Lang(lang1) output_lang = Lang(lang2) # print(list1) return input_lang, output_lang, list1 def filterPair(p, maxlenght, eng_prefixes): return len(p[0].split(' ')) < maxlenght and p[0].startswith(eng_prefixes) and len(p[1].split(' ')) < maxlenght def filterPairs(pairs): eng_prefixes = ( "i am ", "i m ", "he is", "he s ", "she is", "she s ", "you are", "you re ", "we are", "we re ", "they are", "they re " ) maxlenght = 10 return [pair for pair in pairs if filterPair(pair, maxlenght, eng_prefixes)] def demo04_test_filterPairs(path): lang1 = "eng" lang2 = "fra" input_lang, output_lang, pairs_before = readLangs(lang1, lang2, path) print("Before filtering pairs", len(pairs_before)) pairs_after = filterPairs(pairs_before) print("After filtering pairs", len(pairs_after)) print("Filtered pairs Top 5:", pairs_after[:5]) print("Filtered pairs900-905:", pairs_after[900:905]) # Text Processing Text-->Numerization def prepareData(lang1, lang2, path): input_lang, output_lang, pairs = readLangs(lang1, lang2, path) print(len(pairs)) pairs = filterPairs(pairs) print(len(pairs)) # print(input_lang.n_words) for pair in pairs: # print(pairs) input_lang.addSentence(pair[0]) output_lang.addSentence(pair[1]) # print(input_lang.n_words) print(f'{pair[0]}--> {pair[1]}') return input_lang, output_lang, pairs # 6-2 Text Preprocessing Function Test: Text--->Numerization def demo05_test_prepareData(path): input_lang, output_lang, pairs = prepareData('eng', 'fra', path) print("input_n_words:", input_lang.n_words) # input_n_words: 2803 print("output_n_words:", output_lang.n_words) # output_n_words: 4345 for i in range(3): print('Randomly select a language pair random.choice(pairs)--->', random.choice(pairs)) # random.choice(pairs)---> ['he s on his way .', 'il est en route .']route def tensorfromSentence(lang, sentence): # Return Subscript indexs = [lang.word2index[word] for word in sentence.split(' ')] indexs.append(EOS_token) # Add End Flag # print(indexs) return torch.tensor(indexs, dtype=torch.long, device=device).view(-1, 1) def tensorfromPair(input_lang, output_lang, pair): # input_lang, output_lang, paris = prepareData('english', 'french', '../data/eng-fra.txt') input_tensor = tensorfromSentence(input_lang, pair[0]) target_tensor = tensorfromSentence(output_lang, pair[1]) # Finally, return the tuples they make up return input_tensor, target_tensor if __name__ == '__main__': # demo1_testlong() # s = 'what are you doing? hhhh iam missing you!' # print(normalizesString(s)) path = '../data/eng-fra.txt' # demo04_test_filterPairs(path) demo05_test_prepareData(path) # input_lang, output_lang, pairs = readLangs('english', 'french', '../data/eng-fra.txt') # print("input_lang:", input_lang) # print("output_lang:", output_lang) # Print (pairs before filtering, len (pairs)) # pairs_after = filterPairs(pairs) # print("pairs after filtering", len (pairs_after)) # print("first 5 filtered pairs:", pairs_after[:5]) # print("filtered pairs 900-905:", pairs_after[900:905]) # input_lang, output_lang, pairs = readLangs('english', 'french', '../data/eng-fra.txt') # print(input_lang.n_words) # print(output_lang.n_words) # tensorfromSentence(input_lang, 'asdaskj aslkjda asdlkn') # for i in pairs: # print(tensorfromPair(pairs[0]))
model building
import random import torch import torch.nn as nn import torch.nn.functional as F device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class EncoderRNN(nn.Module): def __init__(self, input_size, hidden_size): super(EncoderRNN, self).__init__() self.input = input_size self.hidden = hidden_size self.embeding = nn.Embedding(input_size, hidden_size) self.gru = nn.GRU(hidden_size, hidden_size) def forward(self, input1, hidden): # input1 is the input tensor # Hidden represents the initialization of the hidden layer tensor # embedding is a two-dimensional tensor, but gru requires a three-dimensional tensor to be input output = self.embeding(input1).view(1, 1, -1) output, hidden = self.gru(output, hidden) # Number of Hidden Layers return output, hidden def initHiddenTensor(self): # Initialize Hidden Layer Tensor return torch.zeros(1, 1, self.hidden, device=device) def test_encoder(): import load_data path = '../data/eng-fra.txt' input_lang, output_lang, pairs = load_data.prepareData('english', 'frensh', path) pair_tensor = load_data.tensorfromPair(random.choice(pairs)) hidden_size = 25 input_size = 20 input = pair_tensor[0][0] # print(pair_tensor[0].item()) # print(pair_tensor[0][0], pair_tensor[0][1]) print(pair_tensor[0].shape) print(input) # Hidden neurons circulate 25 times, so dictionaries cannot exceed 25 words # Get the tensor value print(input.item()) if input.item() > 24: input = torch.tensor([6]) hidden = torch.zeros(1, 1, hidden_size) encoder = EncoderRNN(input_size, hidden_size) # Input Element Feature Dimension, Number of Hidden Layer Cycles encoder_output, hidden = encoder(input, hidden) print('encoder structure', encoder) print('encoderoutput.shape---->', encoder_output.shape, encoder_output) class DecoderRNN(nn.Module): def __init__(self, hidden_size, output_size): super(DecoderRNN, self).__init__() self.hidden_size = hidden_size self.output_size = output_size self.embedding = nn.Embedding(output_size, hidden_size) print('embedding: ->', self.embedding) self.gru = nn.GRU(hidden_size, hidden_size) self.out = nn.Linear(hidden_size, output_size) # Make a size change self.softmax = nn.LogSoftmax(dim=1) def forward(self, input, hidden): # print('embedding computed output', self.embedding(input), self.embedding(input).shape) # print('embedding shape', self.embedding(input).shape) output = self.embedding(input).view(1, 1, -1) output = F.relu(output) output, hidden = self.gru(output, hidden) print('output---->', output.shape) output = self.softmax(self.out(output[0])) # dimensionality reduction # print('somtmax--->', output.shape) return output, hidden def initHidden(self): return torch.zeros(1, 1, self.hidden_size, device=device) def test_decodeRNN(): hidden_size = 25 output_size = 12 input = torch.tensor([1]) # Enter a word # One word, 25 features hidden = torch.zeros(1, 1, hidden_size) decorator = DecoderRNN(hidden_size, output_size) for i in range(3): output, hidden = decorator(input, hidden) print('*' * 20) print('decoderRnn--->', output) print('shape', output.shape) class AttnDecoderRNN(nn.Module): def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10): ''' :param hidden_size: Decoder GRU Input Size,Number of hidden layer loops :param output_size: Decoder Output Size :param dropout_p: Random inactivation parameters :param max_length: Maximum Sentence Length,Number of words ''' super(AttnDecoderRNN, self).__init__() self.hidden_size = hidden_size self.output_size = output_size self.dropout_p = dropout_p self.max_length = max_length self.embedding = nn.Embedding(self.output_size, self.hidden_size) # Calculate rules based on QKV # 1. q and k do attention weight matrix calculation and then multiply with v # Three ways of calculating # QK longitudinal stitching, linear transformation, tensor multiplication with v after softmax processing # QK longitudinal stitching, a linear transformation, activation using the tanh function, then internal summation, and finally the result obtained by using softmax processing is multiplied by v # The QK transformation does a dot product operation, then divides by a zoom factor, uses softmax processing to get the result and multiplies it by v # bmm operation when the attention weight matrix and v are both three-dimensional tensors and the first dimension represents the number of batch bars # 2. According to the calculation method used in the first step, if the splicing method is used, Q and the calculation result of the second step need to be spliced again. # If the point product is transposed, the self-attention mechanism Q=K=V does not need to be stitched with Q, so the second part of the calculation is related to the method used in the first step. # 3. The third step is to make the whole attention structure output according to the specified size, use the linear layer to make a linear table hidden danger on the second result, and get the final attention matrix to Query, which is also the output of the decoder. # Take the first calculation from step one # Encoder output input to embedding to create word vector self.attn = nn.Linear(self.hidden_size * 2, self.max_length) # The first step is splicing, so the size after splicing is 2*hiddensize The second parameter is the length of the sentence, which is the batch number self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size) # Here is the linear layer required for the fourth step to specify the size of the output, and the input is the output of the third gru # Number of input neurons, number of output neurons Input neurons: cat(cat(Q,K)*V,Q) So it is the Q dimension Hiddensize*2 output dimension Hiddensize uses output to look up words in dictionary because it is seq2seq so it is not strictly required self.dropout = nn.Dropout(self.dropout_p) self.gru = nn.GRU(self.hidden_size, self.hidden_size) self.out = nn.Linear(self.hidden_size, self.output_size) def forward(self, input, hidden, encoder_outputs): ''' Attention calculation order :param input:Enter the encoded word tensor :param hidden: Initialize Hidden Layer Tensor(Full Connection Layer Weight Matrix) :param encoder_outputs: Output tensor of decoder :return:output Output value hidden Value of the last loop lock output for the last hidden layer,atten_weights: Attention Weight Matrix ''' embedded = self.embedding(input).view(1, 1, -1) # Enter the input into the embedding layer to create a tensor and raise the dimension from the original two dimensions to include the attention calculation # dropout prevents overfitting embedded = self.dropout(embedded) # Perform atten calculation for the first type # QK longitudinal stitching, linear transformation into softmax # print(embedded[0].shape, embedded[0]) # print(hidden[0].shape, hidden[0]) # Query is the output of the decoder Key is hidden hidden layer Value: is the output of the encoder attn_weight = F.softmax( self.attn( torch.cat((embedded[0], hidden[0]), 1) ), dim=1 ) # cat first parameter, tensor second parameter to stitch select dimension 0:row 1:column and so on dim does not know what it means # The second half of the weight matrix of the first step multiplies v and calculates bmm attn_applied = torch.bmm(attn_weight.unsqueeze(0), encoder_outputs.unsqueeze(0)) # unsqueeze ascending dimension bam x bma = baa # Second calculation spliced with Q output = torch.cat((embedded[0], attn_applied[0]), 1) # The third step uses a linear layer to make a linear transformation on the result of the third step and to extend the dimension to the fourth step output = self.attn_combine(output).unsqueeze(0) output = F.relu(output) # Activated results are passed into the gru input along with hidden output, hidden = self.gru(output, hidden) # Dimension reduction of results and feeding into softmax for final results output = F.log_softmax(self.out(output[0]), dim=1) return output, hidden, attn_weight def test_AttnDecoderRnn(): hidden_size = 25 output_size = 10 input = torch.tensor([3]) # Enter one word at a time hidden = torch.zeros(1, 1, hidden_size) # encoderoutputs are stacked with the output of each time step in encode # His shape should be the number of loops * the number of dimensions per word encoder_outputs = torch.randn(10, 25) # Intermediate Semantic Tensor C decoder = AttnDecoderRNN(hidden_size, output_size) output, hidden, attnweights = decoder(input, hidden, encoder_outputs) print(output.shape) if __name__ == '__main__': # test_AttnDecoderRnn() test_decodeRNN()
model training
import random import seq2seq_CreateModel import load_data import torch import torch.nn as nn import torch.nn.functional as F device = torch.device("cuda" if torch.cuda.is_available() else "cpu") teacher_forcing_ratio = 0.5 def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, maxleght=10): ''' Training function :param teacher_forcing_ratio: learning rate :param input_tensor: English Input Tensor :param target_tensor: French Target Tensor :param encoder: Encoder obj :param decoder: Decoder obj :param encoder_optimizer: Encoder Optimizer Method :param decoder_optimizer: Decoder Optimizer Method :param criterion: Calculation method of loss function :param maxleght: Maximum Sentence Length batchsize :return: ''' # Initialize Encoder Hidden Layer Tensor encoder_hidden = encoder.initHiddenTensor() # Encoder Decoder Optimizer Gradient Zeroing encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() # Getting the corresponding length from the original text and the target text tensor input_length = input_tensor.size(0) # Number of words entered target_length = target_tensor.size(0) # Number of words for the target # Initialize Encoder Output Tensor This is the container used to store encoder output encoder_outputs = torch.zeros(maxleght, encoder.hidden, device=device) # Encoder Output Bach_ Number of Size X hidden layer loops # Set initial loss to 0 loss = 0 # Loop through input tensor index for ei in range(input_length): # Remove the corresponding word tensor from the index, because each sentence has a different length, so each loop needs to be determined by the length of the input sentence encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) # Enter tensor and initialize the hidden layer # Store Encoder Output encoder_outputs[ei] = encoder_output[0, 0] # Each output shape is [1,1,hidden_size] a word, a sentence, the number of hidden loops # Because storage containers are two-dimensional tensors, reduce the dimensions when storing # Initialize the first input of the decoder decoder_input = torch.tensor([[load_data.SOS_token]], device=device) # Initializing the decoder hidden layer tensor is also the output of the last loop of the encoder hidden layer decoder_hidden = encoder_hidden # According to random number and teacher_ Forcing_ Comparison of ratios to determine whether to use teacher_forching use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False # If teacher_is used Forcing if use_teacher_forcing: # Loop through the target tensor index if you are training with the correct target for di in range(target_length): # encoder_outputs encoder output container # decoder_input Decoder Input # decoder_hidden Encoder Last Hidden Layer Last Loop Output Is Decoder Hidden Layer Input decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs) # decoder_output:Decoder final output decoder_ The value of the last loop lock output of hidden's last hidden layer, decoder_attention:Attention Weight Matrix loss += criterion(decoder_output, target_tensor[di]) decoder_input = target_tensor[di] # Because if the result of the first cycle is wrong # The second cycle will be greatly affected, so enter the correct values into the next cycle and let the model learn else: # Do not use correct target value training for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.topk(1) # Remove results loss += criterion(decoder_output, target_tensor[di]) # Terminate loop if output value is Terminator if topi.squeeze().item() == load_data.EOS_token: break # Continue typing if it is not a terminator decoder_input = topi.squeeze().detach() # top1 dimensionality reduction as input to the next decoder is removed from the computational diagram because it is not related to the computational diagram # That is, training with the last input as input loss.backward() # Start reverse propagation encoder_optimizer.step() decoder_optimizer.step() # After updating the error, return the average loss return loss.item() / target_length def timeSince(since): import time import math now = time.time() s = now - since m = math.floor(s / 60) s -= m * 60 return '%dm %ds' % (m, s) def trainAgent(input_lang, output_lang, pairs, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=1e-1): ''' Training Agent Function :param pairs: data set :param encoder: Encoder obj :param decoder: Decoder obj :param n_iters: Total Iteration Step :param print_every: Print Log Interval :param plot_every: Interval for plotting loss curves :param learning_rate: learning rate :return: ''' import time from torch import optim start = time.time() plot_losses = [] # Loss Statistics Curve Container print_loss_total = 0 # Total loss per print log interval plot_loss_total = 0 # Total loss per drawing interval encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) # Select Loss Function criterion = nn.NLLLoss() num = 0 # Cycle by step for iter in range(1, n_iters + 1): # Take one out of the training set each time as a training statement # training_pair = load_data.tensorfromPair(input_lang, output_lang, pairs) # pari = random.choice(pairs) pari, num = choise_random_pairs(num, pairs) training_pair = load_data.tensorfromPair(input_lang, output_lang, pari) input_tensor = training_pair[0] target_tensor = training_pair[1] loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss # The interval between iterations reaching log printing is if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d Percentage) %.4f' % (timeSince(start), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 import matplotlib.pyplot as plt plt.figure() plt.plot(plot_losses) plt.savefig('./s2sloss.png') plt.show() def choise_random_pairs(num, pairs): if num == len(pairs): num = 0 else: num += 1 return pairs[random.randint(0, len(pairs) - 1)], num if __name__ == '__main__': hidden_size = 256 input_lang, output_lang, pairs = load_data.prepareData('english', 'french', '../data/eng-fra.txt') # Number of print(len(pairs)) training sentences 10599 # print(input_lang.n_words, output_lang.n_words) English Dictionary Number 2803 French Dictionary 4345 encoder1 = seq2seq_CreateModel.EncoderRNN(input_lang.n_words, hidden_size).to(device) print(encoder1) attn_decoder1 = seq2seq_CreateModel.AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device) print(attn_decoder1) # Start training n_iters = 7500 print_every = 100 # for i in pairs: # trainAgent(input_lang, output_lang, pairs, encoder1, attn_decoder1, n_iters, print_every) trainAgent(input_lang=input_lang, output_lang=output_lang, pairs=pairs, encoder=encoder1, decoder=attn_decoder1, n_iters=n_iters, print_every=print_every, learning_rate=0.01) torch.save(encoder1.state_dict(), './encoder4.pth') torch.save(attn_decoder1.state_dict(), './decoder4.pth')
Model evaluation
import re import random import torch import torch.nn import torch.nn.functional as F import seq2seq_CreateModel import seq2seq_train import load_data from io import open from torch import optim device = torch.device("cuda" if torch.cuda.is_available() else "cpu") maxlength = 10 # Start Mark SOS_token = 0 # End Mark EOS_token = 1 def evaluate(lang, output_lang, encoder, decoder, sentence, max_length=maxlength): ''' Evaluation function :param output_lang: Decoder Entity Class Object :param lang: Input Entity Class Object :param encoder: Encoder :param decoder: Decoder :param sentence: Sentences to be evaluated :param max_length: Maximum number of sentence words :return: ''' with torch.no_grad(): input_tensor = load_data.tensorfromSentence(lang, sentence) # Return tensor sentence onehot encoding input_length = input_tensor.size()[0] encoder_hidden = encoder.initHiddenTensor() # Initialize hidden layer weight matrix encoder_outputs = torch.zeros(maxlength, encoder.hidden, device=device) # Cache Container # Encoder Loop for ei in range(input_length): encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) encoder_outputs[ei] = encoder_output[0, 0] decoder_input = torch.tensor([[SOS_token]], device=device) decoder_hidden = encoder_hidden decoder_words = [] decoder_attentions = torch.zeros(maxlength, maxlength) # Initialize Attention Weight Matrix # Decoder Loop for di in range(maxlength): decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs) # print(decoder_output.shape) decoder_attentions[di] = decoder_attention.data # Cache attention results topv, topi = decoder_output.data.topk(1) if topi.item() == EOS_token: decoder_words.append('<Finall>') break else: decoder_words.append(output_lang.index2word[topi.item()]) decoder_input = topi.squeeze().detach() # top1 dimensionality reduction as input to the next decoder is removed from the computational diagram because it is not related to the computational diagram return decoder_words, decoder_attentions[:di + 1] def agent_evaluate(encoder, decoder, n, pairs, input_lang, output_lang): ''' Test Proxy Function :param output_lang: Decoder Dictionary Object :param input_lang: Encoder Dictionary Object :param pairs: Data pair :param encoder: Encoder object :param decoder: Decoder Object :param n: Number of Tests :return: ''' for i in range(n): pair = random.choice(pairs) print('input========> ', pair[0]) print('lable----> ', pair[1]) output_words, attentions = evaluate(input_lang, output_lang, encoder, decoder, pair[0], maxlength) output_sentence = ' '.join(output_words) print('predicted value: ', output_sentence) print('') def agent_evaluate_onesentence(encoder, decoder, n, sentence, input_lang, output_lang): ''' Test Proxy Function,Enter Custom English :param output_lang: Decoder Dictionary Object :param input_lang: Encoder Dictionary Object :param pairs: Data pair :param encoder: Encoder object :param decoder: Decoder Object :param n: Number of Tests :return: ''' print('input========> ', sentence) output_words, attentions = evaluate(input_lang, output_lang, encoder, decoder, sentence, maxlength) output_sentence = ' '.join(output_words) print('predicted value: ', output_sentence) print('') if __name__ == '__main__': hidden = 256 print('Please wati a minute,it\'s load data and load model...... ') input_lang, output_lang, pairs = load_data.prepareData('english', 'french', '../data/eng-fra.txt') print(input_lang.n_words, output_lang.n_words) path = '/tmp/pycharm_project_270/Pytorch/day_04_Project/transformer/encoder4.pth' path2 = '/tmp/pycharm_project_270/Pytorch/day_04_Project/transformer/decoder4.pth' encoder = seq2seq_CreateModel.EncoderRNN(input_lang.n_words, hidden).to(device) encoder.load_state_dict(torch.load(path)) decoder = seq2seq_CreateModel.AttnDecoderRNN(hidden, output_lang.n_words, dropout_p=0.1).to(device) decoder.load_state_dict(torch.load(path2)) print('model load up ') # Start testing agent_evaluate(encoder, decoder, 500, pairs, input_lang, output_lang) # Custom Test # agent_evaluate_onesentence(encoder, decoder, 1, 'we are friend', input_lang, output_lang) sentence = "what re you doing ?" output_words, attentions = evaluate(input_lang, output_lang, encoder, decoder, sentence=sentence) print(output_words, len(output_words)) print(attentions.shape) import matplotlib.pyplot as plt plt.matshow(attentions.numpy()) plt.show()
First the code is for translation into English, but the process is very difficult, let's take a look at the data.
This is the AB experimental control group, without data filtering, 1w pairs of English and French, the result of training. Here is the next foreshade, there is a bug across all the models I did not find the reason why I couldn't find loss down for 4 days
This is a 7,500 round robin training, with a total of 2803 sentences on the English dataset... Nothing to say
Because the random sample gradient doesn't go down
So if it's a learning rate problem, change 0.01 to 1e-1
This guy, counted 1 e-2,7500 times he was honest
What about 15,000 training sessions?
Okay, the error doesn't go on, don't go here at 3
So is it teacher_forceing, the other parameters are not changed
The amount parameter changed from 0.5 to 0.75
Appears to improve error correction, has a little bit of optimism about the error
So can increasing the number of training rounds reduce loss?
It's true that you can lower the jurisdiction... but just increasing the error by 0.5 is not a usable range, so what on earth is going to keep my error down?
Originally...my decoder input was written in a fixed word, my God
It's not just the most out of line. After four days of searching, I finally found the problem.
And it's ridiculous that a decoder can learn at least three words of reply by entering only one word
This is the strength of the attention mechanism. Is it really half the strength?
Let's look at atten visualization after bug resolution and attn before bug resolution
5
This is before, because bug s cause inaccurate attention, but you can see that a word can be associated with three or more words, indicating that translation takes into account at least two other words in relation to its own word before it is translated.
Let's look at lr=0.02 teacherfource=0.5 epoch=7500 times after fixing the bug
Translation Verification
And because you don't know French, it's not likely that Sindaya's assessment was better than it was before, but only 7,500 times
How about 75,000 times?
Very low
What about the attention matrix?
Very elegant. Note that the attention images here are all translations of the same sentence
Put parameters on
Summary of Training Experience
To be continued, the next article will detail atten's attention mechanism self-attention mechanism and QKV's application in seq2seq to understand bmm, etc.