Seq2Seq attention English Translation Code Implementation+Detailed Attention Mechanism

Say nothing but code

Data Loading Class

from io import open
import unicodedata
import re
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

# Register Driver
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Start Mark
SOS_token = 0
EOS_token = 1

class Lang():
    def __init__(self, name): = name
        self.word2index = {}  # Lexical Subscript Lookup Dictionary Mapping Relation Preservation
        self.index2word = {0: 'SOS', 1: 'EOS'}  # Subscript Corresponds to Dictionary Number Lookup Mapping Relation Save
        self.n_words = 2  # Initialize the index because 01 is already occupied, so the initial value is 2

    def addSentence(self, sentence):
        # Add a sentence function to convert a sentence into a corresponding numeric sequence
        # This is set to divide sentences by using spaces as the standard for word breaking and get corresponding Subscripts
        for word in sentence.split(' '):

    def addWord(self, word):
        Add Word to Dictionary
        :param word:Words after participle
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

def demo1_testlong():
    name = 'eng'
    sen = 'I am IronMan'
    dicts = Lang(name)

def unicodeToAscii(s):
    unicode Convert Universal Code to ASCII Code is processed as standard
    :param s:
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'

def normalizesString(s):
    String Standardization,Remove unwanted special symbols,Inductive word-breaking rules, etc.,Cleaning data
    :param s:
    # Strings are capitalized and whitespace removed. Special symbols
    s = unicodeToAscii(s.lower().strip())
    # Will it be!?? str money = plus a space before the symbol
    s = re.sub(r"([.!?])", r" \1", s)
    # Not normal punctuation and uppercase characters are replaced with spaces
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, path):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    list1 = []
    c = 0
    list1 = [[normalizesString(s) for s in l.split('\t')] for l in lines]
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    # print(list1)
    return input_lang, output_lang, list1

def filterPair(p, maxlenght, eng_prefixes):
    return len(p[0].split(' ')) < maxlenght and p[0].startswith(eng_prefixes) and len(p[1].split(' ')) < maxlenght

def filterPairs(pairs):
    eng_prefixes = (
        "i am ", "i m ",
        "he is", "he s ",
        "she is", "she s ",
        "you are", "you re ",
        "we are", "we re ",
        "they are", "they re "
    maxlenght = 10
    return [pair for pair in pairs if filterPair(pair, maxlenght, eng_prefixes)]

def demo04_test_filterPairs(path):
    lang1 = "eng"
    lang2 = "fra"
    input_lang, output_lang, pairs_before = readLangs(lang1, lang2, path)
    print("Before filtering pairs", len(pairs_before))
    pairs_after = filterPairs(pairs_before)
    print("After filtering pairs", len(pairs_after))

    print("Filtered pairs Top 5:", pairs_after[:5])
    print("Filtered pairs900-905:", pairs_after[900:905])

# Text Processing Text-->Numerization
def prepareData(lang1, lang2, path):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, path)
    pairs = filterPairs(pairs)
    # print(input_lang.n_words)
    for pair in pairs:
        # print(pairs)
        # print(input_lang.n_words)
        print(f'{pair[0]}--> {pair[1]}')
    return input_lang, output_lang, pairs

# 6-2 Text Preprocessing Function Test: Text--->Numerization
def demo05_test_prepareData(path):
    input_lang, output_lang, pairs = prepareData('eng', 'fra', path)
    print("input_n_words:", input_lang.n_words)  # input_n_words: 2803
    print("output_n_words:", output_lang.n_words)  # output_n_words: 4345

    for i in range(3):
        print('Randomly select a language pair random.choice(pairs)--->', random.choice(pairs))
    # random.choice(pairs)---> ['he s on his way .', 'il est en route .']route

def tensorfromSentence(lang, sentence):
    # Return Subscript
    indexs = [lang.word2index[word] for word in sentence.split(' ')]
    indexs.append(EOS_token)  # Add End Flag
    # print(indexs)
    return torch.tensor(indexs, dtype=torch.long, device=device).view(-1, 1)

def tensorfromPair(input_lang, output_lang, pair):
    # input_lang, output_lang, paris = prepareData('english', 'french', '../data/eng-fra.txt')
    input_tensor = tensorfromSentence(input_lang, pair[0])
    target_tensor = tensorfromSentence(output_lang, pair[1])
    # Finally, return the tuples they make up
    return input_tensor, target_tensor

if __name__ == '__main__':
    # demo1_testlong()
    # s = 'what are you doing? hhhh iam missing you!'
    # print(normalizesString(s))
    path = '../data/eng-fra.txt'
    # demo04_test_filterPairs(path)
    # input_lang, output_lang, pairs = readLangs('english', 'french', '../data/eng-fra.txt')
    # print("input_lang:", input_lang)
    # print("output_lang:", output_lang)
    # Print (pairs before filtering, len (pairs))
    # pairs_after = filterPairs(pairs)
    # print("pairs after filtering", len (pairs_after))
    # print("first 5 filtered pairs:", pairs_after[:5])
    # print("filtered pairs 900-905:", pairs_after[900:905])
    # input_lang, output_lang, pairs = readLangs('english', 'french', '../data/eng-fra.txt')
    # print(input_lang.n_words)
    # print(output_lang.n_words)
    # tensorfromSentence(input_lang, 'asdaskj aslkjda asdlkn')
    # for i in pairs:
    # print(tensorfromPair(pairs[0]))

model building

import random

import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.input = input_size
        self.hidden = hidden_size
        self.embeding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input1, hidden):
        # input1 is the input tensor
        # Hidden represents the initialization of the hidden layer tensor
        # embedding is a two-dimensional tensor, but gru requires a three-dimensional tensor to be input
        output = self.embeding(input1).view(1, 1, -1)
        output, hidden = self.gru(output, hidden)  # Number of Hidden Layers
        return output, hidden

    def initHiddenTensor(self):
        # Initialize Hidden Layer Tensor
        return torch.zeros(1, 1, self.hidden, device=device)

def test_encoder():
    import load_data

    path = '../data/eng-fra.txt'
    input_lang, output_lang, pairs = load_data.prepareData('english', 'frensh', path)
    pair_tensor = load_data.tensorfromPair(random.choice(pairs))
    hidden_size = 25
    input_size = 20
    input = pair_tensor[0][0]
    # print(pair_tensor[0].item())
    # print(pair_tensor[0][0], pair_tensor[0][1])
    # Hidden neurons circulate 25 times, so dictionaries cannot exceed 25 words
    # Get the tensor value print(input.item())
    if input.item() > 24:
        input = torch.tensor([6])
    hidden = torch.zeros(1, 1, hidden_size)
    encoder = EncoderRNN(input_size, hidden_size)  # Input Element Feature Dimension, Number of Hidden Layer Cycles
    encoder_output, hidden = encoder(input, hidden)
    print('encoder structure', encoder)
    print('encoderoutput.shape---->', encoder_output.shape, encoder_output)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        print('embedding: ->', self.embedding)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)  # Make a size change
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # print('embedding computed output', self.embedding(input), self.embedding(input).shape)
        # print('embedding shape', self.embedding(input).shape)
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        print('output---->', output.shape)
        output = self.softmax(self.out(output[0]))  # dimensionality reduction
        # print('somtmax--->', output.shape)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def test_decodeRNN():
    hidden_size = 25
    output_size = 12
    input = torch.tensor([1])  # Enter a word
    # One word, 25 features
    hidden = torch.zeros(1, 1, hidden_size)
    decorator = DecoderRNN(hidden_size, output_size)
    for i in range(3):
        output, hidden = decorator(input, hidden)
        print('*' * 20)
        print('decoderRnn--->', output)
        print('shape', output.shape)

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        :param hidden_size: Decoder GRU Input Size,Number of hidden layer loops
        :param output_size: Decoder Output Size
        :param dropout_p: Random inactivation parameters
        :param max_length: Maximum Sentence Length,Number of words
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        # Calculate rules based on QKV
        # 1. q and k do attention weight matrix calculation and then multiply with v
        # Three ways of calculating
        # QK longitudinal stitching, linear transformation, tensor multiplication with v after softmax processing
        # QK longitudinal stitching, a linear transformation, activation using the tanh function, then internal summation, and finally the result obtained by using softmax processing is multiplied by v
        # The QK transformation does a dot product operation, then divides by a zoom factor, uses softmax processing to get the result and multiplies it by v
        # bmm operation when the attention weight matrix and v are both three-dimensional tensors and the first dimension represents the number of batch bars
        # 2. According to the calculation method used in the first step, if the splicing method is used, Q and the calculation result of the second step need to be spliced again.
        # If the point product is transposed, the self-attention mechanism Q=K=V does not need to be stitched with Q, so the second part of the calculation is related to the method used in the first step.
        # 3. The third step is to make the whole attention structure output according to the specified size, use the linear layer to make a linear table hidden danger on the second result, and get the final attention matrix to Query, which is also the output of the decoder.

        # Take the first calculation from step one
        # Encoder output input to embedding to create word vector
        self.attn = nn.Linear(self.hidden_size * 2,
                              self.max_length)  # The first step is splicing, so the size after splicing is 2*hiddensize The second parameter is the length of the sentence, which is the batch number
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)  # Here is the linear layer required for the fourth step to specify the size of the output, and the input is the output of the third gru
        # Number of input neurons, number of output neurons Input neurons: cat(cat(Q,K)*V,Q) So it is the Q dimension Hiddensize*2 output dimension Hiddensize uses output to look up words in dictionary because it is seq2seq so it is not strictly required
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        Attention calculation order
        :param input:Enter the encoded word tensor
        :param hidden: Initialize Hidden Layer Tensor(Full Connection Layer Weight Matrix)
        :param encoder_outputs: Output tensor of decoder
        :return:output Output value  hidden Value of the last loop lock output for the last hidden layer,atten_weights: Attention Weight Matrix
        embedded = self.embedding(input).view(1, 1, -1)  # Enter the input into the embedding layer to create a tensor and raise the dimension from the original two dimensions to include the attention calculation
        # dropout prevents overfitting
        embedded = self.dropout(embedded)

        # Perform atten calculation for the first type
        # QK longitudinal stitching, linear transformation into softmax
        # print(embedded[0].shape, embedded[0])
        # print(hidden[0].shape, hidden[0])
        # Query is the output of the decoder Key is hidden hidden layer Value: is the output of the encoder
        attn_weight = F.softmax(
      [0], hidden[0]), 1)
            ), dim=1
        # cat first parameter, tensor second parameter to stitch select dimension 0:row 1:column and so on dim does not know what it means
        # The second half of the weight matrix of the first step multiplies v and calculates bmm
        attn_applied = torch.bmm(attn_weight.unsqueeze(0), encoder_outputs.unsqueeze(0))
        # unsqueeze ascending dimension bam x bma = baa
        # Second calculation spliced with Q
        output =[0], attn_applied[0]), 1)
        # The third step uses a linear layer to make a linear transformation on the result of the third step and to extend the dimension to the fourth step
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        # Activated results are passed into the gru input along with hidden
        output, hidden = self.gru(output, hidden)
        # Dimension reduction of results and feeding into softmax for final results
        output = F.log_softmax(self.out(output[0]), dim=1)

        return output, hidden, attn_weight

def test_AttnDecoderRnn():
    hidden_size = 25
    output_size = 10
    input = torch.tensor([3])
    # Enter one word at a time
    hidden = torch.zeros(1, 1, hidden_size)
    # encoderoutputs are stacked with the output of each time step in encode
    # His shape should be the number of loops * the number of dimensions per word
    encoder_outputs = torch.randn(10, 25)
    # Intermediate Semantic Tensor C
    decoder = AttnDecoderRNN(hidden_size, output_size)
    output, hidden, attnweights = decoder(input, hidden, encoder_outputs)

if __name__ == '__main__':
    # test_AttnDecoderRnn()

model training

import random

import seq2seq_CreateModel
import load_data
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, maxleght=10):
    Training function
    :param teacher_forcing_ratio: learning rate
    :param input_tensor: English Input Tensor
    :param target_tensor: French Target Tensor
    :param encoder: Encoder obj
    :param decoder: Decoder obj
    :param encoder_optimizer: Encoder Optimizer Method
    :param decoder_optimizer: Decoder Optimizer Method
    :param criterion: Calculation method of loss function
    :param maxleght: Maximum Sentence Length batchsize
    # Initialize Encoder Hidden Layer Tensor
    encoder_hidden = encoder.initHiddenTensor()
    # Encoder Decoder Optimizer Gradient Zeroing

    # Getting the corresponding length from the original text and the target text tensor
    input_length = input_tensor.size(0)  # Number of words entered
    target_length = target_tensor.size(0)  # Number of words for the target

    # Initialize Encoder Output Tensor This is the container used to store encoder output
    encoder_outputs = torch.zeros(maxleght, encoder.hidden, device=device)  # Encoder Output Bach_ Number of Size X hidden layer loops
    # Set initial loss to 0
    loss = 0

    # Loop through input tensor index
    for ei in range(input_length):
        # Remove the corresponding word tensor from the index, because each sentence has a different length, so each loop needs to be determined by the length of the input sentence
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)  # Enter tensor and initialize the hidden layer
        # Store Encoder Output
        encoder_outputs[ei] = encoder_output[0, 0]  # Each output shape is [1,1,hidden_size] a word, a sentence, the number of hidden loops
        # Because storage containers are two-dimensional tensors, reduce the dimensions when storing

    # Initialize the first input of the decoder
    decoder_input = torch.tensor([[load_data.SOS_token]], device=device)
    # Initializing the decoder hidden layer tensor is also the output of the last loop of the encoder hidden layer
    decoder_hidden = encoder_hidden

    # According to random number and teacher_ Forcing_ Comparison of ratios to determine whether to use teacher_forching
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # If teacher_is used Forcing
    if use_teacher_forcing:
        # Loop through the target tensor index if you are training with the correct target
        for di in range(target_length):
            # encoder_outputs encoder output container
            # decoder_input Decoder Input
            # decoder_hidden Encoder Last Hidden Layer Last Loop Output Is Decoder Hidden Layer Input
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden,
            # decoder_output:Decoder final output decoder_ The value of the last loop lock output of hidden's last hidden layer, decoder_attention:Attention Weight Matrix
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Because if the result of the first cycle is wrong
            # The second cycle will be greatly affected, so enter the correct values into the next cycle and let the model learn
        # Do not use correct target value training
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)  # Remove results
            loss += criterion(decoder_output, target_tensor[di])
            # Terminate loop if output value is Terminator
            if topi.squeeze().item() == load_data.EOS_token:
            # Continue typing if it is not a terminator
            decoder_input = topi.squeeze().detach()  # top1 dimensionality reduction as input to the next decoder is removed from the computational diagram because it is not related to the computational diagram
            # That is, training with the last input as input

    loss.backward()  # Start reverse propagation
    # After updating the error, return the average loss
    return loss.item() / target_length

def timeSince(since):
    import time
    import math
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def trainAgent(input_lang, output_lang, pairs, encoder, decoder, n_iters, print_every=1000, plot_every=100,
    Training Agent Function
    :param pairs: data set
    :param encoder: Encoder obj
    :param decoder: Decoder obj
    :param n_iters: Total Iteration Step
    :param print_every: Print Log Interval
    :param plot_every: Interval for plotting loss curves
    :param learning_rate: learning rate
    import time
    from torch import optim
    start = time.time()
    plot_losses = []  # Loss Statistics Curve Container
    print_loss_total = 0  # Total loss per print log interval
    plot_loss_total = 0  # Total loss per drawing interval

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    # Select Loss Function
    criterion = nn.NLLLoss()
    num = 0
    # Cycle by step
    for iter in range(1, n_iters + 1):
        # Take one out of the training set each time as a training statement
        # training_pair = load_data.tensorfromPair(input_lang, output_lang, pairs)
        # pari = random.choice(pairs)
        pari, num = choise_random_pairs(num, pairs)
        training_pair = load_data.tensorfromPair(input_lang, output_lang, pari)
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        # The interval between iterations reaching log printing is
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d Percentage) %.4f' % (timeSince(start), iter, iter / n_iters * 100, print_loss_avg))
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_loss_total = 0

    import matplotlib.pyplot as plt

def choise_random_pairs(num, pairs):
    if num == len(pairs):
        num = 0
        num += 1
    return pairs[random.randint(0, len(pairs) - 1)], num

if __name__ == '__main__':
    hidden_size = 256
    input_lang, output_lang, pairs = load_data.prepareData('english', 'french', '../data/eng-fra.txt')
    # Number of print(len(pairs)) training sentences 10599
    # print(input_lang.n_words, output_lang.n_words) English Dictionary Number 2803 French Dictionary 4345
    encoder1 = seq2seq_CreateModel.EncoderRNN(input_lang.n_words, hidden_size).to(device)
    attn_decoder1 = seq2seq_CreateModel.AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
    # Start training
    n_iters = 7500
    print_every = 100
    # for i in pairs:
    # trainAgent(input_lang, output_lang, pairs, encoder1, attn_decoder1, n_iters, print_every)
    trainAgent(input_lang=input_lang, output_lang=output_lang, pairs=pairs, encoder=encoder1,
               n_iters=n_iters, print_every=print_every, learning_rate=0.01), './encoder4.pth'), './decoder4.pth')

Model evaluation

import re
import random
import torch
import torch.nn
import torch.nn.functional as F
import seq2seq_CreateModel
import seq2seq_train
import load_data
from io import open
from torch import optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
maxlength = 10
# Start Mark
SOS_token = 0
# End Mark
EOS_token = 1

def evaluate(lang, output_lang, encoder, decoder, sentence, max_length=maxlength):
    Evaluation function
    :param output_lang: Decoder Entity Class Object
    :param lang: Input Entity Class Object
    :param encoder: Encoder
    :param decoder: Decoder
    :param sentence: Sentences to be evaluated
    :param max_length: Maximum number of sentence words
    with torch.no_grad():
        input_tensor = load_data.tensorfromSentence(lang, sentence)  # Return tensor sentence onehot encoding
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHiddenTensor()  # Initialize hidden layer weight matrix
        encoder_outputs = torch.zeros(maxlength, encoder.hidden, device=device)  # Cache Container
        # Encoder Loop
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden
        decoder_words = []
        decoder_attentions = torch.zeros(maxlength, maxlength)  # Initialize Attention Weight Matrix
        # Decoder Loop
        for di in range(maxlength):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            # print(decoder_output.shape)
            decoder_attentions[di] =  # Cache attention results
            topv, topi =
            if topi.item() == EOS_token:

            decoder_input = topi.squeeze().detach()  # top1 dimensionality reduction as input to the next decoder is removed from the computational diagram because it is not related to the computational diagram

        return decoder_words, decoder_attentions[:di + 1]

def agent_evaluate(encoder, decoder, n, pairs, input_lang, output_lang):
    Test Proxy Function
    :param output_lang: Decoder Dictionary Object
    :param input_lang: Encoder Dictionary Object
    :param pairs: Data pair
    :param encoder: Encoder object
    :param decoder: Decoder Object
    :param n: Number of Tests

    for i in range(n):
        pair = random.choice(pairs)
        print('input========>   ', pair[0])
        print('lable---->  ', pair[1])
        output_words, attentions = evaluate(input_lang, output_lang, encoder, decoder, pair[0], maxlength)
        output_sentence = ' '.join(output_words)
        print('predicted value: ', output_sentence)

def agent_evaluate_onesentence(encoder, decoder, n, sentence, input_lang, output_lang):
    Test Proxy Function,Enter Custom English
    :param output_lang: Decoder Dictionary Object
    :param input_lang: Encoder Dictionary Object
    :param pairs: Data pair
    :param encoder: Encoder object
    :param decoder: Decoder Object
    :param n: Number of Tests
    print('input========>   ', sentence)
    output_words, attentions = evaluate(input_lang, output_lang, encoder, decoder, sentence, maxlength)
    output_sentence = ' '.join(output_words)
    print('predicted value: ', output_sentence)

if __name__ == '__main__':
    hidden = 256
    print('Please wati a minute,it\'s load data and load model......  ')
    input_lang, output_lang, pairs = load_data.prepareData('english', 'french', '../data/eng-fra.txt')
    print(input_lang.n_words, output_lang.n_words)
    path = '/tmp/pycharm_project_270/Pytorch/day_04_Project/transformer/encoder4.pth'
    path2 = '/tmp/pycharm_project_270/Pytorch/day_04_Project/transformer/decoder4.pth'
    encoder = seq2seq_CreateModel.EncoderRNN(input_lang.n_words, hidden).to(device)
    decoder = seq2seq_CreateModel.AttnDecoderRNN(hidden, output_lang.n_words, dropout_p=0.1).to(device)
    print('model load up ')

    # Start testing
    agent_evaluate(encoder, decoder, 500, pairs, input_lang, output_lang)
    # Custom Test
    # agent_evaluate_onesentence(encoder, decoder, 1, 'we are friend', input_lang, output_lang)
    sentence = "what re you doing ?"
    output_words, attentions = evaluate(input_lang, output_lang, encoder, decoder, sentence=sentence)
    print(output_words, len(output_words))
    import matplotlib.pyplot as plt


First the code is for translation into English, but the process is very difficult, let's take a look at the data.
This is the AB experimental control group, without data filtering, 1w pairs of English and French, the result of training. Here is the next foreshade, there is a bug across all the models I did not find the reason why I couldn't find loss down for 4 days

This is a 7,500 round robin training, with a total of 2803 sentences on the English dataset... Nothing to say
Because the random sample gradient doesn't go down

So if it's a learning rate problem, change 0.01 to 1e-1
This guy, counted 1 e-2,7500 times he was honest

What about 15,000 training sessions?

Okay, the error doesn't go on, don't go here at 3

So is it teacher_forceing, the other parameters are not changed
The amount parameter changed from 0.5 to 0.75

Appears to improve error correction, has a little bit of optimism about the error
So can increasing the number of training rounds reduce loss?
It's true that you can lower the jurisdiction... but just increasing the error by 0.5 is not a usable range, so what on earth is going to keep my error down? decoder input was written in a fixed word, my God
It's not just the most out of line. After four days of searching, I finally found the problem.
And it's ridiculous that a decoder can learn at least three words of reply by entering only one word
This is the strength of the attention mechanism. Is it really half the strength?

Let's look at atten visualization after bug resolution and attn before bug resolution

This is before, because bug s cause inaccurate attention, but you can see that a word can be associated with three or more words, indicating that translation takes into account at least two other words in relation to its own word before it is translated.
Let's look at lr=0.02 teacherfource=0.5 epoch=7500 times after fixing the bug

Translation Verification

And because you don't know French, it's not likely that Sindaya's assessment was better than it was before, but only 7,500 times

How about 75,000 times?

Very low
What about the attention matrix?

Very elegant. Note that the attention images here are all translations of the same sentence

Put parameters on

Summary of Training Experience

To be continued, the next article will detail atten's attention mechanism self-attention mechanism and QKV's application in seq2seq to understand bmm, etc.

Tags: AI Pytorch NLP ML

Posted on Wed, 27 Oct 2021 13:21:00 -0400 by TheHyipSite