Deep "learning" task2: text preprocessing, language model and dataset, cyclic neural network

1, Text preprocessing

step1 - read in text

import re
import collections

# read time_machine
def read_time_machine():
    with open('/home/yuzhu/input/timemachine/timemachine.txt', 'r') as f:
        lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
    return lines

lines = read_time_machine()
print('# sentences %d' % len(lines))

step2- participle

# split sentences into word or char tokens
def tokenize(sentences, token='word'):
    if token == 'word':
        return [sentence.split(' ') for sentence in sentences]
    elif token == 'char':
        return [list(sentence) for sentence in sentences]
        print('ERROR: unknown token type' + token)

tokens = tokenize(lines)

step3 build dictionary

# build dictionary
class Vocab(object):

    def __init__(self, tokens, min_freq=0, use_special_tokens=False):
        counter = count_corpus(tokens)
        self.token_freqs = list(counter.items())
        self.idx_to_token = []
        if use_special_tokens:
            # padding, begin of sentence, end of sentence, unknown
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            self.idx_to_token += ['', '' ,'' ,'']
            self.unk = 0
            self.idx_to_token += ['']
        self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token]
        self.token_to_idx = dict()
        for idx, token in enumerate(self.idx_to_token):
            self.token_to_idx[token] = idx

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self,tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

step4 - Convert words to indexes

def count_corpus(sentences):
    tokens = [tk for st in sentences for tk in st]
    return collections.Counter(tokens)

# Example
vocab = Vocab(tokens)


# sentences 3583
[[''], ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'time', 'machine', 'by', 'h', 'g', 'wells']]
[('', 0), ('the', 1), ('project', 2), ('gutenberg', 3), ('ebook', 4), ('of', 5), ('time', 6), ('machine', 7), ('by', 8), ('h', 9)]

Using existing tools to segment words

Two common tools are sqacyand NLTK

2, Language models and datasets

1 - language model

A natural language text can be regarded as a discrete-time sequence. Given a sequence of words of length T, the goal of the language model is to evaluate whether the sequence is reasonable, that is, to calculate the probability of the sequence

Suppose that each word in the sequence is generated in turn:


The parameters of language model are the probability of words and the conditional probability given the first few words

2-n meta syntax

With the increase of sequence length, the complexity of calculating and storing the probability of multiple words appearing together will increase exponentially. n-ary grammar simplifies the model through Markov hypothesis

Markov hypothesis means that the appearance of a word is only related to the previous n words, i.e. n-order Markov chain of irder n. if n=1, then there are

Based on the n-1 order Markov chain, we can rewrite the language model as follows:

n-grams is a probabilistic language model based on n-1 Markov chain
When n is equal to 1, 2 and 3 respectively, we call them unigram, bigram and trigram respectively. For example, the probability of sequences with length of 4 in unigram, bigram and trigram is as follows:

3-language model data set

Read data set:

with open('/home/yuzhu/input/jayzhou_lyrics/jayzhou_lyrics.txt') as f:
    corpus_chars =
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]

Build character index:

# De duplicate to get index to character mapping
idx_to_char = list(set(corpus_chars))
# Character to index mapping
char_to_idx = {char : i for i, char in enumerate(idx_to_char)}
vocab_size = len(char_to_idx)
# Convert each character to an index to get a sequence of indexes
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[: 20]

Define the data loading function:

def load_data_jay_lyrics():
    with open('/home/yuzhu/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
        corpus_chars =
    corpus_chars =
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size2-n Meta grammar

4 - data sampling

Sequential data sampling

In training, we need to read a small batch of samples and labels at random every time. A sample of time series data usually contains consecutive characters. That is to say, if the length of the sequence is t and the deployment of time is n, then there will be a total of T-n legal samples. However, these samples have a large number of overlaps, and usually adopt a more efficient sampling method

M1 random sampling


import torch
import random

def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
    # Minus 1 is because for a sequence of length N, x contains at most the first n-1 characters
    num_examples = (len(corpus_indices) - 1) // num_steps
    example_indices = [i * num_steps for i in range(num_examples)]

    def _data(i):
        # Returns a sequence of num steps from i
        return corpus_indices[i: i+num_steps]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(0, num_examples, batch_size):
        # Each time, select batch_size random samples
        batch_indices = example_indices[i:i + batch_size]
        x = [_data(j) for j in batch_indices]
        y = [_data(j+1) for j in batch_indices]
        yield torch.tensor(x, device=device), torch.tensor(y, device=device)

my_seq = list(range(30))
for x, y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('x: ', x, '\ny:', y, '\n')


x:  tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11]]) 
y: tensor([[ 1,  2,  3,  4,  5,  6],
        [ 7,  8,  9, 10, 11, 12]]) 

x:  tensor([[12, 13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22, 23]]) 
y: tensor([[13, 14, 15, 16, 17, 18],
        [19, 20, 21, 22, 23, 24]]) 
M2-Adjacent sampling

#### 3, Cyclic neural network

##### 1-Cyclic neural network

    //Predict the next character of the sequence based on the current and past input sequences. A hidden variable H is introduced into the recurrent neural network. It is considered that Ht records the sequence information of the current character until now, and predicts the next character of the sequence with Ht
![Insert picture description here](,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JrYnZ1c2h1,size_16,color_FFFFFF,t_70)
##### Construction of 2-cycle neural network
![Insert picture description here](,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JrYnZ1c2h1,size_16,color_FFFFFF,t_70)
##### Realization of 3-cycle neural network

· read in data
import torch
import torch.nn as nn
import time
import math
import sys
import d2l_jay9460 as d2l
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

·One hot vector -- used to represent characters as vectors

def one_hot(x, n_class, dtype=torch.float32):
    result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)  # shape: (n, n_class)
    result.scatter_(1, x.long().view(-1, 1), 1)  # result[i, x[i, 0]] = 1
    return result
x = torch.tensor([0, 2])
x_one_hot = one_hot(x, vocab_size)
//The shape of the small batch for each sampling is, time_step),The following function converts a small batch into a matrix, the number of which is equal to the number of steps in time

def to_onehot(X, n_class):
    return [one_hot(X[:, i], n_class) for i in range(X.shape[1])]

X = torch.arange(10).view(2, 5)
inputs = to_onehot(X, vocab_size)
print(len(inputs), inputs[0].shape)

·Initialize model parameters

num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
# num_inputs: d
# Num? Hidden: H, the number of hidden cells is a super parameter
# num_outputs: q

def get_params():
    def _one(shape):
        param = torch.zeros(shape, device=device, dtype=torch.float32)
        nn.init.normal_(param, 0, 0.01)
        return torch.nn.Parameter(param)

    # Hide layer parameters
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device))
    # Output layer parameters
    W_hq = _one((num_hiddens, num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device))
    return (W_xh, W_hh, b_h, W_hq, b_q)

·Defining the model -- rnn completes the calculation of each time step of the cyclic neural network in turn by using the cyclic method

def rnn(inputs, state, params):
    # Input and output are both num steps and matrix with shape (batch size, vocab size)
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h)
        Y = torch.matmul(H, W_hq) + b_q
    return outputs, (H,)

The init RNN state function initializes the hidden variable, where the return value is a tuple

def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device), )
· Reduction gradient

In the cyclic neural network, gradient decay or gradient explosion is easy to occur, which will make the network almost unable to train, and gradient reduction is used to solve the problem of gradient explosion

def grad_clipping(params, theta, device):
    norm = torch.tensor([0.0], device=device)
    for param in params:
        norm += ( ** 2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
   *= (theta / norm)

·Define prediction function

def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, device, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens, device)
    output = [char_to_idx[prefix[0]]]   # output record prefix plus predicted num ﹐ chars characters
    for t in range(num_chars + len(prefix) - 1):
        # Take the output of the previous time step as the input of the current time step
        X = to_onehot(torch.tensor([[output[-1]]], device=device), vocab_size)
        # Calculate output and update hidden state
        (Y, state) = rnn(X, state, params)
        # The next time step is to input the characters in the prefix or the current best prediction character
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
    return ''.join([idx_to_char[i] for i in output])


The degree of perplexity is used to evaluate the quality of the language model. The degree of perplexity is the value obtained by exponential calculation of the cross entropy loss function

·Define model training function

def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # If adjacent sampling is used, the hidden state is initialized at the beginning of epoch
            state = init_rnn_state(batch_size, num_hiddens, device)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:
            if is_random_iter:  # If random sampling is used, the hidden state is initialized before each small batch update
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:  # Otherwise, you need to use the detach function to separate the hidden state from the calculation graph
                for s in state:
            # Input is a matrix whose shapes are (batch size, vocab size)
            inputs = to_onehot(X, vocab_size)
            # outputs have num steps matrices with the shape (batch size, vocab size)
            (outputs, state) = rnn(inputs, state, params)
            # After splicing, the shape is (Num ﹐ steps * batch ﹐ size, vocab ﹐ size)
            outputs =, dim=0)
            # The shape of Y is (batch_size, num_steps), which is transformed to
            # The vector of (Num ﹐ steps * batch ﹐ size,) so that it corresponds to the output line one by one
            y = torch.flatten(Y.T)
            # Using cross entropy loss to calculate average classification error
            l = loss(outputs, y.long())
            # Gradient Qing 0
            if params[0].grad is not None:
                for param in params:
            grad_clipping(params, clipping_theta, device)  # Clipping gradient
            d2l.sgd(params, lr, 1)  # Because the error has been averaged, the gradient does not need to be averaged
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
Published 2 original articles, praised 0 and visited 27
Private letter follow

Tags: network Python

Posted on Fri, 14 Feb 2020 04:12:56 -0500 by JohnnyBlaze