Hands on deep learning
- Day 1
- Day2
linear regression
code implementation
# import packages and modules %matplotlib inline import torch from IPython import display from matplotlib import pyplot as plt import numpy as np import random # set input feature number num_inputs = 2 # set example number num_examples = 1000 # set true weight and bias in order to generate corresponded label true_w = [2, -3.4] true_b = 4.2 features = torch.randn(num_examples, num_inputs, dtype=torch.float32) labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float32) ef data_iter(batch_size, features, labels): num_examples = len(features) indices = list(range(num_examples)) random.shuffle(indices) # random read 10 samples for i in range(0, num_examples, batch_size): j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # the last time may be not enough for a whole batch yield features.index_select(0, j), labels.index_select(0, j) batch_size = 10 for X, y in data_iter(batch_size, features, labels): print(X, '\n', y) break w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32) b = torch.zeros(1, dtype=torch.float32) w.requires_grad_(requires_grad=True) b.requires_grad_(requires_grad=True) def linreg(X, w, b): return torch.mm(X, w) + b def squared_loss(y_hat, y): return (y_hat - y.view(y_hat.size())) ** 2 / 2 ef sgd(params, lr, batch_size): for param in params: param.data -= lr * param.grad / batch_size # ues .data to operate param without gradient track # super parameters init lr = 0.03 num_epochs = 5 net = linreg loss = squared_loss # training for epoch in range(num_epochs): # training repeats num_epochs times # in each epoch, all the samples in dataset will be used once # X is the feature and y is the label of a batch sample for X, y in data_iter(batch_size, features, labels): l = loss(net(X, w, b), y).sum() # calculate the gradient of batch sample loss l.backward() # using small batch random gradient descent to iter model parameters sgd([w, b], lr, batch_size) # reset parameter gradient w.grad.data.zero_() b.grad.data.zero_() train_l = loss(net(features, w, b), labels) print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item())) ptint(w, true_w, b, true_b)
Softmax and classification model
Start from scratch
import torch import torchvision import numpy as np import sys sys.path.append("/home/kesci/input") import d2lzh1981 as d2l batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, root='/home/kesci/input/FashionMNIST2065') num_inputs = 784 print(28*28) num_outputs = 10 W = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_outputs)), dtype=torch.float) b = torch.zeros(num_outputs, dtype=torch.float) W.requires_grad_(requires_grad=True) b.requires_grad_(requires_grad=True) X = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(X.sum(dim=0, keepdim=True)) # dim is 0, sum according to the same column, and retain the column characteristics in the result print(X.sum(dim=1, keepdim=True)) # dim is 1, sum according to the same lines, and retain the line features in the result print(X.sum(dim=0, keepdim=False)) # dim is 0, sum according to the same column, do not retain the column characteristics in the result print(X.sum(dim=1, keepdim=False)) # dim is 1, sum according to the same lines, and do not retain the line features in the result #Define softmax operations ef softmax(X): X_exp = X.exp() partition = X_exp.sum(dim=1, keepdim=True) # print("X size is ", X_exp.size()) # print("partition size is ", partition, partition.size()) return X_exp / partition # The broadcast mechanism is applied here X = torch.rand((2, 5)) X_prob = softmax(X) print(X_prob, '\n', X_prob.sum(dim=1)) def net(X): return softmax(torch.mm(X.view((-1, num_inputs)), W) + b) y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]]) y = torch.LongTensor([0, 2]) y_hat.gather(1, y.view(-1, 1)) def cross_entropy(y_hat, y): return - torch.log(y_hat.gather(1, y.view(-1, 1))) def accuracy(y_hat, y): return (y_hat.argmax(dim=1) == y).float().mean().item() # This function has been saved in the d2lzh pytorch package for later use. This function will be improved step by step: its complete implementation will be described in the "image augmentation" section def evaluate_accuracy(data_iter, net): acc_sum, n = 0.0, 0 for X, y in data_iter: acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() n += y.shape[0] return acc_sum / n num_epochs, lr = 5, 0.1 # This function has been saved in the d2lzh pytorch package for later use def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, optimizer=None): for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: y_hat = net(X) l = loss(y_hat, y).sum() # Gradient clearing if optimizer is not None: optimizer.zero_grad() elif params is not None and params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() if optimizer is None: d2l.sgd(params, lr, batch_size) else: optimizer.step() train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() n += y.shape[0] test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc)) train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)
model prediction
X, y = iter(test_iter).next() true_labels = d2l.get_fashion_mnist_labels(y.numpy()) pred_labels = d2l.get_fashion_mnist_labels(net(X).argmax(dim=1).numpy()) titles = [true + '\n' + pred for true, pred in zip(true_labels, pred_labels)] d2l.show_fashion_mnist(X[0:9], titles[0:9])
Multilayer perceptron
Basic knowledge
The relu function is a generic activation function that is currently used in most cases. However, the relu function can only be used in hidden layers.
Sigmoid function and its combination are usually better for classifier. Because of the problem of gradient disappearance, sigmoid and tanh functions are sometimes avoided.
When there are many layers of neural network, it is better to use the ReLu function, which is simpler and requires less computation, while sigmoid and tanh functions require a lot of computation.
When selecting the activation function, you can select the ReLu function first. If the effect is not ideal, you can try other activation functions.
code implementation
import torch import numpy as np import sys sys.path.append("/home/kesci/input") import d2lzh1981 as d2l batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size,root='/home/kesci/input/FashionMNIST2065') num_inputs, num_outputs, num_hiddens = 784, 10, 256 W1 = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_hiddens)), dtype=torch.float) b1 = torch.zeros(num_hiddens, dtype=torch.float) W2 = torch.tensor(np.random.normal(0, 0.01, (num_hiddens, num_outputs)), dtype=torch.float) b2 = torch.zeros(num_outputs, dtype=torch.float) params = [W1, b1, W2, b2] for param in params: param.requires_grad_(requires_grad=True) def relu(X): return torch.max(input=X, other=torch.tensor(0.0)) def net(X): X = X.view((-1, num_inputs)) H = relu(torch.matmul(X, W1) + b1) return torch.matmul(H, W2) + b2 loss = torch.nn.CrossEntropyLoss() num_epochs, lr = 5, 100.0 # def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, # params=None, lr=None, optimizer=None): # for epoch in range(num_epochs): # train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 # for X, y in train_iter: # y_hat = net(X) # l = loss(y_hat, y).sum() # # # Gradient clearing # if optimizer is not None: # optimizer.zero_grad() # elif params is not None and params[0].grad is not None: # for param in params: # param.grad.data.zero_() # # l.backward() # if optimizer is None: # d2l.sgd(params, lr, batch_size) # else: # optimizer.step() # The section "simple implementation of softmax regression" will be used # # # train_l_sum += l.item() # train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() # n += y.shape[0] # test_acc = evaluate_accuracy(test_iter, net) # print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' # % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc)) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)
pytorch implementation
import torch from torch import nn from torch.nn import init import numpy as np import sys sys.path.append("/home/kesci/input") import d2lzh1981 as d2l num_inputs, num_outputs, num_hiddens = 784, 10, 256 net = nn.Sequential( d2l.FlattenLayer(), nn.Linear(num_inputs, num_hiddens), nn.ReLU(), nn.Linear(num_hiddens, num_outputs), ) for params in net.parameters(): init.normal_(params, mean=0, std=0.01) batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size,root='/home/kesci/input/FashionMNIST2065') loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(net.parameters(), lr=0.5) num_epochs = 5 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)Day2
Text preprocessing
General steps
- Read in text
- participle
- Build a dictionary to map each word to a unique index
- Convert text from a sequence of words to an indexed sequence
Code example
Read in text
import collections import re def read_time_machine(): with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f: lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f] return lines lines = read_time_machine() print('# sentences %d' % len(lines))
participle
def tokenize(sentences, token='word'): """Split sentences into word or char tokens""" if token == 'word': return [sentence.split(' ') for sentence in sentences] elif token == 'char': return [list(sentence) for sentence in sentences] else: print('ERROR: unkown token type '+token) tokens = tokenize(lines) tokens[0:2]
Build a dictionary
class Vocab(object): def __init__(self, tokens, min_freq=0, use_special_tokens=False): counter = count_corpus(tokens) # : self.token_freqs = list(counter.items()) self.idx_to_token = [] if use_special_tokens: # padding, begin of sentence, end of sentence, unknown self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3) self.idx_to_token += ['', '', '', ''] else: self.unk = 0 self.idx_to_token += [''] self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token] self.token_to_idx = dict() for idx, token in enumerate(self.idx_to_token): self.token_to_idx[token] = idx def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] def count_corpus(sentences): tokens = [tk for st in sentences for tk in st] return collections.Counter(tokens) # Returns a dictionary that records the number of occurrences of each word
Turn words into indexes
for i in range(8, 10): print('words:', tokens[i]) print('indices:', vocab[tokens[i]])
Disadvantages of the above segmentation methods:
- Punctuation usually provides semantic information, but our method directly discards it
- Words like "shouldn't", "doesn't" are handled incorrectly
- Words like "Mr.", "Dr.", are handled incorrectly
Introduction of existing tools:
text = "Mr. Chen doesn't agree with my suggestion."
1,spaCy
import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(text) print([token.text for token in doc])
2,NLTK
from nltk.tokenize import word_tokenize from nltk import data data.path.append('/home/kesci/input/nltk_data3784/nltk_data') print(word_tokenize(text))
Language model
brief introduction
n meta syntax
Language model data set
Read data setwith open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f: corpus_chars = f.read() print(len(corpus_chars)) print(corpus_chars[: 40]) corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ') corpus_chars = corpus_chars[: 10000]Build character index
idx_to_char = list(set(corpus_chars)) # De duplicate to get index to character mapping char_to_idx = {char: i for i, char in enumerate(idx_to_char)} # Character to index mapping vocab_size = len(char_to_idx) print(vocab_size) corpus_indices = [char_to_idx[char] for char in corpus_chars] # Turn each character into an index to get a sequence of indexes sample = corpus_indices[: 20] print('chars:', ''.join([idx_to_char[idx] for idx in sample])) print('indices:', sample)Define the function load? Data? Jay? Lyrics
def load_data_jay_lyrics(): with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f: corpus_chars = f.read() corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ') corpus_chars = corpus_chars[0:10000] idx_to_char = list(set(corpus_chars)) char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)]) vocab_size = len(char_to_idx) corpus_indices = [char_to_idx[char] for char in corpus_chars] return corpus_indices, char_to_idx, idx_to_char, vocab_sizeSampling of time series data
- Random sampling: random sampling one small batch of data at a time
import torch import random def data_iter_random(corpus_indices, batch_size, num_steps, device=None): # Minus 1 is because for a sequence of length N, X contains at most the first n - 1 characters num_examples = (len(corpus_indices) - 1) // num_steps # The number of samples without overlapping is obtained by rounding example_indices = [i * num_steps for i in range(num_examples)] # The subscript of the first character of each sample in corpus indexes random.shuffle(example_indices) def _data(i): # Returns a sequence of num steps from i return corpus_indices[i: i + num_steps] if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') for i in range(0, num_examples, batch_size): # Each time, select batch_size random samples batch_indices = example_indices[i: i + batch_size] # Subscript of the first character of each sample of the current batch X = [_data(j) for j in batch_indices] Y = [_data(j + 1) for j in batch_indices] yield torch.tensor(X, device=device), torch.tensor(Y, device=device)
- Adjacent sampling: two adjacent random small batches are adjacent to each other on the original sequence
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None): if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') corpus_len = len(corpus_indices) // Batch size * batch size the length of the remaining sequence corpus_indices = corpus_indices[: corpus_len] # Only the first corpus'len characters are reserved indices = torch.tensor(corpus_indices, device=device) indices = indices.view(batch_size, -1) # resize into (batch_size,) batch_num = (indices.shape[1] - 1) // num_steps for i in range(batch_num): i = i * num_steps X = indices[:, i: i + num_steps] Y = indices[:, i + 1: i + num_steps + 1] yield X, Y for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6): print('X: ', X, '\nY:', Y, '\n')
Fundamentals of cyclic neural network
Load data set
import os os.listdir('/home/kesci/input') import numpy as np import torch from torch import nn, optim import torch.nn.functional as F import sys sys.path.append("../input/") import d2l_jay9460 as d2l device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') (corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
GRU
Initialization parameters
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size print('will use', device) def get_params(): def _one(shape): ts = torch.tensor(np.random.normal(0, 0.01, size=shape), device=device, dtype=torch.float32) #Normal distribution return torch.nn.Parameter(ts, requires_grad=True) def _three(): return (_one((num_inputs, num_hiddens)), _one((num_hiddens, num_hiddens)), torch.nn.Parameter(torch.zeros(num_hiddens, device=device, dtype=torch.float32), requires_grad=True)) W_xz, W_hz, b_z = _three() # Update door parameters W_xr, W_hr, b_r = _three() # Reset door parameters W_xh, W_hh, b_h = _three() # Candidate hidden state parameters # Output layer parameters W_hq = _one((num_hiddens, num_outputs)) b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device, dtype=torch.float32), requires_grad=True) return nn.ParameterList([W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q]) def init_gru_state(batch_size, num_hiddens, device): #Hide state initialization return (torch.zeros((batch_size, num_hiddens), device=device), )
GRU model
def gru(inputs, state, params): W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: Z = torch.sigmoid(torch.matmul(X, W_xz) + torch.matmul(H, W_hz) + b_z) R = torch.sigmoid(torch.matmul(X, W_xr) + torch.matmul(H, W_hr) + b_r) H_tilda = torch.tanh(torch.matmul(X, W_xh) + R * torch.matmul(H, W_hh) + b_h) H = Z * H + (1 - Z) * H_tilda Y = torch.matmul(H, W_hq) + b_q outputs.append(Y) return outputs, (H,)
train
num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['Separate', 'No separation'] d2l.train_and_predict_rnn(gru, get_params, init_gru_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
LSTM
Initialization parameters
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size print('will use', device) def get_params(): def _one(shape): ts = torch.tensor(np.random.normal(0, 0.01, size=shape), device=device, dtype=torch.float32) return torch.nn.Parameter(ts, requires_grad=True) def _three(): return (_one((num_inputs, num_hiddens)), _one((num_hiddens, num_hiddens)), torch.nn.Parameter(torch.zeros(num_hiddens, device=device, dtype=torch.float32), requires_grad=True)) W_xi, W_hi, b_i = _three() # Input door parameters W_xf, W_hf, b_f = _three() # Forgetting gate parameter W_xo, W_ho, b_o = _three() # Output gate parameters W_xc, W_hc, b_c = _three() # Candidate memory cell parameters # Output layer parameters W_hq = _one((num_hiddens, num_outputs)) b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device, dtype=torch.float32), requires_grad=True) return nn.ParameterList([W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q]) def init_lstm_state(batch_size, num_hiddens, device): return (torch.zeros((batch_size, num_hiddens), device=device), torch.zeros((batch_size, num_hiddens), device=device))
LSTM model
def lstm(inputs, state, params): [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q] = params (H, C) = state outputs = [] for X in inputs: I = torch.sigmoid(torch.matmul(X, W_xi) + torch.matmul(H, W_hi) + b_i) F = torch.sigmoid(torch.matmul(X, W_xf) + torch.matmul(H, W_hf) + b_f) O = torch.sigmoid(torch.matmul(X, W_xo) + torch.matmul(H, W_ho) + b_o) C_tilda = torch.tanh(torch.matmul(X, W_xc) + torch.matmul(H, W_hc) + b_c) C = F * C + I * C_tilda H = O * C.tanh() Y = torch.matmul(H, W_hq) + b_q outputs.append(Y) return outputs, (H, C)
train
num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['Separate', 'No separation'] d2l.train_and_predict_rnn(lstm, get_params, init_lstm_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
Deep loop network
num_hiddens=256 num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['Separate', 'No separation'] lr = 1e-2 # Pay attention to adjusting the learning rate gru_layer = nn.LSTM(input_size=vocab_size, hidden_size=num_hiddens,num_layers=2) model = d2l.RNNModel(gru_layer, vocab_size).to(device) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
Bidirectional loop network
num_hiddens=128 num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e-2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['Separate', 'No separation'] lr = 1e-2 # Pay attention to adjusting the learning rate gru_layer = nn.GRU(input_size=vocab_size, hidden_size=num_hiddens,bidirectional=True) model = d2l.RNNModel(gru_layer, vocab_size).to(device) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)Zhiguo Ding Published 1 original article, praised 0 and visited 3 Private letter follow