Neural network, integer to onehot vector experiment, found that Adam optimizer is not universal, sometimes RMSprop optimizer is better

Experiment: the 16 integers 0-15 are transformed into a onehot vector with length of 16 by MLP of two layers, and the accuracy is required to be 100%
Increase the difficulty, change to 100 integers, and change the length of onehot to 100.

In the case of 16 integers, the original data is not processed
The number of iterations using the Adam optimizer is generally more than 10000 to achieve 100% accuracy
With the RMSprop optimizer, 100% accuracy is achieved in most cases within 2000 iterations

100 integers
Using the RMSprop optimizer
Without original data processing, the accuracy is 0.45 for 40000 iterations
The original data is normalized and the accuracy of 5100 iterations is 1.0
After normalizing the original data with 0-means, the accuracy of 3600 iterations is 1.0
After regularizing the original data, the accuracy of 2300 iterations is 1.0
With the Adam optimizer, no data processing can converge

This is the experiment of 16 integers. The next one is the experiment of 100 integers

import torch import torch.nn as nn import torch.nn.functional as F import numpy as np class SelectNet(nn.Module): def __init__(self, in_dim=1, out_dim=4*4, inter_dim=32, n_layer=2, act=nn.LeakyReLU(0.1)): super().__init__() self.mod_list = nn.ModuleList() for i in range(n_layer): d_in = in_dim if i == 0 else inter_dim d_out = out_dim if i == n_layer - 1 else inter_dim self.mod_list.append(nn.Linear(d_in, d_out)) self.mod_list.append(act) def forward(self, x): y = x for m in self.mod_list: y = m(y) return y #F.sigmoid(y) torch.set_grad_enabled(False) in_dim = 1 out_dim = 4*4 inter_dim = int(out_dim*1.3) n_layer = 2 net = SelectNet(in_dim, out_dim, inter_dim, n_layer, nn.LeakyReLU(0.1)) x_train = np.arange(out_dim) norm_type = 'Keep it as it is.' print('Original data processing method', norm_type) if norm_type == 'normalization': # Using normalization, when the length is 100, the accuracy reaches 1.0 when the number of iterations is about 5100 x_train = (x_train - x_train.min()) / (x_train.max() - x_train.min().min()) elif norm_type == '0 Mean normalization': # When the length is 100, the accuracy reaches 1.0 when the number of iterations is about 3600 x_train = (x_train - x_train.min()) / (x_train.max() - x_train.min().min()) * 2 - 1 elif norm_type == 'Regularization': # Using regularization, when the length is 100, the accuracy reaches 1.0 when the number of iterations is about 2300 x_train = (x_train - x_train.mean()) / np.std(x_train) elif norm_type == 'Keep it as it is.': # Without normalization, this is the slowest... When the length is 100 and 50000 iterations, the accuracy is only 0.45, and then it is terminated x_train = x_train else: print('Keep as is by default') y_train = [] mask = np.ones(len(x_train)) # First come, first take for i in x_train: x = torch.Tensor([i])[None, ...] a = net(x).cpu().detach().numpy()[0] a -= (a.min()-1) a *= mask pos = np.argmax(a) y_train.append(pos) mask[pos] = 0 # y_train = x_train.copy() print(y_train) def mini_train(net, x_train, y_train, batch_size=None, target_accuracy=None, max_count=None): # Training, end at 1 Accuracy optim = torch.optim.RMSprop(net.parameters(), lr=0.001) # optim = torch.optim.Adam(net.parameters(), lr=0.001) x_train = torch.Tensor(x_train) y_train = torch.Tensor(y_train) def check_accuracy(xs, ys): with torch.no_grad(): out_ys = net(torch.Tensor(xs)).numpy() return np.sum(np.argmax(out_ys, 1) == ys.numpy()) / len(ys) with torch.enable_grad(): for b in range(99999999999999999): batch_x = torch.Tensor(x_train).type(torch.float32) batch_y = torch.Tensor(y_train).type(torch.long) out = net(batch_x) loss = F.cross_entropy(out, batch_y) loss.backward() optim.step() acc = check_accuracy(x_train, y_train) print(b, acc, loss.item()) if acc == 1: break mini_train(net, x_train[:, None], y_train) print('over')

This is an experiment of 100 integers

import torch import torch.nn as nn import torch.nn.functional as F import numpy as np class SelectNet(nn.Module): def __init__(self, in_dim=1, out_dim=4*4, inter_dim=32, n_layer=2, act=nn.LeakyReLU(0.1)): super().__init__() self.mod_list = nn.ModuleList() for i in range(n_layer): d_in = in_dim if i == 0 else inter_dim d_out = out_dim if i == n_layer - 1 else inter_dim self.mod_list.append(nn.Linear(d_in, d_out)) self.mod_list.append(act) def forward(self, x): y = x for m in self.mod_list: y = m(y) return y #F.sigmoid(y) torch.set_grad_enabled(False) in_dim = 1 out_dim = 10*10 inter_dim = int(out_dim*1.3) n_layer = 2 net = SelectNet(in_dim, out_dim, inter_dim, n_layer, nn.LeakyReLU(0.1)) x_train = np.arange(out_dim) norm_type = 'Regularization' optim_type = torch.optim.RMSprop # optim_type = torch.optim.Adam print('Original data processing method', norm_type) if norm_type == 'normalization': # Using normalization, when the length is 100, the accuracy reaches 1.0 when the number of iterations is about 5100 x_train = (x_train - x_train.min()) / (x_train.max() - x_train.min().min()) elif norm_type == '0 Mean normalization': # When the length is 100, the accuracy reaches 1.0 when the number of iterations is about 3600 x_train = (x_train - x_train.min()) / (x_train.max() - x_train.min().min()) * 2 - 1 elif norm_type == 'Regularization': # Using regularization, when the length is 100, the accuracy reaches 1.0 when the number of iterations is about 2300 x_train = (x_train - x_train.mean()) / np.std(x_train) elif norm_type == 'Keep it as it is.': # Without normalization, this is the slowest... When the length is 100 and 50000 iterations, the accuracy is only 0.45, and then it is terminated x_train = x_train else: print('Keep as is by default') y_train = [] mask = np.ones(len(x_train)) # First come, first take for i in x_train: x = torch.Tensor([i])[None, ...] a = net(x).cpu().detach().numpy()[0] a -= (a.min()-1) a *= mask pos = np.argmax(a) y_train.append(pos) mask[pos] = 0 # y_train = x_train.copy() print(y_train) def mini_train(net, x_train, y_train, batch_size=None, target_accuracy=None, max_count=None): # Training, end at 1 Accuracy optim = optim_type(net.parameters(), lr=0.001) x_train = torch.Tensor(x_train) y_train = torch.Tensor(y_train) def check_accuracy(xs, ys): with torch.no_grad(): out_ys = net(torch.Tensor(xs)).numpy() return np.sum(np.argmax(out_ys, 1) == ys.numpy()) / len(ys) with torch.enable_grad(): for b in range(99999999999999999): batch_x = torch.Tensor(x_train).type(torch.float32) batch_y = torch.Tensor(y_train).type(torch.long) out = net(batch_x) loss = F.cross_entropy(out, batch_y) loss.backward() optim.step() acc = check_accuracy(x_train, y_train) print(b, acc, loss.item()) if acc == 1: break mini_train(net, x_train[:, None], y_train) print('over')

Neural network, integer to onehot vector experiment, found that Adam optimizer is not universal, sometimes RMSprop optimizer is better

3 December 2019, 00:00 | Views: 8729

Add new comment

0 comments