- The main contents of this clock out include:
- Pre training model loading and modification
- Model saving during training
- Pseudo tag training
- Knowledge distillation
Pre training model loading and modification
- The pre training model adopts resnet18. Based on the model loading parameters saved in the torchvision model, fix the existing parameters, and then connect the full connection layer for regression
- resnet18 model loading instructions:
- Fixed parameters are used to extract the data features of the picture
- The output of the full connection layer of the last layer of the resnet18 model is the out before entering the softmax layer_ features
- Parameters. Requirements using the iteratability of model.parameters()_ Grad = false fixed parameter
- Loading of full connection layer:
- The full connection layer uses nn.Linear() to convert nn.Sequential into sequential and merge with the original network
- The code is as follows:
def set_parameter_requires_grad(model, feature_extracting): if feature_extracting: model = model for param in model.parameters(): param.requires_grad = False def resnet_model(out_feature, feature_extract = True): model_ft = torchvision.models.resnet18(pretrained=True) set_parameter_requires_grad(model_ft, feature_extract) num_ftrs = model_ft.fc.in_features model_ft.fc = nn.Sequential(nn.Linear(num_ftrs, out_feature)) return model_ft
Model saving during training
- The training function of the model, including the model saving function, constructs the overall training process
from torch.optim.lr_scheduler import CosineAnnealingLR def train_model(model, trainLoader, vaildLoader, params): train_loss, vaild_loss = [], [] loss_func = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(),lr=params.lr) scheduler = CosineAnnealingLR(optimizer,T_max=10) for i in range(params.epochs): valid_losses = [] train_losses = [] for ibatch, (X, y) in enumerate(trainLoader): model.train() optimizer.zero_grad() out = model(X) loss = loss_func(y, out) loss.backward() optimizer.step() train_losses.append(loss.item()) train_loss.append(loss.item()) for iv, (X, y) in enumerate(vaildLoader): model.eval() out = model(X) v_loss = loss_func(y, out) valid_losses.append(v_loss.item()) vaild_loss.append(v_loss.item()) if i%2 == 0: print("train loss: {}, vaild loss: {}".format( np.mean(train_losses), np.mean(valid_losses))) filename = para.save_path.format(epoch=i) save_torch(model, filename) print('epoch {} model saved'.format(i))
Pseudo tag training
- The pseudo labeled data learning process is as follows:
- Steps:
-
Divide the original data into train set and valid set to train model1
-
Use model1 to predict the pseudo label of the unlabeled test set
-
Randomly sample the train set in proportion and combine it with the test set with false label to form a new train set
-
Use the new train set to train the model to get model2
-
After using model2 to predict the test set, the final result is obtained
-
The code is as follows:
def get_torh_modl(modelOri, filename): ### Model data loading ### modelOri.load_state_dict(torch.load(filename)) return modelOri def pseudo_label_creat(model1, model1_fn, test_input): ### Predict test data ### model1 = get_torh_modl(model1, model1_fn) pseudo_label = model1(test_input) return test_input.detach(), pseudo_label.detach() def pseudo_real_concat(Xtrain, ytrain, Xtest, ytest, rate): ### Merge data ### index = random.sample(range(0,len(ytrain)),int(len(ytrain)*rate)) Xtrain = Xtrain[index, ...] ytrain = ytrain[index, ...] X = torch.cat((Xtrain, Xtest), dim=0) y = torch.cat((ytrain, ytest), dim=0) return X, y
-
Knowledge distillation
- Knowledge distillation for regression problems
- The main idea of knowledge distillation is to optimize the results of simple models based on the learned data structure of complex models
- The main measure is to modify the training loss function
- It was first applied to classification problems, and then the article:
- Knowledge disintegration for regression with teacher bounds applies it to regression problems
- More about distillation, bolg as follows:
- https://blog.csdn.net/nature553863/article/details/80568658
- https://blog.csdn.net/bryant_meng/article/details/104703438
- In use, the following formula is used to design the loss function:
- Where m is margin is set to 1, v v v is the weight, set to 0.5
- y r e g y_reg yr , eg is the true value
- R t R_t Rt , and R s R_s Rs # is the regression model learned by tester model and student model
- L is the loss function, and L1smooth loss function is selected here
- The code is as follows:
def loss_fn_reg_kd(outputs, labels, teacher_output): L_teacher_student = torch.zeros(1,1) for i, _ in enumerate(labels): L_t_s = nn.MSELoss()(outputs[i], labels[i]) if nn.MSELoss( )(outputs[i], labels[i]) > nn.MSELoss( )(teacher_output[i], labels[i]) + 1 else 0 L_teacher_student += L_t_s loss = nn.SmoothL1Loss()(outputs, labels) + 0.5 * L_teacher_student/len(labels) return loss def distill_model_train( teacher_moedl, student_model, trainLoader, vaildLoader, params): ''' Knowledge distillation model training method ''' train_loss, vaild_loss = [], [] loss_func = nn.MSELoss() optimizer = torch.optim.Adam(student_model.parameters(),lr=params.lr) scheduler = CosineAnnealingLR(optimizer,T_max=10) for i in range(params.epochs): valid_losses = [] train_losses = [] for ibatch, (X, y) in enumerate(trainLoader): student_model.train() optimizer.zero_grad() t_out = teacher_moedl(X) s_out = student_model(X) loss = loss_fn_reg_kd(s_out, y, t_out) loss.backward() optimizer.step() train_losses.append(loss.item()) train_loss.append(loss.item()) for iv, (X, y) in enumerate(vaildLoader): student_model.eval() out = student_model(X) v_loss = loss_func(y, out) valid_losses.append(v_loss.item()) vaild_loss.append(v_loss.item()) if i%2 == 0: print("train loss: {}, vaild loss: {}".format( np.mean(train_losses), np.mean(valid_losses))) filename = para.save_path.format(epoch=i) save_torch(student_model, filename) print('epoch {} model saved'.format(i))
The complete code is as follows:
import os import random import torch import torch.nn as nn import torchvision import torch.nn.functional as F import pandas as pd import numpy as np import cv2 import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, KFold class MLP(nn.Module): def __init__(self, output_dims): super(MLP, self).__init__() #self.input_dims = input_dims self.output_dims = output_dims self.conv1 = nn.Conv2d( in_channels=3, out_channels=16, kernel_size=3,stride=2) self.conv2 = nn.Conv2d( in_channels=16, out_channels=32, kernel_size=3, stride=2) self.conv3 = nn.Conv2d( in_channels=32, out_channels=64, kernel_size=3, stride=2) self.conv4 = nn.Conv2d( in_channels=64, out_channels=64, kernel_size=3, stride=2) self.fc1 = nn.Linear(7744, 1600) self.fc2 = nn.Linear(1600, 800) self.fc3 = nn.Linear(800, 100) self.fc4 = nn.Linear(100, self.output_dims) def forward(self, X): X = F.relu(self.conv1(X)) X = F.relu(self.conv2(X)) X = F.relu(self.conv3(X)) X = X.reshape(X.shape[0],-1) X = F.relu(self.fc1(X)) X = F.relu(self.fc2(X)) X = F.relu(self.fc3(X)) out = self.fc4(X) return out def set_parameter_requires_grad(model, feature_extracting): if feature_extracting: model = model for param in model.parameters(): param.requires_grad = False def resnet_model(out_feature, feature_extract = True): model_ft = torchvision.models.resnet18(pretrained=True) set_parameter_requires_grad(model_ft, feature_extract) num_ftrs = model_ft.fc.in_features model_ft.fc = nn.Sequential(nn.Linear(num_ftrs, out_feature)) return model_ft def alexnet_model(out_feature, feature_extract = True): model_ft = torchvision.models.resnet18(pretrained=True) set_parameter_requires_grad(model_ft, feature_extract) class Params: pass para = Params() para.lr = 1e-3 para.epochs = 10 para.batch_size = 32 para.save_path = './model/model_epoch_{epoch}.mdl' from torch.utils.data import Dataset, DataLoader, TensorDataset def in_out_creat(inputData, outputData): return DataLoader(TensorDataset(inputData, outputData), batch_size=para.batch_size, shuffle=True) def save_torch(model, filename): ''' Save model ''' makedir(filename) torch.save(model.state_dict(), filename) def makedir(fileName): ''' Create folders for files ''' path = os.path.dirname(fileName) if not os.path.exists(path): os.makedirs(path) from torch.optim.lr_scheduler import CosineAnnealingLR def train_model(model, trainLoader, vaildLoader, params): train_loss, vaild_loss = [], [] loss_func = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(),lr=params.lr) scheduler = CosineAnnealingLR(optimizer,T_max=10) for i in range(params.epochs): valid_losses = [] train_losses = [] for ibatch, (X, y) in enumerate(trainLoader): model.train() optimizer.zero_grad() out = model(X) loss = loss_func(y, out) loss.backward() optimizer.step() train_losses.append(loss.item()) train_loss.append(loss.item()) for iv, (X, y) in enumerate(vaildLoader): model.eval() out = model(X) v_loss = loss_func(y, out) valid_losses.append(v_loss.item()) vaild_loss.append(v_loss.item()) if i%2 == 0: print("train loss: {}, vaild loss: {}".format( np.mean(train_losses), np.mean(valid_losses))) filename = para.save_path.format(epoch=i) save_torch(model, filename) print('epoch {} model saved'.format(i)) def k_fold_train(X, y, model, params): '''K Fold cross test''' skf = KFold(n_splits=5, random_state=233, shuffle=True) for ifold, (train_ind, valid_ind) in enumerate(skf.split(X, y)): X_train, y_train = X[train_ind], y[train_ind] X_valid, y_valid = X[valid_ind], y[valid_ind] train_loader = in_out_creat(X_train, y_train) valid_loader = in_out_creat(X_valid, y_valid) train_model(model, train_loader, valid_loader, params) filename = './model/kflod/kfold_{}.mdl'.format(ifold) save_torch(model, filename) ''' pseudo-labeled data learning Steps: Divide the original data into train set and valid set Train out model1 use model1 For unlabeled test set False tag prediction yes train set Random sampling was carried out proportionally, compared with those with false labels test set Merge into new train set Use new train set The model is trained to obtain model2 in use model2 yes test set Predict and get the final result ''' def get_torh_modl(modelOri, filename): modelOri.load_state_dict(torch.load(filename)) return modelOri def pseudo_label_creat(model1, model1_fn, test_input): ''' Pseudo tag generation ''' model1 = get_torh_modl(model1, model1_fn) pseudo_label = model1(test_input) return test_input.detach(), pseudo_label.detach() def pseudo_real_concat(Xtrain, ytrain, Xtest, ytest, rate): ''' Merge training data and pseudo label data ''' index = random.sample(range(0,len(ytrain)),int(len(ytrain)*rate)) Xtrain = Xtrain[index, ...] ytrain = ytrain[index, ...] X = torch.cat((Xtrain, Xtest), dim=0) y = torch.cat((ytrain, ytest), dim=0) return X, y ''' Knowledge distillation for regression problems The main idea of knowledge distillation is to optimize the results of simple models based on the learned data structure of complex models The main measure is to modify the training loss function It was first applied to classification problems, and then the article: Knowledge Distillation for Regression with Teacher Bounds Apply it to regression problems ''' def loss_fn_reg_kd(outputs, labels, teacher_output): L_teacher_student = torch.zeros(1,1) for i, _ in enumerate(labels): L_t_s = nn.MSELoss()(outputs[i], labels[i]) if nn.MSELoss( )(outputs[i], labels[i]) > nn.MSELoss( )(teacher_output[i], labels[i]) + 1 else 0 L_teacher_student += L_t_s loss = nn.SmoothL1Loss()(outputs, labels) + 0.5 * L_teacher_student/len(labels) return loss def distill_model_train( teacher_moedl, student_model, trainLoader, vaildLoader, params): ''' Knowledge distillation model training method ''' train_loss, vaild_loss = [], [] loss_func = nn.MSELoss() optimizer = torch.optim.Adam(student_model.parameters(),lr=params.lr) scheduler = CosineAnnealingLR(optimizer,T_max=10) for i in range(params.epochs): valid_losses = [] train_losses = [] for ibatch, (X, y) in enumerate(trainLoader): student_model.train() optimizer.zero_grad() t_out = teacher_moedl(X) s_out = student_model(X) loss = loss_fn_reg_kd(s_out, y, t_out) loss.backward() optimizer.step() train_losses.append(loss.item()) train_loss.append(loss.item()) for iv, (X, y) in enumerate(vaildLoader): student_model.eval() out = student_model(X) v_loss = loss_func(y, out) valid_losses.append(v_loss.item()) vaild_loss.append(v_loss.item()) if i%2 == 0: print("train loss: {}, vaild loss: {}".format( np.mean(train_losses), np.mean(valid_losses))) filename = para.save_path.format(epoch=i) save_torch(student_model, filename) print('epoch {} model saved'.format(i)) class Params: pass if __name__ == '__main__': train_df = pd.read_csv('./Face key point detection challenge_data set/train.csv') train_img = np.load('./Face key point detection challenge_data set/train.npy/train.npy') test_img = np.load('./Face key point detection challenge_data set/test.npy/test.npy') print(train_df.head()) print(train_img.shape) #print(train_df.isnull().sum()) '''Data reading and preprocessing''' train_df.fillna(method='ffill', inplace=True) para = Params() para.lr = 1e-3 para.epochs = 10 para.batch_size = 32 para.save_path = './model/model_epoch_{epoch}.mdl' model = resnet_model(8) Xinput = train_img.transpose(2, 0, 1) youtput = train_df.values.astype(np.float32) Xinput = torch.FloatTensor(Xinput).unsqueeze(1).repeat(1,3,1,1) youtput = torch.FloatTensor(youtput) #k_fold_train(Xinput, youtput, model, para) '''Pseudo label learning''' Xtest = train_img.transpose(2, 0, 1) Xtest = torch.FloatTensor(Xtest).unsqueeze(1).repeat(1,3,1,1) model1 = model model1_fn = './model/kflod/kfold_{}.mdl'.format(0) Xtest, ytest = pseudo_label_creat(model1, model1_fn, Xtest) X, y = pseudo_real_concat(Xinput, youtput, Xtest, ytest, rate=0.7) model = resnet_model(8) k_fold_train(Xinput, youtput, model, para) '''Distillation learning''' Xtrain, Xvalid, ytrain, yvalid = train_test_split( train_img.transpose(2, 0, 1), train_df.values.astype(np.float32), test_size=0.1) Xtrain = torch.FloatTensor(Xtrain).unsqueeze(1).repeat(1,3,1,1) ytrain = torch.FloatTensor(ytrain) Xvalid = torch.FloatTensor(Xvalid).unsqueeze(1).repeat(1,3,1,1) yvalid = torch.FloatTensor(yvalid) trainLoader = in_out_creat(Xtrain, ytrain) validLoader = in_out_creat(Xvalid, yvalid) model = resnet_model(8) model1_fn = './model/kflod/kfold_{}.mdl'.format(0) teacher_moedl = get_torh_modl(model, model1_fn) student_model = MLP(8) para.save_path = './model/students/model_epoch_{epoch}.mdl' distill_model_train( teacher_moedl, student_model, trainLoader, validLoader, para)