#Load package import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline
path = 'LogiReg_data.txt' #header=None: indicates that the first row in the file will not be set as column name by default. #names is used to set the column name pdData = pd.read_csv(path,header=None,sep = ',',names=['Exam 1','Exam 2','Admitted']) pdData.head() %head The function reads the first five lines by default
pdData.shape
#Data visualization positive=pdData[pdData['Admitted']==1] negative=pdData[pdData['Admitted']==0] fig,ax = plt.subplots(figsize=(10,5)) #plt.subplots() is a function that returns a tuple containing figure and axes objects. #Therefore, use fig,ax = plt.subplots() to decompose the tuple into two variables, fig and ax. ax.scatter(positive['Exam 1'],positive['Exam 2'],s=30,c='b',marker='o',label='Admitted') ax.scatter(negative['Exam 1'], negative['Exam 2'], s=30, c='r', marker='x', label='Not Admitted') ax.legend() ax.set_xlabel('Exam 1 Score') ax.set_ylabel('Exam 2 Score')
The logistic regression
Objective: to establish a classifier (solve three parameters 012)
Set the threshold and judge the admission result according to the threshold
Modules to be completed
sigmod: function mapped to probability
model: return prediction result value
cost: calculate loss according to parameters
Gradient: calculate the gradient direction of each parameter
descent: update parameters
Accuracy: calculation accuracy
#sigmod: function mapped to probability def sigmoid(z): return 1/(1+np.exp(-z)) #Create a vector with a size of 20 from - 10 to 10 nums=np.arange(-10,10,1) fig,ax = plt.subplots(figsize=(12,4)) ax.plot(nums,sigmoid(nums),'r')
① np.arange()
The function returns a fixed step arrangement with an end point and a start point, such as [1,2,3,4,5]. The start point is 1, the end point is 6, and the step is 1.
Number of parameters: NP. Orange() function is divided into one parameter, two parameters and three parameters
1) For a parameter, the parameter value is the end point, the default value is 0 for the starting point and 1 for the step size.
2) When there are two parameters, the first parameter is the starting point, the second parameter is the end point, and the default step size is 1.
3) When there are three parameters, the first parameter is the starting point, the second parameter is the end point, and the third parameter is the step size. The step size supports decimals
#The default starting point of a parameter is 0, and the step size is 1. Output: [0 1 2]
a = np.arange(3)
#The default step size of the two parameters is 1 output [3 4 5 6 7 8]
a = np.arange(3,9)
#The three parameters start at 0, end at 3, step at 0.1, output [0.5] zero point one zero point two zero point three zero point four zero point five zero point six zero point seven zero point eight zero point nine one one point one one point two one point three 1.4 1.5 one point six one point seven one point eight one point nine two two point one two point two two point three two point four two point five two point six two point seven two point eight 2.9]
a = np.arange(0, 3, 0.1)
#Model: return prediction result value (model function) def model(X,theta): return sigmoid(np.dot(X,theta.T)
② Dataframe.insert(loc, column, value, allow_duplicates=False): inserts data into the specified column of the dataframe. Parameter introduction:
loc: int type, indicating the column number; If you insert data in the first column, loc=0
Column: name the inserted column, such as column = 'new column'
value: number, array, series, etc. (you can try it yourself)
allow_duplicates: whether to allow duplicate column names. Selecting Ture means to allow new column names to duplicate existing column names.
③ The shape function returns a tuple
hg.shape returns the number of rows and columns of hg
hg.shape[0] returns the number of rows of hg. How many rows are there
hg.shape[1] returns the number of hg columns. How many columns are there
pdData.insert(0,'ones',1) #The first column inserts all values of 1, and 0 represents the first column orig_data = pdData.as_matrix() #Convert data format to matrix cols = orig_data.shape[1] X=orig_data[:,0:cols-1] #Feature extraction y =orig_data[:,cols-1:cols] #Extract label theta = np.zeros([1,3]) #Initialize three parameters
#Calculation loss function def cost(X,y,theta): left = np.multiply(-y,np.log(model(X.theta))) right = np.multiply(1-y,np.log(1-model(X,theta))) return np.sum(left - right)/(len(X)) #According to the objective function formulation of likelihood function
#decent compares three different gradient descent methods STOP_ITER = 0 #Terminate according to the number of iterations STOP_COST = 1 #Termination based on loss value STOP_GRAD = 2 #Termination according to gradient def stopCriterion(type,value,threshold): #Set three different stop strategies if type == STOP_ITER: return value>threshold #Threshold specifies the threshold elif type == STOP_COST: return abs(value[-1]-value[-2])<threshold #Default second norm elif type == STOP_GRAD: return np.linalg.norm(value)<threshold
③ shuffle() cannot be accessed directly. You can import the numpy.random module and call this method through the numpy.random static object. shuffle directly operates on the original array and changes the order of the original array without return value (it is to randomly disrupt the order of all elements in list X. if x is not a list, an error will be reported).
import numpy.random #shuffle the cards def shuffleData(data): np.random.shuffle(data) cols = data.shape[1] X = data[:, 0:cols-1] y = data[:, cols-1: ] return X,y
#The decent method is as follows import time def descent(data, theta, batchSize,stopType,thresh,alpha): #Gradient descent solution init_time = time.time() i =0 #Number of iterations k=0 # batch X,y = shuffleData(data) grad = np.zeros(theta.shape) #Calculated gradient costs = [cost(X,y,theta)] while True: grad = gradient(X[k:k+batchSize],y[k:k+batchSize],theta) k += batchSize #Batch quantity data -- batch processing if k>= n: k=0 X,y = shuffleData(data) #Reshuffle theta = theta - alpha*grad #Parameter update costs.append(cost(X,y,theta)) #Calculate new losses i+=1 if stopType == STOP_ITER: value = i elif stopType == STOP_COST: value = costs elif stopType == STOP_GRAD: value = grad if stopCriterion(stopType,value,thresh):break return theta,i-1,costs,grad,time.time() - init_time
#Drawing tools are as follows def runExpe(data, theta, batchSize, stopType, thresh, alpha): #import pdb; pdb.set_trace(); theta, iter, costs, grad, dur = descent(data, theta, batchSize, stopType, thresh, alpha) name = "Original" if (data[:,1]>2).sum() > 1 else "Scaled" name += " data - learning rate: {} - ".format(alpha) if batchSize==n: strDescType = "Gradient" elif batchSize==1: strDescType = "Stochastic" else: strDescType = "Mini-batch ({})".format(batchSize) name += strDescType + " descent - Stop: " if stopType == STOP_ITER: strStop = "{} iterations".format(thresh) elif stopType == STOP_COST: strStop = "costs change < {}".format(thresh) else: strStop = "gradient norm < {}".format(thresh) name += strStop print ("***{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format( name, theta, iter, costs[-1], dur)) fig, ax = plt.subplots(figsize=(12,4)) ax.plot(np.arange(len(costs)), costs, 'r') ax.set_xlabel('Iterations') ax.set_ylabel('Cost') ax.set_title(name.upper() + ' - Error vs. Iteration') return theta
(1) Batch gradient descent method based on all samples:
# Stop according to the number of iterations n=100 runExpe(orig_data, theta, n, STOP_ITER, thresh=5000, alpha=0.000001)
#Stop according to the change of loss value and set the threshold 1E-6, which requires almost 110000 iterations runExpe(orig_data, theta, n, STOP_COST, thresh=0.000001, alpha=0.001)
#According to the gradient change, it needs almost 40000 iterations to stop setting the threshold of 0.05 runExpe(orig_data, theta, n, STOP_GRAD, thresh=0.05, alpha=0.001)
(2) Random gradient descent method
#Stop strategy according to the number of iterations runExpe(orig_data, theta, 1, STOP_ITER, thresh=5000, alpha=0.001)
It's very unstable. Let's try to reduce the learning rate
#It's very unstable. Turn down the learning rate runExpe(orig_data, theta, 1, STOP_ITER, thresh=15000, alpha=0.000002)
Conclusion: the random gradient is fast, but its stability is poor, and it needs a small learning rate.
(3) Small batch gradient descent
#Stop strategy according to the number of iterations runExpe(orig_data, theta, 16, STOP_ITER, thresh=15000, alpha=0.001)
The float is still large. Let's try to standardize the data, subtract its mean value by its attribute (by column), and then divide it by its standard deviation. The final result is that for each attribute / column, all data are clustered near 0, and the variance value is 1.
(4) Results of data preprocessing
① Batch gradient descent method based on all samples
#Stop strategy according to the number of iterations from sklearn import preprocessing as pp scaled_data = orig_data.copy() scaled_data[:, 1:3] = pp.scale(orig_data[:, 1:3]) runExpe(scaled_data, theta, n, STOP_ITER, thresh=5000, alpha=0.001)
The value of the cost function is 0.38. The smaller the cost function, the better.
#Stop strategy according to gradient change runExpe(scaled_data, theta, n, STOP_GRAD, thresh=0.02, alpha=0.001)
② Random gradient descent method
#Random - stop strategy according to gradient change theta = runExpe(scaled_data, theta, 1, STOP_GRAD, thresh=0.002/5, alpha=0.001)
Although the gradient descent method is fast, it has many iterations, so the small batch descent method can be used.
③ Small batch gradient descent method
#Stop strategy according to gradient change -- small batch runExpe(scaled_data, theta, 16, STOP_GRAD, thresh=0.002*2, alpha=0.001)
Let's look at the accuracy of the model:
#accuracy #Set threshold def predict(X, theta): return [1 if x >= 0.5 else 0 for x in model(X, theta)] scaled_X = scaled_data[:, :3] y = scaled_data[:, 3] predictions = predict(scaled_X, theta) correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)] accuracy = (sum(correct) % len(correct)) print ('accuracy = {0}%'.format(accuracy))