python implementation
Step by step
-
Divide the data subset (the left subtree divides the sample set smaller than the specified value, and the right subtree divides the sample set larger than the specified value)
import numpy as np #The method of obtaining data subset, classification and regression is the same #The data set is divided into two categories according to the division characteristics def split_dataset(data_x,data_y,fea_axis,fea_value): ''' input:data_x(ndarry):characteristic value data_y(ndarry):Tag value fea_axis(int):Number of features to be divided (number of columns) fea_value(int):The feature to be divided corresponds to the eigenvalue output:data_x[equal_Idx],data_y[equal_Idx](ndarry):Samples and labels with eigenvalues equal to (greater than or equal to) target eigenvalues data_x[nequal_Idx],data_y[nequal_Idx](ndarry):Samples and labels with feature values not equal to (less than) the target feature ''' if isinstance(fea_value,int) or isinstance(fea_value,float): #If the eigenvalue is a floating point number (continuous eigenvalue), the continuous value is discretized equal_Idx = np.where(data_x[:,fea_axis]<=fea_value) #Find out that the eigenvalue is greater than or equal to FEA_ Sample serial number of value nequal_Idx = np.where(data_x[:,fea_axis]>fea_value) #Find out that the eigenvalue is less than FEA_ Sample serial number of value else: equal_Idx = np.where(data_x[:,fea_axis]==fea_value) #Find out that the eigenvalue is equal to FEA_ Sample serial number of value nequal_Idx = np.where(data_x[:,fea_axis]!=fea_value) #Find out that the eigenvalue is not equal to FEA_ Sample serial number of value return data_x[equal_Idx],data_y[equal_Idx],data_x[nequal_Idx],data_y[nequal_Idx]
-
Mean calculation of leaf nodes
import numpy as np #Leaf node mean calculation def reg_leaf(data_y): ''' input:data_y(array):Tag value output:(float)mean value ''' return np.mean(data_y)
-
Calculate the total variance of the data set
#Calculate the total variance of the data set def reg_err(data_y): ''' input:data_y(array):Tag value output:(float):Total variance ''' return np.var(data_y)*len(data_y)
-
Select partition features and corresponding eigenvalues
def classify_get_best_fea(data_x,data_y,ops=(1,4)): ''' input:data_x(ndarry):characteristic value data_y(array):Tag value ops(tuple):The first number is the minimum precision of decision tree stopping division, and the second number is the minimum number of decision tree stopping division output:best_fea_idx(int):Subscript of the best partition feature best_fea_val(float):It is best to divide the eigenvalues corresponding to the features ''' m,n = np.shape(data_x) final_s = ops[0] #Stop precision final_n = ops[1] #Minimum number of samples to stop #When there is only one kind of sample, the leaf node and its corresponding mean are output if len(np.unique(data_y))==1: return None,reg_leaf(data_y) #Obtaining optimal features and eigenvalues total_err = reg_err(data_y) #Total error best_err = np.inf best_fea_idx = 0 best_fea_val = 0 for i in range(n): feas = np.unique(data_x[:,i]) for fea_val in feas: data_D1_x,data_D1_y,data_D2_x,data_D2_y = split_dataset(data_x,data_y,i,fea_val) #If the minimum partition set is not satisfied, no calculation will be performed if data_D1_x.shape[0]<final_n or data_D2_x.shape[0]<final_n: continue con_err = reg_err(data_D1_y)+reg_err(data_D2_y) if con_err<best_err: best_err = con_err best_fea_idx = i best_fea_val = fea_val #Pre pruning, the error of the solution is less than the minimum error, and continue to divide if total_err-best_err<final_s: return None,reg_leaf(data_y) #It has been unable to be divided and processed here data_D1_x,data_D1_y,data_D2_x,data_D2_y = split_dataset(data_x,data_y,best_fea_idx,best_fea_val) if data_D1_x.shape[0]<final_n or data_D2_x.shape[0]<final_n: return None,reg_leaf(data_y) return best_fea_idx,best_fea_val
-
Generation of CART regression tree (recursive generation)
def reg_create_tree(data_x,data_y,ops=(1,4)): fea_idx,fea_val = classify_get_best_fea(data_x,data_y,ops) if fea_idx == None: return fea_val #Recursive establishment of CART regression decision tree my_tree = {} my_tree['fea_idx'] = fea_idx my_tree['fea_val'] = fea_val data_D1_x,data_D1_y,data_D2_x,data_D2_y = split_dataset(data_x,data_y,fea_idx,fea_val) my_tree['left'] = reg_create_tree(data_D1_x,data_D1_y,ops) my_tree['right'] = reg_create_tree(data_D2_x,data_D2_y,ops) return my_tree
The dictionary format is: {'fea_idx': 'current best divided feature subscript', 'fea_val': 'corresponding feature value', 'left': {...}, "right": {...}}.
-
Prediction function
#Test operation import re #Predict a test data result def classify(inputTree,testdata): ''' input:inputTree(dict):CART Classification decision tree xlabel(list):Feature attribute list testdata(darry):Characteristic value of a test data output:classLabel(int):Test data prediction results ''' first_fea_idx = inputTree[list(inputTree.keys())[0]] #Corresponding characteristic subscript fea_val = inputTree[list(inputTree.keys())[1]] #Segmentation value of corresponding feature classLabel = 0.0 #Define the variable classLabel. The default value is 0 if testdata[first_fea_idx]>=fea_val: #Enter right subtree if type(inputTree['right']).__name__ == 'dict': classLabel = classify(inputTree['right'],testdata) else: classLabel = inputTree['right'] else: #Enter left subtree if type(inputTree['left']).__name__ == 'dict': classLabel = classify(inputTree['left'],testdata) else: classLabel = inputTree['left'] return round(classLabel,2) #Predict all test data results def classifytest(inputTree, testDataSet): ''' input:inputTree(dict):Trained decision tree xlabel(list):Eigenvalue label list testDataSet(ndarray):Test data set output:classLabelAll(list):Test set prediction result list ''' classLabelAll = []#Create an empty list for testVec in testDataSet:#Traverse each data classLabelAll.append(classify(inputTree, testVec))#Add the feature label obtained from each data to the list return np.array(classLabelAll)
Source code (all)
import numpy as np import re #The method of obtaining data subset, classification and regression is the same #The data set is divided into two categories according to the division characteristics def split_dataset(data_x,data_y,fea_axis,fea_value): ''' input:data_x(ndarry):characteristic value data_y(ndarry):Tag value fea_axis(int):Number of features to be divided (number of columns) fea_value(int):The feature to be divided corresponds to the eigenvalue output:data_x[equal_Idx],data_y[equal_Idx](ndarry):Samples and labels with eigenvalues equal to (greater than or equal to) target eigenvalues data_x[nequal_Idx],data_y[nequal_Idx](ndarry):Samples and labels with feature values not equal to (less than) the target feature ''' if isinstance(fea_value,int) or isinstance(fea_value,float): #If the eigenvalue is a floating point number (continuous eigenvalue), the continuous value is discretized equal_Idx = np.where(data_x[:,fea_axis]<=fea_value) #Find out that the eigenvalue is greater than or equal to FEA_ Sample serial number of value nequal_Idx = np.where(data_x[:,fea_axis]>fea_value) #Find out that the eigenvalue is less than FEA_ Sample serial number of value else: equal_Idx = np.where(data_x[:,fea_axis]==fea_value) #Find out that the eigenvalue is equal to FEA_ Sample serial number of value nequal_Idx = np.where(data_x[:,fea_axis]!=fea_value) #Find out that the eigenvalue is not equal to FEA_ Sample serial number of value return data_x[equal_Idx],data_y[equal_Idx],data_x[nequal_Idx],data_y[nequal_Idx] #Leaf node mean calculation def reg_leaf(data_y): ''' input:data_y(array):Tag value output:(float)mean value ''' return np.mean(data_y) #Calculate the total variance of the data set def reg_err(data_y): ''' input:data_y(array):Tag value output:(float):Total variance ''' return np.var(data_y)*len(data_y) def classify_get_best_fea(data_x,data_y,ops=(1,4)): ''' input:data_x(ndarry):characteristic value data_y(array):Tag value ops(tuple):The first number is the minimum precision of decision tree stopping division, and the second number is the minimum number of decision tree stopping division output:best_fea_idx(int):Subscript of the best partition feature best_fea_val(float):It is best to divide the eigenvalues corresponding to the features ''' m,n = np.shape(data_x) final_s = ops[0] #Stop precision final_n = ops[1] #Minimum number of samples to stop #When there is only one kind of sample, the leaf node and its corresponding mean are output if len(np.unique(data_y))==1: return None,reg_leaf(data_y) #Obtaining optimal features and eigenvalues total_err = reg_err(data_y) #Total error best_err = np.inf best_fea_idx = 0 best_fea_val = 0 for i in range(n): feas = np.unique(data_x[:,i]) for fea_val in feas: data_D1_x,data_D1_y,data_D2_x,data_D2_y = split_dataset(data_x,data_y,i,fea_val) #If the minimum partition set is not satisfied, no calculation will be performed if data_D1_x.shape[0]<final_n or data_D2_x.shape[0]<final_n: continue con_err = reg_err(data_D1_y)+reg_err(data_D2_y) if con_err<best_err: best_err = con_err best_fea_idx = i best_fea_val = fea_val #Pre pruning, the error of the solution is less than the minimum error, and continue to divide if total_err-best_err<final_s: return None,reg_leaf(data_y) #It has been unable to be divided and processed here data_D1_x,data_D1_y,data_D2_x,data_D2_y = split_dataset(data_x,data_y,best_fea_idx,best_fea_val) if data_D1_x.shape[0]<final_n or data_D2_x.shape[0]<final_n: return None,reg_leaf(data_y) return best_fea_idx,best_fea_val def reg_create_tree(data_x,data_y,ops=(1,4)): ''' input:data_x(ndarry):characteristic value data_y(array):Tag value ops(tuple):The first number is the minimum precision of decision tree stopping division, and the second number is the minimum number of decision tree stopping division output:my_tree(dict):Generated CART Decision tree dictionary ''' fea_idx,fea_val = classify_get_best_fea(data_x,data_y,ops) if fea_idx == None: return fea_val #Recursive establishment of CART regression decision tree my_tree = {} my_tree['fea_idx'] = fea_idx my_tree['fea_val'] = fea_val data_D1_x,data_D1_y,data_D2_x,data_D2_y = split_dataset(data_x,data_y,fea_idx,fea_val) my_tree['left'] = reg_create_tree(data_D1_x,data_D1_y,ops) my_tree['right'] = reg_create_tree(data_D2_x,data_D2_y,ops) return my_tree #Predict a test data result def classify(inputTree,testdata): ''' input:inputTree(dict):CART Classification decision tree xlabel(list):Feature attribute list testdata(darry):Characteristic value of a test data output:classLabel(int):Test data prediction results ''' first_fea_idx = inputTree[list(inputTree.keys())[0]] #Corresponding characteristic subscript fea_val = inputTree[list(inputTree.keys())[1]] #Segmentation value of corresponding feature classLabel = 0.0 #Define the variable classLabel. The default value is 0 if testdata[first_fea_idx]>=fea_val: #Enter right subtree if type(inputTree['right']).__name__ == 'dict': classLabel = classify(inputTree['right'],testdata) else: classLabel = inputTree['right'] else: if type(inputTree['left']).__name__ == 'dict': classLabel = classify(inputTree['left'],testdata) else: classLabel = inputTree['left'] return round(classLabel,2) #Predict all test data results def classifytest(inputTree, testDataSet): ''' input:inputTree(dict):Trained decision tree xlabel(list):Eigenvalue label list testDataSet(ndarray):Test data set output:classLabelAll(list):Test set prediction result list ''' classLabelAll = []#Create an empty list for testVec in testDataSet:#Traverse each data classLabelAll.append(classify(inputTree, testVec))#Add the feature label obtained from each data to the list return np.array(classLabelAll)
Test set 1 (Boston house price data set)
There are thirteen attributes in total, as shown in the following figure:
Import dataset and train CART regression tree:
#Boston house price data set from sklearn.datasets import load_boston boston = load_boston() data = boston.data target = boston.target # X = data[:200,:] # y = target[:200] x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 666) #Generate CART regression tree cartTree = reg_create_tree(x_train,y_train) print(cartTree)
The CART tree obtained from training is shown in the following figure:
The predicted results are shown in the figure below:
classlist=classifytest(cartTree,x_test) print('Forecast data',classlist) print('Real data'.y_test) print("The average error is:",abs(np.sum(classlist)-np.sum(y_test))/len(y_test))
Test set 2 (diabetes dataset)
There are ten attributes in total, as shown in the following figure:
Import dataset and train CART regression tree:
#Diabetes dataset from sklearn.datasets import load_diabetes diabetes = load_diabetes() data = diabetes.data target = diabetes.target X = data[:500,:] y = target[:500] x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 666) #Generate CART regression tree cartTree = reg_create_tree(x_train,y_train) print(cartTree)
The CART tree obtained from training is shown in the figure below (500 pieces of data):
The predicted results are shown in the figure below:
classlist=classifytest(cartTree,x_test) print('Forecast data',classlist) print('Real data',y_test) print("The average error is:",abs(np.sum(classlist)-np.sum(y_test))/len(y_test))
summary
(1) The label values of data sets predicted by CART regression tree are continuous. Of course, discrete label values can also be used for classification, but it is not necessary. For each feature, the corresponding eigenvalue can be continuous or discrete, and the order of magnitude between different features can also be different, because each feature is independent of each other during division. (for neural networks, normalization is required)
(2) The average error (the average of the sum of errors) and the mean square error (the average of the sum of squares of errors) can be used to evaluate the accuracy of the model.
(3) The calculation method is different from the Gini coefficient of CART classification tree. CART regression tree uses the method of mean square deviation.