Study and organize the Boston house price prediction model GridSearchCV

coding: utf-8

import numpy as np import pandas as pd

#Read data, preprocessing

data = pd.read_csv('housing.csv') prices = data['MEDV'] features = data.drop('MEDV', axis = 1)

#Observation data characteristics
#Objective: to calculate the minimum value of value

minimum_price = np.min(prices)

#Objective: calculate the maximum value of value

maximum_price = np.max(prices)

#Objective: calculate the average value

mean_price = np.mean(prices)

#Goal: calculate the median value

median_price = np.median(prices)

#Objective: calculate the standard deviation of value

std_price = np.std(prices)

#Goal: output calculated results

print("Statistics for Boston housing dataset:\n") print("Minimum price: ${:,.2f}".format(minimum_price)) print("Maximum price: ${:,.2f}".format(maximum_price)) print("Mean price: ${:,.2f}".format(mean_price)) print("Median price ${:,.2f}".format(median_price)) print("Standard deviation of prices: ${:,.2f}".format(std_price))

#Through the relationship between features and labels of scatter diagram

import matplotlib.pyplot as plt rm = data['RM'] medv = data['MEDV'] plt.scatter(rm, medv, c='b') plt.show() lstat = data['LSTAT'] plt.scatter(lstat, medv, c='c') plt.show() ptratio = data['PTRATIO'] plt.scatter(ptratio, medv, c='g') plt.show()

#Determine the prediction scoring model and select R2 method

from sklearn.metrics import r2_score def performance_metric(y_true, y_predict): """Calculate and return the fraction of the predicted value compared to the predicted value""" score = r2_score(y_true, y_predict, sample_weight=None, multioutput=None) return score

#Build a prediction model and find the best decision tree model through GridSearchCV

from sklearn.model_selection import KFold from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import GridSearchCV def fit_model(X, y): """ Based on input data [X,y]，It is helpful for grid search to find the optimal decision tree model""" cross_validator = KFold(n_splits=10, shuffle=False, random_state=None) regressor = DecisionTreeRegressor() params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]} scoring_fnc = make_scorer(performance_metric) grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cross_validator) # Grid search based on input data [X,y]

grid = grid.fit(X, y)
#Return to the optimal model after grid search
return grid.best_estimator_

#Split the data set, train the test set, and select train test split

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.20, random_state=0) print("Train test split success!")

Based on the training data, the optimal model is obtained

optimal_reg = fit_model(X_train, y_train)

The 'max UU depth' parameter of the output optimal model

print("Parameter 'max_depth' is {} for the optimal model.".format(optimal_reg.get_params()['max_depth']))

Generate data of three customers and forecast the corresponding price

client_data = [[5, 17, 15], # Customer 1 [4, 32, 22], # Customer 2 [8, 3, 12]] # Customer 3

#Forecast

predicted_price = optimal_reg.predict(client_data) print(predicted_price) for i, price in enumerate(predicted_price): print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

Study and organize the Boston house price prediction model GridSearchCV

6 November 2019, 10:31 | Views: 3471

Add new comment

0 comments