# coding: utf-8

```import numpy as np
import pandas as pd
```

```data = pd.read_csv('housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
```

#Observation data characteristics
#Objective: to calculate the minimum value of value

```minimum_price = np.min(prices)
```

#Objective: calculate the maximum value of value

```maximum_price = np.max(prices)
```

#Objective: calculate the average value

```mean_price = np.mean(prices)
```

#Goal: calculate the median value

```median_price = np.median(prices)
```

#Objective: calculate the standard deviation of value

```std_price = np.std(prices)
```

#Goal: output calculated results

```print("Statistics for Boston housing dataset:\n")
print("Minimum price: \${:,.2f}".format(minimum_price))
print("Maximum price: \${:,.2f}".format(maximum_price))
print("Mean price: \${:,.2f}".format(mean_price))
print("Median price \${:,.2f}".format(median_price))
print("Standard deviation of prices: \${:,.2f}".format(std_price))
```

#Through the relationship between features and labels of scatter diagram

```import matplotlib.pyplot as plt
rm = data['RM']
medv = data['MEDV']
plt.scatter(rm, medv, c='b')
plt.show()
lstat = data['LSTAT']
plt.scatter(lstat, medv, c='c')
plt.show()
ptratio = data['PTRATIO']
plt.scatter(ptratio, medv, c='g')
plt.show()
```

#Determine the prediction scoring model and select R2 method

```from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
"""Calculate and return the fraction of the predicted value compared to the predicted value"""
score = r2_score(y_true, y_predict, sample_weight=None, multioutput=None)
return score
```

#Build a prediction model and find the best decision tree model through GridSearchCV

```from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
def fit_model(X, y):
""" Based on input data [X,y]，It is helpful for grid search to find the optimal decision tree model"""
cross_validator = KFold(n_splits=10, shuffle=False, random_state=None)
regressor = DecisionTreeRegressor()
params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cross_validator)
# Grid search based on input data [X,y]
```

grid = grid.fit(X, y)
return grid.best_estimator_

#Split the data set, train the test set, and select train test split

```from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.20, random_state=0)
print("Train test split success!")
```

# Based on the training data, the optimal model is obtained

```optimal_reg = fit_model(X_train, y_train)
```

# The 'max UU depth' parameter of the output optimal model

```print("Parameter 'max_depth' is {} for the optimal model.".format(optimal_reg.get_params()['max_depth']))
```

# Generate data of three customers and forecast the corresponding price

```client_data = [[5, 17, 15], # Customer 1
[4, 32, 22], # Customer 2
[8, 3, 12]]  # Customer 3
```

# #Forecast

```predicted_price = optimal_reg.predict(client_data)
print(predicted_price)
for i, price in enumerate(predicted_price):
print("Predicted selling price for Client {}'s home: \${:,.2f}".format(i+1, price))
```

Posted on Wed, 06 Nov 2019 10:31:59 -0500 by God Ownz