1, Prediction of house prices by multiple linear regression model
1. Basic package import
import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt df = pd.read_csv('house_prices.csv') df.info(); df.head()
2. Variable exploration
# Outlier handling # ================Outlier test function: two methods of IQR & Z score========================= def outlier_test(data, column, method=None, z=2): """ Based on a column, the upper and lower truncation point method is used to detect outliers(Indexes) """ """ full_data: Complete data column: full_data Specified line in, format 'x' Quoted return Optional; outlier: Outlier data frame upper: Upper truncation point; lower: Lower truncation point method: Method of checking outliers (optional), default None Is the upper and lower cut-off point method), choose Z Method, Z The default is 2 """ # ==================Upper and lower cut-off point method to test outliers============================== if method == None: print(f'with {column} Based on the column, the upper and lower cut-off point method is used(iqr) Detect outliers...') print('=' * 70) # Quartile; There will be exceptions when calling the function here column_iqr = np.quantile(data[column], 0.75) - np.quantile(data[column], 0.25) # 1, 3 quantiles (q1, q3) = np.quantile(data[column], 0.25), np.quantile(data[column], 0.75) # Calculate upper and lower cutoff points upper, lower = (q3 + 1.5 * column_iqr), (q1 - 1.5 * column_iqr) # Detect outliers outlier = data[(data[column] <= lower) | (data[column] >= upper)] print(f'First quantile: {q1}, Third quantile:{q3}, Interquartile range:{column_iqr}') print(f"Upper cutoff point:{upper}, Lower cutoff point:{lower}") return outlier, upper, lower # =====================Z-score test outliers========================== if method == 'z': """ Based on a column, the incoming data is the same as the data you want to segment z Score point, return the outlier index and the data frame """ """ params data: Complete data column: Specified detection column z: Z Quantile, The default is 2, according to z fraction-According to the normal curve table, take 2 at the left and right ends%, According to you z Positive and negative setting of scores. It can also be changed arbitrarily to know the data set of any top percentage """ print(f'with {column} List as basis, use Z Fractional method, z Quantile extraction {z} To detect outliers...') print('=' * 70) # Calculate the numerical points of the two Z fractions mean, std = np.mean(data[column]), np.std(data[column]) upper, lower = (mean + z * std), (mean - z * std) print(f"take {z} individual Z Score: greater than {upper} Or less than {lower} Is considered an outlier.") print('=' * 70) # Detect outliers outlier = data[(data[column] <= lower) | (data[column] >= upper)] return outlier, upper, lower
outlier, upper, lower = outlier_test(data=df, column='price', method='z') outlier.info(); outlier.sample(5)
# Simply discard it here df.drop(index=outlier.index, inplace=True)
# Category variables, also known as nominal variables nominal_vars = ['neighborhood', 'style'] for each in nominal_vars: print(each, ':') print(df[each].agg(['value_counts']).T) # Direct. value_counts().T cannot achieve the following effect ## You must get agg, and the brackets [] inside can't be less print('='*35) # It is found that the number of each category is also OK, so as to prepare for the following analysis of variance
# Thermodynamic diagram def heatmap(data, method='pearson', camp='RdYlGn', figsize=(10 ,8)): """ data: Whole data method: Default to pearson coefficient camp: The default is: RdYlGn-Red, yellow and blue; YlGnBu-Yellow green blue; Blues/Greens It's also a good choice figsize: The default is 10, 8 """ ## Eliminate color blocks with diagonal color duplication # mask = np.zeros_like(df2.corr()) # mask[np.tril_indices_from(mask)] = True plt.figure(figsize=figsize, dpi= 80) sns.heatmap(data.corr(method=method), \ xticklabels=data.corr(method=method).columns, \ yticklabels=data.corr(method=method).columns, cmap=camp, \ center=0, annot=True) # To achieve the effect of leaving only half of the diagonal, the parameters in brackets can be added with mask=mask
# It can be seen from the thermodynamic diagram that the relationship between variables such as area, bedrooms and bathrooms and house price is still relatively strong ## Therefore, it is worth putting into the model, but the relationship between the classification variables style and neighborhood and price is unknown heatmap(data=df, figsize=(6,5))
# In the exploration just now, we found that the categories of style and neighborhood are three, ## If there are only two categories, we can carry out chi square test, so here we use analysis of variance ## Using analysis of variance in regression model ## Only statsmodels have ANOVA libraries ## The analysis of variance results were extracted from the linear regression results import statsmodels.api as sm from statsmodels.formula.api import ols # ols is a statistical database for establishing linear regression model from statsmodels.stats.anova import anova_lm
Insert a sample size and confidence level α_ Attention points of level (confidence level) α Selection experience)
sample size α-level ≤ 100 10% 100 < n ≤ 500 5% 500 < n ≤ 1000 1% n > 2000 1/1000
The sample size is too large, α- level is meaningless.
When the data volume is large, the p value is useless, and the sample size usually does not exceed 5000,
In order to prove that the relationship between the two variables is stable, the sample size should be well controlled.
# Number of samples in the dataset: 6028. 600 samples are randomly selected here. If you want stratified sampling, please refer to the article: df = df.copy().sample(600) # C means to tell python that this is a classified variable, otherwise Python will use it as a continuous variable ## Here, analysis of variance is directly used to test all classified variables ## The following lines of code are the standard gestures for analysis of variance using the statistical library lm = ols('price ~ C(neighborhood) + C(style)', data=df).fit() anova_lm(lm) # The Residual line indicates the within group that cannot be explained by the model, and the others are between groups that can be explained # df: degree of freedom (n-1) - the number of categories in the classification variable minus 1 # sum_sq: sum of total squares (SSM), sum of residual lines_ eq: SSE # mean_ SQ: MSM, mean of residual line_ sq: mse # F: F statistics, just check the chi square distribution table # Pr (> F): P value # Refresh several times and find that they are very significant, so these two variables are also worth putting into the model
3. Multiple linear regression modeling
from statsmodels.formula.api import ols lm = ols('price ~ area + bedrooms + bathrooms', data=df).fit() lm.summary()
4. Model optimization
It is found that the accuracy is not high enough. Here, the accuracy of the model is improved by adding dummy variables and using variance expansion factor to detect multicollinearity
# Set dummy variable # Take the nominal variable neighborhood as an example nominal_data = df['neighborhood'] # Set dummy variable dummies = pd.get_dummies(nominal_data) dummies.sample() # pandas will automatically name it for you # One dummy variable generated by each nominal variable needs to be discarded. Here, take discarding C as an example dummies.drop(columns=['C'], inplace=True) dummies.sample()
# Splice the results with the original data set results = pd.concat(objs=[df, dummies], axis='columns') # Merge by column results.sample(3) # You can try to handle the nominal variable style by yourself
# Modeling again lm = ols('price ~ area + bedrooms + bathrooms + A + B', data=results).fit() lm.summary()
# Self defined variance expansion factor detection formula def vif(df, col_i): """ df: Whole data col_i: Detected column name """ cols = list(df.columns) cols.remove(col_i) cols_noti = cols formula = col_i + '~' + '+'.join(cols_noti) r2 = ols(formula, df).fit().rsquared return 1. / (1. - r2)
test_data = results[['area', 'bedrooms', 'bathrooms', 'A', 'B']] for i in test_data.columns: print(i, '\t', vif(df=test_data, col_i=i)) # It is found that there is a strong correlation between bedrooms and bathrooms, which may explain the same problem
# Multiple collinearity detection was performed again test_data = df[['area', 'bathrooms']] for i in test_data.columns: print(i, '\t', vif(df=test_data, col_i=i))
2, Redo the above multiple linear regression with Excel to solve the regression equation
House price forecast based on multiple linear regression
abstract
The trend of market house price is affected by many factors. Through the analysis of many factors affecting market house price, it is helpful to make a more accurate evaluation of the trend of house price in the future.
Multiple linear regression is suitable for the analysis of data affected by multiple factors. It is more effective and practical to predict or estimate dependent variables by the optimal combination of multiple independent variables. Based on the mathematical model, this paper arranges the relevant data such as house sales price in a certain area in the past, analyzes the data by using the method of multiple linear regression, and forecasts the future house price trend in this area.
Keywords: multiple linear regression; House price forecast; Data analysis;
introduction
The prediction of future house prices affects the development of social economy to a certain extent. In a broad sense, accurate house price prediction helps the state to macro-control the trend of market house price. In a small range, future house price prediction is a part of enterprise strategic planning. For consumers, house price prediction plays a positive role in the rational planning of personal economy. Because the house price is related to many factors, and there is a linear relationship between the house price and some factors affecting the house price, it is more appropriate to select the multiple linear regression model to study this problem.
Through the linear regression analysis of the sold house price data in a certain area for a certain period of time, this research explores the main factors affecting the house price, analyzes the influence degree of these factors, and uses the data obtained from the analysis to predict the trend and trend of house price in the future.
Theoretical basis of linear regression
Univariate linear regression is a method to analyze the linear correlation between only one independent variable (independent variable x and dependent variable y). The mathematical model of univariate linear regression analysis is: y = a+bx+ ε.
The least squares estimates of unknown parameters a and b of the linear model can be obtained by using the sum of squares of deviations to derive the partial derivatives of parameters a and b respectively. The sum of squares of deviations is defined as Σ (yi-a-bXi)2, and the unique solutions of a and b are shown in the figure.
**Least squares estimation of parameters**
In order to facilitate the significance test of regression effect, three mathematical symbols LXX, LYY and LXY are introduced according to the estimation of b. the definitions of these three mathematical symbols are shown in the figure.
Figure mathematical definitions of LXX, LYY and LXY
In the study of practical problems, the change of dependent variables is often affected by several important factors. At this time, it is necessary to use two or more influencing factors as independent variables to explain the change of dependent variables, which is multiple regression. In other words, when there is a linear relationship between multiple independent variables and dependent variables, the regression analysis is multivariate regression. The mathematical model of multiple linear regression is: y= β 0+ β 1X1+ β 2X2+…++ β pXp+ ε. Use the sum of squares of residuals to separate the parameters β The unknown parameters of the linear model can be obtained by calculating the partial derivative of i (i=0,1,..., p) β The estimated value of i (i=0,1,..., p), β The estimated value of the matrix is shown in the figure.
Significance test of regression effect
For the disordered points on the plane, the linear regression equation solved by the least square method is meaningless. Whether the trend description reflected by the linear regression is reasonable needs a quantitative index to measure.
The total fluctuation of data can be described by the sum of square deviation and LYY. It represents the dispersion yi-y of Y ̅ Sum of squares. The larger the LYY value, the greater the fluctuation of Yi value, that is, the more dispersed it is. The sum of squared deviations LYY can be decomposed into the sum of squared deviations U of Y on the regression line and the sum of squared differences Q between Yi and Y on the regression line. Where u is the dispersion of Y caused by the linear correlation between x and y, and Q is the dispersion caused by random error. yi-y ̅ The decomposition is shown in Figure 2-4. The larger the proportion of u in the total, the smaller the proportion of random error and the more significant the regression effect. Therefore, the determination coefficient R2 can be used to measure whether the linear regression effect is significant. R2, as the goodness of fit, represents the quality of fitting data with a straight line, and R2 is equal to U/Lyy.
The result after R2 square is Pearson correlation coefficient, which can be used to measure whether the two data sets are on a line, so as to measure the linear relationship between distance variables. The greater the absolute value of correlation coefficient, the stronger the correlation; The closer the correlation coefficient is to 1 or - 1, the stronger the correlation is, and the closer the correlation coefficient is to 0, the weaker the correlation is. When | R | > = 0.8, x and y are strongly correlated, and when | R | < 0.3, x and y are weakly correlated. Pearson correlation coefficient is defined as shown in the figure.
For the univariate linear regression model, the significance of the effect of the linear regression model can be tested by the hypothesis test problem H0: b=0; H1: b ≠ 0. The test methods include F test and t test. F test belongs to the significance test of regression equation and is a test method to test whether X and y are related. T-test is the significance test of regression coefficient and the method to test whether variable x is useful. When H0 is established, the definitions of the two inspection methods are shown in Figure 2-6 and figure 2-7. When H0 does not hold, for a given significance level α, When f > F1- α (1,n-2), the regression effect is significant. When | t | > T1- α/ 2(n-2), it is considered that the influence of regression coefficient is significant, otherwise the effect of regression coefficient is not significant.
Univariate linear regression F-test
Univariate linear regression t-test
For the multiple linear regression model, the significance of the regression effect can be tested by hypothesis H0 using the F test method: β 0= β 1= β 2=…= β p=0; H1: β Judge if i (i=0,1,..., P) is not all 0. When H0 is established, the definition of F test method is shown in Figure 2-8. When H0 does not hold, for a given significance level α, When f > F1- α (p,n-p-1), the regression effect is significant.
The significance test of the regression coefficient can use the t-test method to test the hypothesis. Question H0: β i=0; H1: β I ≠ 0 for judgment. When H0 is established, the definition of t-test method is shown in Figure 2-9. When H0 does not hold, for a given significance level α, When | t | > T1- α/ 2(n-p-1), it is considered that the influence of regression coefficient is significant, otherwise the effect of regression coefficient is not significant.
Linear regression and gradient descent based on machine learning
Machine learning spans many disciplines such as computer science, engineering technology and statistics. It has penetrated into all fields of people's production and life. It is widely used in all walks of life. In today's fierce competition in the world, it is very necessary to master and understand the basic models and methods of machine learning.
The linear regression model in machine learning is based on the linear regression model of mathematical statistics. It uses a straight line to fit the data points. In machine learning, the solution process of the regression problem is to find the best fitting parameter set, that is, to find the parameter solution that minimizes the variance between the estimated value and the actual value. This process uses the loss function, The definition of loss function is shown in Figure 2-10. Using the loss function, the best fitting parameter set can be solved. The gradient descent method can be used to solve the loss function.
The calculation process of gradient descent method is to solve the minimum value along the descending direction of gradient or the maximum value along the ascending direction of gradient. Generally, if the gradient vector is 0, it indicates that it has reached an extreme point, and the amplitude of the gradient is also 0. When the gradient descent algorithm is used for optimization, the termination condition of the algorithm iteration is that the amplitude of the gradient vector is close to 0 or close to a very small constant threshold. The process of gradient descent is shown in the figure.
Data analysis using EXCEL
Select the x,y value range
The field Multiple R represents the complex correlation coefficient r, that is, the square root of R2, also known as the correlation coefficient, which is used to measure the correlation degree between independent variables x and y.
R Square is the complex determination coefficient, that is, the square of the correlation coefficient R.
Adjusted R Square is the adjusted complex determination coefficient R2.
The standard error is used to measure the degree of fitting and to calculate other statistics related to regression. The smaller the value, the better the degree of fitting.
The observed value is the number of observed values used to estimate the data of the regression equation. There are 20 data in this data set, so the observed value is 20.
Coefficients is a constant term
Let the dependent variable house price be y, the independent variable area be x1, bedrooms be x2, bathrooms be x3
So we get the equation:
y=10072.11+345.911x1-2925.81x2+7345.392x3
3, Redo the above multivariate linear regression with the machine learning library Sklearn library
1. Solve directly without processing
import pandas as pd import numpy as np import math import matplotlib.pyplot as plt #Drawing from sklearn import linear_model #linear model data = pd.read_csv('house_prices.csv') data.head() #Data display
new_data=data.iloc[:,1:]#Get rid of house_id column new_data.head()
new_data.corr()#Correlation coefficient matrix, only statistical value column
"""take are,bedrooms and bathroom As X,price by Y Find linear regression.""" x_data = new_data.iloc[:, 1:4] #Corresponding columns of are, bedrooms and bathroom y_data = new_data.iloc[:, -1] #price corresponding column print(x_data, y_data, len(x_data))
# Application model model = linear_model.LinearRegression() model.fit(x_data, y_data) print("Regression coefficient:", model.coef_) print("Intercept:", model.intercept_) print('regression equation : price=',model.coef_[0],'*area +',model.coef_[1],'*bedrooms +',model.coef_[2],'*bathromms +',model.intercept_)
2. Clean the data before solving
new_data_Z=new_data.iloc[:,0:] new_data_IQR=new_data.iloc[:,0:] def outlier_test(data, column, method=None, z=2): if method == None: print(f'with {column} Based on the column, the upper and lower cut-off point method is used(iqr) Detect outliers...') print('=' * 70) column_iqr = np.quantile(data[column], 0.75) - np.quantile(data[column], 0.25) (q1, q3) = np.quantile(data[column], 0.25), np.quantile(data[column], 0.75) upper, lower = (q3 + 1.5 * column_iqr), (q1 - 1.5 * column_iqr) outlier = data[(data[column] <= lower) | (data[column] >= upper)] print(f'First quantile: {q1}, Third quantile:{q3}, Interquartile range:{column_iqr}') print(f"Upper cutoff point:{upper}, Lower cutoff point:{lower}") return outlier, upper, lower if method == 'z': print(f'with {column} List as basis, use Z Fractional method, z Quantile extraction {z} To detect outliers...') print('=' * 70) mean, std = np.mean(data[column]), np.std(data[column]) upper, lower = (mean + z * std), (mean - z * std) print(f"take {z} individual Z Score: greater than {upper} Or less than {lower} Is considered an outlier.") print('=' * 70) outlier = data[(data[column] <= lower) | (data[column] >= upper)] return outlier, upper, lower outlier, upper, lower = outlier_test(data=new_data_Z, column='price', method='z') outlier.info(); outlier.sample(5) #Simply discard it here new_data_Z.drop(index=outlier.index, inplace=True)
outlier, upper, lower = outlier_test(data=new_data_IQR, column='price') outlier.info(); outlier.sample(5) # Simply discard it here new_data_IQR.drop(index=outlier.index, inplace=True)
print("Original data correlation matrix") new_data.corr()
print("z Correlation matrix processed by method") new_data_Z.corr()
print("IQR Data correlation matrix processed by method") new_data_IQR.corr()
x_data = new_data_Z.iloc[:, 1:4] y_data = new_data_Z.iloc[:, -1] # Application model model = linear_model.LinearRegression() model.fit(x_data, y_data) print("Regression coefficient:", model.coef_) print("Intercept:", model.intercept_) print('regression equation : price=',model.coef_[0],'*area +',model.coef_[1],'*bedrooms +',model.coef_[2],'*bathromms +',model.intercept_)
x_data = new_data_IQR.iloc[:, 1:4] y_data = new_data_IQR.iloc[:, -1] # Application model model = linear_model.LinearRegression() model.fit(x_data, y_data) print("Regression coefficient:", model.coef_) print("Intercept:", model.intercept_) print('regression equation : price=',model.coef_[0],'*area +',model.coef_[1],'*bedrooms +',model.coef_[2],'*bathromms +',model.intercept_)
3.3 comparison
No data processing: price= 345.911018840024 *area + -2925.806324666705 *bedrooms + 7345.391713693825 *bathromms + 10072.107046726742 Z To clean up data: price= 226.4211697383351 *area + 49931.50311720713 *bedrooms + -12224.71724496588 *bathromms + 64356.04135007458 IQR To clean up data: price= 242.6111551782956 *area + 41547.43068790577 *bedrooms + -6415.78250090158 *bathromms + 58018.13845504692
4, References
https://blog.csdn.net/qq_55691662/article/details/120960932
https://blog.csdn.net/weixin_43196118/article/details/108462140
https://blog.csdn.net/m0_51120713/article/details/120969812
https://blog.csdn.net/weixin_43196118/article/details/108462140