Advanced application of NumPy (example)

Advanced applications of NumPy

Import of package, Chinese font setting and image definition setting on image

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['STfangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'

Common functions

array1 = np.arange(1, 10).reshape(3, 3)
array1
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])
array2 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
array2
array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])
# Horizontal splicing
np.hstack((array1, array2))
array([[1, 2, 3, 1, 1, 1],
       [4, 5, 6, 2, 2, 2],
       [7, 8, 9, 3, 3, 3]])
# Vertical splicing
np.vstack((array1, array2))
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])
# Splice along the specified axis
np.concatenate((array1, array2))
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3]])
np.concatenate((array1, array2), axis=1)
array([[1, 2, 3, 1, 1, 1],
       [4, 5, 6, 2, 2, 2],
       [7, 8, 9, 3, 3, 3]])
# Split vertically
np.vsplit(array2, 3)
[array([[1, 1, 1]]), array([[2, 2, 2]]), array([[3, 3, 3]])]
# Split horizontally
np.hsplit(array2, 3)
[array([[1],
        [2],
        [3]]),
 array([[1],
        [2],
        [3]]),
 array([[1],
        [2],
        [3]])]
# Append element at end
np.append(array1, 10)
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
# Inserts an element at the specified location
np.insert(array1, 0, 0)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array1[array1 % 3 == 0]
array([3, 6, 9])
# Filter data by criteria
np.extract(array1 % 3 == 0, array1)
array([3, 6, 9])
# Get data according to conditions and formulas
x = np.arange(10)
condlist = [x < 3, x > 5]
choicelist = [x, x ** 2]
np.select(condlist, choicelist, default=np.nan)
array([ 0.,  1.,  2., nan, nan, nan, 36., 49., 64., 81.])
# Get data according to conditions and formulas
np.where(x < 5, x, 10 * x)
array([ 0,  1,  2,  3,  4, 50, 60, 70, 80, 90])
def fib(n):
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
        yield a

        
gen = fib(20)
# Creating array objects through iterators (generators)
array3 = np.fromiter(gen, dtype=np.int64, count=10)
array3
array([ 1,  1,  2,  3,  5,  8, 13, 21, 34, 55], dtype=int64)
# Resize array
np.resize(array1, (4, 4))
array([[1, 2, 3, 4],
       [5, 6, 7, 8],
       [9, 1, 2, 3],
       [4, 5, 6, 7]])

vector

Dot product operation

A ⋅ B = a 1 b 1 + a 2 b 2 = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = a_1b_1 + a_2b_2 = \lvert A \rvert \lvert B \rvert cos \theta A⋅B=a1​b1​+a2​b2​=∣A∣∣B∣cosθ

A ⋅ B = ∑ i = 1 n a i b i = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = \sum_{i=1}^{n} a_ib_i = \lvert A \rvert \lvert B \rvert cos \theta A⋅B=i=1∑n​ai​bi​=∣A∣∣B∣cosθ

v1 = np.array([3, 5])
v2 = np.array([1, 3])
# inner_prod = np.dot(v1, v2)
inner_prod = np.inner(v1, v2)
print('Vector dot product:', inner_prod)
Vector dot product: 18

Note: in Euclidean geometry, the point product of two Cartesian coordinate vectors is also called inner product, but the meaning of inner product is higher than that of point product. Point product is equivalent to a special case of inner product in Euclidean space $\ mathbb{R}^n $, and inner product can be extended to normed vector space.

v1_norm = np.linalg.norm(v1)
v2_norm = np.linalg.norm(v2)
print('v1 Module of:', np.round(v1_norm, 6))
print('v2 Module of:', np.round(v2_norm, 6))
v1 Module of: 5.830952
v2 Module of: 3.162278
cos_theta = inner_prod / (v1_norm * v2_norm)
print('Cosine value of vector angle:', cos_theta)
print('included angle:', np.arccos(cos_theta) * 180 / np.pi)
Cosine value of vector angle: 0.9761870601839526
 included angle: 12.52880770915155

determinant

d e t ( A ) = ∑ n ! ± a 1 α a 2 β a 3 γ ⋯ a n ω det(A) = \sum_{n!} \pm a_{1\alpha}a_{2\beta}a_{3\gamma} \cdots a_{n\omega} det(A)=n!∑​±a1α​a2β​a3γ​⋯anω​

array4 = np.stack((v1, v2))
array4
array([[3, 5],
       [1, 3]])

d e t ∣ 3 5 1 3 ∣ = 4 det \begin{vmatrix} 3 & 5 \\ 1 & 3 \end{vmatrix} = 4 det∣∣∣∣​31​53​∣∣∣∣​=4

# Calculate the value of the determinant
np.round(np.linalg.det(array4), 2)
4.0

d e t ∣ 1 2 3 4 5 6 7 8 9 ∣ = 0 det \begin{vmatrix} 1 & 2 & 3 \\ 4 & 5 & 6 \\ 7 & 8 & 9 \end{vmatrix} = 0 det∣∣∣∣∣∣​147​258​369​∣∣∣∣∣∣​=0

np.linalg.det(array1)
0.0

matrix

array1 = np.arange(1, 10).reshape((3, 3))
array1
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

Do a set of linear transformations on the above array1 to know why its rank is 2.

∣ 1 2 3 4 5 6 7 8 9 ∣ → ∣ 1 2 3 0 − 3 − 6 0 − 6 − 12 ∣ \begin{vmatrix} 1 & 2 & 3\\ 4 & 5 & 6\\ 7 & 8 & 9 \end{vmatrix} \quad \to \quad \begin{vmatrix} 1 & 2 & 3\\ 0 & -3 & -6\\ 0 & -6 & -12 \end{vmatrix} ∣∣∣∣∣∣​147​258​369​∣∣∣∣∣∣​→∣∣∣∣∣∣​100​2−3−6​3−6−12​∣∣∣∣∣∣​

# Inverse matrix
# Linalgerror -- > singluar matrix -- > singular matrix cannot be inversed
# np.linalg.inv(array1)
array2 = np.array([[1, 2], [3, 4]])
array2
array([[1, 2],
       [3, 4]])
array3 = np.linalg.inv(array2)
array3
array([[-2. ,  1. ],
       [ 1.5, -0.5]])

A ⋅ A − 1 = I A \cdot A^{-1} = I A⋅A−1=I

np.round(array2 @ array3, 2)
array([[1., 0.],
       [0., 1.]])
# Find the rank of matrix
np.linalg.matrix_rank(array1)
2
array1[2, 2] = 8
array1
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 8]])
np.linalg.matrix_rank(array1)
3

Solving linear equations:

{ 3 x + y = 9 x + 2 y = 8 \begin{cases} 3x + y = 9 \\ x + 2y = 8 \end{cases} {3x+y=9x+2y=8​

A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8]).reshape(-1, 1)
np.linalg.solve(A, b)
array([[2.],
       [3.]])

A x = b A − 1 A x = A − 1 b I x = A − 1 b Ax = b\\ A^{-1}Ax = A^{-1}b\\ Ix = A^{-1}b Ax=bA−1Ax=A−1bIx=A−1b

A_1 = np.linalg.inv(A)
A_1
array([[ 0.4, -0.2],
       [-0.2,  0.6]])
A_1 @ b
array([[2.],
       [3.]])

Least squares solution

!pip install scikit-learn
Looking in indexes: https://pypi.doubanio.com/simple
Requirement already satisfied: scikit-learn in d:\programs\python\python38\lib\site-packages (0.24.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: numpy>=1.13.3 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.21.2)
Requirement already satisfied: joblib>=0.11 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.0.1)
Requirement already satisfied: scipy>=0.19.1 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.7.1)
from sklearn.datasets import load_boston

# Get Boston house price data
dataset = load_boston()
print(dataset.DESCR)
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

dataset.data.shape
(506, 13)
dataset.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
# Create a DataFrame object from data
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
..........................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.67
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.08
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.64
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.48
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.88

506 rows × 13 columns

# Add house price column
df['PRICE'] = dataset.target
df
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
.............................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.6722.4
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.0820.6
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.6423.9
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.4822.0
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.8811.9

506 rows × 14 columns

# Calculate covariance
df.cov()
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
CRIM73.986578-40.21595623.992339-0.1221090.419594-1.32503885.405322-6.87672246.847761844.8215385.399331-302.38181627.986168-30.718508
ZN-40.215956543.936814-85.412648-0.252925-1.3961485.112513-373.90154832.629304-63.348695-1236.453735-19.776571373.721402-68.78303777.315176
INDUS23.992339-85.41264847.0644420.1096690.607074-1.887957124.513903-10.22809735.549971833.3602905.692104-223.57975629.580270-30.520823
CHAS-0.122109-0.2529250.1096690.0645130.0026840.0162850.618571-0.053043-0.016296-1.523367-0.0668191.131325-0.0978160.409409
NOX0.419594-1.3961480.6070740.0026840.013428-0.0246032.385927-0.1876960.61692913.0462860.047397-4.0205700.488946-0.455412
RM-1.3250385.112513-1.8879570.016285-0.0246030.493671-4.7519290.303663-1.283815-34.583448-0.5407638.215006-3.0797414.493446
AGE85.405322-373.901548124.5139030.6185712.385927-4.751929792.358399-44.329379111.7708462402.69012215.936921-702.940328121.077725-97.589017
DIS-6.87672232.629304-10.228097-0.053043-0.1876960.303663-44.3293794.434015-9.068252-189.664592-1.05977556.040356-7.4733294.840229
RAD46.847761-63.34869535.549971-0.0162960.616929-1.283815111.770846-9.06825275.8163661335.7565778.760716-353.27621930.385442-30.561228
TAX844.821538-1236.453735833.360290-1.52336713.046286-34.5834482402.690122-189.6645921335.75657728404.759488168.153141-6797.911215654.714520-726.255716
PTRATIO5.399331-19.7765715.692104-0.0668190.047397-0.54076315.936921-1.0597758.760716168.1531414.686989-35.0595275.782729-10.110657
B-302.381816373.721402-223.5797561.131325-4.0205708.215006-702.94032856.040356-353.276219-6797.911215-35.0595278334.752263-238.667516279.989834
LSTAT27.986168-68.78303729.580270-0.0978160.488946-3.079741121.077725-7.47332930.385442654.7145205.782729-238.66751650.994760-48.447538
PRICE-30.71850877.315176-30.5208230.409409-0.4554124.493446-97.5890174.840229-30.561228-726.255716-10.110657279.989834-48.44753884.586724
# Calculate Pearson correlation coefficient
np.round(df.corr(), 2)
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
CRIM1.00-0.200.41-0.060.42-0.220.35-0.380.630.580.29-0.390.46-0.39
ZN-0.201.00-0.53-0.04-0.520.31-0.570.66-0.31-0.31-0.390.18-0.410.36
INDUS0.41-0.531.000.060.76-0.390.64-0.710.600.720.38-0.360.60-0.48
CHAS-0.06-0.040.061.000.090.090.09-0.10-0.01-0.04-0.120.05-0.050.18
NOX0.42-0.520.760.091.00-0.300.73-0.770.610.670.19-0.380.59-0.43
RM-0.220.31-0.390.09-0.301.00-0.240.21-0.21-0.29-0.360.13-0.610.70
AGE0.35-0.570.640.090.73-0.241.00-0.750.460.510.26-0.270.60-0.38
DIS-0.380.66-0.71-0.10-0.770.21-0.751.00-0.49-0.53-0.230.29-0.500.25
RAD0.63-0.310.60-0.010.61-0.210.46-0.491.000.910.46-0.440.49-0.38
TAX0.58-0.310.72-0.040.67-0.290.51-0.530.911.000.46-0.440.54-0.47
PTRATIO0.29-0.390.38-0.120.19-0.360.26-0.230.460.461.00-0.180.37-0.51
B-0.390.18-0.360.05-0.380.13-0.270.29-0.44-0.44-0.181.00-0.370.33
LSTAT0.46-0.410.60-0.050.59-0.610.60-0.500.490.540.37-0.371.00-0.74
PRICE-0.390.36-0.480.18-0.430.70-0.380.25-0.38-0.47-0.510.33-0.741.00
rooms = df['RM'].values
prices = df['PRICE'].values
history_data = {room: price for room, price in zip(rooms, prices)}
history_data
{6.575: 24.0,
 6.421: 21.6,
 7.185: 34.9,
 6.998: 33.4,
 7.147: 36.2,
 6.43: 28.7,
 6.012: 22.9,
 6.172: 27.1,
 5.631: 16.5,
 6.004: 20.3,
 6.377: 15.0,
 6.009: 21.7,
 5.889: 21.7,
 5.949: 20.4,
 6.096: 13.5,
 5.834: 19.9,
 5.935: 8.4,
 5.99: 17.5,
 5.456: 20.2,
 5.727: 18.2,
 5.57: 13.6,
 5.965: 19.6,
 6.142: 15.2,
 5.813: 16.6,
 5.924: 15.6,
 5.599: 13.9,
 6.047: 14.8,
 6.495: 26.4,
 6.674: 21.0,
 5.713: 20.1,
 6.072: 14.5,
 5.95: 13.2,
 5.701: 13.1,
 5.933: 18.9,
 5.841: 20.0,
 5.85: 21.0,
 5.966: 16.0,
 6.595: 30.8,
 7.024: 34.9,
 6.77: 26.6,
 6.169: 25.3,
 6.211: 25.0,
 6.069: 21.2,
 5.682: 19.3,
 5.786: 20.0,
 6.03: 11.9,
 5.399: 14.4,
 5.602: 19.4,
 5.963: 19.7,
 6.115: 20.5,
 6.511: 25.0,
 5.998: 23.4,
 5.888: 23.3,
 7.249: 35.4,
 6.383: 24.7,
 6.816: 31.6,
 6.145: 23.3,
 5.927: 19.6,
 5.741: 18.7,
 6.456: 22.2,
 6.762: 25.0,
 7.104: 33.0,
 6.29: 23.5,
 5.787: 19.4,
 5.878: 22.0,
 5.594: 17.4,
 5.885: 20.9,
 6.417: 13.0,
 5.961: 20.5,
 6.065: 22.8,
 6.245: 23.4,
 6.273: 24.1,
 6.286: 21.4,
 6.279: 20.0,
 6.14: 20.8,
 6.232: 21.2,
 5.874: 20.3,
 6.727: 27.5,
 6.619: 23.9,
 6.302: 24.8,
 6.167: 19.9,
 6.389: 23.9,
 6.63: 27.9,
 6.015: 22.5,
 6.121: 22.2,
 7.007: 23.6,
 7.079: 28.7,
 6.405: 12.5,
 6.442: 22.9,
 6.249: 20.6,
 6.625: 28.4,
 6.163: 21.4,
 8.069: 38.7,
 7.82: 45.4,
 7.416: 33.2,
 6.781: 26.5,
 6.137: 19.3,
 5.851: 19.5,
 5.836: 19.5,
 6.127: 22.7,
 6.474: 19.8,
 6.229: 21.4,
 6.195: 21.7,
 6.715: 22.8,
 5.913: 18.8,
 6.092: 18.7,
 6.254: 18.5,
 5.928: 18.3,
 6.176: 21.2,
 6.021: 19.2,
 5.872: 20.4,
 5.731: 19.3,
 5.87: 22.0,
 5.856: 21.1,
 5.879: 18.8,
 5.986: 21.4,
 5.613: 15.7,
 5.693: 16.2,
 6.431: 24.6,
 5.637: 14.3,
 6.458: 19.2,
 6.326: 24.4,
 6.372: 23.0,
 5.822: 18.4,
 5.757: 15.0,
 6.335: 18.1,
 5.942: 17.4,
 6.454: 17.1,
 5.857: 13.3,
 6.151: 17.8,
 6.174: 14.0,
 5.019: 14.4,
 5.403: 13.4,
 5.468: 15.6,
 4.903: 11.8,
 6.13: 13.8,
 5.628: 15.6,
 4.926: 14.6,
 5.186: 17.8,
 5.597: 15.4,
 6.122: 22.1,
 5.404: 19.3,
 5.012: 15.3,
 5.709: 19.4,
 6.129: 17.0,
 6.152: 8.7,
 5.272: 13.1,
 6.943: 41.3,
 6.066: 24.3,
 6.51: 23.3,
 6.25: 27.0,
 7.489: 50.0,
 7.802: 50.0,
 8.375: 50.0,
 5.854: 10.8,
 6.101: 25.0,
 7.929: 50.0,
 5.877: 23.8,
 6.319: 23.8,
 6.402: 22.3,
 5.875: 50.0,
 5.88: 19.1,
 5.572: 23.1,
 6.416: 23.6,
 5.859: 22.6,
 6.546: 29.4,
 6.02: 23.2,
 6.315: 22.3,
 6.86: 29.9,
 6.98: 29.8,
 7.765: 39.8,
 6.144: 19.8,
 7.155: 37.9,
 6.563: 32.5,
 5.604: 26.4,
 6.153: 29.6,
 7.831: 50.0,
 6.782: 7.5,
 6.556: 29.8,
 6.951: 26.7,
 6.739: 30.5,
 7.178: 36.4,
 6.8: 31.1,
 6.604: 29.1,
 7.875: 50.0,
 7.287: 33.3,
 7.107: 30.3,
 7.274: 34.6,
 6.975: 34.9,
 7.135: 32.9,
 6.162: 13.3,
 7.61: 42.3,
 7.853: 48.5,
 8.034: 50.0,
 5.891: 22.6,
 5.783: 22.5,
 6.064: 24.4,
 5.344: 20.0,
 5.96: 21.7,
 5.807: 22.4,
 6.375: 28.1,
 5.412: 23.7,
 6.182: 25.0,
 6.642: 28.7,
 5.951: 21.5,
 6.373: 23.0,
 6.164: 21.7,
 6.879: 27.5,
 6.618: 30.1,
 8.266: 44.8,
 8.725: 50.0,
 8.04: 37.6,
 7.163: 31.6,
 7.686: 46.7,
 6.552: 31.5,
 5.981: 24.3,
 7.412: 31.7,
 8.337: 41.7,
 8.247: 48.3,
 6.726: 29.0,
 6.086: 24.0,
 6.631: 25.1,
 7.358: 31.5,
 6.481: 23.7,
 6.606: 23.3,
 6.897: 22.0,
 6.095: 20.1,
 6.358: 22.2,
 6.393: 23.7,
 5.593: 17.6,
 5.605: 18.5,
 6.108: 21.9,
 6.226: 20.5,
 6.433: 24.5,
 6.718: 26.2,
 6.487: 24.4,
 6.438: 24.8,
 6.957: 29.6,
 8.259: 42.8,
 5.876: 20.9,
 7.454: 44.0,
 8.704: 50.0,
 7.333: 36.0,
 6.842: 30.1,
 7.203: 33.8,
 7.52: 43.1,
 8.398: 48.8,
 7.327: 31.0,
 7.206: 36.5,
 5.56: 22.8,
 7.014: 30.7,
 8.297: 50.0,
 7.47: 43.5,
 5.92: 20.7,
 6.24: 25.2,
 6.538: 24.4,
 7.691: 35.2,
 6.758: 32.4,
 6.854: 32.0,
 7.267: 33.2,
 6.826: 33.1,
 6.482: 29.1,
 6.812: 35.1,
 6.968: 10.4,
 7.645: 46.0,
 7.923: 50.0,
 7.088: 32.2,
 6.453: 22.0,
 6.23: 20.1,
 6.209: 21.4,
 6.565: 24.8,
 6.861: 28.5,
 7.148: 37.3,
 6.678: 28.6,
 6.549: 27.1,
 5.79: 20.3,
 6.345: 22.5,
 7.041: 29.0,
 6.871: 24.8,
 6.59: 22.0,
 6.982: 33.1,
 7.236: 36.1,
 6.616: 28.4,
 7.42: 33.4,
 6.849: 28.2,
 6.635: 24.5,
 5.972: 20.3,
 4.973: 16.1,
 6.023: 19.4,
 6.266: 21.6,
 6.567: 23.8,
 5.705: 16.2,
 5.914: 17.8,
 5.782: 19.8,
 6.382: 23.1,
 6.113: 21.0,
 6.426: 23.8,
 6.376: 17.7,
 6.041: 20.4,
 5.708: 18.5,
 6.415: 25.0,
 6.312: 21.2,
 6.083: 22.2,
 5.868: 19.3,
 6.333: 22.6,
 5.706: 17.1,
 6.031: 19.4,
 6.316: 22.2,
 6.31: 20.7,
 6.037: 21.1,
 5.869: 19.5,
 5.895: 18.5,
 6.059: 20.6,
 5.985: 19.0,
 5.968: 18.7,
 7.241: 32.7,
 6.54: 16.5,
 6.696: 23.9,
 6.874: 31.2,
 6.014: 17.5,
 5.898: 17.2,
 6.516: 23.1,
 6.939: 26.6,
 6.49: 22.9,
 6.579: 24.1,
 5.884: 18.6,
 6.728: 14.9,
 5.663: 18.2,
 5.936: 13.5,
 6.212: 17.8,
 6.395: 21.7,
 6.112: 22.6,
 6.398: 25.0,
 6.251: 12.6,
 5.362: 20.8,
 5.803: 16.8,
 8.78: 21.9,
 3.561: 27.5,
 4.963: 21.9,
 3.863: 23.1,
 4.97: 50.0,
 6.683: 50.0,
 7.016: 50.0,
 6.216: 50.0,
 4.906: 13.8,
 4.138: 11.9,
 7.313: 15.0,
 6.649: 13.9,
 6.794: 22.0,
 6.38: 9.5,
 6.223: 10.2,
 6.545: 10.9,
 5.536: 11.3,
 5.52: 12.3,
 4.368: 8.8,
 5.277: 7.2,
 4.652: 10.5,
 5.0: 7.4,
 4.88: 10.2,
 5.39: 19.7,
 6.051: 23.2,
 5.036: 9.7,
 6.193: 11.0,
 5.887: 12.7,
 6.471: 13.1,
 5.747: 8.5,
 5.453: 5.0,
 5.852: 6.3,
 5.987: 5.6,
 6.343: 7.2,
 6.404: 12.1,
 5.349: 8.3,
 5.531: 8.5,
 5.683: 5.0,
 5.608: 27.9,
 5.617: 17.2,
 6.852: 27.5,
 6.657: 17.2,
 4.628: 17.9,
 5.155: 16.3,
 4.519: 7.0,
 6.434: 7.2,
 5.304: 12.0,
 5.957: 8.8,
 6.824: 8.4,
 6.411: 16.7,
 6.006: 14.2,
 5.648: 20.8,
 6.103: 13.4,
 5.565: 11.7,
 5.896: 8.3,
 5.837: 10.2,
 6.202: 10.9,
 6.348: 14.5,
 6.833: 14.1,
 6.425: 16.1,
 6.436: 14.3,
 6.208: 11.7,
 6.629: 13.4,
 6.461: 9.6,
 5.627: 12.8,
 5.818: 10.5,
 6.406: 17.1,
 6.219: 18.4,
 6.485: 15.4,
 6.459: 11.8,
 6.341: 14.9,
 6.185: 14.6,
 6.749: 13.4,
 6.655: 15.2,
 6.297: 16.1,
 7.393: 17.8,
 6.525: 14.1,
 5.976: 12.7,
 6.301: 14.9,
 6.081: 20.0,
 6.701: 16.4,
 6.317: 19.5,
 6.513: 20.2,
 5.759: 19.9,
 5.952: 19.0,
 6.003: 19.1,
 5.926: 24.5,
 6.437: 23.2,
 5.427: 13.8,
 6.484: 16.7,
 6.242: 23.0,
 6.75: 23.7,
 7.061: 25.0,
 5.762: 21.8,
 5.871: 20.6,
 6.114: 19.1,
 5.905: 20.6,
 5.454: 15.2,
 5.414: 7.0,
 5.093: 8.1,
 5.983: 20.1,
 5.707: 21.8,
 5.67: 23.1,
 5.794: 18.3,
 6.019: 21.2,
 5.569: 17.5,
 6.027: 16.8,
 6.593: 22.4,
 6.12: 20.6,
 6.976: 23.9}

By calculating the Pearson correlation coefficient, it is found that there is a positive correlation between the number of rooms and the house price. Next, we study the historical data to finally achieve the goal of predicting the house price with the number of rooms.

import heapq

nums = [35, 98, 76, 12, 55, 68, 47, 92]
print(heapq.nlargest(3, nums))
print(heapq.nsmallest(3, nums))
[98, 92, 76]
[12, 35, 47]
import heapq

# kNN algorithm
def predict_price_by_knn(history_data, room, k=5):
    # keys = sorted(history_data, key=lambda x: (x - room) ** 2)[:k]
    keys = heapq.nsmallest(k, history_data, key=lambda x: (x - room) ** 2)
    return np.mean([history_data[key] for key in keys])
# Forecast house price
np.round(predict_price_by_knn(history_data, 6.25), 2)
20.42
np.round(predict_price_by_knn(history_data, 5.125), 2)
13.26
# The relationship between variables is studied by scatter diagram
plt.scatter(rooms, prices)
plt.show()


Through the above figure, we find that there is a linear relationship between the number of rooms and the house price. Next, we try to use a linear function to predict the house price.

loss function

Regression equation: x x x represents the number of rooms, y y y is the house price to be predicted.
y = a x + b y = ax + b y=ax+b

Now our problem is to find a set of a and b to make the prediction achieve the best effect (the smallest error is the best).

Mean square error: to minimize the mean square error a a a and b b b is the best fit.
M S E = 1 N ∑ ( y i ^ − y i ) 2 MSE = \frac{1} {N} \sum (\hat{y_i} - y_i)^2 MSE=N1​∑(yi​^​−yi​)2

def get_loss(x, y, a, b):
    """loss function """
    y_hat = a * x + b
    return np.mean((y_hat - y) ** 2)
# Find the values of a and b to achieve the best fitting through Monte Carlo simulation
import random

best_a, best_b = None, None
min_loss = np.inf

for _ in range(1000):
    # Randomly generate the values of a and b
    a = random.random() * 200 - 100
    b = random.random() * 200 - 100
    # Calculated loss (MSE)
    curr_loss = get_loss(rooms, prices, a, b)
    # a and b with less loss are better fitting
    if curr_loss < min_loss:
        min_loss = curr_loss
        best_a, best_b = a, b
print(best_a, best_b)
print(min_loss)
12.414266461017732 -56.48722240398021
50.00741553150247

gradient descent

The loss function is a concave function. To find the values of a and b that minimize the function, you can use the following method:

a ′ = a + ( − 1 ) × ∂ l o s s ( a , b ) ∂ a × Δ a^\prime = a + (-1) \times \frac {\partial loss(a, b)} {\partial a} \times \Delta a′=a+(−1)×∂a∂loss(a,b)​×Δ
b ′ = b + ( − 1 ) × ∂ l o s s ( a , b ) ∂ b × Δ b^\prime = b + (-1) \times \frac {\partial loss(a, b)} {\partial b} \times \Delta b′=b+(−1)×∂b∂loss(a,b)​×Δ

For the loss function of MSE, the partial derivative can be calculated by the following formula:

f ( a , b ) = 1 N ∑ i = 1 N ( y i − ( a x i + b ) ) 2 f(a, b) = \frac {1} {N} \sum_{i=1}^{N}(y_i - (ax_i + b))^2 f(a,b)=N1​i=1∑N​(yi​−(axi​+b))2
∂ f ( a , b ) ∂ a = 2 N ∑ i = 1 N ( − x i y i + x i 2 a + x i b ) \frac {\partial {f(a, b)}} {\partial {a}} = \frac {2} {N} \sum_{i=1}^{N}(-x_iy_i + x_i^2a + x_ib) ∂a∂f(a,b)​=N2​i=1∑N​(−xi​yi​+xi2​a+xi​b)
∂ f ( a , b ) ∂ b = 2 N ∑ i = 1 N ( − y i + x i a + b ) \frac {\partial {f(a, b)}} {\partial {b}} = \frac {2} {N} \sum_{i=1}^{N}(-y_i + x_ia + b) ∂b∂f(a,b)​=N2​i=1∑N​(−yi​+xi​a+b)

# Find the partial derivative of a
def partial_a(x, y, a, b):
    return 2 * np.mean((y - a * x - b) * (-x))


# Find the partial derivative of b
def partial_b(x, y, a, b):
    return 2 * np.mean(-y + a * x + b)
# Approach the inflection point by gradient descent
# This method can find the best fitting a and b faster
# The initial values of a and b can be set at will, and the value of delta should be small enough
a, b = 35, -35
delta = 0.01

for _ in range(100):
    a = a - partial_a(rooms, prices, a, b) * delta
    b = b - partial_b(rooms, prices, a, b) * delta
print(a, b)
print(get_loss(rooms, prices, a, b))
9.276809660789766 -35.781905844032686
43.61576735104159
# Forecast house prices through linear regression equation
def predict_price_by_regression(a, b, x):
    return a * x + b
# Forecast house price
print(np.round(predict_price_by_regression(best_a, best_b, 6.25), 2))
print(np.round(predict_price_by_regression(a, b, 6.25), 2))
21.1
22.2
print(np.round(predict_price_by_regression(best_a, best_b, 5.12), 2))
print(np.round(predict_price_by_regression(a, b, 5.12), 2))
7.07
11.72
# Compare the two fitting curves
y_hat1 = best_a * rooms + best_b
y_hat2 = a * rooms + b
plt.scatter(rooms, prices)
plt.plot(rooms, y_hat1, color='red', linewidth=4)
plt.plot(rooms, y_hat2, color='green', linewidth=4)
plt.show()


The least square solution is to use the obtained historical data (the values of x and y) to find a and b that can best fit these historical data.

y = a x + b y = ax + b y=ax+b

For the above equation, it is equivalent that x is the coefficient of variable a and 1 is the coefficient of variable b.

Description of lstsq function parameters

The first parameter of lstsq function is $\ begin{bmatrix} x \ 1 \ \end{bmatrix} ^T $, and the second parameter is y. The rcond parameter is ignored temporarily and directly set to None.

# The first argument to the lstsq function
param1 = np.vstack([rooms, np.ones(rooms.size)]).T
param1
array([[6.575, 1.   ],
       [6.421, 1.   ],
       [7.185, 1.   ],
       ...,
       [6.976, 1.   ],
       [6.794, 1.   ],
       [6.03 , 1.   ]])
# The second argument to the lstsq function
param2 = prices
param2
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
       17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
       25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
       23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
       32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
       34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
       20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
       26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
       31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
       22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
       42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
       36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
       32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
       20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
       20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
       22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
       21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
       19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
       32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
       18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
       16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
       13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
        7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
       12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
       27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
        8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
        9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
       10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
       15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
       19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
       29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
       20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
       23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])

Description of return value of lstsq function

The lstsq function returns a quad. The first element in the quad is the coefficient of the equation to be solved, and the second element in the quad is the sum of squares of errors.

# The rcond parameter is directly set to None (not explained temporarily)
result = np.linalg.lstsq(param1, param2, rcond=None)
result
(array([  9.10210898, -34.67062078]),
 array([22061.87919621]),
 2,
 array([143.99484122,   2.46656609]))
a, b = result[0]
mse = result[1][0] / rooms.size
print(a, b)
print(mse)
9.102108981180313 -34.67062077643857
43.600551771169584
# Compare the two fitting curves
plt.scatter(rooms, prices)
# House prices predicted by a and b given by gradient descent method
plt.plot(rooms, y_hat2, color='red', linewidth=4)
# House prices predicted by a and b given by lstsq function
y_hat3 = a * rooms + b
plt.plot(rooms, y_hat3, color='green', linewidth=4)
plt.show()


Tags: Python linear algebra

Posted on Fri, 17 Sep 2021 15:47:42 -0400 by skripty