# Machine learning KNN algorithm

The simplest algorithm in machine learning

There is no data training, just comparing new data with known data sets

Principle: k points closest to the new data will vote to decide which category the new data belongs to

Example 1: classification of film types

```#1. Build data set
import pandas as pd
rowdata={'Movie title':['There is no question.','Later we','Former 3','Red Sea Action','Investigation of Chinatown','Wolf 2'],
'Fighting scenes':[1,5,12,108,112,115],
'Kissing lens':[101,89,97,5,9,8],
'Movie genre':['Affectional film','Affectional film','Affectional film','Action movie','Action movie','Action movie']}
movie_data=pd.DataFrame(rowdata)

#2. Calculate the distance
new_data=[24,67]
dist=list((((movie_data.iloc[:6,1:3]-new_data)**2).sum(1))**0.5)

#Arrange the distance in ascending order, then select the k points with the smallest distance
dist_l=pd.DataFrame({'dist':dist,'lables':(movie_data.iloc[:6,3])})
k=4
dr=dist_l.sort_values(by='dist')[:k]

#Determine the occurrence frequency of the category of the first k points
re=dr.loc[:,'lables'].value_counts()

#Output result
result=[]
result.append(re.index[0])
print(result)

```

Example 2: dating website matching effect

Code:

```# -*- coding: utf-8 -*-
"""
Created on Thu Feb 13 18:07:01 2020

@author: fypc
"""

#1. Prepare data
import pandas as pd
'''
//Check the first few lines
//Viewing scale
print(datingTest.shape)
//View column data types
datingTest.info()
'''
#2. Analysis data
import matplotlib as mpl
import matplotlib.pyplot as plt

Colors=[]
for i in range(datingTest.shape[0]):
m=datingTest.iloc[i,-1]
if m=='didntLike':
Colors.append('black')
elif m=='smallDoses':
Colors.append('orange')
else:
Colors.append('red')
#Draw a scatter diagram between two features
plt.rcParams['font.sans-serif']=['Simhei'] #Set font to bold
pl=plt.figure(figsize=(12,8))
#Using the concept of canvas and sub canvas
plt.scatter(datingTest.iloc[:,1], datingTest.iloc[:,2], marker='.',c=Colors)
plt.xlabel('Time spent playing video games')
plt.ylabel('Litres of ice cream consumed per week')

plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,1], marker='.',c=Colors)
plt.xlabel('Frequent flyer mileage per year')
plt.ylabel('Time spent playing video games')

plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,2], marker='.',c=Colors)
plt.xlabel('Frequent flyer mileage per year')
plt.ylabel('Litres of ice cream consumed per week')

plt.show()

#3. Data normalization: make each feature have the same weight on distance
#Use 0-1 standardization here
def minmax(dataSet):
minDf=dataSet.min()
maxDf=dataSet.max()
normSet=(dataSet-minDf)/(maxDf-minDf)
return normSet

datingT=pd.concat([minmax(datingTest.iloc[:,:3]),datingTest.iloc[:,3]],axis=1)

#4. Divide training set and test set
def randSplit(dataSet,rate=0.9):
n=dataSet.shape[0]
m=int(n*rate)
train=dataSet.iloc[:m,:]
test=dataSet.iloc[m:,:]
test.index=range(test.shape[0]) #Reset index line number
return train,test

train,test=randSplit(datingT)

#5. Build classifier
def datingClass(train,test,k):
n=train.shape[1]-1
m=test.shape[0]
result=[]
for i in range(m):
dist=list((((train.iloc[:,:n]-test.iloc[i,:n])**2).sum(1))**0.5)
dist_l=pd.DataFrame({'dist':dist,'labels':(train.iloc[:,n])})
dr=dist_l.sort_values(by='dist')[:k]
re=dr.loc[:,'labels'].value_counts()
result.append(re.index[0])
result=pd.Series(result)
test['predict']=result
acc=(test.iloc[:,-1]==test.iloc[:,-2]).mean()
print(f'The prediction accuracy of the model is{acc}')
return test

#Verification model
datingClass(train,test,5)```

50 original articles published, 40 praised, 10000 visitors+

Posted on Thu, 13 Feb 2020 09:26:46 -0500 by tibiz