Machine learning KNN algorithm

The simplest algorithm in machine learning

There is no data training, just comparing new data with known data sets

Principle: k points closest to the new data will vote to decide which category the new data belongs to

Example 1: classification of film types

#1. Build data set
import pandas as pd
rowdata={'Movie title':['There is no question.','Later we','Former 3','Red Sea Action','Investigation of Chinatown','Wolf 2'],
         'Fighting scenes':[1,5,12,108,112,115],
         'Kissing lens':[101,89,97,5,9,8],
         'Movie genre':['Affectional film','Affectional film','Affectional film','Action movie','Action movie','Action movie']}
movie_data=pd.DataFrame(rowdata)

#2. Calculate the distance
new_data=[24,67]
dist=list((((movie_data.iloc[:6,1:3]-new_data)**2).sum(1))**0.5)

#Arrange the distance in ascending order, then select the k points with the smallest distance
dist_l=pd.DataFrame({'dist':dist,'lables':(movie_data.iloc[:6,3])})
k=4
dr=dist_l.sort_values(by='dist')[:k]

#Determine the occurrence frequency of the category of the first k points
re=dr.loc[:,'lables'].value_counts()

#Output result
result=[]
result.append(re.index[0])
print(result)

 

Example 2: dating website matching effect

Code:

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 13 18:07:01 2020

@author: fypc
"""

#1. Prepare data
import pandas as pd
datingTest=pd.read_table('datingTestSet.txt',header=None)
'''
//Check the first few lines
print(datingTest.head())
//Viewing scale
print(datingTest.shape)
//View column data types
datingTest.info()
'''
#2. Analysis data
import matplotlib as mpl
import matplotlib.pyplot as plt

Colors=[]
for i in range(datingTest.shape[0]):
    m=datingTest.iloc[i,-1]
    if m=='didntLike':
        Colors.append('black')
    elif m=='smallDoses':
        Colors.append('orange')
    else:
        Colors.append('red')
#Draw a scatter diagram between two features
plt.rcParams['font.sans-serif']=['Simhei'] #Set font to bold
pl=plt.figure(figsize=(12,8))
#Using the concept of canvas and sub canvas
fig1=pl.add_subplot(221)
plt.scatter(datingTest.iloc[:,1], datingTest.iloc[:,2], marker='.',c=Colors)
plt.xlabel('Time spent playing video games')
plt.ylabel('Litres of ice cream consumed per week')

fig2=pl.add_subplot(222)
plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,1], marker='.',c=Colors)
plt.xlabel('Frequent flyer mileage per year')
plt.ylabel('Time spent playing video games')


fig1=pl.add_subplot(223)
plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,2], marker='.',c=Colors)
plt.xlabel('Frequent flyer mileage per year')
plt.ylabel('Litres of ice cream consumed per week')

plt.show()

#3. Data normalization: make each feature have the same weight on distance
#Use 0-1 standardization here
def minmax(dataSet):
    minDf=dataSet.min()
    maxDf=dataSet.max()
    normSet=(dataSet-minDf)/(maxDf-minDf)
    return normSet

datingT=pd.concat([minmax(datingTest.iloc[:,:3]),datingTest.iloc[:,3]],axis=1)

#4. Divide training set and test set
def randSplit(dataSet,rate=0.9):
    n=dataSet.shape[0] 
    m=int(n*rate)
    train=dataSet.iloc[:m,:]
    test=dataSet.iloc[m:,:]
    test.index=range(test.shape[0]) #Reset index line number
    return train,test

train,test=randSplit(datingT)

#5. Build classifier
def datingClass(train,test,k):
    n=train.shape[1]-1
    m=test.shape[0]
    result=[]
    for i in range(m):
        dist=list((((train.iloc[:,:n]-test.iloc[i,:n])**2).sum(1))**0.5)
        dist_l=pd.DataFrame({'dist':dist,'labels':(train.iloc[:,n])})
        dr=dist_l.sort_values(by='dist')[:k]
        re=dr.loc[:,'labels'].value_counts()
        result.append(re.index[0])
    result=pd.Series(result)
    test['predict']=result
    acc=(test.iloc[:,-1]==test.iloc[:,-2]).mean()
    print(f'The prediction accuracy of the model is{acc}')
    return test

#Verification model
datingClass(train,test,5)

 

50 original articles published, 40 praised, 10000 visitors+
Private letter follow

Posted on Thu, 13 Feb 2020 09:26:46 -0500 by tibiz