Machine learning KNN algorithm

The simplest algorithm in machine learning There is no da...

The simplest algorithm in machine learning

There is no data training, just comparing new data with known data sets

Principle: k points closest to the new data will vote to decide which category the new data belongs to

Example 1: classification of film types

#1. Build data set import pandas as pd rowdata={'Movie title':['There is no question.','Later we','Former 3','Red Sea Action','Investigation of Chinatown','Wolf 2'], 'Fighting scenes':[1,5,12,108,112,115], 'Kissing lens':[101,89,97,5,9,8], 'Movie genre':['Affectional film','Affectional film','Affectional film','Action movie','Action movie','Action movie']} movie_data=pd.DataFrame(rowdata) #2. Calculate the distance new_data=[24,67] dist=list((((movie_data.iloc[:6,1:3]-new_data)**2).sum(1))**0.5) #Arrange the distance in ascending order, then select the k points with the smallest distance dist_l=pd.DataFrame({'dist':dist,'lables':(movie_data.iloc[:6,3])}) k=4 dr=dist_l.sort_values(by='dist')[:k] #Determine the occurrence frequency of the category of the first k points re=dr.loc[:,'lables'].value_counts() #Output result result=[] result.append(re.index[0]) print(result)

Example 2: dating website matching effect

Code:

# -*- coding: utf-8 -*- """ Created on Thu Feb 13 18:07:01 2020 @author: fypc """ #1. Prepare data import pandas as pd datingTest=pd.read_table('datingTestSet.txt',header=None) ''' //Check the first few lines print(datingTest.head()) //Viewing scale print(datingTest.shape) //View column data types datingTest.info() ''' #2. Analysis data import matplotlib as mpl import matplotlib.pyplot as plt Colors=[] for i in range(datingTest.shape[0]): m=datingTest.iloc[i,-1] if m=='didntLike': Colors.append('black') elif m=='smallDoses': Colors.append('orange') else: Colors.append('red') #Draw a scatter diagram between two features plt.rcParams['font.sans-serif']=['Simhei'] #Set font to bold pl=plt.figure(figsize=(12,8)) #Using the concept of canvas and sub canvas fig1=pl.add_subplot(221) plt.scatter(datingTest.iloc[:,1], datingTest.iloc[:,2], marker='.',c=Colors) plt.xlabel('Time spent playing video games') plt.ylabel('Litres of ice cream consumed per week') fig2=pl.add_subplot(222) plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,1], marker='.',c=Colors) plt.xlabel('Frequent flyer mileage per year') plt.ylabel('Time spent playing video games') fig1=pl.add_subplot(223) plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,2], marker='.',c=Colors) plt.xlabel('Frequent flyer mileage per year') plt.ylabel('Litres of ice cream consumed per week') plt.show() #3. Data normalization: make each feature have the same weight on distance #Use 0-1 standardization here def minmax(dataSet): minDf=dataSet.min() maxDf=dataSet.max() normSet=(dataSet-minDf)/(maxDf-minDf) return normSet datingT=pd.concat([minmax(datingTest.iloc[:,:3]),datingTest.iloc[:,3]],axis=1) #4. Divide training set and test set def randSplit(dataSet,rate=0.9): n=dataSet.shape[0] m=int(n*rate) train=dataSet.iloc[:m,:] test=dataSet.iloc[m:,:] test.index=range(test.shape[0]) #Reset index line number return train,test train,test=randSplit(datingT) #5. Build classifier def datingClass(train,test,k): n=train.shape[1]-1 m=test.shape[0] result=[] for i in range(m): dist=list((((train.iloc[:,:n]-test.iloc[i,:n])**2).sum(1))**0.5) dist_l=pd.DataFrame({'dist':dist,'labels':(train.iloc[:,n])}) dr=dist_l.sort_values(by='dist')[:k] re=dr.loc[:,'labels'].value_counts() result.append(re.index[0]) result=pd.Series(result) test['predict']=result acc=(test.iloc[:,-1]==test.iloc[:,-2]).mean() print(f'The prediction accuracy of the model is') return test #Verification model datingClass(train,test,5)

hzaukotete 50 original articles published, 40 praised, 10000 visitors+ Private letter follow

13 February 2020, 09:26 | Views: 5664

Add new comment

For adding a comment, please log in
or create account

0 comments