The simplest algorithm in machine learning
There is no data training, just comparing new data with known data sets
Principle: k points closest to the new data will vote to decide which category the new data belongs to
Example 1: classification of film types
#1. Build data set import pandas as pd rowdata={'Movie title':['There is no question.','Later we','Former 3','Red Sea Action','Investigation of Chinatown','Wolf 2'], 'Fighting scenes':[1,5,12,108,112,115], 'Kissing lens':[101,89,97,5,9,8], 'Movie genre':['Affectional film','Affectional film','Affectional film','Action movie','Action movie','Action movie']} movie_data=pd.DataFrame(rowdata) #2. Calculate the distance new_data=[24,67] dist=list((((movie_data.iloc[:6,1:3]-new_data)**2).sum(1))**0.5) #Arrange the distance in ascending order, then select the k points with the smallest distance dist_l=pd.DataFrame({'dist':dist,'lables':(movie_data.iloc[:6,3])}) k=4 dr=dist_l.sort_values(by='dist')[:k] #Determine the occurrence frequency of the category of the first k points re=dr.loc[:,'lables'].value_counts() #Output result result=[] result.append(re.index[0]) print(result)
Example 2: dating website matching effect
# -*- coding: utf-8 -*- """ Created on Thu Feb 13 18:07:01 2020 @author: fypc """ #1. Prepare data import pandas as pd datingTest=pd.read_table('datingTestSet.txt',header=None) ''' //Check the first few lines print(datingTest.head()) //Viewing scale print(datingTest.shape) //View column data types ''' #2. Analysis data import matplotlib as mpl import matplotlib.pyplot as plt Colors=[] for i in range(datingTest.shape[0]): m=datingTest.iloc[i,-1] if m=='didntLike': Colors.append('black') elif m=='smallDoses': Colors.append('orange') else: Colors.append('red') #Draw a scatter diagram between two features plt.rcParams['font.sans-serif']=['Simhei'] #Set font to bold pl=plt.figure(figsize=(12,8)) #Using the concept of canvas and sub canvas fig1=pl.add_subplot(221) plt.scatter(datingTest.iloc[:,1], datingTest.iloc[:,2], marker='.',c=Colors) plt.xlabel('Time spent playing video games') plt.ylabel('Litres of ice cream consumed per week') fig2=pl.add_subplot(222) plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,1], marker='.',c=Colors) plt.xlabel('Frequent flyer mileage per year') plt.ylabel('Time spent playing video games') fig1=pl.add_subplot(223) plt.scatter(datingTest.iloc[:,0], datingTest.iloc[:,2], marker='.',c=Colors) plt.xlabel('Frequent flyer mileage per year') plt.ylabel('Litres of ice cream consumed per week') #3. Data normalization: make each feature have the same weight on distance #Use 0-1 standardization here def minmax(dataSet): minDf=dataSet.min() maxDf=dataSet.max() normSet=(dataSet-minDf)/(maxDf-minDf) return normSet datingT=pd.concat([minmax(datingTest.iloc[:,:3]),datingTest.iloc[:,3]],axis=1) #4. Divide training set and test set def randSplit(dataSet,rate=0.9): n=dataSet.shape[0] m=int(n*rate) train=dataSet.iloc[:m,:] test=dataSet.iloc[m:,:] test.index=range(test.shape[0]) #Reset index line number return train,test train,test=randSplit(datingT) #5. Build classifier def datingClass(train,test,k): n=train.shape[1]-1 m=test.shape[0] result=[] for i in range(m): dist=list((((train.iloc[:,:n]-test.iloc[i,:n])**2).sum(1))**0.5) dist_l=pd.DataFrame({'dist':dist,'labels':(train.iloc[:,n])}) dr=dist_l.sort_values(by='dist')[:k] re=dr.loc[:,'labels'].value_counts() result.append(re.index[0]) result=pd.Series(result) test['predict']=result acc=(test.iloc[:,-1]==test.iloc[:,-2]).mean() print(f'The prediction accuracy of the model is') return test #Verification model datingClass(train,test,5)