Machine learning practice -- naive Bayes method
Naive Bayes is the category that uses Bayes rule to calculate the posterior probability through the prior probability under the assumption that each feature is independent from each other, and outputs the maximum posterior probability.
First, I will talk about the problems encountered in the next code. For the code, please refer to the book of machine learning practice.
Question 1:
When running program listing 4-5 in machine learning practice P66, it always reports an error:
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multib
terms of settlement:
Open the 23.txt file and found that it contains a "? Character. "SciFinance?is", delete the "?" Yes.
Question two:
del(trainingSet[randIndex]) this code reports an error
Error 'range' object doesn't support item deletion
Main reason: different versions. In python2 version of the book, range returns range object instead of list in python3
Solution: change trainingSet = range(50) to trainingSet = list(range(50))
Question three:
When parsing text
strings = 'Hello python,the best language!' print(strings) import re listOfTokens = re.split(r'\W*',strings) print(listOfTokens)
Output result
Hello python,the best language! ['', 'H', 'e', 'l', 'l', 'o', '', 'p', 'y', 't', 'h', 'o', 'n', '', 't', 'h', 'e', '', 'b', 'e', 's', 't', '', 'l', 'a', 'n', 'g', 'u', 'a', 'g', 'e', '', '']
Visible file parsing is not correct.
terms of settlement:
Change listOfTokens = re.split(r '\ w *', strings) to listOfTokens = re.split(r '\ W', strings) and remove *
The code gives its own understanding in the following comments
from numpy import * """Constructing word vector from text""" def loadDataSet(): """Create an experiment sample""" postingList=[ ['my','dog','has','flea','probiems','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licke','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','atupid'] ] classVec = [0,1,0,1,0,1] #1 for insulting words, 0 for normal speech return postingList,classVec def creatVocabList(dataSet): """Create a list of all documents without duplicate words""" vocabSet = set([]) for document in dataSet: vocabSet = vocabSet | set(document) #set() removes duplicate words from the list return list(vocabSet) def setOfWords2Vec(vocabList,inputSet): #Word set model """Enter as glossary and document, check if the words in the document are in the glossary""" returnVet = [0]*len(vocabList) #Create a 0 vector with a length of vocabList for word in inputSet: if word in vocabList: returnVet[vocabList.index(word)] = 1 else: print("the word: " + word + " is not in my Vocabulary!") return returnVet def bagOfWords2Vec(vocabList,inputSet): #Word bag model """Enter as glossary and document and check the number of times words in the document appear in the glossary""" returnVet = [0]*len(vocabList) #Create a 0 vector with a length of vocabList for word in inputSet: if word in vocabList: returnVet[vocabList.index(word)] += 1 else: print("the word: " + word + " is not in my Vocabulary!") return returnVet """Calculating probability from word vector""" def trainNB0(trainMatrix,trainCategory): """Input: document matrix, document category label vector""" numTrainDocs = len(trainMatrix) #Number of documents numWords = len(trainMatrix[0]) #Number of words in the document pAbusive = sum(trainCategory)/float(numTrainDocs) # P(1) #Initialization probability p0Nnm = ones(numWords) #Vector of the number of times each word appears in the vocabulary in the document with category 0 p1Num = ones(numWords) #Files of category 1 p0Denom = 2.0 #Total number of words in the vocabulary for documents with category 0 p1Denom = 2.0 #The total number of words in the glossary for documents with category 1 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Nnm += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) #p(w|c1) where W is the word vector of each document p0Vect = log(p0Nnm/p0Denom) #p(w|c0) return p0Vect,p1Vect,pAbusive def classfyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) #p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) #p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(0) if p1 > p0: return 1 else: return 0 def testingNB(): listOPosts,listClasses = loadDataSet() myVocabList = creatVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) testEntry = ['him','to','dog'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb)) testEntry = ['stupid','garbage'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb)) def textParse(bigString): """Text parsing""" import re listOfTokens = re.split(r'\W',bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] def spamTest(): docList = []; classList = []; fullText = [] for i in range(1,26): #Parsed text wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\spam\%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) # Classified labels wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\ham\%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = creatVocabList(docList) #Glossary trainingSet = list(range(50)) #Training dataset coordinates testSet = [] #Test data set coordinates for i in range(10): #Randomly select 10 texts as the test set randIndex = int(random.uniform(0,len(trainingSet))) #Randomly select text testSet.append(trainingSet[randIndex]) #Join test set del(trainingSet[randIndex]) #Remove test data from training set trainMat = []; trainClasses = [] for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) #Training set trainClasses.append(classList[docIndex]) #Training set category p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #Computing probability errorCount = 0 for docIndex in testSet: #test wordVector = setOfWords2Vec(vocabList,docList[docIndex]) if classfyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print('classcation error:',docList[docIndex]) print('the error rate is:',float(errorCount)/len(testSet)) if __name__ == "__main__": spamTest()
Operation result
classcation error: ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don A kind of', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts'] the error rate is: 0.1QianLong_ Published 4 original articles, won praise 3, visited 95 Private letter follow