# Machine learning practice -- naive Bayes method (code explanation)

## Machine learning practice -- naive Bayes method

Naive Bayes is the category that uses Bayes rule to calculate the posterior probability through the prior probability under the assumption that each feature is independent from each other, and outputs the maximum posterior probability.

First, I will talk about the problems encountered in the next code. For the code, please refer to the book of machine learning practice.

Question 1:
When running program listing 4-5 in machine learning practice P66, it always reports an error:
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multib
terms of settlement:
Open the 23.txt file and found that it contains a "? Character. "SciFinance?is", delete the "?" Yes.
Question two:
del(trainingSet[randIndex]) this code reports an error
Error 'range' object doesn't support item deletion
Main reason: different versions. In python2 version of the book, range returns range object instead of list in python3
Solution: change trainingSet = range(50) to trainingSet = list(range(50))

Question three:
When parsing text

```strings = 'Hello python,the best language!'
print(strings)
import re
listOfTokens = re.split(r'\W*',strings)
print(listOfTokens)
```

Output result

```Hello python,the best language!
['', 'H', 'e', 'l', 'l', 'o', '', 'p', 'y', 't', 'h', 'o', 'n', '', 't', 'h', 'e', '', 'b', 'e', 's', 't', '', 'l', 'a', 'n', 'g', 'u', 'a', 'g', 'e', '', '']
```

Visible file parsing is not correct.
terms of settlement:
Change listOfTokens = re.split(r '\ w *', strings) to listOfTokens = re.split(r '\ W', strings) and remove *

The code gives its own understanding in the following comments

```from numpy import *
"""Constructing word vector from text"""
def loadDataSet():
"""Create an experiment sample"""
postingList=[
['my','dog','has','flea','probiems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licke','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','atupid']
]
classVec = [0,1,0,1,0,1] #1 for insulting words, 0 for normal speech
return postingList,classVec

def creatVocabList(dataSet):
"""Create a list of all documents without duplicate words"""
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document) #set() removes duplicate words from the list
return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):
#Word set model
"""Enter as glossary and document, check if the words in the document are in the glossary"""
returnVet = [0]*len(vocabList) #Create a 0 vector with a length of vocabList
for word in inputSet:
if word in vocabList:
returnVet[vocabList.index(word)] = 1
else:
print("the word: " + word + " is not in my Vocabulary!")
return returnVet

def bagOfWords2Vec(vocabList,inputSet):
#Word bag model
"""Enter as glossary and document and check the number of times words in the document appear in the glossary"""
returnVet = [0]*len(vocabList) #Create a 0 vector with a length of vocabList
for word in inputSet:
if word in vocabList:
returnVet[vocabList.index(word)] += 1
else:
print("the word: " + word + " is not in my Vocabulary!")
return returnVet

"""Calculating probability from word vector"""
def trainNB0(trainMatrix,trainCategory):
"""Input: document matrix, document category label vector"""
numTrainDocs = len(trainMatrix) #Number of documents
numWords = len(trainMatrix[0]) #Number of words in the document
pAbusive = sum(trainCategory)/float(numTrainDocs) # P(1)
#Initialization probability
p0Nnm = ones(numWords) #Vector of the number of times each word appears in the vocabulary in the document with category 0
p1Num = ones(numWords) #Files of category 1
p0Denom = 2.0 #Total number of words in the vocabulary for documents with category 0
p1Denom = 2.0 #The total number of words in the glossary for documents with category 1
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Nnm += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #p(w|c1) where W is the word vector of each document
p0Vect = log(p0Nnm/p0Denom) #p(w|c0)
return p0Vect,p1Vect,pAbusive

def classfyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
#p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
#p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(0)
if p1 > p0:
return 1
else:
return 0

def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = creatVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['him','to','dog']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb))

def textParse(bigString):
"""Text parsing"""
import re
listOfTokens = re.split(r'\W',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26): #Parsed text
wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\spam\%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1) # Classified labels
wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\ham\%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = creatVocabList(docList) #Glossary
trainingSet = list(range(50)) #Training dataset coordinates
testSet = [] #Test data set coordinates
for i in range(10): #Randomly select 10 texts as the test set
randIndex = int(random.uniform(0,len(trainingSet))) #Randomly select text
testSet.append(trainingSet[randIndex]) #Join test set
del(trainingSet[randIndex]) #Remove test data from training set
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) #Training set
trainClasses.append(classList[docIndex]) #Training set category
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #Computing probability
errorCount = 0
for docIndex in testSet: #test
wordVector = setOfWords2Vec(vocabList,docList[docIndex])
if classfyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('classcation error:',docList[docIndex])
print('the error rate is:',float(errorCount)/len(testSet))

if __name__ == "__main__":
spamTest()

```

Operation result

```classcation error: ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don A kind of', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is: 0.1
```
Published 4 original articles, won praise 3, visited 95

Tags: Python codec

Posted on Sat, 01 Feb 2020 11:24:04 -0500 by Inkyskin