Machine learning practice -- naive Bayes method (code explanation)

Machine learning practice -- naive Bayes method

Naive Bayes is the category that uses Bayes rule to calculate the posterior probability through the prior probability under the assumption that each feature is independent from each other, and outputs the maximum posterior probability.

First, I will talk about the problems encountered in the next code. For the code, please refer to the book of machine learning practice.

Question 1:
When running program listing 4-5 in machine learning practice P66, it always reports an error:
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multib
terms of settlement:
Open the 23.txt file and found that it contains a "? Character. "SciFinance?is", delete the "?" Yes.
Question two:
del(trainingSet[randIndex]) this code reports an error
Error 'range' object doesn't support item deletion
Main reason: different versions. In python2 version of the book, range returns range object instead of list in python3
Solution: change trainingSet = range(50) to trainingSet = list(range(50))

Question three:
When parsing text

strings = 'Hello python,the best language!'
print(strings)
import re
listOfTokens = re.split(r'\W*',strings)
print(listOfTokens)

Output result

Hello python,the best language!
['', 'H', 'e', 'l', 'l', 'o', '', 'p', 'y', 't', 'h', 'o', 'n', '', 't', 'h', 'e', '', 'b', 'e', 's', 't', '', 'l', 'a', 'n', 'g', 'u', 'a', 'g', 'e', '', '']

Visible file parsing is not correct.
terms of settlement:
Change listOfTokens = re.split(r '\ w *', strings) to listOfTokens = re.split(r '\ W', strings) and remove *

The code gives its own understanding in the following comments

from numpy import *
"""Constructing word vector from text"""
def loadDataSet():
    """Create an experiment sample"""
    postingList=[
                ['my','dog','has','flea','probiems','help','please'],
                ['maybe','not','take','him','to','dog','park','stupid'],
                ['my','dalmation','is','so','cute','I','love','him'],
                ['stop','posting','stupid','worthless','garbage'],
                ['mr','licke','ate','my','steak','how','to','stop','him'],
                ['quit','buying','worthless','dog','food','atupid']
    ]
    classVec = [0,1,0,1,0,1] #1 for insulting words, 0 for normal speech
    return postingList,classVec

def creatVocabList(dataSet):
    """Create a list of all documents without duplicate words"""
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document) #set() removes duplicate words from the list
    return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):
    #Word set model
    """Enter as glossary and document, check if the words in the document are in the glossary"""
    returnVet = [0]*len(vocabList) #Create a 0 vector with a length of vocabList
    for word in inputSet:
        if word in vocabList:
            returnVet[vocabList.index(word)] = 1
        else:
            print("the word: " + word + " is not in my Vocabulary!")
    return returnVet

def bagOfWords2Vec(vocabList,inputSet):
    #Word bag model
    """Enter as glossary and document and check the number of times words in the document appear in the glossary"""
    returnVet = [0]*len(vocabList) #Create a 0 vector with a length of vocabList
    for word in inputSet:
        if word in vocabList:
            returnVet[vocabList.index(word)] += 1
        else:
            print("the word: " + word + " is not in my Vocabulary!")
    return returnVet

"""Calculating probability from word vector"""
def trainNB0(trainMatrix,trainCategory):
    """Input: document matrix, document category label vector"""
    numTrainDocs = len(trainMatrix) #Number of documents
    numWords = len(trainMatrix[0]) #Number of words in the document
    pAbusive = sum(trainCategory)/float(numTrainDocs) # P(1)
    #Initialization probability
    p0Nnm = ones(numWords) #Vector of the number of times each word appears in the vocabulary in the document with category 0
    p1Num = ones(numWords) #Files of category 1
    p0Denom = 2.0 #Total number of words in the vocabulary for documents with category 0
    p1Denom = 2.0 #The total number of words in the glossary for documents with category 1
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Nnm += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom) #p(w|c1) where W is the word vector of each document
    p0Vect = log(p0Nnm/p0Denom) #p(w|c0)
    return p0Vect,p1Vect,pAbusive

def classfyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
     #p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
     #p(w1|c1)*p(w2|c1)*...*p(wn|c1)*p(0)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet() 
    myVocabList = creatVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['him','to','dog']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classfied as:',classfyNB(thisDoc,p0V,p1V,pAb))

def textParse(bigString):
    """Text parsing"""
    import re
    listOfTokens = re.split(r'\W',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1,26): #Parsed text
        wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\spam\%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) # Classified labels
        wordList = textParse(open(r'E:\machine learning\machinelearninginaction\Ch04\email\ham\%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = creatVocabList(docList) #Glossary
    trainingSet = list(range(50)) #Training dataset coordinates
    testSet = [] #Test data set coordinates
    for i in range(10): #Randomly select 10 texts as the test set
        randIndex = int(random.uniform(0,len(trainingSet))) #Randomly select text
        testSet.append(trainingSet[randIndex]) #Join test set
        del(trainingSet[randIndex]) #Remove test data from training set
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) #Training set
        trainClasses.append(classList[docIndex]) #Training set category
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #Computing probability
    errorCount = 0
    for docIndex in testSet: #test
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classfyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print('classcation error:',docList[docIndex])
    print('the error rate is:',float(errorCount)/len(testSet))
 

if __name__ == "__main__":
    spamTest()



Operation result

classcation error: ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don A kind of', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is: 0.1
Published 4 original articles, won praise 3, visited 95
Private letter follow

Tags: Python codec

Posted on Sat, 01 Feb 2020 11:24:04 -0500 by Inkyskin