Spam classification 2

1. Read

# 1. Read data set
def read_dataset():
     file_path = r'SMSSpamCollection'
     sms = open(file_path, encoding='utf-8')
     sms_data = []
     sms_label = []
     csv_reader = csv.reader(sms, delimiter='\t')
     for line in csv_reader:
        sms_label.append(line[0])  # Extract Tags
        sms_data.append(preprocessing(line[1]))  # Extract features
     sms.close()
     return sms_data, sms_label

2. Data preprocessing

# 2. Data preprocessing
def preprocess(text):
     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]  # participle
     stops = stopwords.words('english')  # Use English stoppage list
     tokens = [token for token in tokens if token not in stops]  # Remove stop words
     tokens = [token.lower() for token in tokens if len(token) >= 3]  # Case, short word
     wnl = WordNetLemmatizer()
     tag = nltk.pos_tag(tokens)  # Part of speech
     tokens = [wnl.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)]  # Part of speech reduction
     preprocessed_text = ' '.join(tokens)
     return preprocessed_text

3. Data division training set and test set data division

from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0, stratify=y_train)

# 3. Partition data set
def split_dataset(data, label):
     x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0, stratify=label)
     return x_train, x_test, y_train, y_test

4. Text feature extraction

sklearn.feature_extraction.text.CountVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=sklearn%20feature_extraction%20text%20tfidfvectorizer

sklearn.feature_extraction.text.TfidfVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=sklearn%20feature_extraction%20text%20tfidfvectorizer#sklearn.feature_extraction.text.TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf2 = TfidfVectorizer()

Observe the relationship between email and vector

Vector revert to mail

# 4. Text feature extraction
# Transforming text into TF IDF's characteristic matrix
def tfidf_dataset(x_train,x_test):
     tfidf = TfidfVectorizer()
     X_train = tfidf.fit_transform(x_train)  
     X_test = tfidf.transform(x_test)
     return X_train, X_test, tfidf

# Vector revert to mail
def revert_mail(x_train, X_train, model):
    s = X_train.toarray()[0]
    print("The first email vector is expressed as:", s)
    a = np.flatnonzero(X_train.toarray()[0])  # Location of non-zero elements (index)
    print("Location of non-zero elements:", a)
    print("The value of a non-zero element of a vector:", s[a])
    b = model.vocabulary_  # glossary
    key_list = []
    for key, value in b.items():
        if value in a:
            key_list.append(key)  # key the word corresponding to the non-0 element
    print("Words for vector non-zero elements:", key_list)
    print("Messages before vectorization:", x_train[0])

5. Model selection

from sklearn.naive_bayes import GaussianNB

from sklearn.naive_bayes import MultinomialNB

Explain why this model was chosen?

# 5. Model selection
def mnb_model(x_train, x_test, y_train, y_test):
    mnb = MultinomialNB()
    mnb.fit(x_train, y_train)
    pre = mnb.predict(x_test)
    print("total:", len(y_test))
    print("Forecast correct:", (pre == y_test).sum())
    print("Prediction accuracy:",sum(pre == y_test) / len(y_test))
    return pre

6. Model evaluation: confusion matrix, classification Report

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_predict)

Explain the meaning of confusion matrix

from sklearn.metrics import classification_report

Explain the significance of accuracy rate, accuracy rate, recall rate and F value respectively

# 6. Model evaluation
def class_report(pre, y_test):
    conf_matrix = confusion_matrix(y_test, pre)
    print("=====================================================")
    print("Confusion matrix:\n", conf_matrix)
    c = classification_report(y_test, pre)
    print("Classification report:\n", c)
    print("Model accuracy:", (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix))

Full code:

# -*- coding:utf-8 -*-
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
def get_wordnet_pos(treebank_tag):# According to part of speech, generate restore parameter pos
     if treebank_tag.startswith('J'):  # adj
        return nltk.corpus.wordnet.ADJ
     elif treebank_tag.startswith('V'):  # v
        return nltk.corpus.wordnet.VERB
     elif treebank_tag.startswith('N'):  # n
         return nltk.corpus.wordnet.NOUN
     elif treebank_tag.startswith('R'):  # adv
        return nltk.corpus.wordnet.ADV
     else:
       return nltk.corpus.wordnet.NOUN
 
 # Pretreatment
def preprocessing(text):
     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]  # participle
     stops = stopwords.words('english')  # Use English stoppage list
     tokens = [token for token in tokens if token not in stops]  # Stop words
     tokens = [token.lower() for token in tokens if len(token) >= 3]  # Case, short word
     lmtzr = WordNetLemmatizer()
     tag = nltk.pos_tag(tokens)  # Part of speech
     tokens = [lmtzr.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)]  # Part of speech reduction
     preprocessed_text = ' '.join(tokens)
     return preprocessed_text
 
# Read data set
def read_dataset():
     file_path =r'SMSSpamCollection'
     sms = open(file_path, encoding='utf-8')#Read data
     sms_label = []  # store titles
     sms_data = []#Store data
     csv_reader = csv.reader(sms, delimiter='\t')
     for line in csv_reader:
        sms_label.append(line[0])  # Extract Tags
        sms_data.append(preprocessing(line[1]))  # Preprocess each email
     sms.close()
     return sms_data, sms_label
 
# Partition data set
def split_dataset(data, label):
     x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0, stratify=label)
     return x_train, x_test, y_train, y_test
 
# Transform the original text into TF IDF's characteristic matrix
def tfidf_dataset(x_train,x_test):
     tfidf = TfidfVectorizer()
     X_train = tfidf.fit_transform(x_train)  # X_ Fit for train_ Transform generate Glossary
     X_test = tfidf.transform(x_test)  # X_test with X_ The train vocabulary is the same, so in X_train to fit_transform operation based on transform
     return X_train, X_test, tfidf
 
# Vector restore message
def revert_mail(x_train, X_train, model):
    s = X_train.toarray()[0]
    print("The first email vector is expressed as:", s)
    # This function inputs a matrix and returns the position (index) of non-zero elements in the flattened matrix
    a = np.flatnonzero(X_train.toarray()[0])  # Location of non-zero elements (index)
    print("Location of non-zero elements:", a)
    print("The value of a non-zero element of a vector:", s[a])
    b = model.vocabulary_  # glossary
    key_list = []
    for key, value in b.items():
        if value in a:
            key_list.append(key)  # key the word corresponding to the non-0 element
    print("Words for vector non-zero elements:", key_list)
    print("Messages before vectorization:", x_train[0])
 
# Model selection (polynomial distribution based on data characteristics)
def mnb_model(x_train, x_test, y_train, y_test):
    mnb = MultinomialNB()
    mnb.fit(x_train, y_train)
    ypre_mnb = mnb.predict(x_test)
    print("total:", len(y_test))
    print("Forecast correct:", (ypre_mnb == y_test).sum())
    return ypre_mnb
 
# Model evaluation: confusion matrix, classification Report
def class_report(ypre_mnb, y_test):
    conf_matrix = confusion_matrix(y_test, ypre_mnb)
    print("Confusion matrix:\n", conf_matrix)
    c = classification_report(y_test, ypre_mnb)
    print("------------------------------------------")
    print("Classification report:\n", c)
    print("Model accuracy:", (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix))
 
if __name__ == '__main__':
    sms_data, sms_label = read_dataset() # Read data set
    x_train, x_test, y_train, y_test = split_dataset(sms_data, sms_label) # Partition data set
    X_train, X_test,tfidf = tfidf_dataset(x_train, x_test) # Transform the original text into TF IDF's characteristic matrix
    revert_mail(x_train, X_train, tfidf) # Vector revert to mail
    y_mnb = mnb_model(X_train, X_test, y_train,y_test) # Model selection
    class_report(y_mnb, y_test) # Model evaluation

6. Comparison and summary

If CountVectorizer is used for text feature generation, what is the effect compared with TfidfVectorizer?

  • Count vectorizer: only the frequency of words appearing in the text is considered, which belongs to the feature of word bag model.
  • Tfidf vectorizer: in addition to considering the frequency of a word in the text, it also focuses on the number of all texts containing the word, which can reduce the impact of high-frequency meaningless words, and mining more meaningful features. It belongs to tfdf feature.
  • Compared with tfdf vectorizer, CountVectorizer is more accurate in predicting negative classes, but less accurate in predicting positive classes. However, the overall prediction accuracy is also slightly higher than that of tfdf vectorizer, and it seems that CountVectorizer is more suitable for prediction.

Tags: Python encoding less

Posted on Mon, 25 May 2020 10:13:59 -0400 by Braet