1. Read
# 1. Read data set def read_dataset(): file_path = r'SMSSpamCollection' sms = open(file_path, encoding='utf-8') sms_data = [] sms_label = [] csv_reader = csv.reader(sms, delimiter='\t') for line in csv_reader: sms_label.append(line[0]) # Extract Tags sms_data.append(preprocessing(line[1])) # Extract features sms.close() return sms_data, sms_label
2. Data preprocessing
# 2. Data preprocessing def preprocess(text): tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # participle stops = stopwords.words('english') # Use English stoppage list tokens = [token for token in tokens if token not in stops] # Remove stop words tokens = [token.lower() for token in tokens if len(token) >= 3] # Case, short word wnl = WordNetLemmatizer() tag = nltk.pos_tag(tokens) # Part of speech tokens = [wnl.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)] # Part of speech reduction preprocessed_text = ' '.join(tokens) return preprocessed_text
3. Data division training set and test set data division
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0, stratify=y_train)
# 3. Partition data set def split_dataset(data, label): x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0, stratify=label) return x_train, x_test, y_train, y_test
4. Text feature extraction
sklearn.feature_extraction.text.CountVectorizer
sklearn.feature_extraction.text.TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer()
Observe the relationship between email and vector
Vector revert to mail
# 4. Text feature extraction # Transforming text into TF IDF's characteristic matrix def tfidf_dataset(x_train,x_test): tfidf = TfidfVectorizer() X_train = tfidf.fit_transform(x_train) X_test = tfidf.transform(x_test) return X_train, X_test, tfidf # Vector revert to mail def revert_mail(x_train, X_train, model): s = X_train.toarray()[0] print("The first email vector is expressed as:", s) a = np.flatnonzero(X_train.toarray()[0]) # Location of non-zero elements (index) print("Location of non-zero elements:", a) print("The value of a non-zero element of a vector:", s[a]) b = model.vocabulary_ # glossary key_list = [] for key, value in b.items(): if value in a: key_list.append(key) # key the word corresponding to the non-0 element print("Words for vector non-zero elements:", key_list) print("Messages before vectorization:", x_train[0])
5. Model selection
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
Explain why this model was chosen?
# 5. Model selection def mnb_model(x_train, x_test, y_train, y_test): mnb = MultinomialNB() mnb.fit(x_train, y_train) pre = mnb.predict(x_test) print("total:", len(y_test)) print("Forecast correct:", (pre == y_test).sum()) print("Prediction accuracy:",sum(pre == y_test) / len(y_test)) return pre
6. Model evaluation: confusion matrix, classification Report
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_predict)
Explain the meaning of confusion matrix
from sklearn.metrics import classification_report
Explain the significance of accuracy rate, accuracy rate, recall rate and F value respectively
# 6. Model evaluation def class_report(pre, y_test): conf_matrix = confusion_matrix(y_test, pre) print("=====================================================") print("Confusion matrix:\n", conf_matrix) c = classification_report(y_test, pre) print("Classification report:\n", c) print("Model accuracy:", (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix))
Full code:
# -*- coding:utf-8 -*- from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix, classification_report import numpy as np import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import csv def get_wordnet_pos(treebank_tag):# According to part of speech, generate restore parameter pos if treebank_tag.startswith('J'): # adj return nltk.corpus.wordnet.ADJ elif treebank_tag.startswith('V'): # v return nltk.corpus.wordnet.VERB elif treebank_tag.startswith('N'): # n return nltk.corpus.wordnet.NOUN elif treebank_tag.startswith('R'): # adv return nltk.corpus.wordnet.ADV else: return nltk.corpus.wordnet.NOUN # Pretreatment def preprocessing(text): tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # participle stops = stopwords.words('english') # Use English stoppage list tokens = [token for token in tokens if token not in stops] # Stop words tokens = [token.lower() for token in tokens if len(token) >= 3] # Case, short word lmtzr = WordNetLemmatizer() tag = nltk.pos_tag(tokens) # Part of speech tokens = [lmtzr.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)] # Part of speech reduction preprocessed_text = ' '.join(tokens) return preprocessed_text # Read data set def read_dataset(): file_path =r'SMSSpamCollection' sms = open(file_path, encoding='utf-8')#Read data sms_label = [] # store titles sms_data = []#Store data csv_reader = csv.reader(sms, delimiter='\t') for line in csv_reader: sms_label.append(line[0]) # Extract Tags sms_data.append(preprocessing(line[1])) # Preprocess each email sms.close() return sms_data, sms_label # Partition data set def split_dataset(data, label): x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0, stratify=label) return x_train, x_test, y_train, y_test # Transform the original text into TF IDF's characteristic matrix def tfidf_dataset(x_train,x_test): tfidf = TfidfVectorizer() X_train = tfidf.fit_transform(x_train) # X_ Fit for train_ Transform generate Glossary X_test = tfidf.transform(x_test) # X_test with X_ The train vocabulary is the same, so in X_train to fit_transform operation based on transform return X_train, X_test, tfidf # Vector restore message def revert_mail(x_train, X_train, model): s = X_train.toarray()[0] print("The first email vector is expressed as:", s) # This function inputs a matrix and returns the position (index) of non-zero elements in the flattened matrix a = np.flatnonzero(X_train.toarray()[0]) # Location of non-zero elements (index) print("Location of non-zero elements:", a) print("The value of a non-zero element of a vector:", s[a]) b = model.vocabulary_ # glossary key_list = [] for key, value in b.items(): if value in a: key_list.append(key) # key the word corresponding to the non-0 element print("Words for vector non-zero elements:", key_list) print("Messages before vectorization:", x_train[0]) # Model selection (polynomial distribution based on data characteristics) def mnb_model(x_train, x_test, y_train, y_test): mnb = MultinomialNB() mnb.fit(x_train, y_train) ypre_mnb = mnb.predict(x_test) print("total:", len(y_test)) print("Forecast correct:", (ypre_mnb == y_test).sum()) return ypre_mnb # Model evaluation: confusion matrix, classification Report def class_report(ypre_mnb, y_test): conf_matrix = confusion_matrix(y_test, ypre_mnb) print("Confusion matrix:\n", conf_matrix) c = classification_report(y_test, ypre_mnb) print("------------------------------------------") print("Classification report:\n", c) print("Model accuracy:", (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)) if __name__ == '__main__': sms_data, sms_label = read_dataset() # Read data set x_train, x_test, y_train, y_test = split_dataset(sms_data, sms_label) # Partition data set X_train, X_test,tfidf = tfidf_dataset(x_train, x_test) # Transform the original text into TF IDF's characteristic matrix revert_mail(x_train, X_train, tfidf) # Vector revert to mail y_mnb = mnb_model(X_train, X_test, y_train,y_test) # Model selection class_report(y_mnb, y_test) # Model evaluation
6. Comparison and summary
If CountVectorizer is used for text feature generation, what is the effect compared with TfidfVectorizer?
- Count vectorizer: only the frequency of words appearing in the text is considered, which belongs to the feature of word bag model.
- Tfidf vectorizer: in addition to considering the frequency of a word in the text, it also focuses on the number of all texts containing the word, which can reduce the impact of high-frequency meaningless words, and mining more meaningful features. It belongs to tfdf feature.
- Compared with tfdf vectorizer, CountVectorizer is more accurate in predicting negative classes, but less accurate in predicting positive classes. However, the overall prediction accuracy is also slightly higher than that of tfdf vectorizer, and it seems that CountVectorizer is more suitable for prediction.