Design of python feature extraction project

preface

python final exam requires a feature extraction project. Today I'll show my results

1, Code

import numpy as np # Linear discriminant analysis is imported from sklearn's linear analysis library, that is, LDA maximizes the coordinate axis of inter class discrimination to reduce the dimension of classification preprocessing from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.datasets import load_iris import matplotlib.pyplot as plt import pandas as pd outputfile = 'LDA6.xlsx' # Dimensionless data def lda(data, target, n_dim): ''' :param data: (n_samples, n_features) :param target: data class :param n_dim: target dimension :return: (n_samples, n_dims) ''' clusters = np.unique(target) if n_dim > len(clusters)-1: print("K is too much") print("please input again") exit(0) #within_class scatter matrix # Calculate in class distance Sw Sw = np.zeros((data.shape[1],data.shape[1])) # Loop each type for i in clusters: datai = data[target == i] datai = datai-datai.mean(0) Swi = np.mat(datai).T*np.mat(datai) Sw += Swi #between_class scatter matrix # Calculate the distance between classes SB SB = np.zeros((data.shape[1],data.shape[1])) u = data.mean(0) #Average of all samples for i in clusters: Ni = data[target == i].shape[0] ui = data[target == i].mean(0) #Average value of a category SBi = Ni*np.mat(ui - u).T*np.mat(ui - u) SB += SBi S = np.linalg.inv(Sw)*SB eigVals,eigVects = np.linalg.eig(S) #Eigenvalue, eigenvector eigValInd = np.argsort(eigVals) eigValInd = eigValInd[:(-n_dim-1):-1] w = eigVects[:,eigValInd] data_ndim = np.dot(data, w) return data_ndim if __name__ == '__main__': data1 = pd.read_excel('yy.xlsx', names=range(0, 129)) X = data1.drop([0], axis=1) data = data1.values Y = data[:, 0] # Load dataset as dictionary # iris = load_iris() # X represents data and Y represents label # X = iris.data # Y = iris.target data_1 = lda(X, Y, 5) data_2 = LinearDiscriminantAnalysis(n_components=5).fit_transform(X, Y) data_3 = lda(X, Y, 5) data_4 = lda(X, Y, 5) data_5 = lda(X, Y, 5) data_6 = lda(X, Y, 7) data_7 = lda(X, Y, 7) data_8 = lda(X, Y, 7) data_9 = lda(X, Y, 7) data_10 = lda(X, Y, 9) data_11 = lda(X, Y, 9) data_12 = lda(X, Y, 9) data_13 = lda(X, Y, 9) data_14 = lda(X, Y, 9) data_15 = lda(X, Y, 9) data_16 = lda(X, Y, 9) plt.figure(figsize=(20, 15)) plt.subplot(441) plt.title("LDA") plt.xlim([data_1[:, 0].min()*1.2, data_1[:, 0].max()*1.2]) plt.ylim([data_1[:, 1].min()*1.3, data_1[:, 1].max()*1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_1[:, 0], data_1[:, 1], c=Y) plt.subplot(442) plt.title("sklearn_LDA") plt.scatter(data_2[:, 0], data_2[:, 1], c=Y) plt.subplot(443) plt.title("LDA2") plt.xlim([data_3[:, 1].min() * 1.2, data_3[:, 1].max() * 1.2]) plt.ylim([data_3[:, 2].min() * 1.3, data_3[:, 2].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_3[:, 1], data_3[:, 2], c=Y) plt.subplot(444) plt.title("LDA3") plt.xlim([data_4[:, 2].min() * 1.2, data_4[:, 2].max() * 1.2]) plt.ylim([data_4[:, 3].min() * 1.3, data_4[:, 3].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_4[:, 2], data_4[:, 3], c=Y) plt.subplot(445) plt.title("LDA4") plt.xlim([data_5[:, 3].min() * 1.2, data_5[:, 3].max() * 1.2]) plt.ylim([data_5[:, 4].min() * 1.3, data_5[:, 4].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_5[:, 3], data_5[:, 4], c=Y) plt.subplot(446) plt.title("LDA5") plt.xlim([data_6[:, 5].min() * 1.2, data_6[:, 5].max() * 1.2]) plt.ylim([data_6[:, 6].min() * 1.3, data_6[:, 6].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_6[:, 5], data_6[:, 6], c=Y) plt.subplot(447) plt.title("LDA6") plt.xlim([data_7[:, 1].min() * 1.2, data_7[:, 1].max() * 1.2]) plt.ylim([data_7[:, 4].min() * 1.3, data_7[:, 4].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_7[:, 1], data_7[:, 4], c=Y) plt.subplot(448) plt.title("LDA7") plt.xlim([data_8[:, 3].min() * 1.2, data_8[:, 3].max() * 1.2]) plt.ylim([data_8[:, 6].min() * 1.3, data_8[:, 6].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_8[:, 3], data_8[:, 6], c=Y) plt.subplot(449) plt.title("LDA8") plt.xlim([data_9[:, 2].min() * 1.2, data_9[:, 2].max() * 1.2]) plt.ylim([data_9[:, 5].min() * 1.3, data_9[:, 5].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_9[:, 2], data_9[:, 5], c=Y) plt.subplot(4, 4, 10) plt.title("LDA9") plt.xlim([data_10[:, 3].min() * 1.2, data_10[:, 3].max() * 1.2]) plt.ylim([data_10[:, 8].min() * 1.3, data_10[:, 8].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_10[:, 3], data_10[:, 8], c=Y) plt.subplot(4, 4, 11) plt.title("LDA10") plt.xlim([data_11[:, 7].min() * 1.2, data_11[:, 7].max() * 1.2]) plt.ylim([data_11[:, 8].min() * 1.3, data_11[:, 8].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_11[:, 7], data_11[:, 8], c=Y) plt.subplot(4, 4, 12) plt.title("LDA11") plt.xlim([data_12[:, 2].min() * 1.2, data_10[:, 2].max() * 1.2]) plt.ylim([data_12[:, 7].min() * 1.3, data_10[:, 7].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_12[:, 2], data_10[:, 7], c=Y) plt.subplot(4, 4, 13) plt.title("LDA12") plt.xlim([data_13[:, 1].min() * 1.2, data_13[:, 1].max() * 1.2]) plt.ylim([data_13[:, 8].min() * 1.3, data_13[:, 8].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_13[:, 1], data_13[:, 8], c=Y) plt.subplot(4, 4, 14) plt.title("LDA13") plt.xlim([data_14[:, 5].min() * 1.2, data_14[:, 5].max() * 1.2]) plt.ylim([data_14[:, 8].min() * 1.3, data_14[:, 8].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_14[:, 5], data_14[:, 8], c=Y) plt.subplot(4, 4, 15) plt.title("LDA14") plt.xlim([data_15[:, 0].min() * 1.2, data_15[:, 0].max() * 1.2]) plt.ylim([data_15[:, 6].min() * 1.3, data_15[:, 6].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_15[:, 0], data_15[:, 6], c=Y) plt.subplot(4, 4, 16) plt.title("LDA15") plt.xlim([data_16[:, 0].min() * 1.2, data_16[:, 0].max() * 1.2]) plt.ylim([data_16[:, 8].min() * 1.3, data_16[:, 8].max() * 1.3]) plt.xticks([]) plt.yticks([]) plt.scatter(data_16[:, 0], data_16[:, 8], c=Y) plt.savefig("LDA4.png",dpi=800) plt.show() writer=pd.ExcelWriter(outputfile) pd.DataFrame(data_1).to_excel(writer, sheet_name='LDA',) pd.DataFrame(data_2).to_excel(writer, sheet_name='sklearn_LDA') pd.DataFrame(data_3).to_excel(writer, sheet_name='LDA2',) pd.DataFrame(data_4).to_excel(writer, sheet_name='LDA3',) pd.DataFrame(data_5).to_excel(writer, sheet_name='LDA4',) pd.DataFrame(data_6).to_excel(writer, sheet_name='LDA5',) pd.DataFrame(data_7).to_excel(writer, sheet_name='LDA6',) pd.DataFrame(data_8).to_excel(writer, sheet_name='LDA7',) pd.DataFrame(data_9).to_excel(writer, sheet_name='LDA8',) pd.DataFrame(data_10).to_excel(writer, sheet_name='LDA9',) pd.DataFrame(data_11).to_excel(writer, sheet_name='LDA10',) pd.DataFrame(data_12).to_excel(writer, sheet_name='LDA11',) pd.DataFrame(data_13).to_excel(writer, sheet_name='LDA12',) pd.DataFrame(data_14).to_excel(writer, sheet_name='LDA13',) pd.DataFrame(data_15).to_excel(writer, sheet_name='LDA14',) pd.DataFrame(data_16).to_excel(writer, sheet_name='LDA15',) # pd.DataFrame(feature_vector).to_excel(writer,sheet_name = 'eigenvector') # pd.DataFrame(scale).to_excel(writer,sheet_name = 'standardized data') writer.save()

import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier # Random forest # def pca(X, k): # k is the components you want # # Mean of each feature # #Number of samples. Number of features # n_samples, n_features = X.shape # mean = np.array([np.mean(X[:, i]) for i in range(n_features)]) # # Normalize data # norm_X = X - mean # # Scatter matrix # scatter_matrix = np.dot(np.transpose(norm_X), norm_X) # # Calculate the eigenvectors and eigenvalues # eig_val, eig_vec = np.linalg.eig(scatter_matrix) # eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(n_features)] # # According to Eig_ Val from highest to lowest on Eig_ Sort eig_vec based on eig_val from highest to lowest # eig_pairs.sort(reverse=True) # # select the top k eig_vec # feature = np.array([ele[1] for ele in eig_pairs[:k]]) # # get new data # data = np.dot(norm_X, np.transpose(feature)) # return data # inputfile = 'wy.csv'#raw data # outputfile = r'../DATA/28PCA6.xlsx' # Dimensionless data outputfile = 'PCA6.xlsx' data1 = pd.read_excel('yy.xlsx', names=range(0, 129)) print(data1.shape) # X = data1.drop([0], axis=1) # Load dataset as dictionary # iris = load_iris() # X represents data and Y represents label # X = iris.data # Y = iris.target # data_2 = LinearDiscriminantAnalysis(n_components=5).fit_transform(X, Y) # data1 = pd.read_excel(r'../DATA/yy.xlsx', engine='open yyxlsx', names=range(129)) # read in data # Characteristic column data = data1.drop([0], axis=1) # Standardized data scale = (data - data.mean()) / (data.std()) # Check whether the data is normal X = data.values Y = data1.values[:, 0] from sklearn.decomposition import PCA # Keep all ingredients pca = PCA(n_components=5) pca.fit(scale) data_1 = pca.transform(scale) # Before dimensionality reduction plt.figure(figsize=(4, 4)) # plt.subplot(121) plt.title("PCA") plt.scatter(data_1[:, 0], data_1[:, 1], c=Y) plt.savefig("pca.png",dpi=300) # plt.show() # Returns each feature vector of the model feature_vector = pca.components_ # Returns the percentage of variance (also known as contribution) of each component contri_rate = pca.explained_variance_ratio_ print(contri_rate.sum()) # View contribution rate # print(contri_rate.shape) # Select the principal components (3 principal components) whose cumulative contribution rate is greater than 80% # pca = PCA(6) # pca.fit(scale) # # Reduce dimension # low_d = pca.transform(scale) # newx = pca.inverse_transform(low_d) # print(newx) # print(newx.shape) # Write results to excel writer=pd.ExcelWriter(outputfile) pd.DataFrame(data_1).to_excel(writer,sheet_name='principal component') pd.DataFrame(contri_rate).to_excel(writer,sheet_name='Contribution rate') pd.DataFrame(feature_vector).to_excel(writer,sheet_name='feature vector ') pd.DataFrame(scale).to_excel(writer,sheet_name='Standardized data') writer.save() # After dimensionality reduction # plt.subplot(122) # plt.title("A_PCA") # plt.scatter(data_2[:, 0], data_2[:, 1], c=Y) # plt.savefig("LDA4.png", dpi=600)

2, Operation results

1. Pictures

2. Dataset

If it is not put up temporarily, the running code will be generated

3, Design report

1. Purpose and significance

Objective: to extract the feature of the data set, reduce the dimension of the initial set of original variables to a more manageable group (feature) for processing, and still accurately and completely describe the original data set for subsequent use.
Significance: data processing can first reduce data storage and input data bandwidth and reduce redundancy. Secondly, after data processing, the classification in low latitude will be improved, and more meaningful potential variables can be found to help generate and in-depth understanding of the data. Finally, the most effective features for classification and recognition are obtained from many features, so as to compress the dimension of feature space, so as to obtain a group of less and accurate classification features with low classification error probability.

2. Detailed program design

Firstly, principal component analysis (PCA) algorithm is used to reduce the dimension of the data set
Generate an eigenvector matrix, calculate the average value of each feature, subtract the average value of the column from each dimension to calculate the dispersion matrix of the feature, then calculate the eigenvalues and eigenvectors for the dispersion matrix, sort the calculated eigenvalues from large to small, and finally take out the first K eigenvectors and eigenvalues and fallback, The reduced dimension characteristic matrix is obtained. At the same time, the principal component, contribution rate, eigenvector and standardized data of the data set are obtained by calculation.
On the other hand, the data set is processed by linear discriminant analysis, that is, LDA Algorithm. After dimensionality reduction, it is mainly combined and classified from different types, and visualized with matplotlib to draw a variety of combination comparison diagrams and data analysis results.
In the following flow charts, the one on the left is the main flow chart for processing data sets using PCA algorithm, and the one on the right is the main flow chart for processing data sets using LDA Algorithm.

3. Program running results

Operation results of dimensionality reduction based on LDA:

Partial data sets of operation results of dimensionality reduction based on LDA:

Operation results of dimensionality reduction based on PCA:

Partial data sets of operation results of dimensionality reduction based on PCA:

4. Report summary

This time, the feature extraction of the data set is mainly carried out through LDA and PCA methods to reduce the dimension. The feature arrangement results of different types of classification and combination after dimension reduction are obtained respectively, and the scatter diagram and data set are made. A series of results obtained by processing the data set based on LDA and PCA algorithms are displayed in many aspects: the comparison diagram of the arrangement results of classification and combination features after dimension reduction, The principal components, contribution rate, eigenvectors and standardized data sets are analyzed.

In the process of this project, I exercised my practical operation ability and learning summary ability, learned the feature extraction of machine learning, dimension reduction and classification of PCA and LDA algorithms, and learned the knowledge of matplotlib visual drawing. On the one hand, it consolidated the learning of python, on the other hand, it learned new knowledge, and also had some insights in dimensionality reduction algorithm, visual drawing arrangement and so on.

In short, there are still many deficiencies in this project, but I have made great progress compared with what I didn't know and didn't practice at first. This project also makes me very interested in dimensionality reduction algorithm and visual mapping of feature extraction. There is no end to learning. In the future, I will continue to learn experiences and lessons from this project, Continue to study and study, and better improve and supplement this project.