# preface

python final exam requires a feature extraction project. Today I'll show my results

# 1, Code

```import numpy as np

# Linear discriminant analysis is imported from sklearn's linear analysis library, that is, LDA maximizes the coordinate axis of inter class discrimination to reduce the dimension of classification preprocessing
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
import pandas as pd

outputfile = 'LDA6.xlsx'   # Dimensionless data

def lda(data, target, n_dim):
'''
:param data: (n_samples, n_features)
:param target: data class
:param n_dim: target dimension
:return: (n_samples, n_dims)
'''

clusters = np.unique(target)

if n_dim > len(clusters)-1:
print("K is too much")
exit(0)

#within_class scatter matrix
# Calculate in class distance Sw
Sw = np.zeros((data.shape[1],data.shape[1]))
# Loop each type
for i in clusters:
datai = data[target == i]
datai = datai-datai.mean(0)
Swi = np.mat(datai).T*np.mat(datai)
Sw += Swi

#between_class scatter matrix
# Calculate the distance between classes SB
SB = np.zeros((data.shape[1],data.shape[1]))
u = data.mean(0)  #Average of all samples
for i in clusters:
Ni = data[target == i].shape[0]
ui = data[target == i].mean(0)  #Average value of a category
SBi = Ni*np.mat(ui - u).T*np.mat(ui - u)
SB += SBi
S = np.linalg.inv(Sw)*SB
eigVals,eigVects = np.linalg.eig(S)  #Eigenvalue, eigenvector
eigValInd = np.argsort(eigVals)
eigValInd = eigValInd[:(-n_dim-1):-1]
w = eigVects[:,eigValInd]
data_ndim = np.dot(data, w)

return data_ndim

if __name__ == '__main__':
X = data1.drop([0], axis=1)
data = data1.values
Y = data[:, 0]

# X represents data and Y represents label
# X = iris.data
# Y = iris.target
data_1 = lda(X, Y, 5)
data_2 = LinearDiscriminantAnalysis(n_components=5).fit_transform(X, Y)
data_3 = lda(X, Y, 5)
data_4 = lda(X, Y, 5)
data_5 = lda(X, Y, 5)
data_6 = lda(X, Y, 7)
data_7 = lda(X, Y, 7)
data_8 = lda(X, Y, 7)
data_9 = lda(X, Y, 7)
data_10 = lda(X, Y, 9)
data_11 = lda(X, Y, 9)
data_12 = lda(X, Y, 9)
data_13 = lda(X, Y, 9)
data_14 = lda(X, Y, 9)
data_15 = lda(X, Y, 9)
data_16 = lda(X, Y, 9)

plt.figure(figsize=(20, 15))
plt.subplot(441)
plt.title("LDA")
plt.xlim([data_1[:, 0].min()*1.2, data_1[:, 0].max()*1.2])
plt.ylim([data_1[:, 1].min()*1.3, data_1[:, 1].max()*1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_1[:, 0], data_1[:, 1], c=Y)

plt.subplot(442)
plt.title("sklearn_LDA")
plt.scatter(data_2[:, 0], data_2[:, 1], c=Y)

plt.subplot(443)
plt.title("LDA2")
plt.xlim([data_3[:, 1].min() * 1.2, data_3[:, 1].max() * 1.2])
plt.ylim([data_3[:, 2].min() * 1.3, data_3[:, 2].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_3[:, 1], data_3[:, 2], c=Y)

plt.subplot(444)
plt.title("LDA3")
plt.xlim([data_4[:, 2].min() * 1.2, data_4[:, 2].max() * 1.2])
plt.ylim([data_4[:, 3].min() * 1.3, data_4[:, 3].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_4[:, 2], data_4[:, 3], c=Y)

plt.subplot(445)
plt.title("LDA4")
plt.xlim([data_5[:, 3].min() * 1.2, data_5[:, 3].max() * 1.2])
plt.ylim([data_5[:, 4].min() * 1.3, data_5[:, 4].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_5[:, 3], data_5[:, 4], c=Y)

plt.subplot(446)
plt.title("LDA5")
plt.xlim([data_6[:, 5].min() * 1.2, data_6[:, 5].max() * 1.2])
plt.ylim([data_6[:, 6].min() * 1.3, data_6[:, 6].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_6[:, 5], data_6[:, 6], c=Y)

plt.subplot(447)
plt.title("LDA6")
plt.xlim([data_7[:, 1].min() * 1.2, data_7[:, 1].max() * 1.2])
plt.ylim([data_7[:, 4].min() * 1.3, data_7[:, 4].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_7[:, 1], data_7[:, 4], c=Y)

plt.subplot(448)
plt.title("LDA7")
plt.xlim([data_8[:, 3].min() * 1.2, data_8[:, 3].max() * 1.2])
plt.ylim([data_8[:, 6].min() * 1.3, data_8[:, 6].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_8[:, 3], data_8[:, 6], c=Y)

plt.subplot(449)
plt.title("LDA8")
plt.xlim([data_9[:, 2].min() * 1.2, data_9[:, 2].max() * 1.2])
plt.ylim([data_9[:, 5].min() * 1.3, data_9[:, 5].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_9[:, 2], data_9[:, 5], c=Y)

plt.subplot(4, 4, 10)
plt.title("LDA9")
plt.xlim([data_10[:, 3].min() * 1.2, data_10[:, 3].max() * 1.2])
plt.ylim([data_10[:, 8].min() * 1.3, data_10[:, 8].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_10[:, 3], data_10[:, 8], c=Y)

plt.subplot(4, 4, 11)
plt.title("LDA10")
plt.xlim([data_11[:, 7].min() * 1.2, data_11[:, 7].max() * 1.2])
plt.ylim([data_11[:, 8].min() * 1.3, data_11[:, 8].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_11[:, 7], data_11[:, 8], c=Y)

plt.subplot(4, 4, 12)
plt.title("LDA11")
plt.xlim([data_12[:, 2].min() * 1.2, data_10[:, 2].max() * 1.2])
plt.ylim([data_12[:, 7].min() * 1.3, data_10[:, 7].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_12[:, 2], data_10[:, 7], c=Y)

plt.subplot(4, 4, 13)
plt.title("LDA12")
plt.xlim([data_13[:, 1].min() * 1.2, data_13[:, 1].max() * 1.2])
plt.ylim([data_13[:, 8].min() * 1.3, data_13[:, 8].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_13[:, 1], data_13[:, 8], c=Y)

plt.subplot(4, 4, 14)
plt.title("LDA13")
plt.xlim([data_14[:, 5].min() * 1.2, data_14[:, 5].max() * 1.2])
plt.ylim([data_14[:, 8].min() * 1.3, data_14[:, 8].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_14[:, 5], data_14[:, 8], c=Y)

plt.subplot(4, 4, 15)
plt.title("LDA14")
plt.xlim([data_15[:, 0].min() * 1.2, data_15[:, 0].max() * 1.2])
plt.ylim([data_15[:, 6].min() * 1.3, data_15[:, 6].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_15[:, 0], data_15[:, 6], c=Y)

plt.subplot(4, 4, 16)
plt.title("LDA15")
plt.xlim([data_16[:, 0].min() * 1.2, data_16[:, 0].max() * 1.2])
plt.ylim([data_16[:, 8].min() * 1.3, data_16[:, 8].max() * 1.3])
plt.xticks([])
plt.yticks([])
plt.scatter(data_16[:, 0], data_16[:, 8], c=Y)

plt.savefig("LDA4.png",dpi=800)
plt.show()

writer=pd.ExcelWriter(outputfile)
pd.DataFrame(data_1).to_excel(writer, sheet_name='LDA',)
pd.DataFrame(data_2).to_excel(writer, sheet_name='sklearn_LDA')
pd.DataFrame(data_3).to_excel(writer, sheet_name='LDA2',)
pd.DataFrame(data_4).to_excel(writer, sheet_name='LDA3',)
pd.DataFrame(data_5).to_excel(writer, sheet_name='LDA4',)
pd.DataFrame(data_6).to_excel(writer, sheet_name='LDA5',)
pd.DataFrame(data_7).to_excel(writer, sheet_name='LDA6',)
pd.DataFrame(data_8).to_excel(writer, sheet_name='LDA7',)
pd.DataFrame(data_9).to_excel(writer, sheet_name='LDA8',)
pd.DataFrame(data_10).to_excel(writer, sheet_name='LDA9',)
pd.DataFrame(data_11).to_excel(writer, sheet_name='LDA10',)
pd.DataFrame(data_12).to_excel(writer, sheet_name='LDA11',)
pd.DataFrame(data_13).to_excel(writer, sheet_name='LDA12',)
pd.DataFrame(data_14).to_excel(writer, sheet_name='LDA13',)
pd.DataFrame(data_15).to_excel(writer, sheet_name='LDA14',)
pd.DataFrame(data_16).to_excel(writer, sheet_name='LDA15',)

# pd.DataFrame(feature_vector).to_excel(writer,sheet_name = 'eigenvector')
# pd.DataFrame(scale).to_excel(writer,sheet_name = 'standardized data')

writer.save()

```
```import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier     # Random forest

# def pca(X, k):  # k is the components you want
#     # Mean of each feature
#     #Number of samples. Number of features
#     n_samples, n_features = X.shape
#     mean = np.array([np.mean(X[:, i]) for i in range(n_features)])
#     # Normalize data
#     norm_X = X - mean
#     # Scatter matrix
#     scatter_matrix = np.dot(np.transpose(norm_X), norm_X)
#     # Calculate the eigenvectors and eigenvalues
#     eig_val, eig_vec = np.linalg.eig(scatter_matrix)
#     eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(n_features)]
#     # According to Eig_ Val from highest to lowest on Eig_ Sort eig_vec based on eig_val from highest to lowest
#     eig_pairs.sort(reverse=True)
#     # select the top k eig_vec
#     feature = np.array([ele[1] for ele in eig_pairs[:k]])
#     # get new data
#     data = np.dot(norm_X, np.transpose(feature))
#     return data

# inputfile = 'wy.csv'#raw data
# outputfile = r'../DATA/28PCA6.xlsx'  # Dimensionless data
outputfile = 'PCA6.xlsx'

print(data1.shape)
# X = data1.drop([0], axis=1)

# X represents data and Y represents label
# X = iris.data
# Y = iris.target
# data_2 = LinearDiscriminantAnalysis(n_components=5).fit_transform(X, Y)

# Characteristic column
data = data1.drop([0], axis=1)

# Standardized data
scale = (data - data.mean()) / (data.std())

# Check whether the data is normal

X = data.values
Y = data1.values[:, 0]
from sklearn.decomposition import PCA

# Keep all ingredients
pca = PCA(n_components=5)
pca.fit(scale)

data_1 = pca.transform(scale)

# Before dimensionality reduction
plt.figure(figsize=(4, 4))
# plt.subplot(121)
plt.title("PCA")
plt.scatter(data_1[:, 0], data_1[:, 1], c=Y)
plt.savefig("pca.png",dpi=300)
# plt.show()

# Returns each feature vector of the model
feature_vector = pca.components_

# Returns the percentage of variance (also known as contribution) of each component
contri_rate = pca.explained_variance_ratio_
print(contri_rate.sum())
# View contribution rate
# print(contri_rate.shape)

# Select the principal components (3 principal components) whose cumulative contribution rate is greater than 80%
# pca = PCA(6)
# pca.fit(scale)
# # Reduce dimension
# low_d = pca.transform(scale)
# newx = pca.inverse_transform(low_d)
# print(newx)
# print(newx.shape)

# Write results to excel
writer=pd.ExcelWriter(outputfile)
pd.DataFrame(data_1).to_excel(writer,sheet_name='principal component')
pd.DataFrame(contri_rate).to_excel(writer,sheet_name='Contribution rate')
pd.DataFrame(feature_vector).to_excel(writer,sheet_name='feature vector ')
pd.DataFrame(scale).to_excel(writer,sheet_name='Standardized data')
writer.save()

# After dimensionality reduction
# plt.subplot(122)
# plt.title("A_PCA")
# plt.scatter(data_2[:, 0], data_2[:, 1], c=Y)
# plt.savefig("LDA4.png", dpi=600)

```

# 2, Operation results

## 2. Dataset

If it is not put up temporarily, the running code will be generated

# 3, Design report

## 1. Purpose and significance

Objective: to extract the feature of the data set, reduce the dimension of the initial set of original variables to a more manageable group (feature) for processing, and still accurately and completely describe the original data set for subsequent use.
Significance: data processing can first reduce data storage and input data bandwidth and reduce redundancy. Secondly, after data processing, the classification in low latitude will be improved, and more meaningful potential variables can be found to help generate and in-depth understanding of the data. Finally, the most effective features for classification and recognition are obtained from many features, so as to compress the dimension of feature space, so as to obtain a group of less and accurate classification features with low classification error probability.

## 2. Detailed program design

Firstly, principal component analysis (PCA) algorithm is used to reduce the dimension of the data set
Generate an eigenvector matrix, calculate the average value of each feature, subtract the average value of the column from each dimension to calculate the dispersion matrix of the feature, then calculate the eigenvalues and eigenvectors for the dispersion matrix, sort the calculated eigenvalues from large to small, and finally take out the first K eigenvectors and eigenvalues and fallback, The reduced dimension characteristic matrix is obtained. At the same time, the principal component, contribution rate, eigenvector and standardized data of the data set are obtained by calculation.
On the other hand, the data set is processed by linear discriminant analysis, that is, LDA Algorithm. After dimensionality reduction, it is mainly combined and classified from different types, and visualized with matplotlib to draw a variety of combination comparison diagrams and data analysis results.
In the following flow charts, the one on the left is the main flow chart for processing data sets using PCA algorithm, and the one on the right is the main flow chart for processing data sets using LDA Algorithm.

## 3. Program running results

Operation results of dimensionality reduction based on LDA:

Partial data sets of operation results of dimensionality reduction based on LDA:

Operation results of dimensionality reduction based on PCA:

Partial data sets of operation results of dimensionality reduction based on PCA:

## 4. Report summary

This time, the feature extraction of the data set is mainly carried out through LDA and PCA methods to reduce the dimension. The feature arrangement results of different types of classification and combination after dimension reduction are obtained respectively, and the scatter diagram and data set are made. A series of results obtained by processing the data set based on LDA and PCA algorithms are displayed in many aspects: the comparison diagram of the arrangement results of classification and combination features after dimension reduction, The principal components, contribution rate, eigenvectors and standardized data sets are analyzed.

In the process of this project, I exercised my practical operation ability and learning summary ability, learned the feature extraction of machine learning, dimension reduction and classification of PCA and LDA algorithms, and learned the knowledge of matplotlib visual drawing. On the one hand, it consolidated the learning of python, on the other hand, it learned new knowledge, and also had some insights in dimensionality reduction algorithm, visual drawing arrangement and so on.

In short, there are still many deficiencies in this project, but I have made great progress compared with what I didn't know and didn't practice at first. This project also makes me very interested in dimensionality reduction algorithm and visual mapping of feature extraction. There is no end to learning. In the future, I will continue to learn experiences and lessons from this project, Continue to study and study, and better improve and supplement this project.

Posted on Mon, 11 Oct 2021 00:09:10 -0400 by mattcass