Principal Components Analysis

Given a collection of points in a multidimensional space, a "best fitting" line can be defined as one that minimizes the average squared distance from a point to the line. The next best-fitting line can be similarly chosen from directions perpendicular to the first. Repeating this process yields an orthogonal basis in which different individual dimensions of the data are uncorrelated. These basis vectors are called principal components, and several related procedures principal component analysis (PCA).

Python Example

To download the code click here.

"""
dimensionality_reduction_using_scikit_learn.py
"""
# Import needed libraries.
import numpy as np
import matplotlib.pyplot as plotlib
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Set parameters.
random_state = 0
test_data_proportion = 0.25
number_of_neighbors = 5
number_of_components = 2

# Load Digits dataset
X, y = datasets.load_digits(return_X_y=True)

# Split data into X/y train/test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_data_proportion,
                                                    stratify=y,
                                                    random_state=random_state)

dim = len(X[0])
n_classes = len(np.unique(y))

# Reduce dimension to 2 with PCA
pca = make_pipeline(StandardScaler(),
                    PCA(n_components=number_of_components,
                        random_state=random_state))

# Reduce dimension to 2 with LinearDiscriminantAnalysis
lda = make_pipeline(StandardScaler(),
                    LinearDiscriminantAnalysis(n_components=number_of_components))

# Instantiate a k nearest neighbors classifier for evaluation.
knn = KNeighborsClassifier(n_neighbors=number_of_neighbors)

# Make a list of the methods to be compared
dim_reduction_methods = [('PCA', pca), ('LDA', lda)]

# plt.figure()
for i, (name, model) in enumerate(dim_reduction_methods):
    plotlib.figure()
    # plt.subplot(1, 3, i + 1, aspect=1)

    # Fit the method's model
    model.fit(X_train, y_train)

    # Fit a nearest neighbor classifier on the embedded training set
    knn.fit(model.transform(X_train), y_train)

    # Compute the nearest neighbor accuracy on the embedded test set
    acc_knn = knn.score(model.transform(X_test), y_test)

    # Embed the data set in 2 dimensions using the fitted model
    X_embedded = model.transform(X)

    # Plot the projected points and show the evaluation score
    plotlib.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
    plotlib.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
                                                                  number_of_neighbors,
                                                                  acc_knn))
plotlib.show()

Results are shown below:

Screen Shot 2020-09-18 at 12.35.29 PM.png
Screen Shot 2020-09-18 at 12.35.24 PM.png