Author: Blake Conrad

Objective: Reduce high dimensionality based on maximum variance distributions.

In [1]:
# - - - - - - - - - - - - - - - #
# Author: Blake Conrad
# Purpose: HW1 CSCI 48100
# - - - - - - - - - - - - - - - #

from __future__ import division
import sys
import numpy as np
from numpy import linalg as LA
import pandas as pd
In [2]:
# - - - - - - - - - - - - - - - #
# Import the data
# - - - - - - - - - - - - - - - #

# textFileNameInCurDir = sys.args[1]
textFileNameInCurDir = "magic04.txt"
df = pd.read_csv(textFileNameInCurDir, 
                 sep=",",
                 header=None,
                names=[ "v1",
                        "v2",
                        "v3",
                        "v4",
                        "v5",
                        "v6",
                        "v7",
                        "v8",
                        "v9",
                        "v10",
                        "v11"],
                dtype={'v1': np.float64,
                       'v2': np.float64,
                       'v3': np.float64,
                       'v4': np.float64,
                       'v5': np.float64,
                       'v6': np.float64,
                       'v7': np.float64,
                       'v8': np.float64,
                       'v9': np.float64,
                       'v10': np.float64})
df = df.drop('v11',1)
In [3]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#
# 1: 
# Subroutine to compute a matrix's covariance (Eq. 2.31)
# Confirmation that the subroutine acts as NumPy.cov()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#

def getCovariance(df):
    
    # - - - - - - - - - - - - - - #
    # Z = D - transpose(mu)*1     #
    # - - - - - - - - - - - - - - #
    
    # Get transpose(mu)*1; the mean matrix
    mu_T1 = df.mean()
    mu_T2 = pd.DataFrame([mu_T1],dtype=np.float64)
    mu_T_times_1 = pd.concat([mu_T2] * len(df.index))
    
    # Get D - tranpose(mu)*1; the centered matrix
    Z = np.subtract(df.as_matrix(),mu_T_times_1.as_matrix())    

    # Get tranpose(Z)*Z/n; the covariance matrix
    n_rows, n_cols = np.shape(Z)
    ZT = Z.transpose()
    ZTZ = np.dot(ZT,Z)
    cov_mat = np.true_divide(ZTZ,n_rows)
    
    # Return the covariance matrix object created; np.ndarray object
    return (Z,cov_mat)

# Get the covariance matrix; from my function
Z, my_cov = getCovariance(df)

# Get the covariance matrix; from NumPy
real_cov = np.cov(df.as_matrix().transpose(), bias=True)

# Compare the results; true
print(np.allclose(my_cov,real_cov))
True
In [4]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#
# 2. 
# Get the two dominant eigenvectors
# Project the data onto this new eigenspace
# Print the variance of the new projected datapoints
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# Get eigen vectors and values; default sorted
LAMBDA,U = LA.eig(my_cov)

# Sort eigen values, just in case
idx = LAMBDA.argsort()[::-1]   
LAMBDA = LAMBDA[idx]
U = U[:,idx]

# Create the new projection space
df_eig = pd.DataFrame({'eig0':U[0],
                        'eig1':U[1]})

# Project the old data onto the new eigen space
projectedData = pd.DataFrame(np.dot(Z,df_eig.as_matrix()))

# Print only the variance of this new vectorspace
Z2, projectedVariance = getCovariance(projectedData)
print(np.shape(projectedVariance))
print(projectedVariance)
(2, 2)
[[  479.15757873  -666.32826151]
 [ -666.32826151  2437.6707179 ]]
In [5]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#
# 3.
# Print the covariance matrix Σ in its eigen-
#  decomposition form (UΛUT)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#

#Use linalg.eig to find all the eigenvectors, and print the 

# Get eigen vectors and values; default sorted
Z3, my_cov = getCovariance(df)

# Get the eigen values and vectors
LAMBDA,U = LA.eig(my_cov)

# Format the decomposition and compute 
UT = U.transpose()
U_LAMBDA = U * LAMBDA
SIGMA = U_LAMBDA * UT

# Print the SIGMA
print(np.shape(SIGMA))
print(SIGMA)
(10, 10)
[[  7.07033179e+02   5.89487275e+01  -4.04272137e-01   1.06269377e+00
    3.01078665e-02  -3.27734998e+00  -2.84280723e+00   1.16569616e-06
    2.61443721e-07   1.50124413e-08]
 [  1.00644360e+02   1.11889533e+01  -2.52518380e-02  -1.09538689e-01
    1.84069365e-03   6.99771500e+00   6.19365703e+01   1.19375311e-06
    8.57531173e-07   9.02441261e-09]
 [ -1.31920680e+00  -4.82633045e-02   2.26257302e-03  -4.69278908e-03
   -4.76733203e-04  -3.12472801e-02   1.03815775e+00  -9.99612408e-04
   -2.12223982e-04   1.73366293e-06]
 [  5.27021089e+00  -3.18179642e-01  -7.13200806e-03   9.00676061e-03
    1.42052336e-03  -4.28286107e-03  -2.72689334e-02  -3.30913746e-04
    7.64001085e-04   5.00768534e-05]
 [  3.24660140e-01   1.16256032e-02  -1.57537997e-03   3.08871266e-03
    5.15695272e-04   5.26660042e-05   2.17515014e-02   1.58453957e-04
   -5.48795487e-03  -3.42255451e-05]
 [ -4.97332113e+01   6.21963534e+01  -1.45310445e-01  -1.31050236e-02
    7.41148205e-05   3.38774157e-02  -2.56305019e-02  -4.35822380e-05
   -1.00774802e-08  -2.61202051e-12]
 [ -1.59422711e+02   2.03439333e+03   1.78413232e+01  -3.08355053e-01
    1.13120946e-01  -9.47188047e-02   9.98716992e-02   4.47589470e-06
   -4.42581046e-08   1.49876750e-11]
 [  8.87770948e-02   5.32494427e-02  -2.33296450e+01  -5.08171741e+00
    1.11910197e+00  -2.18726503e-01   6.07844713e-03   1.85060608e-09
    1.55162785e-10  -1.10846529e-13]
 [  1.60225730e-01   3.07814411e-01  -3.98574428e+01   9.44122086e+01
   -3.11900150e+02  -4.06988665e-04  -4.83664346e-04   1.24860780e-09
    3.96156560e-09  -1.86053844e-12]
 [  2.56353160e-01   9.02591596e-02   9.07220685e+00   1.72426703e+02
   -5.41986833e+01  -2.93927521e-06   4.56371840e-06  -2.48538622e-11
   -5.18408312e-11   3.02495729e-15]]
In [6]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#
# 4: Write a subroutine to implement PCA Algorithm (Algorithm 7.1, Page 198).
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#

def get_reduced_dimension(LAMBDA, alpha):
    
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Choose the smallest r s.t f(r) >= alpha given that
    #  f(r) = sum(i->r lambda_i)/sum(i->d lambda_i)
    #   when r = {1,2,...,d}
    #
    #
    #
    # So basically, pick the first appearance of r >= alpha.
    #
    #
    #
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    
    # The denominator doesn't change.
    denominator = np.sum(LAMBDA)
    
    # Update a list of each quotient
    each_possible_r = []
    for r in range(len(LAMBDA)):
        numerator_i = np.sum([LAMBDA[i] for i in range(r)])
        each_possible_r.append(np.divide(numerator_i,denominator))
        
    # Grab the smallest r s.t r captures alpha*100 % variance
    smallest_r=0
    for i in range(len(each_possible_r)):
        print each_possible_r[i], " "
        if(each_possible_r[i] >= alpha):
            print("Max dimension of: ",i)
            smallest_r=i
            break
            
    # Return the smallest r with respect to alpha
    return smallest_r

def PCA(df, alpha):
    # - - - - - - - - - - - - - - - #
    Z4, coV = getCovariance(df)
    LAMBDA, U = LA.eig(coV)

    # Sort eigen values, just in case
    idx = LAMBDA.argsort()[::-1]   
    LAMBDA = U[idx]
    U = U[:,idx]

    # Find the reduced vectorspace of R^r
    r = get_reduced_dimension(LAMBDA, alpha)

    # compute the projection of the points from df onto 

    df_eig = pd.DataFrame({'eig'.join(str(dim)):U[dim]
                              for dim in range(0,r)})
    print(df_eig.head())
    A = np.dot(Z4,df_eig.as_matrix())
    
    # Return the projected data matrix onto the new vector space
    return(A)
    
df_onto_u_at_alpha_thresh = PCA(df, alpha=0.90)

# Print the first 10 data observations in our new vector space
# capturing 90% variance
print(df_onto_u_at_alpha_thresh[:10])
-0.0  
0.0422813601377  
0.266317550455  
0.540134466499  
0.516531244727  
0.76166258661  
0.926997094849  
('Max dimension of: ', 6)
          0         1         2         3         4         5
0  0.327812  0.113810  0.003061 -0.000927 -0.000522 -0.380700
1  0.134406  0.053884 -0.000659  0.000258  0.000153 -0.693897
2 -0.065508  0.018994 -0.001059  0.000510  0.000295  0.540077
3 -0.863988 -0.319766 -0.006930  0.002606  0.001513 -0.285130
4 -0.094551  0.019755 -0.002652  0.001539  0.000919  0.005484
[[  -4.37451898  -14.72489059    0.47587644  -74.01427705   85.29049027
    14.0265176 ]
 [  -3.55913418  -16.40020321   17.44255944   19.4606638    -0.36860138
    15.84860872]
 [  27.90616902   86.00346935   26.31441721  -18.12100001  -86.82609289
  -120.27070976]
 [ -16.68135491   11.00981256   11.62048934  -31.7215939    71.77924275
    19.5278976 ]
 [  14.55484735  -13.61627625   -7.92288297  114.44087381 -119.55622699
   -13.79941536]
 [  11.63625053  -32.00054517    0.70820766   46.10681419  -22.57146002
     2.63044549]
 [   7.43948841  -26.81513233   -0.82446377   34.94186693   -7.19973718
     6.12737696]
 [  -5.79847681  -13.6364257    13.73014459   43.57682511  -22.73858015
    16.27575164]
 [  45.20803082  -65.69654439  -30.82752175   61.58304781  -25.32733622
   -29.77212774]
 [   8.81610021  -33.15879171   11.33582669  -37.45150504   85.35977094
     8.18149063]]
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
www.000webhost.com