Sanjiv R. Das
%pylab inline
import pandas as pd
from ipypublish import nb_setup
Populating the interactive namespace from numpy and matplotlib
ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep='\t')
ncaa.head()
No NAME | GMS | PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1. NorthCarolina | 6 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 |
1 | 2. Illinois | 6 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 |
2 | 3. Louisville | 5 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 |
3 | 4. MichiganState | 5 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 |
4 | 5. Arizona | 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 |
#CREATE FEATURES
X = ncaa.iloc[:,2:13]
X.head()
PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 |
1 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 |
2 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 |
3 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 |
4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 |
#NORMALIZE
from sklearn.preprocessing import scale
Xs = pd.DataFrame(scale(X))
Xs.columns = X.columns
print(Xs.shape)
Xs.head()
(64, 11)
PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.791399 | 1.394302 | 1.320092 | -0.297755 | 1.116502 | -0.057707 | 0.602062 | -0.625961 | 1.619338 | -0.211355 | 0.945894 |
1 | 0.775133 | -0.092623 | 1.633680 | -0.966397 | 2.416689 | 0.550039 | -0.602062 | -0.689957 | 0.601708 | 0.473444 | 0.312470 |
2 | 1.078965 | 0.184936 | 0.222533 | -0.760661 | 0.710193 | -0.665452 | 0.831418 | -0.657959 | 0.994477 | 0.081031 | 0.482137 |
3 | 1.435182 | 0.660753 | 0.065739 | -0.349189 | 0.141362 | 0.737038 | -0.200687 | 0.365977 | 0.387470 | 0.704275 | -0.049486 |
4 | 1.330412 | 0.105634 | 0.797445 | 0.139434 | 0.303885 | -0.384954 | 2.150220 | -1.713894 | 2.119227 | 0.519611 | 0.719671 |
Suppose we reduce the $k=11$ dimensional feature space $X$ to reduced factor space $R$ with $k=3$. We translate with a matrix $L$.
$$ R = X \cdot L $$where $F$ is $(64 \times 3)$, $X$ is $(64 \times 11)$, and $L$ is $(11 \times 3)$.
#REDUCED DATA
from sklearn import decomposition
pca = decomposition.PCA(n_components=3)
pca.fit(Xs)
R = pca.transform(Xs)
print(R.shape)
(64, 3)
C = Xs.cov()
C
PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|
PTS | 1.015873 | 0.065418 | 0.616990 | -0.148496 | 0.524359 | 0.026626 | 0.080895 | 0.042184 | 0.652497 | 0.117021 | 0.426928 |
REB | 0.065418 | 1.015873 | -0.232696 | 0.168788 | -0.329797 | -0.189989 | 0.203458 | -0.145426 | -0.207746 | -0.200005 | -0.229139 |
AST | 0.616990 | -0.232696 | 1.015873 | 0.020450 | 0.706120 | 0.096488 | -0.081354 | -0.013186 | 0.640854 | -0.155134 | 0.355424 |
TO | -0.148496 | 0.168788 | 0.020450 | 1.015873 | -0.635876 | 0.187273 | -0.042798 | 0.078073 | 0.018424 | -0.060498 | -0.040244 |
A/T | 0.524359 | -0.329797 | 0.706120 | -0.635876 | 1.015873 | -0.071913 | -0.036536 | -0.052240 | 0.414333 | -0.008547 | 0.197645 |
STL | 0.026626 | -0.189989 | 0.096488 | 0.187273 | -0.071913 | 1.015873 | 0.179408 | 0.286513 | -0.172642 | 0.139247 | -0.027863 |
BLK | 0.080895 | 0.203458 | -0.081354 | -0.042798 | -0.036536 | 0.179408 | 1.015873 | -0.009174 | -0.059236 | 0.076592 | 0.081699 |
PF | 0.042184 | -0.145426 | -0.013186 | 0.078073 | -0.052240 | 0.286513 | -0.009174 | 1.015873 | -0.232258 | 0.079845 | -0.021644 |
FG | 0.652497 | -0.207746 | 0.640854 | 0.018424 | 0.414333 | -0.172642 | -0.059236 | -0.232258 | 1.015873 | -0.116949 | 0.467952 |
FT | 0.117021 | -0.200005 | -0.155134 | -0.060498 | -0.008547 | 0.139247 | 0.076592 | 0.079845 | -0.116949 | 1.015873 | -0.113899 |
3P | 0.426928 | -0.229139 | 0.355424 | -0.040244 | 0.197645 | -0.027863 | 0.081699 | -0.021644 | 0.467952 | -0.113899 | 1.015873 |
#LOADINGS MATRIX L
L = pca.components_.T
print(L.shape)
print(X.columns)
L
(11, 3) Index(['PTS ', 'REB ', 'AST ', 'TO ', 'A/T ', 'STL ', 'BLK ', 'PF ', 'FG ', 'FT ', '3P'], dtype='object')
array([[-0.43884425, 0.02285078, -0.18631231], [ 0.1867903 , -0.43294301, -0.21835274], [-0.47238137, 0.04962787, -0.18252437], [ 0.17651088, -0.02325077, -0.68627945], [-0.45266018, 0.08602947, 0.37681287], [ 0.03888779, 0.57289362, -0.29086594], [ 0.02703794, 0.08087251, -0.16285993], [ 0.05607815, 0.51652087, -0.12761847], [-0.44993945, -0.18657986, -0.21900567], [ 0.03599791, 0.40620447, 0.17479897], [-0.32279124, -0.01667957, -0.25572689]])
#CHECK THAT DECOMPOSITION IS CORRECT
sum(R - Xs.dot(L))
0 -1.893624e-14 1 3.152340e-14 2 1.458902e-15 dtype: float64
rates = pd.read_csv("DSTMAA_data/tryrates.txt", sep='\t')
print(rates.shape)
rates.head()
(367, 9)
DATE | FYGM3 | FYGM6 | FYGT1 | FYGT2 | FYGT3 | FYGT5 | FYGT7 | FYGT10 | |
---|---|---|---|---|---|---|---|---|---|
0 | Jun-76 | 5.41 | 5.77 | 6.52 | 7.06 | 7.31 | 7.61 | 7.75 | 7.86 |
1 | Jul-76 | 5.23 | 5.53 | 6.20 | 6.85 | 7.12 | 7.49 | 7.70 | 7.83 |
2 | Aug-76 | 5.14 | 5.40 | 6.00 | 6.63 | 6.86 | 7.31 | 7.58 | 7.77 |
3 | Sep-76 | 5.08 | 5.30 | 5.84 | 6.42 | 6.66 | 7.13 | 7.41 | 7.59 |
4 | Oct-76 | 4.92 | 5.06 | 5.50 | 5.98 | 6.24 | 6.75 | 7.16 | 7.41 |
#PCA
X = rates.drop("DATE",axis=1)
pca = decomposition.PCA(n_components=2)
pca.fit(X)
Y = pca.transform(X)
Y.shape
(367, 2)
X.corr()
FYGM3 | FYGM6 | FYGT1 | FYGT2 | FYGT3 | FYGT5 | FYGT7 | FYGT10 | |
---|---|---|---|---|---|---|---|---|
FYGM3 | 1.000000 | 0.997537 | 0.991125 | 0.975089 | 0.961225 | 0.938329 | 0.922041 | 0.906564 |
FYGM6 | 0.997537 | 1.000000 | 0.997350 | 0.985125 | 0.972844 | 0.951266 | 0.935603 | 0.920542 |
FYGT1 | 0.991125 | 0.997350 | 1.000000 | 0.993696 | 0.984692 | 0.966859 | 0.953130 | 0.939686 |
FYGT2 | 0.975089 | 0.985125 | 0.993696 | 1.000000 | 0.997767 | 0.987892 | 0.978651 | 0.968093 |
FYGT3 | 0.961225 | 0.972844 | 0.984692 | 0.997767 | 1.000000 | 0.995622 | 0.989403 | 0.981307 |
FYGT5 | 0.938329 | 0.951266 | 0.966859 | 0.987892 | 0.995622 | 1.000000 | 0.998435 | 0.994569 |
FYGT7 | 0.922041 | 0.935603 | 0.953130 | 0.978651 | 0.989403 | 0.998435 | 1.000000 | 0.998493 |
FYGT10 | 0.906564 | 0.920542 | 0.939686 | 0.968093 | 0.981307 | 0.994569 | 0.998493 | 1.000000 |
#EXPLAINED VARIANCE
pca.explained_variance_ratio_
array([0.97558798, 0.02283477])
#PLOT COMPONENTS
plot(Y[:,0])
title('PC1')
grid()
plot(Y[:,1])
title('PC2')
grid()