Sanjiv R. Das
%pylab inline
import pandas as pd
import os
from ipypublish import nb_setup
Populating the interactive namespace from numpy and matplotlib
This dataset is highly imbalanced. We applied an oversampling algorithm (SMOTE) to it and balanced the minority class to be equal in size to the majority class. We saved this in a pickle file and read it in here.
#LOAD IN CREDIT CARD DATA
import pickle
CCdata = pickle.load(open("DSTMAA_data/CCdata.p", "rb"))
X_train = CCdata['X_train']
y_train = CCdata['y_train']
X_test = CCdata['X_test']
y_test = CCdata['y_test']
hist(y_train,3)
show()
hist(y_test,3)
show()
nb_setup.images_hconcat(["DSTMAA_images/lda.jpeg"], width=600)
nb_setup.images_hconcat(["DSTMAA_images/lda2.jpeg"], width=600)
$D$ is often replace by $Z$, which leads to the notion of "Z-score" or discriminant score.
Notes: http://srdas.github.io/MLBook/DiscriminantFactorAnalysis.html#discriminant-analysis
#FIT THE LDA MODEL
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model = LDA()
model.fit(X_train, y_train)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)
#PREDICTION ON TEST DATA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix
y_hat = model.predict(X_test)
#ACCURACY
#Out of sample
accuracy_score(y_test,y_hat)
0.9850085650143104
#CLASSIFICATION REPORT
print(classification_report(y_test, y_hat))
precision recall f1-score support 0 1.00 0.99 0.99 93835 1 0.09 0.88 0.16 152 micro avg 0.99 0.99 0.99 93987 macro avg 0.54 0.93 0.58 93987 weighted avg 1.00 0.99 0.99 93987
#ROC, AUC
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))
Area under curve (AUC): 0.9757804853424124
#CONFUSION MATRIX
cm = confusion_matrix(y_test, y_hat)
cm
array([[92444, 1391], [ 18, 134]])
ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep='\t')
yy = append(list(ones(32)), list(zeros(32)))
ncaa["y"] = yy
ncaa.head()
No NAME | GMS | PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1. NorthCarolina | 6 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 | 1.0 |
1 | 2. Illinois | 6 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 | 1.0 |
2 | 3. Louisville | 5 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 | 1.0 |
3 | 4. MichiganState | 5 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 | 1.0 |
4 | 5. Arizona | 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 | 1.0 |
#CREATE FEATURES
y = ncaa['y']
X = ncaa.iloc[:,2:13]
X.head()
PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 |
1 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 |
2 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 |
3 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 |
4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 |
#FIT MODEL
model = LDA()
model.fit(X,y)
ypred = model.predict(X)
#CONFUSION MATRIX
cm = confusion_matrix(y, ypred)
cm
array([[27, 5], [ 5, 27]])
#ACCURACY
accuracy_score(y,ypred)
0.84375
#CLASSIFICATION REPORT
print(classification_report(y, ypred))
precision recall f1-score support 0.0 0.84 0.84 0.84 32 1.0 0.84 0.84 0.84 32 micro avg 0.84 0.84 0.84 64 macro avg 0.84 0.84 0.84 64 weighted avg 0.84 0.84 0.84 64
#ROC, AUC
y_score = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_score)
title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))
Area under curve (AUC): 0.92578125