Sanjiv R. Das
%pylab inline
import pandas as pd
import os
from ipypublish import nb_setup
Populating the interactive namespace from numpy and matplotlib
This dataset is highly imbalanced. We applied an oversampling algorithm (SMOTE) to it and balanced the minority class to be equal in size to the majority class. We saved this in a pickle file and read it in here.
#LOAD IN CREDIT CARD DATA
import pickle
CCdata = pickle.load(open("DSTMAA_data/CCdata.p", "rb"))
X_train = CCdata['X_train']
y_train = CCdata['y_train']
X_test = CCdata['X_test']
y_test = CCdata['y_test']
hist(y_train,3)
show()
hist(y_test,3)
show()
nb_setup.images_hconcat(["DSTMAA_images/lda.jpeg"], width=600)
nb_setup.images_hconcat(["DSTMAA_images/lda2.jpeg"], width=600)
$D$ is often replace by $Z$, which leads to the notion of "Z-score" or discriminant score.
Notes: http://srdas.github.io/MLBook/DiscriminantFactorAnalysis.html#discriminant-analysis
#FIT THE LDA MODEL 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model = LDA()
model.fit(X_train, y_train)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
#PREDICTION ON TEST DATA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix
y_hat = model.predict(X_test)
#ACCURACY
#Out of sample
accuracy_score(y_test,y_hat)
0.9850085650143104
#CLASSIFICATION REPORT
print(classification_report(y_test, y_hat))
              precision    recall  f1-score   support
           0       1.00      0.99      0.99     93835
           1       0.09      0.88      0.16       152
   micro avg       0.99      0.99      0.99     93987
   macro avg       0.54      0.93      0.58     93987
weighted avg       1.00      0.99      0.99     93987
#ROC, AUC
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))
Area under curve (AUC): 0.9757804853424124
#CONFUSION MATRIX
cm = confusion_matrix(y_test, y_hat)
cm
array([[92444,  1391],
       [   18,   134]])
ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep='\t')
yy = append(list(ones(32)), list(zeros(32)))
ncaa["y"] = yy
ncaa.head()
| No NAME | GMS | PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1. NorthCarolina | 6 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 | 1.0 | 
| 1 | 2. Illinois | 6 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 | 1.0 | 
| 2 | 3. Louisville | 5 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 | 1.0 | 
| 3 | 4. MichiganState | 5 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 | 1.0 | 
| 4 | 5. Arizona | 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 | 1.0 | 
#CREATE FEATURES
y = ncaa['y']
X = ncaa.iloc[:,2:13]
X.head()
| PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 | 
| 1 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 | 
| 2 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 | 
| 3 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 | 
| 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 | 
#FIT MODEL
model = LDA()
model.fit(X,y)
ypred = model.predict(X)
#CONFUSION MATRIX
cm = confusion_matrix(y, ypred)
cm
array([[27,  5],
       [ 5, 27]])
#ACCURACY
accuracy_score(y,ypred)
0.84375
#CLASSIFICATION REPORT
print(classification_report(y, ypred))
              precision    recall  f1-score   support
         0.0       0.84      0.84      0.84        32
         1.0       0.84      0.84      0.84        32
   micro avg       0.84      0.84      0.84        64
   macro avg       0.84      0.84      0.84        64
weighted avg       0.84      0.84      0.84        64
#ROC, AUC
y_score = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_score)
title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))
Area under curve (AUC): 0.92578125