%pylab inline
import pandas as pd
import os
from ipypublish import nb_setup

Populating the interactive namespace from numpy and matplotlib


#LOAD IN CREDIT CARD DATA
import pickle
CCdata = pickle.load(open("DSTMAA_data/CCdata.p", "rb"))
X_train = CCdata['X_train']
y_train = CCdata['y_train']
X_test = CCdata['X_test']
y_test = CCdata['y_test']


hist(y_train,3)
show()


hist(y_test,3)
show()


nb_setup.images_hconcat(["DSTMAA_images/lda.jpeg"], width=600)


nb_setup.images_hconcat(["DSTMAA_images/lda2.jpeg"], width=600)


#FIT THE LDA MODEL 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model = LDA()
model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


#PREDICTION ON TEST DATA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix

y_hat = model.predict(X_test)


#ACCURACY
#Out of sample
accuracy_score(y_test,y_hat)

0.9850085650143104


#CLASSIFICATION REPORT
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     93835
           1       0.09      0.88      0.16       152

   micro avg       0.99      0.99      0.99     93987
   macro avg       0.54      0.93      0.58     93987
weighted avg       1.00      0.99      0.99     93987


#ROC, AUC
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.9757804853424124


#CONFUSION MATRIX
cm = confusion_matrix(y_test, y_hat)
cm

array([[92444,  1391],
       [   18,   134]])


ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep='\t')
yy = append(list(ones(32)), list(zeros(32)))
ncaa["y"] = yy
ncaa.head()


#CREATE FEATURES
y = ncaa['y']
X = ncaa.iloc[:,2:13]
X.head()


#FIT MODEL
model = LDA()
model.fit(X,y)
ypred = model.predict(X)


#CONFUSION MATRIX
cm = confusion_matrix(y, ypred)
cm

array([[27,  5],
       [ 5, 27]])


#ACCURACY
accuracy_score(y,ypred)

0.84375


#CLASSIFICATION REPORT
print(classification_report(y, ypred))

              precision    recall  f1-score   support

         0.0       0.84      0.84      0.84        32
         1.0       0.84      0.84      0.84        32

   micro avg       0.84      0.84      0.84        64
   macro avg       0.84      0.84      0.84        64
weighted avg       0.84      0.84      0.84        64


#ROC, AUC
y_score = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.92578125

	No NAME	GMS	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P	y
0	1. NorthCarolina	6	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417	1.0
1	2. Illinois	6	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361	1.0
2	3. Louisville	5	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376	1.0
3	4. MichiganState	5	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329	1.0
4	5. Arizona	4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397	1.0

	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P
0	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417
1	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361
2	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376
3	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329
4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397

Discriminant Analysis¶

Credit Card Dataset¶

Linear Discriminant Analysis¶

Discriminant Function¶

NCAA Dataset¶