%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


#PREDICTION ON TEST DATA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix


ncaa = pd.read_csv("DSTMAA_data/ncaa.txt",sep='\t')
yy = append(list(ones(32)), list(zeros(32)))
ncaa["y"] = yy
ncaa.head()


#CREATE FEATURES
y = ncaa['y']
X = ncaa.iloc[:,2:13]
X.head()


#FIT MODEL
from sklearn.naive_bayes import GaussianNB as NB
model = NB()
model.fit(X,y)
ypred = model.predict(X)


#CONFUSION MATRIX
cm = confusion_matrix(y, ypred)
cm

array([[26,  6],
       [ 8, 24]])


#ACCURACY
accuracy_score(y,ypred)

0.78125


#CLASSIFICATION REPORT
print(classification_report(y, ypred))

              precision    recall  f1-score   support

         0.0       0.76      0.81      0.79        32
         1.0       0.80      0.75      0.77        32

   micro avg       0.78      0.78      0.78        64
   macro avg       0.78      0.78      0.78        64
weighted avg       0.78      0.78      0.78        64


#ROC, AUC
y_score = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.9140625


#LOAD IN CREDIT CARD DATA
import pickle
CCdata = pickle.load(open("DSTMAA_data/CCdata.p", "rb"))
X_train = CCdata['X_train']
y_train = CCdata['y_train']
X_test = CCdata['X_test']
y_test = CCdata['y_test']


#FIT MODEL
from sklearn.naive_bayes import GaussianNB as NB
model = NB()
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)


#CONFUSION MATRIX
ypred = model.predict(X_test)
cm = confusion_matrix(y_test, ypred)
cm

array([[93094,   741],
       [   24,   128]])


#ACCURACY
accuracy_score(y_test,ypred)

0.9918605764627023


#CLASSIFICATION REPORT
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     93835
           1       0.15      0.84      0.25       152

   micro avg       0.99      0.99      0.99     93987
   macro avg       0.57      0.92      0.62     93987
weighted avg       1.00      0.99      0.99     93987


#ROC, AUC
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.9719790547798067

	No NAME	GMS	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P	y
0	1. NorthCarolina	6	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417	1.0
1	2. Illinois	6	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361	1.0
2	3. Louisville	5	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376	1.0
3	4. MichiganState	5	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329	1.0
4	5. Arizona	4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397	1.0

	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P
0	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417
1	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361
2	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376
3	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329
4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397

Naive Bayes Classifier¶

What is Naive Bayes?¶

NCAA Dataset¶

Credit Card Dataset¶