Sanjiv R. Das
%pylab inline
import pandas as pd
Populating the interactive namespace from numpy and matplotlib
Classification based on the class with the highest posterior probability:
$$ Pr[C_j | x_1,...,x_n] = \frac{Pr[x_1,...,x_n | C_j] \cdot Pr[C_j]}{\sum_i Pr[x_1,...,x_n | C_i] \cdot Pr[C_i]} $$and
$$ Pr[x_1,...,x_n | C_j] = f[x_1|C_j] \cdot f[x_2|C_j] \cdots f[x_n|C_j] $$where the last equation encapsulates "naivety", i.e., $x_1,...,x_n$ are independent and Gaussian with density function $f(x) \sim N(\mu_x, \sigma_x^2)$, computed from the raw data.
#PREDICTION ON TEST DATA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix
ncaa = pd.read_csv("DSTMAA_data/ncaa.txt",sep='\t')
yy = append(list(ones(32)), list(zeros(32)))
ncaa["y"] = yy
ncaa.head()
| No NAME | GMS | PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1. NorthCarolina | 6 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 | 1.0 | 
| 1 | 2. Illinois | 6 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 | 1.0 | 
| 2 | 3. Louisville | 5 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 | 1.0 | 
| 3 | 4. MichiganState | 5 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 | 1.0 | 
| 4 | 5. Arizona | 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 | 1.0 | 
#CREATE FEATURES
y = ncaa['y']
X = ncaa.iloc[:,2:13]
X.head()
| PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 | 
| 1 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 | 
| 2 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 | 
| 3 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 | 
| 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 | 
#FIT MODEL
from sklearn.naive_bayes import GaussianNB as NB
model = NB()
model.fit(X,y)
ypred = model.predict(X)
#CONFUSION MATRIX
cm = confusion_matrix(y, ypred)
cm
array([[26,  6],
       [ 8, 24]])
#ACCURACY
accuracy_score(y,ypred)
0.78125
#CLASSIFICATION REPORT
print(classification_report(y, ypred))
              precision    recall  f1-score   support
         0.0       0.76      0.81      0.79        32
         1.0       0.80      0.75      0.77        32
   micro avg       0.78      0.78      0.78        64
   macro avg       0.78      0.78      0.78        64
weighted avg       0.78      0.78      0.78        64
#ROC, AUC
y_score = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_score)
title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))
Area under curve (AUC): 0.9140625
#LOAD IN CREDIT CARD DATA
import pickle
CCdata = pickle.load(open("DSTMAA_data/CCdata.p", "rb"))
X_train = CCdata['X_train']
y_train = CCdata['y_train']
X_test = CCdata['X_test']
y_test = CCdata['y_test']
#FIT MODEL
from sklearn.naive_bayes import GaussianNB as NB
model = NB()
model.fit(X_train,y_train)
GaussianNB(priors=None, var_smoothing=1e-09)
#CONFUSION MATRIX
ypred = model.predict(X_test)
cm = confusion_matrix(y_test, ypred)
cm
array([[93094,   741],
       [   24,   128]])
#ACCURACY
accuracy_score(y_test,ypred)
0.9918605764627023
#CLASSIFICATION REPORT
print(classification_report(y_test, ypred))
              precision    recall  f1-score   support
           0       1.00      0.99      1.00     93835
           1       0.15      0.84      0.25       152
   micro avg       0.99      0.99      0.99     93987
   macro avg       0.57      0.92      0.62     93987
weighted avg       1.00      0.99      0.99     93987
#ROC, AUC
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))
Area under curve (AUC): 0.9719790547798067