%pylab inline
import pandas as pd
from ipypublish import nb_setup
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix

Populating the interactive namespace from numpy and matplotlib


data = pd.read_csv('DSTMAA_data/creditcard.csv')
print(data.shape)
data.head()

(284807, 31)


data.describe()


data[["Class","V1"]].groupby(["Class"]).count()


data[["Class","Amount"]].groupby(["Class"]).mean()


X_train, X_test, y_train, y_test = train_test_split(data.drop('Class',axis=1), data['Class'], test_size=0.33)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(190820, 30)
(190820,)
(93987, 30)
(93987,)


nb_setup.images_hconcat(["DSTMAA_images/smote.png"], width=600)


## Keep original training data before SMOTE
X_train0 = X_train
y_train0 = y_train


sme = SMOTEENN()
X_train, y_train = sme.fit_sample(X_train, y_train)
print(X_train.shape)
print(y_train.shape)
unique(y_train, return_counts=True)

(357425, 30)
(357425,)

(array([0, 1], dtype=int64), array([174933, 182492], dtype=int64))


#SAVE TO PICKLE
import pickle
CCdata = {'X_train':X_train, 'X_test':X_test, 'y_train':y_train, 'y_test':y_test}
pickle.dump(CCdata, open( "DSTMAA_data/CCdata.p", "wb" ))


mean(y_train)  #Corresponds to counts from previous block

0.5105742463453872


a = X_train[:,29]   #Collect the Amount column
print(mean(a))
print(mean(a[y_train==0]))
print(mean(a[y_train==1]))

95.2820114172119
86.70202071650289
103.50661037632864


%%time
#DEFAULT: (~30 secs)
#class sklearn.ensemble.RandomForestClassifier(n_estimators=10, 
#criterion=’gini’, max_depth=None, min_samples_split=2, 
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
#max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, 
#random_state=None, verbose=0, warm_start=False, class_weight=None)[source]

clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train,y_train)

y_test_hat = clf.predict(X_test)

Wall time: 29.7 s


#In sample
y_train_hat = clf.predict(X_train0)
accuracy_score(y_train0,y_train_hat)

0.9999109108059951


#Out of sample
accuracy_score(y_test,y_test_hat)

0.9994254524561907


# Classification Report
print(classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93825
           1       0.83      0.83      0.83       162

   micro avg       1.00      1.00      1.00     93987
   macro avg       0.92      0.92      0.92     93987
weighted avg       1.00      1.00      1.00     93987


y_score = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('Random Forest ROC curve: CC Fraud')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
#print 'Area under curve (AUC): ', auc(fpr,tpr)
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.955217291187626


def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap) 
    plt.title(title)
    class_labels = ['Valid','Fraud']
    plt.colorbar()
    
    tick_marks = np.arange(len(class_labels)) 
    plt.xticks(tick_marks, class_labels, rotation=90) 
    plt.yticks(tick_marks, class_labels) 
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


cm = confusion_matrix(y_test, y_test_hat)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 
plt.figure(figsize=(5,5))
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')


#Out of sample
print(cm)
print("False positive rate = ",cm[1][0]/sum(cm[1]))

[[93798    27]
 [   27   135]]
False positive rate =  0.16666666666666666


#print classification_report(y_test, y_test_hat)
print(classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93820
           1       0.84      0.85      0.85       167

   micro avg       1.00      1.00      1.00     93987
   macro avg       0.92      0.93      0.92     93987
weighted avg       1.00      1.00      1.00     93987


#Type I error = 1 - precision
type1err = 1 - cm[1][1]/(cm[1][1]+cm[0][1])
print("Type I error =", type1err)

Type I error = 0.1597633136094675


#Type II error = 1 - recall
type2err = 1 - cm[1][1]/(cm[1][1]+cm[1][0])
print("Type II error =", type2err)

Type II error = 0.14970059880239517


#F1 score = harmonic mean of precision and recall
precision = cm[1][1]/(cm[1][1]+cm[0][1])
recall = cm[1][1]/(cm[1][1]+cm[1][0])
f_1 = 2.0/(1/precision + 1/recall)
print("F1 = ",f_1)

F1 =  0.8452380952380952


#Sensitivity = Recall = True positive rate = probability of detection
sensitivity = recall
print("Sensitivity = ",sensitivity)

Sensitivity =  0.8502994011976048


#Specificity = true negative rate = TN/(TN+FP)
specificity = cm[0][0]/(cm[0][0]+cm[0][1])
print("Specificity = ",specificity)

Specificity =  0.9997122148795566


#Recheck in sample
cm2 = confusion_matrix(y_train0, y_train_hat)
print(cm2)
print("False positive rate = ",cm2[1][0]/sum(cm[1]))

[[190481     14]
 [     1    324]]
False positive rate =  0.005988023952095809


from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(X_train,y_train)

C:\Users\srdas\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


y_score = logit.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('Logit ROC curve: CC Fraud')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
#print 'Area under curve (AUC): ', auc(fpr,tpr)
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.9782273547128723


y_test_hat = logit.predict(X_test)

cm = confusion_matrix(y_test, y_test_hat)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 
plt.figure(figsize=(5,5))
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')


print(cm)
#print classification_report(y_test, y_test_hat)
print(classification_report(y_test, y_test_hat))

[[92136  1684]
 [   12   155]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     93820
           1       0.08      0.93      0.15       167

   micro avg       0.98      0.98      0.98     93987
   macro avg       0.54      0.96      0.57     93987
weighted avg       1.00      0.98      0.99     93987


#Chisq test for significance of confusion matrix
from scipy.stats import chi2

cmA = cm
cmRowsums = matrix(cmA.sum(axis=1))
cmColsums = matrix(cmA.sum(axis=0))
cmE = cmRowsums.T.dot(cmColsums)/sum(cmA)
cmA = matrix(cm)
print("cmA = ",cmA)
print("cmE = ",cmE)

chisq_stat = sum((cmA-cmE)**2/cmE)
print("Chisq statistic = ",chisq_stat)
print("P-value = ",chi2.sf(chisq_stat,1))

cmA =  [[92136  1684]
 [   12   155]]
cmE =  [[9.19842676e+04 1.83573239e+03]
 [1.63732389e+02 3.26761148e+00]]
Chisq statistic =  13785.659783915933
P-value =  0.0

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
count	284807.000000	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	...	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	284807.000000	284807.000000
mean	94813.859575	3.919560e-15	5.688174e-16	-8.769071e-15	2.782312e-15	-1.552563e-15	2.010663e-15	-1.694249e-15	-1.927028e-16	-3.137024e-15	...	1.537294e-16	7.959909e-16	5.367590e-16	4.458112e-15	1.453003e-15	1.699104e-15	-3.660161e-16	-1.206049e-16	88.349619	0.001727
std	47488.145955	1.958696e+00	1.651309e+00	1.516255e+00	1.415869e+00	1.380247e+00	1.332271e+00	1.237094e+00	1.194353e+00	1.098632e+00	...	7.345240e-01	7.257016e-01	6.244603e-01	6.056471e-01	5.212781e-01	4.822270e-01	4.036325e-01	3.300833e-01	250.120109	0.041527
min	0.000000	-5.640751e+01	-7.271573e+01	-4.832559e+01	-5.683171e+00	-1.137433e+02	-2.616051e+01	-4.355724e+01	-7.321672e+01	-1.343407e+01	...	-3.483038e+01	-1.093314e+01	-4.480774e+01	-2.836627e+00	-1.029540e+01	-2.604551e+00	-2.256568e+01	-1.543008e+01	0.000000	0.000000
25%	54201.500000	-9.203734e-01	-5.985499e-01	-8.903648e-01	-8.486401e-01	-6.915971e-01	-7.682956e-01	-5.540759e-01	-2.086297e-01	-6.430976e-01	...	-2.283949e-01	-5.423504e-01	-1.618463e-01	-3.545861e-01	-3.171451e-01	-3.269839e-01	-7.083953e-02	-5.295979e-02	5.600000	0.000000
50%	84692.000000	1.810880e-02	6.548556e-02	1.798463e-01	-1.984653e-02	-5.433583e-02	-2.741871e-01	4.010308e-02	2.235804e-02	-5.142873e-02	...	-2.945017e-02	6.781943e-03	-1.119293e-02	4.097606e-02	1.659350e-02	-5.213911e-02	1.342146e-03	1.124383e-02	22.000000	0.000000
75%	139320.500000	1.315642e+00	8.037239e-01	1.027196e+00	7.433413e-01	6.119264e-01	3.985649e-01	5.704361e-01	3.273459e-01	5.971390e-01	...	1.863772e-01	5.285536e-01	1.476421e-01	4.395266e-01	3.507156e-01	2.409522e-01	9.104512e-02	7.827995e-02	77.165000	0.000000
max	172792.000000	2.454930e+00	2.205773e+01	9.382558e+00	1.687534e+01	3.480167e+01	7.330163e+01	1.205895e+02	2.000721e+01	1.559499e+01	...	2.720284e+01	1.050309e+01	2.252841e+01	4.584549e+00	7.519589e+00	3.517346e+00	3.161220e+01	3.384781e+01	25691.160000	1.000000

Random Forest Classifier¶

Kaggle's Credit Card Fraud Dataset - RF¶

Quick Class counts¶

Mean Amount in Each class¶

Under/over-sample with SMOTE ENN to overcome class imbalance¶

Different types of Re-sampling methods¶

How does SMOTE work?¶

Train & Predict¶

Evaluate predictions¶

Accuracy¶

SciKitLearn's classification report gives us a more complete picture.¶

ROC Curve & AUC¶

Confusion Matrix¶

Logistic Regression Reprise (after oversampling)¶

	V1
Class
0	284315
1	492

	Amount
Class
0	88.291022
1	122.211321