%pylab inline
import pandas as pd
from ipypublish import nb_setup

Populating the interactive namespace from numpy and matplotlib


#PREDICTION ON TEST DATA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix


ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep="\t")
yy = append(list(ones(32)), list(zeros(32)))
ncaa["y"] = yy
ncaa.head()


#CREATE FEATURES
y = ncaa['y']
X = ncaa.iloc[:,2:13]
X.head()


#FIT MODEL
from sklearn.tree import DecisionTreeClassifier as CART
model = CART()
model.fit(X,y) 
ypred = model.predict(X)


#CONFUSION MATRIX
cm = confusion_matrix(y, ypred)
cm

array([[32,  0],
       [ 0, 32]])


#ACCURACY
accuracy_score(y,ypred)

1.0


#CLASSIFICATION REPORT
print(classification_report(y, ypred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        32
         1.0       1.00      1.00      1.00        32

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64


#ROC, AUC
y_score = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  1.0


from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

/Users/srdas/anaconda3/lib/python3.7/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", DeprecationWarning)


#May need: sudo aptitude install graphviz
Image(graph.create_png())


#LOAD IN CREDIT CARD DATA
import pickle
CCdata = pickle.load(open("DSTMAA_data/CCdata.p", "rb"))
X_train = CCdata['X_train']
y_train = CCdata['y_train']
X_test = CCdata['X_test']
y_test = CCdata['y_test']


#FIT MODEL
from sklearn.tree import DecisionTreeClassifier as CART
model = CART()
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


#CONFUSION MATRIX
ypred = model.predict(X_test)
cm = confusion_matrix(y_test, ypred)
cm

array([[93609,   227],
       [   22,   129]])


#ACCURACY
accuracy_score(y_test,ypred)

0.9973506974368795


#CLASSIFICATION REPORT
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93836
           1       0.36      0.85      0.51       151

    accuracy                           1.00     93987
   macro avg       0.68      0.93      0.75     93987
weighted avg       1.00      1.00      1.00     93987


#ROC, AUC
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.9259427607811742


dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydot plus.graph_from_dot_data(dot_data.getvalue())  
#graph.write('images/tree.dot')

  File "<ipython-input-19-d0c01cde5bd5>", line 5
    graph = pydot plus.graph_from_dot_data(dot_data.getvalue())
                     ^
SyntaxError: invalid syntax


Image(graph.create_png())

	No NAME	GMS	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P	y
0	1. NorthCarolina	6	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417	1.0
1	2. Illinois	6	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361	1.0
2	3. Louisville	5	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376	1.0
3	4. MichiganState	5	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329	1.0
4	5. Arizona	4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397	1.0

	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P
0	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417
1	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361
2	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376
3	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329
4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397

Decision Trees¶

Prediction Trees¶

Recursive Partitioning¶

C4.5 Classifier¶

NCAA Dataset¶

Gini coefficient¶

Credit Card Dataset¶

Explainability and Sensitivity¶