%pylab inline
import pandas as pd
import os
%load_ext rpy2.ipython

Populating the interactive namespace from numpy and matplotlib


%%capture
!pip install ipypublish
from ipypublish import nb_setup


# Basic lines of code needed to import a data file with permissions from Google Drive
from google.colab import drive
# drive.mount("/content/drive", force_remount=True)
drive.mount('/content/drive')
os.chdir("drive/My Drive/Teaching/OnlineMSFA_FNCE2431/FNCE2431_Machine Learning for Finance/3_Course Content/Notebooks/")

Mounted at /content/drive


nb_setup.images_hconcat(["DSTMAA_images/ML_AI.png"], width=700)


nb_setup.images_hconcat(["DSTMAA_images/ML_use_cases.jpg"], width=600)


nb_setup.images_hconcat(["DSTMAA_images/AI_solutions.jpg"], width=600)


nb_setup.images_hconcat(["DSTMAA_images/JPMorgan-machine-learning-2.jpg"], width=600)


#Import the SBA Loans dataset

sba = pd.read_csv("DSTMAA_data/SBA.csv")
print(sba.columns)
print(sba.shape)
sba.head()

Index(['LoanID', 'GrossApproval', 'SBAGuaranteedApproval', 'subpgmdesc',
       'ApprovalFiscalYear', 'InitialInterestRate', 'TermInMonths',
       'ProjectState', 'BusinessType', 'LoanStatus', 'RevolverStatus',
       'JobsSupported'],
      dtype='object')
(527700, 12)


#Feature engineering
sba["GuaranteePct"] = sba.SBAGuaranteedApproval.astype("float")/sba.GrossApproval.astype("float")
X = sba[['ApprovalFiscalYear', 'InitialInterestRate', 'TermInMonths',
        'RevolverStatus','JobsSupported','GuaranteePct']]

x1 = pd.get_dummies(sba.subpgmdesc)
X = pd.concat([X,x1],axis=1)

x2 = pd.get_dummies(sba.BusinessType)
X = pd.concat([X,x2],axis=1)

X.head()


#Sigmoid Function
def logit(fx):
    return exp(fx)/(1+exp(fx))

fx = linspace(-4,4,100)
y = logit(fx)
plot(fx,y)
xlabel('f(x)')
ylabel('Logit value')
grid()


#Dependent categorical variable
y = pd.get_dummies(sba.LoanStatus)
y.head()


#Prepare the X and y variables for chargeoffs vs paid in full
idx1 = list(where(y.CHGOFF==1)[0])
idx2 = list(where(y.PIF==1)[0])
idx = append(idx1,idx2)
print(len(idx))
X = X.iloc[idx]
X["Intercept"] = 1.0

y = y.CHGOFF.iloc[idx]

#Save for later
y_SBA = y
X_SBA = X

223647


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression(max_iter=10000) # higher number of iterations needed if the convergence rate is slow
model = model.fit(X, y)

# check the accuracy on the training set
model.score(X, y)

0.8278000599158495


#Show the coefficients
pd.DataFrame({'X':X.columns, 'Coeff':model.coef_[0]})


nb_setup.images_vconcat(["DSTMAA_images/overfitting.png"], width=600)


# Evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression(max_iter=10000)
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


# Predict class labels for the test set
predicted = model2.predict(X_test)
print(predicted)

[0 0 0 ... 1 1 1]


# Generate class probabilities
probs = model2.predict_proba(X_test)
print(probs)

[[0.93356543 0.06643457]
 [0.95553833 0.04446167]
 [0.83625796 0.16374204]
 ...
 [0.34094718 0.65905282]
 [0.47334767 0.52665233]
 [0.19629927 0.80370073]]


# Confusion Matrix
print(confusion_matrix(predicted, y_test))

[[43693  7996]
 [ 3495 11911]]


print(classification_report(predicted, y_test))

              precision    recall  f1-score   support

           0       0.93      0.85      0.88     51689
           1       0.60      0.77      0.67     15406

    accuracy                           0.83     67095
   macro avg       0.76      0.81      0.78     67095
weighted avg       0.85      0.83      0.84     67095


def MCC(tp,tn,fp,fn):
  return (tp*tn - fp*fn)/sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


mcc = MCC(11911, 43693, 7996, 3495)
print("MCC =", mcc)

MCC = 0.5694125590112834


nb_setup.images_hconcat(["DSTMAA_images/roc_example.jpg"], width=600)


# generate evaluation metrics
print('Accuracy =', accuracy_score(y_test, predicted))
print('AUC =', roc_auc_score(y_test, probs[:, 1]))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-118d2858bf3e> in <module>()
      1 # generate evaluation metrics
----> 2 print('Accuracy =', accuracy_score(y_test, predicted))
      3 print('AUC =', roc_auc_score(y_test, probs[:, 1]))

NameError: name 'predicted' is not defined


#ROC, AUC
from sklearn.metrics import roc_curve, auc
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr,tpr))

Area under curve (AUC):  0.8634222585901785


nb_setup.images_hconcat(["DSTMAA_images/all_metrics.png"], width=600)

	LoanID	GrossApproval	SBAGuaranteedApproval	subpgmdesc	ApprovalFiscalYear	InitialInterestRate	TermInMonths	ProjectState	BusinessType	LoanStatus	RevolverStatus	JobsSupported
0	733784	50000	25000	FA$TRK (Small Loan Express)	2006	11.25	84	IN	CORPORATION	CANCLD	1	4
1	733785	35000	17500	FA$TRK (Small Loan Express)	2006	12.00	84	IL	CORPORATION	CANCLD	0	3
2	733786	15000	7500	FA$TRK (Small Loan Express)	2006	12.00	84	WV	INDIVIDUAL	CANCLD	0	4
3	733787	16000	13600	Community Express	2006	11.50	84	MD	CORPORATION	PIF	0	1
4	733788	16000	13600	Community Express	2006	11.50	84	MD	CORPORATION	CANCLD	0	1

	ApprovalFiscalYear	InitialInterestRate	TermInMonths	RevolverStatus	JobsSupported	GuaranteePct	Community Express	FA$TRK (Small Loan Express)	CORPORATION	INDIVIDUAL
0	2006	11.25	84	1	4	0.50	0	1	1	0
1	2006	12.00	84	0	3	0.50	0	1	1	0
2	2006	12.00	84	0	4	0.50	0	1	0	1
3	2006	11.50	84	0	1	0.85	1	0	1	0
4	2006	11.50	84	0	1	0.85	1	0	1	0

	X	Coeff
0	ApprovalFiscalYear	-0.000790
1	InitialInterestRate	0.334618
2	TermInMonths	-0.042856
3	RevolverStatus	-0.348371
4	JobsSupported	-0.000127
5	GuaranteePct	-0.142879
6	509 - DEALER FLOOR PLAN	-0.052420
7	Community Advantage Initiative	-0.010575
8	Community Express	1.525305
9	Contract Guaranty	-0.294455
10	EXPORT IMPORT HARMONIZATION	-0.025977
11	FA$TRK (Small Loan Express)	-0.067749
12	Guaranty	1.595878
13	Gulf Opportunity	-1.240564
14	International Trade - Sec, 7(a) (16)	-0.009282
15	Lender Advantage Initiative	-0.476432
16	Patriot Express	1.513717
17	Revolving Line of Credit Exports - Sec. 7(a) (14)	-1.638254
18	Rural Lender Advantage	-0.091847
19	Seasonal Line of Credit	-0.046767
20	Small Asset Based	-0.035860
21	Small General Contractors - Sec. 7(a) (9)	-0.128380
22	Standard Asset Based	-0.488682
23	CORPORATION	0.178202
24	INDIVIDUAL	0.193262
25	PARTNERSHIP	-0.342430
26	Intercept	0.027656

Machine Learning: A Quick Introduction¶

Jupyter Extensions¶

5 Applications of ML in Finance¶

Many Applications of ML in Finance¶

J.P. Morgan Guide to ML in Finance¶

ML Tasks in Finance¶

ML with NLP¶

scikit-learn: Python's one-stop shop for ML¶

Supervised Learning Models¶

Unsupervised Learning Models¶

Clustering¶

Dimension Reduction¶

Ensemble Methods¶

Small Business Association (SBA) Loans Dataset¶

Feature Engineering¶

Logistic Regression (Logit)¶

Limited Dependent Variables¶

The Logistic Function¶

Training, validation, and testing data: Underfitting, Overfitting, Cross-Validation¶

Metrics¶

More Metrics¶

The Matthews Correlation Coefficient¶

ROC and AUC¶

All In One¶

ML Comic¶

	CANCLD	PIF
0	1	0
1	1	0
2	1	0
3	0	1
4	1	0