%pylab inline
import pandas as pd
from ipypublish import nb_setup

Populating the interactive namespace from numpy and matplotlib


import pickle
SBAdata = pickle.load(open("DSTMAA_data/SBAdata.p", "rb"))
X = SBAdata["sba"]
print(X.shape)
X.head()

(527700, 24)


#FIT MODEL

#Normalize data, min-max scaling
Xn = (X-X.min())/(X.max()-X.min())

from sklearn.cluster import KMeans
model = KMeans(n_clusters=4).fit(Xn)


#GET CLUSTERS
clusters = model.labels_
Xn["Cluster"] = clusters
Xn.groupby(["Cluster"]).mean()


#FOUR CLUSTERS
from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(Xn)
kmeans = KMeans(init='k-means++', n_clusters=4, n_init=10)
kmeans.fit(reduced_data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)


#PLOT SET UP
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)


figure(1)
imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), 
           yy.max()), cmap=cm.Paired, aspect='auto', origin='lower')
plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, 
        color='w', zorder=10)
xlim(x_min, x_max); ylim(y_min, y_max)
xticks(()); yticks(())
show()


#ELEVEN CLUSTERS
kmeans = KMeans(init='k-means++', n_clusters=11, n_init=10)
kmeans.fit(reduced_data)
#PLOT SET UP
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)


figure(1)
imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), 
           yy.max()), cmap=cm.Paired, aspect='auto', origin='lower')
plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, 
        color='w', zorder=10)
xlim(x_min, x_max); ylim(y_min, y_max)
xticks(()); yticks(())
show()


ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep="\t")
ncaa.head()


#CREATE FEATURES
X = ncaa.iloc[:,2:13]
X.head()


#HIERARCHICAL CLUSTERING
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(X, 'ward')


#DENDROGRAM
figure(figsize=(20, 5))
title('Hierarchical Clustering Dendrogram')
xlabel('sample index')
ylabel('distance')
dendrogram(Z, 
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
show()


%load_ext rpy2.ipython


%%R
#CLUSTER
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
d = dist(ncaa[,3:14], method="euclidian")
fit = hclust(d, method="ward.D")
plot(fit,main="NCAA Teams")
groups = cutree(fit, k=2)
rect.hclust(fit, k=2, border="blue")


%%R
#CLUSTER PLOT
library(cluster)
clusplot(ncaa[,3:14],groups,color=TRUE,shade=TRUE,labels=2,lines=0)

	GrossApproval	ApprovalFiscalYear	InitialInterestRate	TermInMonths	RevolverStatus	JobsSupported	Community Express	...	INDIVIDUAL
0	50000	2006	11.25	84	1	4	0	...	0
1	35000	2006	12.00	84	0	3	0	...	0
2	15000	2006	12.00	84	0	4	0	...	1
3	16000	2006	11.50	84	0	1	1	...	0
4	16000	2006	11.50	84	0	1	1	...	0

	GrossApproval	ApprovalFiscalYear	InitialInterestRate	TermInMonths	RevolverStatus	JobsSupported	Community Advantage Initiative	Community Express	Contract Guaranty	EXPORT IMPORT HARMONIZATION	...	Lender Advantage Initiative	Patriot Express	Revolving Line of Credit Exports - Sec. 7(a) (14)	Rural Lender Advantage	Seasonal Line of Credit	Small Asset Based	Small General Contractors - Sec. 7(a) (9)	Standard Asset Based	INDIVIDUAL	PARTNERSHIP
Cluster
0	0.015145	0.422470	0.139692	0.096157	1.000000	0.001701	0.000000	0.000000	0.001754	0.000062	...	0.000000	0.012489	0.004524	0.000005	0.000605	0.000251	0.000385	0.010217	0.178651	0.013530
1	0.140723	0.536712	0.096212	0.238915	0.000290	0.003711	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.090780	0.029103
2	0.015780	0.381570	0.132424	0.105840	0.000000	0.001719	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.213993	0.015966
3	0.024071	0.502155	0.121932	0.139783	0.019229	0.001843	0.029091	0.389474	0.000558	0.000078	...	0.351094	0.115357	0.003753	0.065284	0.000264	0.000202	0.000853	0.001396	0.207018	0.022066

	No NAME	GMS	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P
0	1. NorthCarolina	6	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417
1	2. Illinois	6	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361
2	3. Louisville	5	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376
3	4. MichiganState	5	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329
4	5. Arizona	4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397

	PTS	REB	AST	TO	A/T	STL	BLK	PF	FG	FT	3P
0	84.2	41.5	17.8	12.8	1.39	6.7	3.8	16.7	0.514	0.664	0.417
1	74.5	34.0	19.0	10.2	1.87	8.0	1.7	16.5	0.457	0.753	0.361
2	77.4	35.4	13.6	11.0	1.24	5.4	4.2	16.6	0.479	0.702	0.376
3	80.8	37.8	13.0	12.6	1.03	8.4	2.4	19.8	0.445	0.783	0.329
4	79.8	35.0	15.8	14.5	1.09	6.0	6.5	13.3	0.542	0.759	0.397

Clustering¶

Overview¶

K-means¶

SBA dataset¶

Clustering on PCA reduced data¶

Hierarchical Clustering (bottom up)¶

NCAA dataset¶

Redo Hierarchical Clustering in R¶