Sanjiv R. Das
%pylab inline
import pandas as pd
from ipypublish import nb_setup
Populating the interactive namespace from numpy and matplotlib
Grouping individuals, firms, projects, etc.
Cluster analysis comprises a group of techniques that uses distance metrics to bunch data into categories.
Two approaches.
Partitioning or Top-down: In this approach, the entire set of $n$ entities is assumed to be divided into $k$ clusters. Then entities are assigned clusters.
Agglomerative or Hierarchical or Bottom-up: In this case we begin with all entities in the analysis being given their own cluster, so that we start with $n$ clusters. Then, entities are grouped into clusters based on a given distance metric between each pair of entities. In this way a hierarchy of clusters is built up and the researcher can choose which grouping is preferred.
import pickle
SBAdata = pickle.load(open("DSTMAA_data/SBAdata.p", "rb"))
X = SBAdata["sba"]
print(X.shape)
X.head()
(527700, 24)
GrossApproval | ApprovalFiscalYear | InitialInterestRate | TermInMonths | RevolverStatus | JobsSupported | Community Advantage Initiative | Community Express | Contract Guaranty | EXPORT IMPORT HARMONIZATION | ... | Lender Advantage Initiative | Patriot Express | Revolving Line of Credit Exports - Sec. 7(a) (14) | Rural Lender Advantage | Seasonal Line of Credit | Small Asset Based | Small General Contractors - Sec. 7(a) (9) | Standard Asset Based | INDIVIDUAL | PARTNERSHIP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50000 | 2006 | 11.25 | 84 | 1 | 4 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 35000 | 2006 | 12.00 | 84 | 0 | 3 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 15000 | 2006 | 12.00 | 84 | 0 | 4 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 16000 | 2006 | 11.50 | 84 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 16000 | 2006 | 11.50 | 84 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 24 columns
#FIT MODEL
#Normalize data, min-max scaling
Xn = (X-X.min())/(X.max()-X.min())
from sklearn.cluster import KMeans
model = KMeans(n_clusters=4).fit(Xn)
#GET CLUSTERS
clusters = model.labels_
Xn["Cluster"] = clusters
Xn.groupby(["Cluster"]).mean()
GrossApproval | ApprovalFiscalYear | InitialInterestRate | TermInMonths | RevolverStatus | JobsSupported | Community Advantage Initiative | Community Express | Contract Guaranty | EXPORT IMPORT HARMONIZATION | ... | Lender Advantage Initiative | Patriot Express | Revolving Line of Credit Exports - Sec. 7(a) (14) | Rural Lender Advantage | Seasonal Line of Credit | Small Asset Based | Small General Contractors - Sec. 7(a) (9) | Standard Asset Based | INDIVIDUAL | PARTNERSHIP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Cluster | |||||||||||||||||||||
0 | 0.015145 | 0.422470 | 0.139692 | 0.096157 | 1.000000 | 0.001701 | 0.000000 | 0.000000 | 0.001754 | 0.000062 | ... | 0.000000 | 0.012489 | 0.004524 | 0.000005 | 0.000605 | 0.000251 | 0.000385 | 0.010217 | 0.178651 | 0.013530 |
1 | 0.140723 | 0.536712 | 0.096212 | 0.238915 | 0.000290 | 0.003711 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.090780 | 0.029103 |
2 | 0.015780 | 0.381570 | 0.132424 | 0.105840 | 0.000000 | 0.001719 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.213993 | 0.015966 |
3 | 0.024071 | 0.502155 | 0.121932 | 0.139783 | 0.019229 | 0.001843 | 0.029091 | 0.389474 | 0.000558 | 0.000078 | ... | 0.351094 | 0.115357 | 0.003753 | 0.065284 | 0.000264 | 0.000202 | 0.000853 | 0.001396 | 0.207018 | 0.022066 |
4 rows × 24 columns
#FOUR CLUSTERS
from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(Xn)
kmeans = KMeans(init='k-means++', n_clusters=4, n_init=10)
kmeans.fit(reduced_data)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
#PLOT SET UP
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
figure(1)
imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(),
yy.max()), cmap=cm.Paired, aspect='auto', origin='lower')
plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3,
color='w', zorder=10)
xlim(x_min, x_max); ylim(y_min, y_max)
xticks(()); yticks(())
show()
#ELEVEN CLUSTERS
kmeans = KMeans(init='k-means++', n_clusters=11, n_init=10)
kmeans.fit(reduced_data)
#PLOT SET UP
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
figure(1)
imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(),
yy.max()), cmap=cm.Paired, aspect='auto', origin='lower')
plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3,
color='w', zorder=10)
xlim(x_min, x_max); ylim(y_min, y_max)
xticks(()); yticks(())
show()
ncaa = pd.read_csv("DSTMAA_data/ncaa.txt", sep="\t")
ncaa.head()
No NAME | GMS | PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1. NorthCarolina | 6 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 |
1 | 2. Illinois | 6 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 |
2 | 3. Louisville | 5 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 |
3 | 4. MichiganState | 5 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 |
4 | 5. Arizona | 4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 |
#CREATE FEATURES
X = ncaa.iloc[:,2:13]
X.head()
PTS | REB | AST | TO | A/T | STL | BLK | PF | FG | FT | 3P | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 84.2 | 41.5 | 17.8 | 12.8 | 1.39 | 6.7 | 3.8 | 16.7 | 0.514 | 0.664 | 0.417 |
1 | 74.5 | 34.0 | 19.0 | 10.2 | 1.87 | 8.0 | 1.7 | 16.5 | 0.457 | 0.753 | 0.361 |
2 | 77.4 | 35.4 | 13.6 | 11.0 | 1.24 | 5.4 | 4.2 | 16.6 | 0.479 | 0.702 | 0.376 |
3 | 80.8 | 37.8 | 13.0 | 12.6 | 1.03 | 8.4 | 2.4 | 19.8 | 0.445 | 0.783 | 0.329 |
4 | 79.8 | 35.0 | 15.8 | 14.5 | 1.09 | 6.0 | 6.5 | 13.3 | 0.542 | 0.759 | 0.397 |
#HIERARCHICAL CLUSTERING
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(X, 'ward')
#DENDROGRAM
figure(figsize=(20, 5))
title('Hierarchical Clustering Dendrogram')
xlabel('sample index')
ylabel('distance')
dendrogram(Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
show()
%load_ext rpy2.ipython
%%R
#CLUSTER
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
d = dist(ncaa[,3:14], method="euclidian")
fit = hclust(d, method="ward.D")
plot(fit,main="NCAA Teams")
groups = cutree(fit, k=2)
rect.hclust(fit, k=2, border="blue")
%%R
#CLUSTER PLOT
library(cluster)
clusplot(ncaa[,3:14],groups,color=TRUE,shade=TRUE,labels=2,lines=0)