%pylab inline
import pandas as pd
from ipypublish import nb_setup
%load_ext rpy2.ipython

Populating the interactive namespace from numpy and matplotlib


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
print(names(ncaa))
d = dist(ncaa[,3:14], method="euclidian")
print(head(d))
print(dim(as.matrix(d)))

 [1] "No"   "NAME" "GMS"  "PTS"  "REB"  "AST"  "TO"   "A.T"  "STL"  "BLK" 
[11] "PF"   "FG"   "FT"   "X3P" 
[1] 12.842301 10.354557  7.996641  9.588546 15.892854 20.036546
[1] 64 64


%%R
# a 2-dimensional example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
(cl <- kmeans(x, 2))

K-means clustering with 2 clusters of sizes 51, 49

Cluster means:
            x           y
1  0.88936602  1.05151616
2 -0.03768187 -0.02612383

Clustering vector:
  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [38] 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:
[1] 7.551175 7.292687
 (between_SS / total_SS =  77.3 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"


%%R
#PLOTTING CLUSTERS
print(names(cl))
plot(x, col = cl$cluster)
points(cl$, col = 1:2, pch = 8, cex=4)

Error in (function (file = "", n = NULL, text = NULL, prompt = "?", keep.source = getOption("keep.source"),  : 
  <text>:4:11: unexpected ','
3: plot(x, col = cl$cluster)
4: points(cl$,
             ^

/Users/srdas/anaconda3/lib/python3.7/site-packages/rpy2/rinterface/__init__.py:145: RRuntimeWarning: Error in (function (file = "", n = NULL, text = NULL, prompt = "?", keep.source = getOption("keep.source"),  : 
  <text>:4:11: unexpected ','
3: plot(x, col = cl$cluster)
4: points(cl$,
             ^

  warnings.warn(x, RRuntimeWarning)


%%R
#REDO ANALYSIS WITH 5 CLUSTERS
## random starts do help here with too many clusters
(cl <- kmeans(x, 5, nstart = 25))

K-means clustering with 5 clusters of sizes 16, 19, 15, 22, 28

Cluster means:
           x           y
1 -0.1444232  0.30302111
2 -0.1920896 -0.22560162
3  0.2887246 -0.07365205
4  0.8427549  0.80546765
5  0.9500090  1.25606578

Clustering vector:
  [1] 2 3 2 2 1 1 3 3 2 1 3 2 3 1 2 1 3 3 2 2 1 3 2 3 2 2 1 2 2 2 2 3 1 3 1 1 1
 [38] 3 2 2 2 3 1 1 2 3 1 1 3 1 4 4 5 4 4 5 5 4 5 4 4 4 4 5 4 5 4 5 5 5 5 5 5 5
 [75] 5 5 4 5 5 5 5 5 5 5 4 5 4 5 4 5 5 4 4 4 4 5 4 4 4 5

Within cluster sum of squares by cluster:
[1] 1.239523 0.988890 0.954980 1.372860 2.972999
 (between_SS / total_SS =  88.5 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"


%%R
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
print(names(ncaa))
fit = kmeans(ncaa[,3:14],4)
print(fit$size)
print(fit$centers)

 [1] "No"   "NAME" "GMS"  "PTS"  "REB"  "AST"  "TO"   "A.T"  "STL"  "BLK" 
[11] "PF"   "FG"   "FT"   "X3P" 
[1] 22 13 15 14
       GMS      PTS      REB       AST       TO       A.T      STL      BLK
1 1.909091 69.61364 35.26364 13.104545 12.67727 1.0936364 6.759091 3.131818
2 1.153846 54.96154 37.53846  8.307692 14.42308 0.6323077 6.000000 2.384615
3 1.533333 61.78000 30.92667 12.706667 15.66667 0.8666667 7.633333 2.726667
4 3.357143 80.12857 34.15714 16.357143 13.70714 1.2357143 6.821429 2.514286
        PF        FG        FT       X3P
1 19.06818 0.4225455 0.6995455 0.3233182
2 17.38462 0.3644615 0.6628462 0.2841538
3 19.31333 0.4190000 0.6925333 0.3252667
4 18.48571 0.4837143 0.7042143 0.4035714


%%R
#Since there are more than two attributes of each observation in the data,
#we picked two of them {AST, PTS} and plotted the clusters against those.
idx = c(4,6)
plot(ncaa[,idx],col=fit$cluster)


%%R
d = dist(ncaa[,3:14], method="euclidian")
fit = hclust(d, method="ward.D")
names(fit)

[1] "merge"       "height"      "order"       "labels"      "method"     
[6] "call"        "dist.method"


%%R
plot(fit,main="NCAA Teams")
groups = cutree(fit, k=3)
rect.hclust(fit, k=3, border="blue")


%%R
print(groups)
library(cluster)
clusplot(ncaa[,3:14],groups,color=TRUE,shade=TRUE,labels=2,lines=0)

 [1] 1 1 1 1 1 2 1 1 2 2 1 2 2 1 1 1 2 2 2 2 2 2 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 3
[39] 1 2 3 3 3 2 2 2 3 2 1 2 2 3 1 2 3 2 2 2 2 3 3 3 3 2


%%R
#Using the correlation matrix as a proxy for distance
x = t(as.matrix(ncaa[,3:14]))
d = as.dist((1-cor(x))/2)
fit = hclust(d, method="ward.D")
plot(fit,main="NCAA Teams")
groups = cutree(fit, k=3)
rect.hclust(fit, k=3, border="red")


%%R
print(groups)
library(cluster)
clusplot(ncaa[,3:14],groups,color=TRUE,shade=TRUE,labels=2,lines=0)

 [1] 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 2 1 1 2 1 1 1 2 2 1 2 1 2 2 2 1 1 1 2 3 3
[39] 1 1 3 3 3 3 2 1 3 2 1 2 2 2 1 1 2 2 2 2 3 2 1 3 1 2


%%R
library(class)
data(iris)
print(head(iris))

#SAMPLE A SUBSET
idx = seq(1,length(iris[,1]))
train_idx = sample(idx,100)
test_idx = setdiff(idx,train_idx)
x_train = iris[train_idx,1:4]
x_test = iris[test_idx,1:4]
y_train = iris[train_idx,5]
y_test = iris[test_idx,5]

#RUN knn
res = knn(x_train, x_test, y_train, k = 3, prob = FALSE, use.all = TRUE)
res

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
 [1] setosa     setosa     setosa     setosa     setosa     setosa    
 [7] setosa     setosa     setosa     setosa     setosa     setosa    
[13] setosa     setosa     versicolor versicolor versicolor versicolor
[19] virginica  virginica  virginica  versicolor versicolor versicolor
[25] versicolor versicolor versicolor versicolor versicolor virginica 
[31] versicolor virginica  virginica  virginica  virginica  virginica 
[37] virginica  virginica  virginica  virginica  virginica  virginica 
[43] virginica  virginica  virginica  virginica  virginica  virginica 
[49] virginica  virginica 
Levels: setosa versicolor virginica


%%R
table(res,y_test)

            y_test
res          setosa versicolor virginica
  setosa         14          0         0
  versicolor      0         12         1
  virginica       0          3        20


%%R
library(rpart)
data(kyphosis)
head(kyphosis)

  Kyphosis Age Number Start
1   absent  71      3     5
2   absent 158      3    14
3  present 128      4     5
4   absent   2      5     1
5   absent   1      4    15
6   absent   1      2    16


%%R
fit = rpart(Kyphosis~Age+Number+Start, method="class", data=kyphosis)
printcp(fit)
summary(kyphosis)

Classification tree:
rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis, 
    method = "class")

Variables actually used in tree construction:
[1] Age   Start

Root node error: 17/81 = 0.20988

n= 81 

        CP nsplit rel error xerror    xstd
1 0.176471      0   1.00000 1.0000 0.21559
2 0.019608      1   0.82353 1.1176 0.22433
3 0.010000      4   0.76471 1.1176 0.22433
    Kyphosis       Age             Number           Start      
 absent :64   Min.   :  1.00   Min.   : 2.000   Min.   : 1.00  
 present:17   1st Qu.: 26.00   1st Qu.: 3.000   1st Qu.: 9.00  
              Median : 87.00   Median : 4.000   Median :13.00  
              Mean   : 83.65   Mean   : 4.049   Mean   :11.49  
              3rd Qu.:130.00   3rd Qu.: 5.000   3rd Qu.:16.00  
              Max.   :206.00   Max.   :10.000   Max.   :18.00


%%R
summary(fit)

Call:
rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis, 
    method = "class")
  n= 81 

          CP nsplit rel error   xerror      xstd
1 0.17647059      0 1.0000000 1.000000 0.2155872
2 0.01960784      1 0.8235294 1.117647 0.2243268
3 0.01000000      4 0.7647059 1.117647 0.2243268

Variable importance
 Start    Age Number 
    64     24     12 

Node number 1: 81 observations,    complexity param=0.1764706
  predicted class=absent   expected loss=0.2098765  P(node) =1
    class counts:    64    17
   probabilities: 0.790 0.210 
  left son=2 (62 obs) right son=3 (19 obs)
  Primary splits:
      Start  < 8.5  to the right, improve=6.762330, (0 missing)
      Number < 5.5  to the left,  improve=2.866795, (0 missing)
      Age    < 39.5 to the left,  improve=2.250212, (0 missing)
  Surrogate splits:
      Number < 6.5  to the left,  agree=0.802, adj=0.158, (0 split)

Node number 2: 62 observations,    complexity param=0.01960784
  predicted class=absent   expected loss=0.09677419  P(node) =0.7654321
    class counts:    56     6
   probabilities: 0.903 0.097 
  left son=4 (29 obs) right son=5 (33 obs)
  Primary splits:
      Start  < 14.5 to the right, improve=1.0205280, (0 missing)
      Age    < 55   to the left,  improve=0.6848635, (0 missing)
      Number < 4.5  to the left,  improve=0.2975332, (0 missing)
  Surrogate splits:
      Number < 3.5  to the left,  agree=0.645, adj=0.241, (0 split)
      Age    < 16   to the left,  agree=0.597, adj=0.138, (0 split)

Node number 3: 19 observations
  predicted class=present  expected loss=0.4210526  P(node) =0.2345679
    class counts:     8    11
   probabilities: 0.421 0.579 

Node number 4: 29 observations
  predicted class=absent   expected loss=0  P(node) =0.3580247
    class counts:    29     0
   probabilities: 1.000 0.000 

Node number 5: 33 observations,    complexity param=0.01960784
  predicted class=absent   expected loss=0.1818182  P(node) =0.4074074
    class counts:    27     6
   probabilities: 0.818 0.182 
  left son=10 (12 obs) right son=11 (21 obs)
  Primary splits:
      Age    < 55   to the left,  improve=1.2467530, (0 missing)
      Start  < 12.5 to the right, improve=0.2887701, (0 missing)
      Number < 3.5  to the right, improve=0.1753247, (0 missing)
  Surrogate splits:
      Start  < 9.5  to the left,  agree=0.758, adj=0.333, (0 split)
      Number < 5.5  to the right, agree=0.697, adj=0.167, (0 split)

Node number 10: 12 observations
  predicted class=absent   expected loss=0  P(node) =0.1481481
    class counts:    12     0
   probabilities: 1.000 0.000 

Node number 11: 21 observations,    complexity param=0.01960784
  predicted class=absent   expected loss=0.2857143  P(node) =0.2592593
    class counts:    15     6
   probabilities: 0.714 0.286 
  left son=22 (14 obs) right son=23 (7 obs)
  Primary splits:
      Age    < 111  to the right, improve=1.71428600, (0 missing)
      Start  < 12.5 to the right, improve=0.79365080, (0 missing)
      Number < 3.5  to the right, improve=0.07142857, (0 missing)

Node number 22: 14 observations
  predicted class=absent   expected loss=0.1428571  P(node) =0.1728395
    class counts:    12     2
   probabilities: 0.857 0.143 

Node number 23: 7 observations
  predicted class=present  expected loss=0.4285714  P(node) =0.08641975
    class counts:     3     4
   probabilities: 0.429 0.571


%%R
plot(fit, uniform=TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=0.8)


%%R
dx = 0.001
x = seq(-5,5,dx)
H2 = -sum(dnorm(x,sd=2)*log(dnorm(x,sd=2))*dx)
print(H2)
H3 = -sum(dnorm(x,sd=3)*log(dnorm(x,sd=3))*dx)
print(H3)

[1] 2.042076
[1] 2.111239


%%R
data(cu.summary)
print(names(cu.summary))
print(head(cu.summary))
print(tail(cu.summary))
print(dim(cu.summary))
print(unique(cu.summary$Type))
print(unique(cu.summary$Country))

[1] "Price"       "Country"     "Reliability" "Mileage"     "Type"       
                Price Country Reliability Mileage  Type
Acura Integra 4 11950   Japan Much better      NA Small
Dodge Colt 4     6851   Japan        <NA>      NA Small
Dodge Omni 4     6995     USA  Much worse      NA Small
Eagle Summit 4   8895     USA      better      33 Small
Ford Escort   4  7402     USA       worse      33 Small
Ford Festiva 4   6319   Korea      better      37 Small
                     Price Country Reliability Mileage Type
Ford Aerostar V6     12267     USA     average      18  Van
Mazda MPV V6         14944   Japan Much better      19  Van
Mitsubishi Wagon 4   14929   Japan        <NA>      20  Van
Nissan Axxess 4      13949   Japan        <NA>      20  Van
Nissan Van 4         14799   Japan        <NA>      19  Van
Volkswagen Vanagon 4 14080 Germany        <NA>      NA  Van
[1] 117   5
[1] Small   Sporty  Compact Medium  Large   Van    
Levels: Compact Large Medium Small Sporty Van
 [1] Japan     USA       Korea     Japan/USA Mexico    Brazil    Germany  
 [8] France    Sweden    England  
10 Levels: Brazil England France Germany Japan Japan/USA Korea ... USA


%%R
library(rpart)
fit <- rpart(Mileage~Price + Country + Reliability + Type, method="anova", data=cu.summary)
print(summary(fit))

Call:
rpart(formula = Mileage ~ Price + Country + Reliability + Type, 
    data = cu.summary, method = "anova")
  n=60 (57 observations deleted due to missingness)

          CP nsplit rel error    xerror       xstd
1 0.62288527      0 1.0000000 1.0461343 0.18101767
2 0.13206061      1 0.3771147 0.5373181 0.10518916
3 0.02544094      2 0.2450541 0.3948035 0.08705043
4 0.01160389      3 0.2196132 0.4255922 0.09596329
5 0.01000000      4 0.2080093 0.4160511 0.09134793

Variable importance
  Price    Type Country 
     48      42      10 

Node number 1: 60 observations,    complexity param=0.6228853
  mean=24.58333, MSE=22.57639 
  left son=2 (48 obs) right son=3 (12 obs)
  Primary splits:
      Price       < 9446.5  to the right, improve=0.6228853, (0 missing)
      Type        splits as  LLLRLL,      improve=0.5044405, (0 missing)
      Reliability splits as  LLLRR,       improve=0.1263005, (11 missing)
      Country     splits as  --LRLRRRLL,  improve=0.1243525, (0 missing)
  Surrogate splits:
      Type    splits as  LLLRLL,     agree=0.950, adj=0.750, (0 split)
      Country splits as  --LLLLRRLL, agree=0.833, adj=0.167, (0 split)

Node number 2: 48 observations,    complexity param=0.1320606
  mean=22.70833, MSE=8.498264 
  left son=4 (23 obs) right son=5 (25 obs)
  Primary splits:
      Type        splits as  RLLRRL,      improve=0.43853830, (0 missing)
      Price       < 12154.5 to the right, improve=0.25748500, (0 missing)
      Country     splits as  --RRLRL-LL,  improve=0.13345700, (0 missing)
      Reliability splits as  LLLRR,       improve=0.01637086, (10 missing)
  Surrogate splits:
      Price   < 12215.5 to the right, agree=0.812, adj=0.609, (0 split)
      Country splits as  --RRLRL-RL,  agree=0.646, adj=0.261, (0 split)

Node number 3: 12 observations
  mean=32.08333, MSE=8.576389 

Node number 4: 23 observations,    complexity param=0.02544094
  mean=20.69565, MSE=2.907372 
  left son=8 (10 obs) right son=9 (13 obs)
  Primary splits:
      Type    splits as  -LR--L,      improve=0.515359600, (0 missing)
      Price   < 14962   to the left,  improve=0.131259400, (0 missing)
      Country splits as  ----L-R--R,  improve=0.007022107, (0 missing)
  Surrogate splits:
      Price < 13572   to the right, agree=0.609, adj=0.1, (0 split)

Node number 5: 25 observations,    complexity param=0.01160389
  mean=24.56, MSE=6.4864 
  left son=10 (14 obs) right son=11 (11 obs)
  Primary splits:
      Price       < 11484.5 to the right, improve=0.09693168, (0 missing)
      Reliability splits as  LLRRR,       improve=0.07767167, (4 missing)
      Type        splits as  L--RR-,      improve=0.04209834, (0 missing)
      Country     splits as  --LRRR--LL,  improve=0.02201687, (0 missing)
  Surrogate splits:
      Country splits as  --LLLL--LR, agree=0.80, adj=0.545, (0 split)
      Type    splits as  L--RL-,     agree=0.64, adj=0.182, (0 split)

Node number 8: 10 observations
  mean=19.3, MSE=2.21 

Node number 9: 13 observations
  mean=21.76923, MSE=0.7928994 

Node number 10: 14 observations
  mean=23.85714, MSE=7.693878 

Node number 11: 11 observations
  mean=25.45455, MSE=3.520661 

n=60 (57 observations deleted due to missingness)

node), split, n, deviance, yval
      * denotes terminal node

 1) root 60 1354.58300 24.58333  
   2) Price>=9446.5 48  407.91670 22.70833  
     4) Type=Large,Medium,Van 23   66.86957 20.69565  
       8) Type=Large,Van 10   22.10000 19.30000 *
       9) Type=Medium 13   10.30769 21.76923 *
     5) Type=Compact,Small,Sporty 25  162.16000 24.56000  
      10) Price>=11484.5 14  107.71430 23.85714 *
      11) Price< 11484.5 11   38.72727 25.45455 *
   3) Price< 9446.5 12  102.91670 32.08333 *


%%R
plot(fit, uniform=TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=.8)


%%R
library(tree)
cahomes = read.table("DSTMAA_data/cahomedata.txt",header=TRUE)
print(dim(cahomes))
head(cahomes)

[1] 20640     9
  MedianHouseValue MedianIncome MedianHouseAge TotalRooms TotalBedrooms
1           452600       8.3252             41        880           129
2           358500       8.3014             21       7099          1106
3           352100       7.2574             52       1467           190
4           341300       5.6431             52       1274           235
5           342200       3.8462             52       1627           280
6           269700       4.0368             52        919           213
  Population Households Latitude Longitude
1        322        126    37.88   -122.23
2       2401       1138    37.86   -122.22
3        496        177    37.85   -122.24
4        558        219    37.85   -122.25
5        565        259    37.85   -122.25
6        413        193    37.85   -122.25


%%R
summary(cahomes)
mhv = as.matrix(as.numeric(cahomes$MedianHouseValue))
logmhv = log(mhv)
lat = as.matrix(as.numeric(cahomes$Latitude))
lon = as.matrix(as.numeric(cahomes$Longitude))
summary(lm(mhv~lat+lon))

Call:
lm(formula = mhv ~ lat + lon)

Residuals:
    Min      1Q  Median      3Q     Max 
-316022  -67502  -22903   46042  483381 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -5829397.0    82092.2  -71.01   <2e-16 ***
lat           -69551.0      859.6  -80.91   <2e-16 ***
lon           -71209.4      916.4  -77.70   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 100400 on 20637 degrees of freedom
Multiple R-squared:  0.2424,	Adjusted R-squared:  0.2423 
F-statistic:  3302 on 2 and 20637 DF,  p-value: < 2.2e-16


%%R
fit = tree(logmhv~lon+lat)
plot(fit)
text(fit,cex=0.8)


%%R
price.deciles = quantile(mhv,0:10/10)

cut.prices = cut(mhv,price.deciles,include.lowest=TRUE)
plot(lon, lat, col=grey(10:2/11)[cut.prices],pch=20,xlab="Longitude",ylab="Latitude")
partition.tree(fit,ordvars=c("lon","lat"),add=TRUE,cex=0.8)


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)

y1 = 1:32
y1 = y1*0+1
y2 = y1*0
y = c(y1,y2)
print(y)
x = as.matrix(ncaa[4:14])

 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
[39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


%%R
library(randomForest)
yf = factor(y)
res = randomForest(x,yf)
print(res)
print(importance(res))

/Users/srdas/anaconda3/lib/python3.7/site-packages/rpy2/rinterface/__init__.py:145: RRuntimeWarning: randomForest 4.6-14

  warnings.warn(x, RRuntimeWarning)
/Users/srdas/anaconda3/lib/python3.7/site-packages/rpy2/rinterface/__init__.py:145: RRuntimeWarning: Type rfNews() to see new features/changes/bug fixes.

  warnings.warn(x, RRuntimeWarning)

Call:
 randomForest(x = x, y = yf) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 32.81%
Confusion matrix:
   0  1 class.error
0 21 11     0.34375
1 10 22     0.31250
    MeanDecreaseGini
PTS         4.937698
REB         1.667427
AST         1.909199
TO          3.915009
A.T         3.566153
STL         2.039079
BLK         2.036481
PF          1.836752
FG          4.184250
FT          3.474843
X3P         1.960985


%%R
res = randomForest(x,yf,mtry=6)
print(res)
print(importance(res))

Call:
 randomForest(x = x, y = yf, mtry = 6) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 6

        OOB estimate of  error rate: 32.81%
Confusion matrix:
   0  1 class.error
0 21 11     0.34375
1 10 22     0.31250
    MeanDecreaseGini
PTS         5.227347
REB         1.528663
AST         1.309917
TO          4.363767
A.T         3.661890
STL         1.823996
BLK         2.088765
PF          1.606537
FG          4.246752
FT          3.828480
X3P         1.841324


%%R
library(xgboost)

data("agaricus.train")
print(names(agaricus.train))
print(dim(agaricus.train$data))
print(length(agaricus.train$label))

data("agaricus.test")
print(names(agaricus.test))
print(dim(agaricus.test$data))
print(length(agaricus.test$label))

[1] "data"  "label"
[1] 6513  126
[1] 6513
[1] "data"  "label"
[1] 1611  126
[1] 1611


%%R
agaricus.train$data[1,]

                  cap-shape=bell                cap-shape=conical 
                               0                                0 
                cap-shape=convex                   cap-shape=flat 
                               1                                0 
               cap-shape=knobbed                 cap-shape=sunken 
                               0                                0 
             cap-surface=fibrous              cap-surface=grooves 
                               0                                0 
               cap-surface=scaly               cap-surface=smooth 
                               0                                1 
                 cap-color=brown                   cap-color=buff 
                               1                                0 
              cap-color=cinnamon                   cap-color=gray 
                               0                                0 
                 cap-color=green                   cap-color=pink 
                               0                                0 
                cap-color=purple                    cap-color=red 
                               0                                0 
                 cap-color=white                 cap-color=yellow 
                               0                                0 
                bruises?=bruises                      bruises?=no 
                               1                                0 
                     odor=almond                       odor=anise 
                               0                                0 
                   odor=creosote                       odor=fishy 
                               0                                0 
                       odor=foul                       odor=musty 
                               0                                0 
                       odor=none                     odor=pungent 
                               0                                1 
                      odor=spicy         gill-attachment=attached 
                               0                                0 
      gill-attachment=descending             gill-attachment=free 
                               0                                1 
         gill-attachment=notched               gill-spacing=close 
                               0                                1 
            gill-spacing=crowded             gill-spacing=distant 
                               0                                0 
                 gill-size=broad                 gill-size=narrow 
                               0                                1 
                gill-color=black                 gill-color=brown 
                               1                                0 
                 gill-color=buff             gill-color=chocolate 
                               0                                0 
                 gill-color=gray                 gill-color=green 
                               0                                0 
               gill-color=orange                  gill-color=pink 
                               0                                0 
               gill-color=purple                   gill-color=red 
                               0                                0 
                gill-color=white                gill-color=yellow 
                               0                                0 
           stalk-shape=enlarging             stalk-shape=tapering 
                               1                                0 
              stalk-root=bulbous                  stalk-root=club 
                               0                                0 
                  stalk-root=cup                 stalk-root=equal 
                               0                                1 
          stalk-root=rhizomorphs                stalk-root=rooted 
                               0                                0 
              stalk-root=missing stalk-surface-above-ring=fibrous 
                               0                                0 
  stalk-surface-above-ring=scaly   stalk-surface-above-ring=silky 
                               0                                0 
 stalk-surface-above-ring=smooth stalk-surface-below-ring=fibrous 
                               1                                0 
  stalk-surface-below-ring=scaly   stalk-surface-below-ring=silky 
                               0                                0 
 stalk-surface-below-ring=smooth     stalk-color-above-ring=brown 
                               1                                0 
     stalk-color-above-ring=buff  stalk-color-above-ring=cinnamon 
                               0                                0 
     stalk-color-above-ring=gray    stalk-color-above-ring=orange 
                               0                                0 
     stalk-color-above-ring=pink       stalk-color-above-ring=red 
                               0                                0 
    stalk-color-above-ring=white    stalk-color-above-ring=yellow 
                               1                                0 
    stalk-color-below-ring=brown      stalk-color-below-ring=buff 
                               0                                0 
 stalk-color-below-ring=cinnamon      stalk-color-below-ring=gray 
                               0                                0 
   stalk-color-below-ring=orange      stalk-color-below-ring=pink 
                               0                                0 
      stalk-color-below-ring=red     stalk-color-below-ring=white 
                               0                                1 
   stalk-color-below-ring=yellow                veil-type=partial 
                               0                                1 
             veil-type=universal                 veil-color=brown 
                               0                                0 
               veil-color=orange                 veil-color=white 
                               0                                1 
               veil-color=yellow                 ring-number=none 
                               0                                0 
                 ring-number=one                  ring-number=two 
                               1                                0 
              ring-type=cobwebby             ring-type=evanescent 
                               0                                0 
               ring-type=flaring                  ring-type=large 
                               0                                0 
                  ring-type=none                ring-type=pendant 
                               0                                1 
             ring-type=sheathing                   ring-type=zone 
                               0                                0 
         spore-print-color=black          spore-print-color=brown 
                               1                                0 
          spore-print-color=buff      spore-print-color=chocolate 
                               0                                0 
         spore-print-color=green         spore-print-color=orange 
                               0                                0 
        spore-print-color=purple          spore-print-color=white 
                               0                                0 
        spore-print-color=yellow              population=abundant 
                               0                                0 
            population=clustered              population=numerous 
                               0                                0 
            population=scattered               population=several 
                               1                                0 
             population=solitary                  habitat=grasses 
                               0                                0 
                  habitat=leaves                  habitat=meadows 
                               0                                0 
                   habitat=paths                    habitat=urban 
                               0                                1 
                   habitat=waste                    habitat=woods 
                               0                                0


%%R
res = xgboost(data=agaricus.train$data, label=agaricus.train$label, objective = "binary:logistic", nrounds=5)
print(names(res))

[1]	train-error:0.000614 
[2]	train-error:0.001228 
[3]	train-error:0.000614 
[4]	train-error:0.000614 
[5]	train-error:0.000000 
[1] "handle"         "raw"            "niter"          "evaluation_log"
[5] "call"           "params"         "callbacks"      "feature_names" 
[9] "nfeatures"


%%R
#In sample
yhat = predict(res,agaricus.train$data)
print(head(yhat,50))
cm = table(agaricus.train$label,as.integer(round(yhat)))
print(cm)
print(chisq.test(cm))

 [1] 0.8973738 0.1030030 0.1030030 0.8973738 0.1018238 0.1030030 0.1030030
 [8] 0.8973738 0.1030030 0.1030030 0.1030030 0.1030030 0.1018238 0.1058771
[15] 0.1018238 0.8973738 0.8973738 0.8973738 0.1030030 0.8973738 0.1030030
[22] 0.1030030 0.8973738 0.1030030 0.1030030 0.8973738 0.1030030 0.1057071
[29] 0.1030030 0.1144627 0.1058771 0.1139800 0.1030030 0.1057071 0.1058771
[36] 0.1030030 0.1030030 0.1030030 0.1030030 0.1057071 0.1057071 0.1030030
[43] 0.1030030 0.8973738 0.1030030 0.1030030 0.1057071 0.1058771 0.1030030
[50] 0.1030030
   
       0    1
  0 3373    0
  1    0 3140

	Pearson's Chi-squared test with Yates' continuity correction

data:  cm
X-squared = 6509, df = 1, p-value < 2.2e-16


%%R
#Out of sample
yhat = predict(res,agaricus.test$data)
print(head(yhat,50))
cm = table(agaricus.test$label,as.integer(round(yhat)))
print(cm)
print(chisq.test(cm))

 [1] 0.1030030 0.8973738 0.1030030 0.1030030 0.1058771 0.1139800 0.8973738
 [8] 0.1030030 0.8973738 0.1057071 0.8973738 0.1030030 0.1018238 0.1030030
[15] 0.1018238 0.1057071 0.1030030 0.8973738 0.1058771 0.1030030 0.1030030
[22] 0.1057071 0.1030030 0.1030030 0.1057071 0.8973738 0.1139800 0.1030030
[29] 0.1030030 0.1018238 0.1030030 0.1030030 0.1057071 0.1058771 0.1030030
[36] 0.1030030 0.1139800 0.8973738 0.1030030 0.1030030 0.1058771 0.1030030
[43] 0.1030030 0.1030030 0.1030030 0.1144627 0.1057071 0.1144627 0.1058771
[50] 0.1030030
   
      0   1
  0 835   0
  1   0 776

	Pearson's Chi-squared test with Yates' continuity correction

data:  cm
X-squared = 1607, df = 1, p-value < 2.2e-16


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
y = as.matrix(c(rep(1,32),rep(0,32)))
x = as.matrix(ncaa[4:14])
res = xgboost(data=x,label=y,objective = "binary:logistic", nrounds=10)

yhat = predict(res,x)
print(yhat)
cm = table(y,as.integer(round(yhat)))
print(cm)
print(chisq.test(cm))

[1]	train-error:0.109375 
[2]	train-error:0.062500 
[3]	train-error:0.031250 
[4]	train-error:0.046875 
[5]	train-error:0.046875 
[6]	train-error:0.031250 
[7]	train-error:0.015625 
[8]	train-error:0.015625 
[9]	train-error:0.015625 
[10]	train-error:0.000000 
 [1] 0.93651539 0.91299230 0.94973743 0.92731959 0.88483542 0.78989410
 [7] 0.87560666 0.90532523 0.86085796 0.83430755 0.91133112 0.77964365
[13] 0.65978771 0.91299230 0.93371087 0.91403663 0.78532064 0.80347157
[19] 0.60545647 0.79564470 0.84763408 0.86694145 0.79334742 0.91165835
[25] 0.80980736 0.76779360 0.90779346 0.88314682 0.85020524 0.77409834
[31] 0.85503411 0.92695338 0.49809304 0.15059802 0.13718443 0.30433667
[37] 0.35902274 0.08057866 0.16935477 0.06189578 0.08516480 0.12777112
[43] 0.06224639 0.18913418 0.07675765 0.33156753 0.06586388 0.13792981
[49] 0.22327985 0.08479820 0.16396984 0.10236575 0.16346745 0.27498406
[55] 0.10642117 0.07299758 0.15809764 0.15259050 0.07768227 0.15006000
[61] 0.08349544 0.06932075 0.10376420 0.11887703
   
y    0  1
  0 32  0
  1  0 32

	Pearson's Chi-squared test with Yates' continuity correction

data:  cm
X-squared = 60.062, df = 1, p-value = 9.189e-15

In the Same Boat: Cluster Analysis and Prediction Trees {#ClusterAnalysis}¶

Overview¶

k-means¶

Nearness / distance metrics¶

Example: Randomly generated data in k-means¶

Example: NCAA teams¶

Hierarchical Clustering¶

k Nearest Neighbors¶

Prediction Trees¶

Fitting the tree¶

Classification Trees¶

C4.5 Classifier¶

Regression Trees¶

Example: Califonia Home Data¶

Random Forests¶

Top Ten Algorithms in Data Science¶

Boosting¶