%pylab inline
import pandas as pd
import os
from ipypublish import nb_setup
%load_ext rpy2.ipython
#%load_ext RWinOut

Populating the interactive namespace from numpy and matplotlib


%%R
#LOG-LIKELIHOOD FUNCTION
LL = function(params,x) {
    mu = params[1]; sigsq = params[2]
    f = (1/sqrt(2*pi*sigsq))*exp(-0.5*(x-mu)^2/sigsq)
    LL = -sum(log(f))
}


%%R
#GENERATE DATA FROM A NORMAL DISTRIBUTION
x = rnorm(10000, mean=5, sd=3)

#MAXIMIZE LOG-LIKELIHOOD
params = c(4,2)  #Create starting guess for parameters
res = nlm(LL,params,x)
print(res)

$minimum
[1] 25177.91

$estimate
[1] 5.040839 9.004325

$gradient
[1]  5.051907e-06 -4.444283e-06

$code
[1] 1

$iterations
[1] 11


%%R
logit = function(fx) { res = exp(fx)/(1+exp(fx)) }
fx = seq(-4,4,0.01)
y = logit(fx)
plot(fx,y,type="l",xlab="x",ylab="f(x)",col="blue",lwd=3)


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)

y1 = 1:32
y1 = y1*0+1
y2 = y1*0
y = c(y1,y2)
print(y)
x = as.matrix(ncaa[4:14])

h = glm(y~x, family=binomial(link="logit"))
names(h)

 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
[39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [1] "coefficients"      "residuals"         "fitted.values"    
 [4] "effects"           "R"                 "rank"             
 [7] "qr"                "family"            "linear.predictors"
[10] "deviance"          "aic"               "null.deviance"    
[13] "iter"              "weights"           "prior.weights"    
[16] "df.residual"       "df.null"           "y"                
[19] "converged"         "boundary"          "model"            
[22] "call"              "formula"           "terms"            
[25] "data"              "offset"            "control"          
[28] "method"            "contrasts"         "xlevels"


%%R
print(logLik(h))
summary(h)

'log Lik.' -21.44779 (df=12)

Call:
glm(formula = y ~ x, family = binomial(link = "logit"))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.80174  -0.40502  -0.00238   0.37584   2.31767  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept) -45.83315   14.97564  -3.061  0.00221 **
xPTS         -0.06127    0.09549  -0.642  0.52108   
xREB          0.49037    0.18089   2.711  0.00671 **
xAST          0.16422    0.26804   0.613  0.54010   
xTO          -0.38405    0.23434  -1.639  0.10124   
xA.T          1.56351    3.17091   0.493  0.62196   
xSTL          0.78360    0.32605   2.403  0.01625 * 
xBLK          0.07867    0.23482   0.335  0.73761   
xPF           0.02602    0.13644   0.191  0.84874   
xFG          46.21374   17.33685   2.666  0.00768 **
xFT          10.72992    4.47729   2.397  0.01655 * 
xX3P          5.41985    5.77966   0.938  0.34838   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 88.723  on 63  degrees of freedom
Residual deviance: 42.896  on 52  degrees of freedom
AIC: 66.896

Number of Fisher Scoring iterations: 6


%%R
h$fitted.values

           1            2            3            4            5            6 
0.9998267965 0.9983229192 0.9686755530 0.9909359265 0.9977011039 0.9639506326 
           7            8            9           10           11           12 
0.5381841865 0.9505255187 0.4329829232 0.7413280575 0.9793554057 0.7273235463 
          13           14           15           16           17           18 
0.2309261473 0.9905414749 0.7344407215 0.9936312074 0.2269619354 0.8779507370 
          19           20           21           22           23           24 
0.2572796426 0.9335376447 0.9765843274 0.7836742557 0.9967552281 0.9966486903 
          25           26           27           28           29           30 
0.9715110760 0.0681674628 0.4984153630 0.9607522159 0.8624544140 0.6988578200 
          31           32           33           34           35           36 
0.9265057217 0.7472357037 0.5589318497 0.2552381741 0.0051790298 0.4394307950 
          37           38           39           40           41           42 
0.0205919396 0.0545333361 0.0100662111 0.0995262051 0.1219394290 0.0025416737 
          43           44           45           46           47           48 
0.3191888357 0.0149772804 0.0685930622 0.3457439539 0.0034943441 0.5767386617 
          49           50           51           52           53           54 
0.5489544863 0.4637012227 0.2354894587 0.0487342700 0.6359622098 0.8027221707 
          55           56           57           58           59           60 
0.0003240393 0.0479116454 0.3422867567 0.4649889328 0.0547385409 0.0722894447 
          61           62           63           64 
0.0228629774 0.0002730981 0.0570387301 0.2830628760


%%R
h = glm(y~x, family=binomial(link="probit"))
print(logLik(h))
summary(h)

'log Lik.' -21.27924 (df=12)

Call:
glm(formula = y ~ x, family = binomial(link = "probit"))

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.76353  -0.41212  -0.00031   0.34996   2.24568  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept) -26.28219    8.09608  -3.246  0.00117 **
xPTS         -0.03463    0.05385  -0.643  0.52020   
xREB          0.28493    0.09939   2.867  0.00415 **
xAST          0.10894    0.15735   0.692  0.48874   
xTO          -0.23742    0.13642  -1.740  0.08180 . 
xA.T          0.71485    1.86701   0.383  0.70181   
xSTL          0.45963    0.18414   2.496  0.01256 * 
xBLK          0.03029    0.13631   0.222  0.82415   
xPF           0.01041    0.07907   0.132  0.89529   
xFG          26.58461    9.38711   2.832  0.00463 **
xFT           6.28278    2.51452   2.499  0.01247 * 
xX3P          3.15824    3.37841   0.935  0.34988   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 88.723  on 63  degrees of freedom
Residual deviance: 42.558  on 52  degrees of freedom
AIC: 66.558

Number of Fisher Scoring iterations: 8


%%R
h$fitted.values

           1            2            3            4            5            6 
9.999998e-01 9.999048e-01 9.769711e-01 9.972812e-01 9.997756e-01 9.721166e-01 
           7            8            9           10           11           12 
5.590209e-01 9.584564e-01 4.367808e-01 7.362946e-01 9.898112e-01 7.262200e-01 
          13           14           15           16           17           18 
2.444006e-01 9.968605e-01 7.292286e-01 9.985910e-01 2.528807e-01 8.751178e-01 
          19           20           21           22           23           24 
2.544738e-01 9.435318e-01 9.850437e-01 7.841357e-01 9.995601e-01 9.996077e-01 
          25           26           27           28           29           30 
9.825306e-01 8.033540e-02 5.101626e-01 9.666841e-01 8.564489e-01 6.657773e-01 
          31           32           33           34           35           36 
9.314164e-01 7.481401e-01 5.810465e-01 2.488875e-01 1.279599e-03 4.391782e-01 
          37           38           39           40           41           42 
1.020269e-02 5.461190e-02 4.267754e-03 1.067584e-01 1.234915e-01 2.665101e-04 
          43           44           45           46           47           48 
3.212605e-01 6.434112e-03 7.362892e-02 3.673105e-01 4.875193e-04 6.020993e-01 
          49           50           51           52           53           54 
5.605770e-01 4.786576e-01 2.731573e-01 4.485079e-02 6.194202e-01 7.888145e-01 
          55           56           57           58           59           60 
1.630556e-06 4.325189e-02 3.899566e-01 4.809365e-01 5.043005e-02 7.330590e-02 
          61           62           63           64 
1.498018e-02 8.425836e-07 5.515960e-02 3.218696e-01


%%R
p = 0.3
OR = p/(1-p); print(OR)
beta = 2
OR_new = OR * exp(beta); print(OR_new)
p_new = OR_new/(1+OR_new); print(p_new)

[1] 0.4285714
[1] 3.166738
[1] 0.7600041


%%R
h = glm(y~x, family=binomial(link="logit"))
b = h$coefficients

#Odds ratio is the exponentiated coefficients
print(exp(b))
x1 = c(1,as.numeric(x[18,]))  #Take row 18 and create the RHS variables array
p1 = 1/(1+exp(-sum(b*x1)))
print(p1)
OR1 = p1/(1-p1)
cat("Odds Ratio = "); print(OR1)

 (Intercept)         xPTS         xREB         xAST          xTO         xA.T 
1.244270e-20 9.405653e-01 1.632927e+00 1.178470e+00 6.810995e-01 4.775577e+00 
        xSTL         xBLK          xPF          xFG          xFT         xX3P 
2.189332e+00 1.081849e+00 1.026364e+00 1.175903e+20 4.570325e+04 2.258450e+02 
[1] 0.8779507
Odds Ratio = [1] 7.193413


%%R
x2 = x1
x2[3] = x2[3] + 1
p2 = 1/(1+exp(-sum(b*x2)))
print(p2)

[1] 0.921546


%%R
OR2 = OR1 * exp(b[3])
print(OR2/(1+OR2))

    xREB 
0.921546


%%R
h = glm(y~x, family=binomial(link="logit"))
beta = h$coefficients
print(beta)
print(dim(x))
beta = as.matrix(beta)
print(dim(beta))

wuns = matrix(1,64,1)
x = cbind(wuns,x)
xbar = as.matrix(colMeans(x))
xbar

 (Intercept)         xPTS         xREB         xAST          xTO         xA.T 
-45.83315262  -0.06127422   0.49037435   0.16421685  -0.38404689   1.56351478 
        xSTL         xBLK          xPF          xFG          xFT         xX3P 
  0.78359670   0.07867125   0.02602243  46.21373793  10.72992472   5.41984900 
[1] 64 11
[1] 12  1
          [,1]
     1.0000000
PTS 67.1015625
REB 34.4671875
AST 12.7484375
TO  13.9578125
A.T  0.9778125
STL  6.8234375
BLK  2.7500000
PF  18.6562500
FG   0.4232969
FT   0.6914687
X3P  0.3333750


%%R
logitfunction = exp(t(beta) %*% xbar)/(1+exp(t(beta) %*% xbar))
print(logitfunction)
slopes = beta * logitfunction[1] * (1-logitfunction[1])
slopes

          [,1]
[1,] 0.5139925
                     [,1]
(Intercept) -11.449314459
xPTS         -0.015306558
xREB          0.122497576
xAST          0.041022062
xTO          -0.095936529
xA.T          0.390572574
xSTL          0.195745753
xBLK          0.019652410
xPF           0.006500512
xFG          11.544386272
xFT           2.680380362
xX3P          1.353901094


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)

y1 = 1:32
y1 = y1*0+1
y2 = y1*0
y = c(y1,y2)
print(y)
x = as.matrix(ncaa[4:14])

 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
[39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


%%R
h = glm(y~x, family=binomial(link="probit"))
beta = h$coefficients
print(beta)
print(dim(x))
beta = as.matrix(beta)
print(dim(beta))

 (Intercept)         xPTS         xREB         xAST          xTO         xA.T 
-26.28219202  -0.03462510   0.28493498   0.10893727  -0.23742076   0.71484863 
        xSTL         xBLK          xPF          xFG          xFT         xX3P 
  0.45963279   0.03029006   0.01040612  26.58460638   6.28277680   3.15823537 
[1] 64 11
[1] 12  1


%%R
wuns = matrix(1,64,1)
x = cbind(wuns,x)
xbar = as.matrix(colMeans(x))
print(xbar)

probitfunction = t(beta) %*% xbar
slopes = probitfunction[1] * beta
slopes

          [,1]
     1.0000000
PTS 67.1015625
REB 34.4671875
AST 12.7484375
TO  13.9578125
A.T  0.9778125
STL  6.8234375
BLK  2.7500000
PF  18.6562500
FG   0.4232969
FT   0.6914687
X3P  0.3333750
                    [,1]
(Intercept) -1.401478911
xPTS        -0.001846358
xREB         0.015193952
xAST         0.005809001
xTO         -0.012660291
xA.T         0.038118787
xSTL         0.024509587
xBLK         0.001615196
xPF          0.000554899
xFG          1.417604938
xFT          0.335024536
xX3P         0.168410621


%%R
ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
x = as.matrix(ncaa[4:14])
w1 = (1:16)*0 + 1
w0 = (1:16)*0
y1 = c(w1,w0,w0,w0)
y2 = c(w0,w1,w0,w0)
y3 = c(w0,w0,w1,w0)
y4 = c(w0,w0,w0,w1)
y = cbind(y1,y2,y3,y4)

library(nnet)
res  = multinom(y~x)
res

# weights:  52 (36 variable)
initial  value 88.722839 
iter  10 value 71.177975
iter  20 value 60.076921
iter  30 value 51.167439
iter  40 value 47.005269
iter  50 value 45.196280
iter  60 value 44.305029
iter  70 value 43.341689
iter  80 value 43.260097
iter  90 value 43.247324
iter 100 value 43.141297
final  value 43.141297 
stopped after 100 iterations
Call:
multinom(formula = y ~ x)

Coefficients:
   (Intercept)       xPTS       xREB       xAST        xTO       xA.T      xSTL
y2   -8.847514 -0.1595873  0.3134622  0.6198001 -0.2629260 -2.1647350 -0.813519
y3   65.688912  0.2983748 -0.7309783 -0.6059289  0.9284964 -0.5720152 -1.310701
y4   31.513342 -0.1382873 -0.2432960  0.2887910  0.2204605 -2.6409780 -1.470406
          xBLK        xPF       xFG        xFT      xX3P
y2  0.01472506  0.6521056 -13.77579  10.374888 -3.436073
y3  0.63038878 -0.1788238 -86.37410 -24.769245 -4.897203
y4 -0.31863373  0.5392835 -45.18077   6.701026 -7.841990

Residual Deviance: 86.28259 
AIC: 158.2826


%%R
print(names(res))
res$fitted.values

 [1] "n"             "nunits"        "nconn"         "conn"         
 [5] "nsunits"       "decay"         "entropy"       "softmax"      
 [9] "censored"      "value"         "wts"           "convergence"  
[13] "fitted.values" "residuals"     "call"          "terms"        
[17] "weights"       "deviance"      "rank"          "lab"          
[21] "coefnames"     "vcoefnames"    "xlevels"       "edf"          
[25] "AIC"          
             y1           y2           y3           y4
1  6.785454e-01 3.214178e-01 7.032345e-06 2.972107e-05
2  6.168467e-01 3.817718e-01 2.797313e-06 1.378715e-03
3  7.784836e-01 1.990510e-01 1.688098e-02 5.584445e-03
4  5.962949e-01 3.988588e-01 5.018346e-04 4.344392e-03
5  9.815286e-01 1.694721e-02 1.442350e-03 8.179230e-05
6  9.271150e-01 6.330104e-02 4.916966e-03 4.666964e-03
7  4.515721e-01 9.303667e-02 3.488898e-02 4.205023e-01
8  8.210631e-01 1.530721e-01 7.631770e-03 1.823302e-02
9  1.567804e-01 9.375075e-02 6.413693e-01 1.080996e-01
10 8.403357e-01 9.793135e-03 1.396393e-01 1.023186e-02
11 9.163789e-01 6.747946e-02 7.847380e-05 1.606316e-02
12 2.448850e-01 4.256001e-01 2.880803e-01 4.143463e-02
13 1.040352e-01 1.534272e-01 1.369554e-01 6.055822e-01
14 8.468755e-01 1.506311e-01 5.083480e-04 1.985036e-03
15 7.136048e-01 1.294146e-01 7.385294e-02 8.312770e-02
16 9.885439e-01 1.114547e-02 2.187311e-05 2.887256e-04
17 6.478074e-02 3.547072e-01 1.988993e-01 3.816127e-01
18 4.414721e-01 4.497228e-01 4.716550e-02 6.163956e-02
19 6.024508e-03 3.608270e-01 7.837087e-02 5.547777e-01
20 4.553205e-01 4.270499e-01 3.614863e-04 1.172681e-01
21 1.342122e-01 8.627911e-01 1.759865e-03 1.236845e-03
22 1.877123e-02 6.423037e-01 5.456372e-05 3.388705e-01
23 5.620528e-01 4.359459e-01 5.606424e-04 1.440645e-03
24 2.837494e-01 7.154506e-01 2.190456e-04 5.809815e-04
25 1.787749e-01 8.037335e-01 3.361806e-04 1.715541e-02
26 3.274874e-02 3.484005e-02 1.307795e-01 8.016317e-01
27 1.635480e-01 3.471676e-01 1.131599e-01 3.761245e-01
28 2.360922e-01 7.235497e-01 3.375018e-02 6.607966e-03
29 1.618602e-02 7.233098e-01 5.762083e-06 2.604984e-01
30 3.037741e-02 8.550873e-01 7.487804e-02 3.965729e-02
31 1.122897e-01 8.648388e-01 3.935657e-03 1.893584e-02
32 2.312231e-01 6.607587e-01 4.770775e-02 6.031045e-02
33 6.743125e-01 2.028181e-02 2.612683e-01 4.413746e-02
34 1.407693e-01 4.089518e-02 7.007541e-01 1.175815e-01
35 6.919547e-04 4.194577e-05 9.950322e-01 4.233924e-03
36 8.051225e-02 4.213965e-03 9.151287e-01 1.450423e-04
37 5.691220e-05 7.480549e-02 5.171594e-01 4.079782e-01
38 2.709867e-02 3.808987e-02 6.193969e-01 3.154145e-01
39 4.531001e-05 2.248580e-08 9.999542e-01 4.626258e-07
40 1.021976e-01 4.597678e-03 5.133839e-01 3.798208e-01
41 2.005837e-02 2.063200e-01 5.925050e-01 1.811166e-01
42 1.829028e-04 1.378795e-03 6.182839e-01 3.801544e-01
43 1.734296e-01 9.025284e-04 7.758862e-01 4.978171e-02
44 4.314938e-05 3.131390e-06 9.997892e-01 1.645004e-04
45 1.516231e-02 2.060325e-03 9.792594e-01 3.517926e-03
46 2.917597e-01 6.351166e-02 4.943818e-01 1.503468e-01
47 1.278933e-04 1.773509e-03 1.209486e-01 8.771500e-01
48 1.320000e-01 2.064338e-01 6.324904e-01 2.907578e-02
49 1.683221e-02 4.007848e-01 1.628981e-03 5.807540e-01
50 9.670085e-02 4.314765e-01 7.669035e-03 4.641536e-01
51 4.953577e-02 1.370037e-01 9.882004e-02 7.146405e-01
52 1.787927e-02 9.825660e-02 2.203037e-01 6.635604e-01
53 1.174053e-02 4.723628e-01 2.430072e-03 5.134666e-01
54 2.053871e-01 6.721356e-01 4.169640e-02 8.078090e-02
55 3.060369e-06 1.418623e-03 1.072549e-02 9.878528e-01
56 1.122164e-02 6.566169e-02 3.080641e-01 6.150525e-01
57 8.873716e-03 4.996907e-01 8.222034e-03 4.832136e-01
58 2.164962e-02 2.874313e-01 1.136455e-03 6.897826e-01
59 5.230443e-03 6.430174e-04 9.816825e-01 1.244406e-02
60 8.743368e-02 6.710327e-02 4.260116e-01 4.194514e-01
61 1.913578e-01 6.458463e-04 3.307553e-01 4.772410e-01
62 6.450967e-07 5.035697e-05 7.448285e-01 2.551205e-01
63 2.400365e-04 4.651537e-03 8.183390e-06 9.951002e-01
64 1.515894e-04 2.631451e-01 1.002332e-05 7.366933e-01


%%R
rowSums(res$fitted.values)

 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
53 54 55 56 57 58 59 60 61 62 63 64 
 1  1  1  1  1  1  1  1  1  1  1  1


%%R
dnorm(-1)/(1-pnorm(-1))

[1] 0.2876


%%R
-dnorm(1)/pnorm(1)

[1] -0.2876


%%R
library(sampleSelection)
data(Mroz87)
Mroz87$kids = (Mroz87$kids5 + Mroz87$kids618 > 0)
Mroz87$numkids = Mroz87$kids5 + Mroz87$kids618
summary(Mroz87)

/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/__init__.py:146: RRuntimeWarning: Loading required package: maxLik

  warnings.warn(x, RRuntimeWarning)
/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/__init__.py:146: RRuntimeWarning: Loading required package: miscTools

  warnings.warn(x, RRuntimeWarning)
/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/__init__.py:146: RRuntimeWarning: 
Please cite the 'maxLik' package as:
Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.

If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
https://r-forge.r-project.org/projects/maxlik/

  warnings.warn(x, RRuntimeWarning)

      lfp             hours            kids5           kids618     
 Min.   :0.0000   Min.   :   0.0   Min.   :0.0000   Min.   :0.000  
 1st Qu.:0.0000   1st Qu.:   0.0   1st Qu.:0.0000   1st Qu.:0.000  
 Median :1.0000   Median : 288.0   Median :0.0000   Median :1.000  
 Mean   :0.5684   Mean   : 740.6   Mean   :0.2377   Mean   :1.353  
 3rd Qu.:1.0000   3rd Qu.:1516.0   3rd Qu.:0.0000   3rd Qu.:2.000  
 Max.   :1.0000   Max.   :4950.0   Max.   :3.0000   Max.   :8.000  
      age             educ            wage           repwage         hushrs    
 Min.   :30.00   Min.   : 5.00   Min.   : 0.000   Min.   :0.00   Min.   : 175  
 1st Qu.:36.00   1st Qu.:12.00   1st Qu.: 0.000   1st Qu.:0.00   1st Qu.:1928  
 Median :43.00   Median :12.00   Median : 1.625   Median :0.00   Median :2164  
 Mean   :42.54   Mean   :12.29   Mean   : 2.375   Mean   :1.85   Mean   :2267  
 3rd Qu.:49.00   3rd Qu.:13.00   3rd Qu.: 3.788   3rd Qu.:3.58   3rd Qu.:2553  
 Max.   :60.00   Max.   :17.00   Max.   :25.000   Max.   :9.98   Max.   :5010  
     husage         huseduc         huswage            faminc     
 Min.   :30.00   Min.   : 3.00   Min.   : 0.4121   Min.   : 1500  
 1st Qu.:38.00   1st Qu.:11.00   1st Qu.: 4.7883   1st Qu.:15428  
 Median :46.00   Median :12.00   Median : 6.9758   Median :20880  
 Mean   :45.12   Mean   :12.49   Mean   : 7.4822   Mean   :23081  
 3rd Qu.:52.00   3rd Qu.:15.00   3rd Qu.: 9.1667   3rd Qu.:28200  
 Max.   :60.00   Max.   :17.00   Max.   :40.5090   Max.   :96000  
      mtr            motheduc         fatheduc           unem       
 Min.   :0.4415   Min.   : 0.000   Min.   : 0.000   Min.   : 3.000  
 1st Qu.:0.6215   1st Qu.: 7.000   1st Qu.: 7.000   1st Qu.: 7.500  
 Median :0.6915   Median :10.000   Median : 7.000   Median : 7.500  
 Mean   :0.6789   Mean   : 9.251   Mean   : 8.809   Mean   : 8.624  
 3rd Qu.:0.7215   3rd Qu.:12.000   3rd Qu.:12.000   3rd Qu.:11.000  
 Max.   :0.9415   Max.   :17.000   Max.   :17.000   Max.   :14.000  
      city            exper          nwifeinc         wifecoll    huscoll   
 Min.   :0.0000   Min.   : 0.00   Min.   :-0.02906    TRUE:212    TRUE:295  
 1st Qu.:0.0000   1st Qu.: 4.00   1st Qu.:13.02504   FALSE:541   FALSE:458  
 Median :1.0000   Median : 9.00   Median :17.70000                          
 Mean   :0.6428   Mean   :10.63   Mean   :20.12896                          
 3rd Qu.:1.0000   3rd Qu.:15.00   3rd Qu.:24.46600                          
 Max.   :1.0000   Max.   :45.00   Max.   :96.00000                          
    kids            numkids     
 Mode :logical   Min.   :0.000  
 FALSE:229       1st Qu.:0.000  
 TRUE :524       Median :1.000  
                 Mean   :1.591  
                 3rd Qu.:3.000  
                 Max.   :8.000


%%R
res = lm(wage ~ age + age^2 + educ + city, data=Mroz87)
summary(res)

Call:
lm(formula = wage ~ age + age^2 + educ + city, data = Mroz87)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.5331 -2.2710 -0.4765  1.3975 22.7241 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.2490882  0.9094210  -3.573 0.000376 ***
age          0.0008193  0.0141084   0.058 0.953708    
educ         0.4496393  0.0503591   8.929  < 2e-16 ***
city         0.0998064  0.2388551   0.418 0.676174    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.079 on 749 degrees of freedom
Multiple R-squared:  0.1016,	Adjusted R-squared:  0.09799 
F-statistic: 28.23 on 3 and 749 DF,  p-value: < 2.2e-16


%%R
res = lm(wage ~ age + age^2 + lfp + city, data=Mroz87)
summary(res)

Call:
lm(formula = wage ~ age + age^2 + lfp + city, data = Mroz87)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.1815 -0.9869 -0.1624  0.3081 20.6809 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.478793   0.513001  -0.933   0.3510    
age          0.004163   0.011333   0.367   0.7135    
lfp          4.185897   0.183727  22.783   <2e-16 ***
city         0.462158   0.190176   2.430   0.0153 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.489 on 749 degrees of freedom
Multiple R-squared:  0.4129,	Adjusted R-squared:  0.4105 
F-statistic: 175.6 on 3 and 749 DF,  p-value: < 2.2e-16


%%R
#LET'S TRY BOTH VARIABLES
Mroz87$educlfp = Mroz87$educ*Mroz87$lfp
res = lm(wage ~ age + age^2 + lfp + educ + city + educlfp , data=Mroz87)
summary(res)

Call:
lm(formula = wage ~ age + age^2 + lfp + educ + city + educlfp, 
    data = Mroz87)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.8139 -0.7307 -0.0712  0.2261 21.1120 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.528196   0.904949  -0.584   0.5596    
age          0.009299   0.010801   0.861   0.3895    
lfp         -2.028354   0.963841  -2.104   0.0357 *  
educ        -0.002723   0.060710  -0.045   0.9642    
city         0.244245   0.182220   1.340   0.1805    
educlfp      0.491515   0.077942   6.306 4.89e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.347 on 747 degrees of freedom
Multiple R-squared:  0.4792,	Adjusted R-squared:  0.4757 
F-statistic: 137.4 on 5 and 747 DF,  p-value: < 2.2e-16


%%R
#LET'S DROP INTERACTION
res = lm(wage ~ age + age^2 + lfp + educ + city, data=Mroz87)
summary(res)

Call:
lm(formula = wage ~ age + age^2 + lfp + educ + city, data = Mroz87)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.9849 -1.1053 -0.1626  0.4762 21.0179 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.18595    0.71239  -5.876 6.33e-09 ***
age          0.01421    0.01105   1.286    0.199    
lfp          3.94731    0.18073  21.841  < 2e-16 ***
educ         0.29043    0.04005   7.252 1.03e-12 ***
city         0.22401    0.18685   1.199    0.231    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.407 on 748 degrees of freedom
Multiple R-squared:  0.4514,	Adjusted R-squared:  0.4485 
F-statistic: 153.9 on 4 and 748 DF,  p-value: < 2.2e-16


%%R
res = selection(lfp ~ age + age^2 + faminc + kids5 + educ,
                wage ~ exper + exper^2 + educ + city, data=Mroz87, method = "2step" )
summary(res)

--------------------------------------------
Tobit 2 model (sample selection model)
2-step Heckman / heckit estimation
753 observations (325 censored and 428 observed)
12 free parameters (df = 742)
Probit selection equation:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.394e-01  4.119e-01   0.824    0.410    
age         -3.424e-02  6.728e-03  -5.090 4.55e-07 ***
faminc       3.390e-06  4.267e-06   0.795    0.427    
kids5       -8.624e-01  1.111e-01  -7.762 2.78e-14 ***
educ         1.162e-01  2.361e-02   4.923 1.05e-06 ***
Outcome equation:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.66736    1.30192  -2.049   0.0408 *  
exper        0.02370    0.01886   1.256   0.2093    
educ         0.48816    0.07946   6.144 1.31e-09 ***
city         0.44936    0.31585   1.423   0.1553    
Multiple R-Squared:0.1248,	Adjusted R-Squared:0.1165
   Error terms:
              Estimate Std. Error t value Pr(>|t|)
invMillsRatio  0.11082    0.73108   0.152     0.88
sigma          3.09434         NA      NA       NA
rho            0.03581         NA      NA       NA
--------------------------------------------


%%R
## Example using binary outcome for selection model.
## We estimate the probability of womens' education on their
## chances to get high wage (> $5/hr in 1975 USD), using PSID data
## We use education as explanatory variable
## and add age, kids, and non-work income as exclusion restrictions.
library(mvtnorm)
data(Mroz87)
m <- selection(lfp ~ educ + age + kids5 + kids618 + nwifeinc,
wage >= 5 ~ educ, data = Mroz87 )
summary(m)

--------------------------------------------
Tobit 2 model with binary outcome (sample selection model)
Maximum Likelihood estimation
BHHH maximisation, 8 iterations
Return code 2: successive function values within tolerance limit
Log-Likelihood: -653.2037 
753 observations (325 censored and 428 observed)
9 free parameters (df = 744)
Probit selection equation:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.430362   0.475966   0.904    0.366    
educ         0.156223   0.023811   6.561 1.00e-10 ***
age         -0.034713   0.007649  -4.538 6.61e-06 ***
kids5       -0.890560   0.112663  -7.905 9.69e-15 ***
kids618     -0.038167   0.039320  -0.971    0.332    
nwifeinc    -0.020948   0.004318  -4.851 1.49e-06 ***
Outcome equation:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -4.5213     0.5611  -8.058 3.08e-15 ***
educ          0.2879     0.0369   7.800 2.09e-14 ***
   Error terms:
    Estimate Std. Error t value Pr(>|t|)
rho   0.1164     0.2706    0.43    0.667
--------------------------------------------


%%R
#CHECK THAT THE NUMBER OF KIDS MATTERS OR NOT
Mroz87$numkids = Mroz87$kids5 + Mroz87$kids618
summary(lm(wage ~ numkids, data=Mroz87))

Call:
lm(formula = wage ~ numkids, data = Mroz87)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.6814 -2.2957 -0.8125  1.3186 23.0900 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.68138    0.17421   15.39   <2e-16 ***
numkids     -0.19285    0.08069   -2.39   0.0171 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.232 on 751 degrees of freedom
Multiple R-squared:  0.007548,	Adjusted R-squared:  0.006227 
F-statistic: 5.712 on 1 and 751 DF,  p-value: 0.0171


%%R
res = selection(lfp ~ age + I(age^2) + faminc + numkids + educ,
                wage ~ exper + I(exper^2) + educ + city + numkids, data=Mroz87, method = "2step" )
summary(res)

--------------------------------------------
Tobit 2 model (sample selection model)
2-step Heckman / heckit estimation
753 observations (325 censored and 428 observed)
15 free parameters (df = 739)
Probit selection equation:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.725e+00  1.398e+00  -2.664  0.00789 ** 
age          1.656e-01  6.482e-02   2.554  0.01084 *  
I(age^2)    -2.198e-03  7.537e-04  -2.917  0.00365 ** 
faminc       4.001e-06  4.204e-06   0.952  0.34161    
numkids     -1.513e-01  3.827e-02  -3.955 8.39e-05 ***
educ         9.224e-02  2.302e-02   4.007 6.77e-05 ***
Outcome equation:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.2476932  2.0702572  -1.086    0.278    
exper        0.0271253  0.0635033   0.427    0.669    
I(exper^2)  -0.0001957  0.0019429  -0.101    0.920    
educ         0.4726828  0.1037086   4.558 6.05e-06 ***
city         0.4389577  0.3166504   1.386    0.166    
numkids     -0.0471181  0.1420580  -0.332    0.740    
Multiple R-Squared:0.1252,	Adjusted R-Squared:0.1128
   Error terms:
              Estimate Std. Error t value Pr(>|t|)
invMillsRatio -0.11737    1.38036  -0.085    0.932
sigma          3.09374         NA      NA       NA
rho           -0.03794         NA      NA       NA
--------------------------------------------


%%R
library(survival)


%%R
SURV = read.table("DSTMAA_data/survival_data.txt",header=TRUE)
SURV

  id time death age female
1  1    1     1  20      0
2  2    4     0  21      1
3  3    7     1  19      0
4  4   10     1  22      1
5  5   12     0  20      0
6  6   13     1  24      1


%%R
#SIMPLE REGRESSION APPROACH
summary(lm(death ~ age+female, SURV))

Call:
lm(formula = death ~ age + female, data = SURV)

Residuals:
       1        2        3        4        5        6 
 0.27083 -0.41667  0.45833  0.39583 -0.72917  0.02083 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  -3.0208     5.2751  -0.573    0.607
age           0.1875     0.2676   0.701    0.534
female       -0.5000     0.8740  -0.572    0.607

Residual standard error: 0.618 on 3 degrees of freedom
Multiple R-squared:  0.1406,	Adjusted R-squared:  -0.4323 
F-statistic: 0.2455 on 2 and 3 DF,  p-value: 0.7967


%%R
#COX APPROACH
res = coxph(Surv(time, death) ~ female + age, data = SURV)
summary(res)
plot(survfit(res))   #Plot the baseline survival function


%%R
cox.zph(res)

          rho chisq     p
female  0.563 1.504 0.220
age    -0.472 0.743 0.389
GLOBAL     NA 1.762 0.414


%%R
covs <- data.frame(age = 21, female = 0)
summary(survfit(res, newdata = covs, type = "aalen"))

Call: survfit(formula = res, newdata = covs, type = "aalen")

 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    1      6       1   0.9475   0.108     7.58e-01            1
    7      4       1   0.8672   0.236     5.08e-01            1
   10      3       1   0.7000   0.394     2.32e-01            1
   13      1       1   0.0184   0.117     7.14e-08            1


%%R
library(glmnet)

ncaa = read.table("DSTMAA_data/ncaa.txt",header=TRUE)
y = c(rep(1,32),rep(0,32))
x = as.matrix(ncaa[4:14])
res = cv.glmnet(x = x, y = y, family = 'binomial', alpha = 1, type.measure = "auc")
plot(res)


%%R
res = glmnet(x = x, y = y, family = 'binomial', alpha = 1)
print(names(res))
print(res)

 [1] "a0"         "beta"       "df"         "dim"        "lambda"    
 [6] "dev.ratio"  "nulldev"    "npasses"    "jerr"       "offset"    
[11] "classnames" "call"       "nobs"      

Call:  glmnet(x = x, y = y, family = "binomial", alpha = 1) 

      Df      %Dev    Lambda
 [1,]  0 1.602e-16 0.2615000
 [2,]  1 3.357e-02 0.2383000
 [3,]  1 6.172e-02 0.2171000
 [4,]  1 8.554e-02 0.1978000
 [5,]  1 1.058e-01 0.1803000
 [6,]  1 1.231e-01 0.1642000
 [7,]  1 1.380e-01 0.1496000
 [8,]  1 1.508e-01 0.1364000
 [9,]  1 1.618e-01 0.1242000
[10,]  2 1.721e-01 0.1132000
[11,]  4 1.851e-01 0.1031000
[12,]  5 1.990e-01 0.0939800
[13,]  4 2.153e-01 0.0856300
[14,]  4 2.293e-01 0.0780300
[15,]  4 2.415e-01 0.0711000
[16,]  5 2.540e-01 0.0647800
[17,]  8 2.730e-01 0.0590200
[18,]  8 2.994e-01 0.0537800
[19,]  8 3.225e-01 0.0490000
[20,]  8 3.428e-01 0.0446500
[21,]  8 3.608e-01 0.0406800
[22,]  8 3.766e-01 0.0370700
[23,]  8 3.908e-01 0.0337800
[24,]  8 4.033e-01 0.0307800
[25,]  8 4.145e-01 0.0280400
[26,]  9 4.252e-01 0.0255500
[27,] 10 4.356e-01 0.0232800
[28,] 10 4.450e-01 0.0212100
[29,] 10 4.534e-01 0.0193300
[30,] 10 4.609e-01 0.0176100
[31,] 10 4.676e-01 0.0160500
[32,] 10 4.735e-01 0.0146200
[33,] 10 4.789e-01 0.0133200
[34,] 10 4.836e-01 0.0121400
[35,] 10 4.878e-01 0.0110600
[36,]  9 4.912e-01 0.0100800
[37,]  9 4.938e-01 0.0091820
[38,]  9 4.963e-01 0.0083670
[39,]  9 4.984e-01 0.0076230
[40,]  9 5.002e-01 0.0069460
[41,]  9 5.018e-01 0.0063290
[42,]  9 5.032e-01 0.0057670
[43,]  9 5.044e-01 0.0052540
[44,]  9 5.055e-01 0.0047880
[45,]  9 5.064e-01 0.0043620
[46,]  9 5.071e-01 0.0039750
[47,] 10 5.084e-01 0.0036220
[48,] 10 5.095e-01 0.0033000
[49,] 10 5.105e-01 0.0030070
[50,] 10 5.114e-01 0.0027400
[51,] 10 5.121e-01 0.0024960
[52,] 10 5.127e-01 0.0022750
[53,] 11 5.133e-01 0.0020720
[54,] 11 5.138e-01 0.0018880
[55,] 11 5.142e-01 0.0017210
[56,] 11 5.146e-01 0.0015680
[57,] 11 5.149e-01 0.0014280
[58,] 11 5.152e-01 0.0013020
[59,] 11 5.154e-01 0.0011860
[60,] 11 5.156e-01 0.0010810
[61,] 11 5.157e-01 0.0009846
[62,] 11 5.158e-01 0.0008971
[63,] 11 5.160e-01 0.0008174
[64,] 11 5.160e-01 0.0007448
[65,] 11 5.161e-01 0.0006786
[66,] 11 5.162e-01 0.0006183
[67,] 11 5.162e-01 0.0005634
[68,] 11 5.163e-01 0.0005134
[69,] 11 5.163e-01 0.0004678
[70,] 11 5.164e-01 0.0004262
[71,] 11 5.164e-01 0.0003883
[72,] 11 5.164e-01 0.0003538
[73,] 11 5.164e-01 0.0003224
[74,] 11 5.164e-01 0.0002938
[75,] 11 5.165e-01 0.0002677
[76,] 11 5.165e-01 0.0002439
[77,] 11 5.165e-01 0.0002222


%%R
b = coef(res)[,25]   #Choose the best case with 8 coefficients
print(b)

x1 = c(1,as.numeric(x[18,]))
p = 1/(1+exp(-sum(b*x1)))
print(p)

 (Intercept)          PTS          REB          AST           TO          A.T 
-17.30807196   0.04224762   0.13304541   0.00000000  -0.13440922   0.63059336 
         STL          BLK           PF           FG           FT          X3P 
  0.21867734   0.11635708   0.00000000  17.14864198   3.00069901   0.00000000 
[1] 0.7696481


%%R
preds = predict(res, x, type = 'response')
print(dim(preds))
preds = preds[,25]   #Take the 25th case
print(preds)
print(glmnet:::auc(y, preds))
print(table(y,round(preds,0)))   #rounding needed to make 0,1

[1] 64 77
 [1] 0.97443940 0.90157397 0.87711437 0.89911656 0.95684199 0.82949042
 [7] 0.53186622 0.83745812 0.45979765 0.58355756 0.78726183 0.55050365
[13] 0.30633472 0.93605170 0.70646742 0.85811465 0.42394178 0.76964806
[19] 0.40172414 0.66137964 0.69620096 0.61569705 0.88800581 0.92834645
[25] 0.82719624 0.17209046 0.66881541 0.84149477 0.58937886 0.64674446
[31] 0.79368965 0.51186217 0.58500925 0.61275721 0.17532362 0.47406867
[37] 0.24314471 0.11843924 0.26787937 0.24296988 0.21129918 0.05041436
[43] 0.30109650 0.14989973 0.17976216 0.57119150 0.05514704 0.46220128
[49] 0.63788393 0.32605605 0.35544396 0.12647374 0.61772958 0.63883954
[55] 0.02306762 0.21285032 0.36455131 0.53953727 0.18563868 0.23598354
[61] 0.11821886 0.04258418 0.19603015 0.24630145
[1] 0.9072266
   
y    0  1
  0 25  7
  1  5 27


%%R
suppressMessages(library(pROC))
res = roc(response=y,predictor=preds)
print(res)
plot(res); grid()

Call:
roc.default(response = y, predictor = preds)

Data: preds in 32 controls (y 0) < 32 cases (y 1).
Area under the curve: 0.9072


%%R
SURV = read.table("DSTMAA_data/survival_data.txt",header=TRUE)
print(SURV)
names(SURV)[3] = "status"
y = as.matrix(SURV[,2:3])
x = as.matrix(SURV[,4:5])

  id time death age female
1  1    1     1  20      0
2  2    4     0  21      1
3  3    7     1  19      0
4  4   10     1  22      1
5  5   12     0  20      0
6  6   13     1  24      1


%%R
res = glmnet(x, y, family = "cox")
print(res)

Call:  glmnet(x = x, y = y, family = "cox") 

      Df    %Dev   Lambda
 [1,]  0 0.00000 0.331700
 [2,]  1 0.02347 0.302200
 [3,]  1 0.04337 0.275400
 [4,]  1 0.06027 0.250900
 [5,]  1 0.07466 0.228600
 [6,]  1 0.08690 0.208300
 [7,]  1 0.09734 0.189800
 [8,]  1 0.10620 0.172900
 [9,]  1 0.11380 0.157600
[10,]  1 0.12020 0.143600
[11,]  1 0.12570 0.130800
[12,]  1 0.13040 0.119200
[13,]  1 0.13430 0.108600
[14,]  1 0.13770 0.098970
[15,]  1 0.14050 0.090180
[16,]  1 0.14300 0.082170
[17,]  1 0.14500 0.074870
[18,]  1 0.14670 0.068210
[19,]  1 0.14820 0.062150
[20,]  1 0.14940 0.056630
[21,]  1 0.15040 0.051600
[22,]  1 0.15130 0.047020
[23,]  1 0.15200 0.042840
[24,]  1 0.15260 0.039040
[25,]  1 0.15310 0.035570
[26,]  2 0.15930 0.032410
[27,]  2 0.16480 0.029530
[28,]  2 0.16930 0.026910
[29,]  2 0.17320 0.024520
[30,]  2 0.17640 0.022340
[31,]  2 0.17910 0.020350
[32,]  2 0.18140 0.018540
[33,]  2 0.18330 0.016900
[34,]  2 0.18490 0.015400
[35,]  2 0.18630 0.014030
[36,]  2 0.18740 0.012780
[37,]  2 0.18830 0.011650
[38,]  2 0.18910 0.010610
[39,]  2 0.18980 0.009669
[40,]  2 0.19030 0.008810
[41,]  2 0.19080 0.008028
[42,]  2 0.19120 0.007314
[43,]  2 0.19150 0.006665
[44,]  2 0.19180 0.006073
[45,]  2 0.19200 0.005533
[46,]  2 0.19220 0.005042
[47,]  2 0.19240 0.004594
[48,]  2 0.19250 0.004186
[49,]  2 0.19260 0.003814
[50,]  2 0.19270 0.003475
[51,]  2 0.19280 0.003166
[52,]  2 0.19280 0.002885
[53,]  2 0.19290 0.002629
[54,]  2 0.19290 0.002395
[55,]  2 0.19300 0.002182
[56,]  2 0.19300 0.001988


%%R
plot(res)
print(coef(res))

2 x 56 sparse Matrix of class "dgCMatrix"
                                                                             
age    . -0.03232796 -0.06240328 -0.09044971 -0.1166396 -0.1411157 -0.1639991
female .  .           .           .           .          .          .        
                                                                      
age    -0.185342 -0.2053471 -0.2240373 -0.2414872 -0.2577658 -0.272938
female  .         .          .          .          .          .       
                                                                        
age    -0.2870651 -0.3002053 -0.3124148 -0.3237473 -0.3342545 -0.3440275
female  .          .          .          .          .          .        
                                                                        
age    -0.3530249 -0.3613422 -0.3690231 -0.3761098 -0.3826423 -0.3886591
female  .          .          .          .          .          .        
                                                                        
age    -0.4300447 -0.4704889 -0.5078614 -0.5424838 -0.5745449 -0.6042077
female  0.1232263  0.2429576  0.3522138  0.4522592  0.5439278  0.6279337
                                                                        
age    -0.6316057 -0.6569988 -0.6804703 -0.7022042 -0.7222141 -0.7407295
female  0.7048655  0.7754539  0.8403575  0.9000510  0.9546989  1.0049765
                                                                       
age    -0.7577467 -0.773467 -0.7878944 -0.8012225 -0.8133071 -0.8246563
female  1.0509715  1.093264  1.1319284  1.1675026  1.1999905  1.2297716
                                                                      
age    -0.8349496 -0.8442393 -0.8528942 -0.860838 -0.8680639 -0.874736
female  1.2570025  1.2817654  1.3045389  1.325398  1.3443458  1.361801
                                                                        
age    -0.8808466 -0.8863844 -0.8915045 -0.8961894 -0.9004172 -0.9043351
female  1.3777603  1.3922138  1.4055495  1.4177359  1.4287319  1.4389022
                 
age    -0.9079181
female  1.4481934


%%R
cvfit = cv.glmnet(x, y, family = "cox")
plot(cvfit)
print(cvfit$lambda.min)
print(coef(cvfit,s=cvfit$lambda.min))

[1] 0.0989681
2 x 1 sparse Matrix of class "dgCMatrix"
                1
age    -0.2870651
female  .

Truncate and Estimate: Limited Dependent Variables {#LimitedDependentVariables}¶

Maximum-Likelihood Estimation (MLE)¶

Likelihood¶

Implementation¶

Logit and Probit Models¶

Relation to classifiers¶

Logit¶

Example¶

Probit¶

Analysis¶

Odds Ratio and Slopes (Coefficients) in a Logit¶

Calculus of the logit coefficients¶

How about the Probit model?¶

Maximum-Likelihood Estimation (MLE) of these Choice Models¶

Multinomial Logit¶

When OLS fails¶

Truncated Variables and Sample Selection¶

Inverse Mills Ratio¶

Example: Limited Dependent Variables in VC Syndications¶

Sample Selection Problems (and endogeneity)¶

Example: Women in the Labor Market¶

Endogeity: Some Theory to Wrap Up¶

Cox Proportional Hazards Model¶

GLMNET: Lasso and Ridge Regressions¶

Prediction on test data¶

ROC Curves¶

Glmnet Cox Models¶