Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(dplyr)
library(ggplot2)
library(epiDisplay)

ANOVA

No variance

anova_gce %>% ggplot(aes(x=grp, y=sit2, col=grp, grp=as.factor(grp))) +
  geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
  geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
  theme_classic()

(a<-summary(aov(sit2 ~ grp, data = anova_gce)))

##             Df Sum Sq Mean Sq F value Pr(>F)
## grp          3      0       0               
## Residuals   16      0       0

No between-groups variance (0) and within-group variance (0)

No within-group variance

anova_gce %>% ggplot(aes(x=grp, y=sit3, col=grp, grp=as.factor(grp))) +
  geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
  geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
  theme_classic()

(a<-summary(aov(sit3 ~ grp, data = anova_gce)))

##             Df Sum Sq Mean Sq   F value Pr(>F)    
## grp          3    162   53.99 2.443e+31 <2e-16 ***
## Residuals   16      0    0.00                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

High between-groups variance (162) and no within-group variance (0)

No between-groups variance

anova_gce %>% ggplot(aes(x=grp, y=sit4, col=grp, grp=as.factor(grp))) +
  geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
  geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
  theme_classic()

(a<-summary(aov(sit4 ~ grp, data = anova_gce)))

##             Df Sum Sq Mean Sq F value Pr(>F)
## grp          3    0.0   0.000       0      1
## Residuals   16  111.2   6.947

No between-groups variance (0) and high within-group variance (111.2)

Case

anova_gce %>% ggplot(aes(x=grp, y=sit1, col=grp, grp=as.factor(grp))) +
  geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
  geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
  theme_classic()

(a<-summary(aov(sit1 ~ grp, data = anova_gce)))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## grp          3 162.12   54.04   47.81 3.28e-08 ***
## Residuals   16  18.08    1.13                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

a[[1]]$'Sum Sq'[1]/(a[[1]]$'Sum Sq'[1]+a[[1]]$'Sum Sq'[2])

## [1] 0.8996485

More between-groups variance (162.12) than within-group variance (18.08)

R-squared = 0.8996485

LINEAR REGRESSION

Assumptions:

Existence: Y is a random variable with a distribution of values for each specific combination of values of X’s.
Independence: Observations of Y are statistically independent of each other.
Linearity: Mean of Y for each specific combination of X’s is μY|X =β0 +β1X1 +β2X2 +⋯+βkXk .
Homoscedasticity: Variance of Y is the same for any specific combination of X’s. For hypothesis testing, need additional assumption
Normality: Y is normally distributed for each specific combination of X’s. Equivalently, E is normally distributed.

Model

LM <- lm(sit1 ~ grp, data = anova_gce)
anova(LM)

## Analysis of Variance Table
## 
## Response: sit1
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## grp        3 162.117  54.039  47.813 3.277e-08 ***
## Residuals 16  18.083   1.130                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(LM)

## 
## Call:
## lm(formula = sit1 ~ grp, data = anova_gce)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.40000 -0.76250 -0.06667  0.66667  1.66667 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.8000     0.4754  29.026 2.88e-15 ***
## grpb          5.5333     0.6437   8.596 2.16e-07 ***
## grpc         -2.0500     0.7132  -2.875  0.01101 *  
## grpd          2.6000     0.6724   3.867  0.00137 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.063 on 16 degrees of freedom
## Multiple R-squared:  0.8996, Adjusted R-squared:  0.8808 
## F-statistic: 47.81 on 3 and 16 DF,  p-value: 3.277e-08

R-squared = 0.8996

Test nested models

anova_gce$v2 <- rbinom(n=20, size=1, prob=0.5)

(full_model<-summary(aov(lm(sit1 ~ grp + v2, data = anova_gce))))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## grp          3 162.12   54.04  66.241 6.98e-09 ***
## v2           1   5.85    5.85   7.167   0.0172 *  
## Residuals   15  12.24    0.82                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(reduced_model<-summary(aov(lm(sit1 ~ grp, data = anova_gce))))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## grp          3 162.12   54.04   47.81 3.28e-08 ***
## Residuals   16  18.08    1.13                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

SSEp <- reduced_model[[1]]$'Sum Sq'[2]
SSEk <- full_model[[1]]$'Sum Sq'[3]
MSEk <- full_model[[1]]$'Mean Sq'[3]
k <- length(full_model[[1]]$'Sum Sq')-1
p <- length(reduced_model[[1]]$'Sum Sq')-1
nkp <- full_model[[1]]$'Df'[3]

Fstat <- ((SSEp-SSEk)/(k-p))/MSEk
(PrF <- 1-pf(Fstat, df1=k-p, df2=nkp))

## [1] 0.01723378

lm.full <- lm(sit1 ~ grp + v2, data = anova_gce)
lm.red <- lm(sit1 ~ grp, data = anova_gce)
anova(lm.full,lm.red)

## Analysis of Variance Table
## 
## Model 1: sit1 ~ grp + v2
## Model 2: sit1 ~ grp
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     15 12.237                              
## 2     16 18.083 -1   -5.8464 7.1665 0.01723 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

LOGISTIC REGRESSION

head(log_dt)

## # A tibble: 6 x 10
##   infarto  tabaco alcohol `_est_m0` `_est_m1` `_est_mod1` `_est_mod2`     p
##   <dbl+l> <dbl+l> <dbl+l>     <dbl>     <dbl>       <dbl>       <dbl> <dbl>
## 1 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1 0.822
## 2 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1 0.822
## 3 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1 0.822
## 4 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1 0.822
## 5 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1 0.822
## 6 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1 0.822
## # … with 2 more variables: resid_std <dbl>, age <dbl>

log_red <- glm(infarto ~ alcohol, family=binomial(link="logit") ,data=log_dt)
summary(log_red)

## 
## Call:
## glm(formula = infarto ~ alcohol, family = binomial(link = "logit"), 
##     data = log_dt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2049  -1.2049  -0.8203   1.1501   1.5829  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.9163     0.2092  -4.381 1.18e-05 ***
## alcohol       0.9808     0.2555   3.839 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 407.18  on 297  degrees of freedom
## Residual deviance: 391.67  on 296  degrees of freedom
## AIC: 395.67
## 
## Number of Fisher Scoring iterations: 4

logistic.display(log_red)

## 
## Logistic regression predicting infarto 
##  
##                  OR(95%CI)        P(Wald's test) P(LR-test)
## alcohol: 1 vs 0  2.67 (1.62,4.4)  < 0.001        < 0.001   
##                                                            
## Log-likelihood = -195.8348
## No. of observations = 298
## AIC value = 395.6696

(LLred<-as.numeric(logLik(log_red)))

## [1] -195.8348

Test nested models

log_full <- glm(infarto ~ alcohol + tabaco, family=binomial(link="logit") ,data=log_dt)
summary(log_full)

## 
## Call:
## glm(formula = infarto ~ alcohol + tabaco, family = binomial(link = "logit"), 
##     data = log_dt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8592  -0.7828  -0.6960   0.6253   1.7530  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.2943     0.2333  -5.548 2.90e-08 ***
## alcohol       0.2684     0.3011   0.891    0.373    
## tabaco        2.5587     0.3215   7.959 1.73e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 407.18  on 297  degrees of freedom
## Residual deviance: 313.12  on 295  degrees of freedom
## AIC: 319.12
## 
## Number of Fisher Scoring iterations: 4

logistic.display(log_full)

## 
## Logistic regression predicting infarto 
##  
##                  crude OR(95%CI)     adj. OR(95%CI)      P(Wald's test)
## alcohol: 1 vs 0  2.67 (1.62,4.4)     1.31 (0.72,2.36)    0.373         
##                                                                        
## tabaco: 1 vs 0   14.07 (7.68,25.79)  12.92 (6.88,24.26)  < 0.001       
##                                                                        
##                  P(LR-test)
## alcohol: 1 vs 0  0.373     
##                            
## tabaco: 1 vs 0   < 0.001   
##                            
## Log-likelihood = -156.5576
## No. of observations = 298
## AIC value = 319.1152

(LLfull<-as.numeric(logLik(log_full)))

## [1] -156.5576

(LRtest <- 2*(-156.5576-(-195.83478)))

## [1] 78.55436

(LRtest <- 2*(LLfull-(LLred)))

## [1] 78.55439

lrtest (log_full, log_red)

## Likelihood ratio test for MLE method 
## Chi-squared 1 d.f. =  78.55439 , P value =  7.782556e-19

Predicted Values

library(plotROC); library(pROC)
log_dt$p1 <- predict(log_red, newdata=log_dt, type="response")
log_dt %>% ggplot(aes(d = infarto, m = p1)) + geom_roc() + 
  geom_abline(intercept=0, slope=1, colour="red", linetype="dashed") +
  theme_bw()

auc(log_dt$infarto, log_dt$p1)

## Area under the curve: 0.6103

log_age <- glm(infarto ~ alcohol + tabaco + age, family=binomial(link="logit") ,data=log_dt)
log_dt$p2 <- predict(log_age, newdata=log_dt, type="response")
log_dt %>% ggplot(aes(d = infarto, m = p2)) + geom_roc() + 
  geom_abline(intercept=0, slope=1, colour="red", linetype="dashed") +
  theme_bw()

auc(log_dt$infarto, log_dt$p2)

## Area under the curve: 0.9777

Residuals

log_dt$p1 <- predict(log_red, newdata=log_dt, type="response")
log_dt$p2 <- predict(log_age, newdata=log_dt, type="response")

log_dt$resid_std <- rstudent(log_age)
shapiro.test(log_dt$resid_std)

## 
##  Shapiro-Wilk normality test
## 
## data:  log_dt$resid_std
## W = 0.79336, p-value < 2.2e-16

log_dt %>% ggplot(aes(sample=resid_std)) + stat_qq() + stat_qq_line(col="red", linetype="dashed") + theme_classic()

# experimental
log_dt$influy2 <- dfbetas(log_age)[,4]*100
log_dt %>% filter(influy2>1)

## # A tibble: 167 x 13
##    infarto  tabaco alcohol `_est_m0` `_est_m1` `_est_mod1` `_est_mod2`
##    <dbl+l> <dbl+l> <dbl+l>     <dbl>     <dbl>       <dbl>       <dbl>
##  1 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  2 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  3 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  4 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  5 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  6 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  7 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  8 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
##  9 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
## 10 1 [Yes] 1 [Yes] 1 [Yes]         1         1           1           1
## # … with 157 more rows, and 6 more variables: p <dbl>,
## #   resid_std <dbl+lbl>, age <dbl>, p1 <dbl>, p2 <dbl>, influy2 <dbl>

log_dt %>% ggplot(aes(x=p2, y=influy2)) + geom_point()

Routine for PH 628

Lecture 1: Linear and Logistic Regression

Gabriel Carrasco Escobar

ANOVA

No variance

No within-group variance

No between-groups variance

Case

LINEAR REGRESSION

Assumptions:

Model

Test nested models

LOGISTIC REGRESSION

Test nested models

Predicted Values

Residuals