Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(dplyr)
library(ggplot2)
library(epiDisplay)
anova_gce %>% ggplot(aes(x=grp, y=sit2, col=grp, grp=as.factor(grp))) +
geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
theme_classic()
(a<-summary(aov(sit2 ~ grp, data = anova_gce)))
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 0 0
## Residuals 16 0 0
No between-groups variance (0) and within-group variance (0)
anova_gce %>% ggplot(aes(x=grp, y=sit3, col=grp, grp=as.factor(grp))) +
geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
theme_classic()
(a<-summary(aov(sit3 ~ grp, data = anova_gce)))
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 162 53.99 2.443e+31 <2e-16 ***
## Residuals 16 0 0.00
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
High between-groups variance (162) and no within-group variance (0)
anova_gce %>% ggplot(aes(x=grp, y=sit4, col=grp, grp=as.factor(grp))) +
geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
theme_classic()
(a<-summary(aov(sit4 ~ grp, data = anova_gce)))
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 0.0 0.000 0 1
## Residuals 16 111.2 6.947
No between-groups variance (0) and high within-group variance (111.2)
anova_gce %>% ggplot(aes(x=grp, y=sit1, col=grp, grp=as.factor(grp))) +
geom_jitter(height = 0, width = .1, size=3, alpha=.5) +
geom_crossbar(stat="summary", fun.y=mean, fun.ymax=mean, fun.ymin=mean) +
theme_classic()
(a<-summary(aov(sit1 ~ grp, data = anova_gce)))
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 162.12 54.04 47.81 3.28e-08 ***
## Residuals 16 18.08 1.13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
a[[1]]$'Sum Sq'[1]/(a[[1]]$'Sum Sq'[1]+a[[1]]$'Sum Sq'[2])
## [1] 0.8996485
More between-groups variance (162.12) than within-group variance (18.08)
R-squared = 0.8996485
LM <- lm(sit1 ~ grp, data = anova_gce)
anova(LM)
## Analysis of Variance Table
##
## Response: sit1
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 162.117 54.039 47.813 3.277e-08 ***
## Residuals 16 18.083 1.130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(LM)
##
## Call:
## lm(formula = sit1 ~ grp, data = anova_gce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.40000 -0.76250 -0.06667 0.66667 1.66667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.8000 0.4754 29.026 2.88e-15 ***
## grpb 5.5333 0.6437 8.596 2.16e-07 ***
## grpc -2.0500 0.7132 -2.875 0.01101 *
## grpd 2.6000 0.6724 3.867 0.00137 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.063 on 16 degrees of freedom
## Multiple R-squared: 0.8996, Adjusted R-squared: 0.8808
## F-statistic: 47.81 on 3 and 16 DF, p-value: 3.277e-08
R-squared = 0.8996
anova_gce$v2 <- rbinom(n=20, size=1, prob=0.5)
(full_model<-summary(aov(lm(sit1 ~ grp + v2, data = anova_gce))))
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 162.12 54.04 66.241 6.98e-09 ***
## v2 1 5.85 5.85 7.167 0.0172 *
## Residuals 15 12.24 0.82
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(reduced_model<-summary(aov(lm(sit1 ~ grp, data = anova_gce))))
## Df Sum Sq Mean Sq F value Pr(>F)
## grp 3 162.12 54.04 47.81 3.28e-08 ***
## Residuals 16 18.08 1.13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSEp <- reduced_model[[1]]$'Sum Sq'[2]
SSEk <- full_model[[1]]$'Sum Sq'[3]
MSEk <- full_model[[1]]$'Mean Sq'[3]
k <- length(full_model[[1]]$'Sum Sq')-1
p <- length(reduced_model[[1]]$'Sum Sq')-1
nkp <- full_model[[1]]$'Df'[3]
Fstat <- ((SSEp-SSEk)/(k-p))/MSEk
(PrF <- 1-pf(Fstat, df1=k-p, df2=nkp))
## [1] 0.01723378
lm.full <- lm(sit1 ~ grp + v2, data = anova_gce)
lm.red <- lm(sit1 ~ grp, data = anova_gce)
anova(lm.full,lm.red)
## Analysis of Variance Table
##
## Model 1: sit1 ~ grp + v2
## Model 2: sit1 ~ grp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 15 12.237
## 2 16 18.083 -1 -5.8464 7.1665 0.01723 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
head(log_dt)
## # A tibble: 6 x 10
## infarto tabaco alcohol `_est_m0` `_est_m1` `_est_mod1` `_est_mod2` p
## <dbl+l> <dbl+l> <dbl+l> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1 0.822
## 2 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1 0.822
## 3 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1 0.822
## 4 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1 0.822
## 5 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1 0.822
## 6 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1 0.822
## # … with 2 more variables: resid_std <dbl>, age <dbl>
log_red <- glm(infarto ~ alcohol, family=binomial(link="logit") ,data=log_dt)
summary(log_red)
##
## Call:
## glm(formula = infarto ~ alcohol, family = binomial(link = "logit"),
## data = log_dt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2049 -1.2049 -0.8203 1.1501 1.5829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9163 0.2092 -4.381 1.18e-05 ***
## alcohol 0.9808 0.2555 3.839 0.000124 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 407.18 on 297 degrees of freedom
## Residual deviance: 391.67 on 296 degrees of freedom
## AIC: 395.67
##
## Number of Fisher Scoring iterations: 4
logistic.display(log_red)
##
## Logistic regression predicting infarto
##
## OR(95%CI) P(Wald's test) P(LR-test)
## alcohol: 1 vs 0 2.67 (1.62,4.4) < 0.001 < 0.001
##
## Log-likelihood = -195.8348
## No. of observations = 298
## AIC value = 395.6696
(LLred<-as.numeric(logLik(log_red)))
## [1] -195.8348
log_full <- glm(infarto ~ alcohol + tabaco, family=binomial(link="logit") ,data=log_dt)
summary(log_full)
##
## Call:
## glm(formula = infarto ~ alcohol + tabaco, family = binomial(link = "logit"),
## data = log_dt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8592 -0.7828 -0.6960 0.6253 1.7530
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2943 0.2333 -5.548 2.90e-08 ***
## alcohol 0.2684 0.3011 0.891 0.373
## tabaco 2.5587 0.3215 7.959 1.73e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 407.18 on 297 degrees of freedom
## Residual deviance: 313.12 on 295 degrees of freedom
## AIC: 319.12
##
## Number of Fisher Scoring iterations: 4
logistic.display(log_full)
##
## Logistic regression predicting infarto
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## alcohol: 1 vs 0 2.67 (1.62,4.4) 1.31 (0.72,2.36) 0.373
##
## tabaco: 1 vs 0 14.07 (7.68,25.79) 12.92 (6.88,24.26) < 0.001
##
## P(LR-test)
## alcohol: 1 vs 0 0.373
##
## tabaco: 1 vs 0 < 0.001
##
## Log-likelihood = -156.5576
## No. of observations = 298
## AIC value = 319.1152
(LLfull<-as.numeric(logLik(log_full)))
## [1] -156.5576
(LRtest <- 2*(-156.5576-(-195.83478)))
## [1] 78.55436
(LRtest <- 2*(LLfull-(LLred)))
## [1] 78.55439
lrtest (log_full, log_red)
## Likelihood ratio test for MLE method
## Chi-squared 1 d.f. = 78.55439 , P value = 7.782556e-19
library(plotROC); library(pROC)
log_dt$p1 <- predict(log_red, newdata=log_dt, type="response")
log_dt %>% ggplot(aes(d = infarto, m = p1)) + geom_roc() +
geom_abline(intercept=0, slope=1, colour="red", linetype="dashed") +
theme_bw()
auc(log_dt$infarto, log_dt$p1)
## Area under the curve: 0.6103
log_age <- glm(infarto ~ alcohol + tabaco + age, family=binomial(link="logit") ,data=log_dt)
log_dt$p2 <- predict(log_age, newdata=log_dt, type="response")
log_dt %>% ggplot(aes(d = infarto, m = p2)) + geom_roc() +
geom_abline(intercept=0, slope=1, colour="red", linetype="dashed") +
theme_bw()
auc(log_dt$infarto, log_dt$p2)
## Area under the curve: 0.9777
log_dt$p1 <- predict(log_red, newdata=log_dt, type="response")
log_dt$p2 <- predict(log_age, newdata=log_dt, type="response")
log_dt$resid_std <- rstudent(log_age)
shapiro.test(log_dt$resid_std)
##
## Shapiro-Wilk normality test
##
## data: log_dt$resid_std
## W = 0.79336, p-value < 2.2e-16
log_dt %>% ggplot(aes(sample=resid_std)) + stat_qq() + stat_qq_line(col="red", linetype="dashed") + theme_classic()
# experimental
log_dt$influy2 <- dfbetas(log_age)[,4]*100
log_dt %>% filter(influy2>1)
## # A tibble: 167 x 13
## infarto tabaco alcohol `_est_m0` `_est_m1` `_est_mod1` `_est_mod2`
## <dbl+l> <dbl+l> <dbl+l> <dbl> <dbl> <dbl> <dbl>
## 1 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 2 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 3 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 4 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 5 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 6 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 7 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 8 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 9 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## 10 1 [Yes] 1 [Yes] 1 [Yes] 1 1 1 1
## # … with 157 more rows, and 6 more variables: p <dbl>,
## # resid_std <dbl+lbl>, age <dbl>, p1 <dbl>, p2 <dbl>, influy2 <dbl>
log_dt %>% ggplot(aes(x=p2, y=influy2)) + geom_point()