Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(tidyverse)
head(dat)
## SEX RESP1 PERFST CS2 SURVSTAT AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1 0 2 100 1 3 49.2 1 0 8.45722 0
## 2 1 1 100 1 3 12.3 1 0 1.01848 1
## 3 1 2 30 1 3 62.4 1 0 0.70363 1
## 4 0 1 100 1 1 43.7 1 1 8.30116 1
## 5 1 1 70 1 3 49.8 1 1 3.27447 1
## 6 0 2 80 1 3 69.2 1 0 3.74538 1
## MAXSIZE RELSTAT RFS K67 PSGRP TMRSIZE
## 1 2 3 86.68639 13 1 0
## 2 10 3 10.55227 33 1 1
## 3 2 3 5.45694 76 0 0
## 4 15 3 15.87771 51 1 1
## 5 2 3 8.67850 54 0 0
## 6 1 3 10.38790 34 1 0
full_model<-lm(K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
null_model<-lm(K67 ~ 1, data = full_model$model,na.action=na.exclude)
anova(null_model,full_model)
## Analysis of Variance Table
##
## Model 1: K67 ~ 1
## Model 2: K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 102 43060
## 2 96 39370 6 3690 1.4996 0.1865
summary(aov(full_model))
## Df Sum Sq Mean Sq F value Pr(>F)
## SEX 1 1774 1774.4 4.327 0.0402 *
## AGE 1 0 0.5 0.001 0.9723
## CS2 1 1248 1248.4 3.044 0.0842 .
## CLNSTG2 1 1 0.6 0.001 0.9699
## PSGRP 1 415 415.5 1.013 0.3167
## TMRSIZE 1 251 250.6 0.611 0.4363
## Residuals 96 39370 410.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness
summary(full_model)
##
## Call:
## lm(formula = K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE,
## data = dat, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.289 -14.812 -0.961 12.851 50.133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.71803 7.16908 5.819 7.79e-08 ***
## SEX 8.72413 4.17346 2.090 0.0392 *
## AGE -0.02403 0.10264 -0.234 0.8154
## CS2 -6.06455 4.53965 -1.336 0.1847
## CLNSTG2 1.15575 5.07790 0.228 0.8204
## PSGRP -4.19745 4.21521 -0.996 0.3219
## TMRSIZE 3.87147 4.95254 0.782 0.4363
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.25 on 96 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.0857, Adjusted R-squared: 0.02855
## F-statistic: 1.5 on 6 and 96 DF, p-value: 0.1865
inspect <- dat %>% mutate(predicted=predict(full_model),
ord_res=residuals(full_model),
stu_res=rstandard(full_model),
jkn_res=rstudent(full_model),
cookD=cooks.distance(full_model),
leverage=hatvalues(full_model)) %>%
dplyr::select(K67,predicted,ord_res,stu_res,jkn_res,cookD,leverage)
inspect %>% ggplot(aes(predicted,jkn_res)) +
geom_point(size=2) +
geom_hline(yintercept = c(-3,0,3), col="red", linetype="dashed") +
theme_classic()
# No extreme observations noted (< −3 or > +3)
# Note: Using a Bonferroni correction (alpha/n) -> beacuse we are comparing all observations (k groups=n individuals)
# Calculating c h for SAS output, pp 2-23: α = .05, n = 103, k = 6
a <- 0.05
n <- 103 #// n<-nrow(full_model$model)
k <- 6 #// k<-ncol(full_model$model)-1
Fstat <- qf(1-(a/n),k, n-k-1)
L <- (k/(n-k-1))*Fstat
(hc1 <- (L+(1/n))/(1+L))
## [1] 0.2262855
# Rule-of-thumb #2
# h-hat=k+1/n (always true)
# Suggests flagging as suspicious any value of hi > 2(k+1)/n
(hc2 <- 2*(k+1)/n)
## [1] 0.1359223
inspect %>% ggplot(aes(predicted,leverage)) +
geom_point(size=2) +
geom_hline(yintercept = c(hc1,hc2), col=c("blue","red"), linetype="dashed") +
geom_text(aes(label=ifelse(leverage>hc2,paste0("obs= ",rownames(inspect)),"")), col="red",hjust = 0, vjust=-.5) +
theme_classic()
# No extremely large values noted (hi>hc)
inspect %>% ggplot(aes(rownames(inspect),leverage)) +
geom_point(size=2) +
geom_hline(yintercept = c(hc1,hc2), col=c("blue","red"), linetype="dashed") +
geom_text(aes(label=ifelse(leverage>hc2,paste0("obs= ",rownames(inspect)),"")), col="red",hjust = 0, vjust=-.5) +
theme_classic()
# No extremely large values noted (hi>hc)
# Rule-of-thumb #2
# Critical values dc given for α, n, k.
# Actually, table lists dc * (n-k-1), so divide number provided in table by (n−k−1) to get dc.
# Example: α=0.05, n=200, k=10
a <-0.05
n<-200
k<-10
dc <- qf(1-a/n,k,n-k-1) ## PENDING ##
inspect %>% ggplot(aes(predicted,cookD)) +
geom_point(size=2) +
geom_hline(yintercept = 1, col="red", linetype="dashed") +
geom_text(aes(label=ifelse(leverage>1,paste0("obs= ",rownames(inspect)),"")), col="red",hjust = 0, vjust=-.5) +
theme_classic()
# No extremely large values noted (di>1 or di>dc)