Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(tidyverse)

Residual Analysis

head(dat)
##   SEX RESP1 PERFST CS2 SURVSTAT  AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1   0     2    100   1        3 49.2       1       0  8.45722      0
## 2   1     1    100   1        3 12.3       1       0  1.01848      1
## 3   1     2     30   1        3 62.4       1       0  0.70363      1
## 4   0     1    100   1        1 43.7       1       1  8.30116      1
## 5   1     1     70   1        3 49.8       1       1  3.27447      1
## 6   0     2     80   1        3 69.2       1       0  3.74538      1
##   MAXSIZE RELSTAT      RFS K67 PSGRP TMRSIZE
## 1       2       3 86.68639  13     1       0
## 2      10       3 10.55227  33     1       1
## 3       2       3  5.45694  76     0       0
## 4      15       3 15.87771  51     1       1
## 5       2       3  8.67850  54     0       0
## 6       1       3 10.38790  34     1       0
full_model<-lm(K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
null_model<-lm(K67 ~ 1, data = full_model$model,na.action=na.exclude)
anova(null_model,full_model) # Anova of entire model SS(model)=3690, df(Model)=6, MSR=SS/df=615
## Analysis of Variance Table
## 
## Model 1: K67 ~ 1
## Model 2: K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    102 43060                           
## 2     96 39370  6      3690 1.4996 0.1865
summary(aov(full_model)) # Anova of each term, SS(error)=39370, df(error)=96, MSE=410.1
##             Df Sum Sq Mean Sq F value Pr(>F)  
## SEX          1   1774  1774.4   4.327 0.0402 *
## AGE          1      0     0.5   0.001 0.9723  
## CS2          1   1248  1248.4   3.044 0.0842 .
## CLNSTG2      1      1     0.6   0.001 0.9699  
## PSGRP        1    415   415.5   1.013 0.3167  
## TMRSIZE      1    251   250.6   0.611 0.4363  
## Residuals   96  39370   410.1                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness
(Fs<-615/410.1) # F=MSR/MSE
## [1] 1.499634
(PrF <- 1-pf(Fs, df1=6, df2=96)) #df1=df(Model), df2=df(error)
## [1] 0.1864902
summary(full_model)
## 
## Call:
## lm(formula = K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, 
##     data = dat, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.289 -14.812  -0.961  12.851  50.133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.71803    7.16908   5.819 7.79e-08 ***
## SEX          8.72413    4.17346   2.090   0.0392 *  
## AGE         -0.02403    0.10264  -0.234   0.8154    
## CS2         -6.06455    4.53965  -1.336   0.1847    
## CLNSTG2      1.15575    5.07790   0.228   0.8204    
## PSGRP       -4.19745    4.21521  -0.996   0.3219    
## TMRSIZE      3.87147    4.95254   0.782   0.4363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.25 on 96 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.0857, Adjusted R-squared:  0.02855 
## F-statistic:   1.5 on 6 and 96 DF,  p-value: 0.1865
inspect <- dat %>% mutate(predicted=predict(full_model),
                          ord_res=residuals(full_model),
                          stu_res=rstandard(full_model),
                          jkn_res=rstudent(full_model)) %>%
  dplyr::select(K67,predicted,ord_res,stu_res,jkn_res)

Checking normality assumption

inspect %>% ggplot(aes(ord_res)) +
  geom_histogram(aes(y = ..density..), bins = 8, fill="white", col="black") + 
  stat_function(fun = dnorm, col="red", linetype="dashed", args = list(mean = mean(inspect$ord_res,na.rm = T), sd = sd(inspect$ord_res,na.rm = T))) +
  theme_classic()

inspect %>%  ggplot(aes(sample = ord_res)) + stat_qq() + stat_qq_line(col="red", linetype="dashed") + theme_classic()

Checking linearity, constant variance assumptions

inspect %>% ggplot(aes(predicted,ord_res)) +
  geom_point(size=2) +
  geom_hline(yintercept = 0, col="red", linetype="dashed") +
  theme_classic()

inspect %>% ggplot(aes(predicted,stu_res)) +
  geom_point(size=2) +
  geom_hline(yintercept = 0, col="red", linetype="dashed") +
  theme_classic()

# pattern scattered around 0; no obvious departures from model assumptions. No Studentized residuals are <-3 or >+3

inspect %>% ggplot(aes(predicted,jkn_res)) +
  geom_point(size=2) +
  geom_hline(yintercept = 0, col="red", linetype="dashed") +
  theme_classic()

# No extreme observations noted