Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(tidyverse)

Diagnostic stats for detecting outliers:

Residuals (standardized, studentized, jackknife)
Leverages
Cook’s distances

head(dat)

##   SEX RESP1 PERFST CS2 SURVSTAT  AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1   0     2    100   1        3 49.2       1       0  8.45722      0
## 2   1     1    100   1        3 12.3       1       0  1.01848      1
## 3   1     2     30   1        3 62.4       1       0  0.70363      1
## 4   0     1    100   1        1 43.7       1       1  8.30116      1
## 5   1     1     70   1        3 49.8       1       1  3.27447      1
## 6   0     2     80   1        3 69.2       1       0  3.74538      1
##   MAXSIZE RELSTAT      RFS K67 PSGRP TMRSIZE
## 1       2       3 86.68639  13     1       0
## 2      10       3 10.55227  33     1       1
## 3       2       3  5.45694  76     0       0
## 4      15       3 15.87771  51     1       1
## 5       2       3  8.67850  54     0       0
## 6       1       3 10.38790  34     1       0

full_model<-lm(K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
null_model<-lm(K67 ~ 1, data = full_model$model,na.action=na.exclude)
anova(null_model,full_model)

## Analysis of Variance Table
## 
## Model 1: K67 ~ 1
## Model 2: K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    102 43060                           
## 2     96 39370  6      3690 1.4996 0.1865

summary(aov(full_model))

##             Df Sum Sq Mean Sq F value Pr(>F)  
## SEX          1   1774  1774.4   4.327 0.0402 *
## AGE          1      0     0.5   0.001 0.9723  
## CS2          1   1248  1248.4   3.044 0.0842 .
## CLNSTG2      1      1     0.6   0.001 0.9699  
## PSGRP        1    415   415.5   1.013 0.3167  
## TMRSIZE      1    251   250.6   0.611 0.4363  
## Residuals   96  39370   410.1                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness

summary(full_model)

## 
## Call:
## lm(formula = K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, 
##     data = dat, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.289 -14.812  -0.961  12.851  50.133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.71803    7.16908   5.819 7.79e-08 ***
## SEX          8.72413    4.17346   2.090   0.0392 *  
## AGE         -0.02403    0.10264  -0.234   0.8154    
## CS2         -6.06455    4.53965  -1.336   0.1847    
## CLNSTG2      1.15575    5.07790   0.228   0.8204    
## PSGRP       -4.19745    4.21521  -0.996   0.3219    
## TMRSIZE      3.87147    4.95254   0.782   0.4363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.25 on 96 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.0857, Adjusted R-squared:  0.02855 
## F-statistic:   1.5 on 6 and 96 DF,  p-value: 0.1865

inspect <- dat %>% mutate(predicted=predict(full_model),
                          ord_res=residuals(full_model),
                          stu_res=rstandard(full_model),
                          jkn_res=rstudent(full_model),
                          cookD=cooks.distance(full_model),
                          leverage=hatvalues(full_model)) %>%
  dplyr::select(K67,predicted,ord_res,stu_res,jkn_res,cookD,leverage)

RESIDUALS

inspect %>% ggplot(aes(predicted,jkn_res)) +
  geom_point(size=2) +
  geom_hline(yintercept = c(-3,0,3), col="red", linetype="dashed") +
  theme_classic()

# No extreme observations noted (< −3 or > +3)

LEVERAGES

# Note: Using a Bonferroni correction (alpha/n) -> beacuse we are comparing all observations (k groups=n individuals)
# Calculating c h for SAS output, pp 2-23: α = .05, n = 103, k = 6
a <- 0.05
n <- 103 #// n<-nrow(full_model$model)
k <- 6 #// k<-ncol(full_model$model)-1
Fstat <- qf(1-(a/n),k, n-k-1) 
L <- (k/(n-k-1))*Fstat
(hc1 <- (L+(1/n))/(1+L))

## [1] 0.2262855

# Rule-of-thumb #2
# h-hat=k+1/n (always true)
# Suggests flagging as suspicious any value of hi > 2(k+1)/n

(hc2 <- 2*(k+1)/n)

## [1] 0.1359223

inspect %>% ggplot(aes(predicted,leverage)) +
  geom_point(size=2) +
  geom_hline(yintercept = c(hc1,hc2), col=c("blue","red"), linetype="dashed") +
  geom_text(aes(label=ifelse(leverage>hc2,paste0("obs= ",rownames(inspect)),"")), col="red",hjust = 0, vjust=-.5) +
  theme_classic()

# No extremely large values noted (hi>hc)

inspect %>% ggplot(aes(rownames(inspect),leverage)) +
  geom_point(size=2) +
  geom_hline(yintercept = c(hc1,hc2), col=c("blue","red"), linetype="dashed") +
  geom_text(aes(label=ifelse(leverage>hc2,paste0("obs= ",rownames(inspect)),"")), col="red",hjust = 0, vjust=-.5) +
  theme_classic()

# No extremely large values noted (hi>hc)

Cook’s distances (Cook’s D)

# Rule-of-thumb #2
# Critical values dc given for α, n, k. 
# Actually, table lists dc * (n-k-1), so divide number provided in table by (n−k−1) to get dc.
# Example: α=0.05, n=200, k=10
a <-0.05
n<-200
k<-10
dc <- qf(1-a/n,k,n-k-1)  ## PENDING ##

inspect %>% ggplot(aes(predicted,cookD)) +
  geom_point(size=2) +
  geom_hline(yintercept = 1, col="red", linetype="dashed") +
  geom_text(aes(label=ifelse(leverage>1,paste0("obs= ",rownames(inspect)),"")), col="red",hjust = 0, vjust=-.5) +
  theme_classic()

# No extremely large values noted (di>1 or di>dc)

Routine for PH 628

Lecture 3: Outliers

Gabriel Carrasco Escobar

Diagnostic stats for detecting outliers:

RESIDUALS

LEVERAGES

Cook’s distances (Cook’s D)