Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(tidyverse); library(epiDisplay); library(ResourceSelection); library(regclass); library(mctest)

Hosmer-Lemeshow goodness-of-fit test

head(dat)

##   SEX RESP1 PERFST CS2 SURVSTAT  AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1   0     0    100   1        3 49.2       1       0  8.45722      0
## 2   1     1    100   1        3 12.3       1       0  1.01848      1
## 3   1     0     30   1        3 62.4       1       0  0.70363      1
## 4   0     1    100   1        1 43.7       1       1  8.30116      1
## 5   1     1     70   1        3 49.8       1       1  3.27447      1
## 6   0     0     80   1        3 69.2       1       0  3.74538      1
##   MAXSIZE RELSTAT      RFS K67 PSGRP TMRSIZE
## 1       2       3 86.68639  13     1       0
## 2      10       3 10.55227  33     1       1
## 3       2       3  5.45694  76     0       0
## 4      15       3 15.87771  51     1       1
## 5       2       3  8.67850  54     0       0
## 6       1       3 10.38790  34     1       0

log_cont <- glm(RESP1 ~ CLNSTG2 + AGE + K67, family=binomial(link="logit") ,data=dat)
summary(log_cont)

## 
## Call:
## glm(formula = RESP1 ~ CLNSTG2 + AGE + K67, family = binomial(link = "logit"), 
##     data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2325  -1.0601   0.4277   1.0692   1.4832  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.081265   0.797810   1.355 0.175325    
## CLNSTG2      2.357205   0.658713   3.578 0.000346 ***
## AGE         -0.015124   0.011542  -1.310 0.190101    
## K67         -0.009274   0.011474  -0.808 0.418935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136.62  on 101  degrees of freedom
## Residual deviance: 115.58  on  98  degrees of freedom
## AIC: 123.58
## 
## Number of Fisher Scoring iterations: 4

logistic.display(log_cont)

## 
## Logistic regression predicting RESP1 
##  
##                  crude OR(95%CI)         adj. OR(95%CI)         
## CLNSTG2: 1 vs 0  10.16 (2.83,36.47)      10.56 (2.9,38.41)      
##                                                                 
## AGE (cont. var.) 0.99 (0.97,1.01)        0.98 (0.96,1.01)       
##                                                                 
## K67 (cont. var.) 0.9921 (0.9728,1.0119)  0.9908 (0.9687,1.0133) 
##                                                                 
##                  P(Wald's test) P(LR-test)
## CLNSTG2: 1 vs 0  < 0.001        < 0.001   
##                                           
## AGE (cont. var.) 0.19           0.185     
##                                           
## K67 (cont. var.) 0.419          0.417     
##                                           
## Log-likelihood = -57.7908
## No. of observations = 102
## AIC value = 123.5816

(h_cont <- hoslem.test(log_cont$model$RESP1, fitted(log_cont), g=10))

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  log_cont$model$RESP1, fitted(log_cont)
## X-squared = 5.3577, df = 8, p-value = 0.7187

# H-L pvalue>0.1 -> "Good fit"
cbind(h_cont$observed,h_cont$expected)

##               y0 y1     yhat0     yhat1
## [0.332,0.393]  6  5 7.0816143  3.918386
## (0.393,0.421]  6  4 5.8899039  4.110096
## (0.421,0.442]  6  4 5.6891314  4.310869
## (0.442,0.504]  7  3 5.2885876  4.711412
## (0.504,0.54]   5  5 4.8439400  5.156060
## (0.54,0.585]   3  7 4.4058707  5.594129
## (0.585,0.774]  4  6 3.8009522  6.199048
## (0.774,0.899]  0 10 1.3013618  8.698638
## (0.899,0.916]  2  8 0.9584768  9.041523
## (0.916,0.951]  1 10 0.7401617 10.259838

Adaptation H-L goodness-of-fit test for small # of covariate patterns

log_bin <- glm(RESP1 ~ CLNSTG2 + PSGRP, family=binomial(link="logit") ,data=dat)
summary(log_bin)

## 
## Call:
## glm(formula = RESP1 ~ CLNSTG2 + PSGRP, family = binomial(link = "logit"), 
##     data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2075  -1.0696   0.4279   1.1241   1.2892  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.2590     0.3141  -0.825 0.409515    
## CLNSTG2       2.2184     0.6613   3.355 0.000794 ***
## PSGRP         0.3857     0.4499   0.857 0.391314    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136.62  on 101  degrees of freedom
## Residual deviance: 117.28  on  99  degrees of freedom
## AIC: 123.28
## 
## Number of Fisher Scoring iterations: 4

logistic.display(log_bin)

## 
## Logistic regression predicting RESP1 
##  
##                  crude OR(95%CI)     adj. OR(95%CI)    P(Wald's test)
## CLNSTG2: 1 vs 0  10.16 (2.83,36.47)  9.19 (2.52,33.6)  < 0.001       
##                                                                      
## PSGRP: 1 vs 0    2.14 (0.95,4.81)    1.47 (0.61,3.55)  0.391         
##                                                                      
##                  P(LR-test)
## CLNSTG2: 1 vs 0  < 0.001   
##                            
## PSGRP: 1 vs 0    0.391     
##                            
## Log-likelihood = -58.6389
## No. of observations = 102
## AIC value = 123.2778

(h_bin <- hoslem.test(log_bin$model$RESP1, fitted(log_bin), g=4)) # g=4 -> number of covarites patters (2x2)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  log_bin$model$RESP1, fitted(log_bin)
## X-squared = 1.6597, df = 2, p-value = 0.4361

# H-L pvalue>0.1 -> "Good fit"
cbind(h_bin$observed,h_bin$expected)

##               y0 y1      yhat0     yhat1
## [0.436,0.532] 37 34 37.0000000 34.000000
## (0.532,0.876]  0  8  0.9883233  7.011677
## (0.876,0.913]  3 20  2.0116768 20.988323

Regression diagnostics: residuals

inspect_cont <- dat %>% mutate(pearson=residuals(log_cont, "pearson"),
                               deviance=residuals(log_cont, "deviance"),
                               leverage=hatvalues(log_cont)) %>%
  dplyr::select(RESP1,CLNSTG2, AGE, K67, pearson, deviance, leverage)

inspect_cont %>% ggplot(aes(rownames(inspect_cont),pearson)) +
  geom_point(size=2) +
  geom_hline(yintercept = c(0,-2,2), col=c("blue","red","red"), linetype="dashed") +
  geom_text(aes(label=ifelse(pearson<(-2) | pearson>(2),rownames(inspect_cont),"")), col="red",hjust = 0, vjust=-.5) +
  theme_classic()

# pattern scattered around 0; no obious departures from model assumptions. Except obs 42, 94, 95.

inspect_cont %>% ggplot(aes(rownames(inspect_cont),deviance)) +
  geom_point(size=2) +
  geom_hline(yintercept = c(0,-2,2), col=c("blue","red","red"), linetype="dashed") +
  geom_text(aes(label=ifelse(deviance<(-2) | deviance>(2),rownames(inspect_cont),"")), col="red",hjust = 0, vjust=-.5) +
  theme_classic()

# pattern scattered around 0; no obious departures from model assumptions. Except obs 42, 94, 95.

# Inspect suspect values 42, 94, 95.
inspect_cont[c(42,94,95),]

##    RESP1 CLNSTG2  AGE K67   pearson  deviance   leverage
## 42     0       1 41.0  66 -3.013660 -2.149775 0.04376740
## 94     0       1 45.6  37 -3.329580 -2.232513 0.02988009
## 95     0       1 73.2  15 -2.992643 -2.143906 0.04474055

Residual outliers: Case 42 (p 35) and cases 94 and 95 (p 37). Case 42 did not have complete response (resid<0), but was in stage 1 or 2, which predicts complete response. Likewise with cases 94 and 95. This discrepancy between observed and predicted responses made these patients outliers.

Leverage (called hat matrix diagonal in SAS)

n<-nrow(log_cont$model) # Num of obs
k<-ncol(log_cont$model)-1 # Num of covariates
(hc <- 2*(k+1)/n)

## [1] 0.07843137

inspect_cont %>% ggplot(aes(rownames(inspect_cont),leverage)) +
  geom_point(size=2) +
  geom_hline(yintercept = hc, col="red", linetype="dashed") +
  geom_text(aes(label=ifelse(leverage>hc,rownames(inspect_cont),"")), col="red",hjust = 0, vjust=-.5) +
  theme_classic()

# Extremely large values noted (hi>hc) -> obs 20, 26.

# Inspect suspect values 20, 26.
inspect_cont[c(20,26),]

##    RESP1 CLNSTG2  AGE K67   pearson  deviance   leverage
## 20     0       0 21.8  77 -1.018907 -1.193360 0.09420449
## 26     0       0 38.1  89 -0.851998 -1.044748 0.10138057

Hat matrix (leverage) outliers: Cases 20 and 26 (p 34). Case 20 was in stage 3 or 4, was 21.8 years old, and had Ki-67 = 77%. Case 26 was in stage 3 or 4, was 38.1 years old, and had Ki-67 = 89%. This combination of young age and aggressive disease appears to be an unusual profile, making these patients outliers. (Note: Observed or predicted response is irrelevant when looking at hat matrix outliers.) Calculation of hat matrix diagonal (leverages) does not depend on outcome.

Collinearity

summary(log_cont)

## 
## Call:
## glm(formula = RESP1 ~ CLNSTG2 + AGE + K67, family = binomial(link = "logit"), 
##     data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2325  -1.0601   0.4277   1.0692   1.4832  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.081265   0.797810   1.355 0.175325    
## CLNSTG2      2.357205   0.658713   3.578 0.000346 ***
## AGE         -0.015124   0.011542  -1.310 0.190101    
## K67         -0.009274   0.011474  -0.808 0.418935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136.62  on 101  degrees of freedom
## Residual deviance: 115.58  on  98  degrees of freedom
## AIC: 123.58
## 
## Number of Fisher Scoring iterations: 4

VIF(log_cont)

##  CLNSTG2      AGE      K67 
## 1.007211 1.005875 1.001354

omcdiag(log_cont$model,log_cont$model$RESP1)

## 
## Call:
## omcdiag(x = log_cont$model, y = log_cont$model$RESP1)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.8190         0
## Farrar Chi-Square:        19.5152         1
## Red Indicator:             0.1741         0
## Sum of Lambda Inverse:     4.4406         0
## Theil's Method:           -2.6307         0
## Condition Number:          9.2750         0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test

imcdiag(log_cont$model,log_cont$model$RESP1)

## 
## Call:
## imcdiag(x = log_cont$model, y = log_cont$model$RESP1)
## 
## 
## All Individual Multicollinearity Diagnostics Result
## 
##            VIF    TOL     Wi      Fi Leamer CVIF Klein
## RESP1   1.2194 0.8201 7.1664 10.8593 0.9056    0     0
## CLNSTG2 1.1935 0.8379 6.3206  9.5777 0.9154    0     0
## AGE     1.0192 0.9812 0.6264  0.9492 0.9905    0     0
## K67     1.0085 0.9916 0.2782  0.4215 0.9958    0     0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
## 
## CLNSTG2 , AGE , K67 , coefficient(s) are non-significant may be due to multicollinearity
## 
## R-square of y on all x: 1 
## 
## * use method argument to check which regressors may be the reason of collinearity
## ===================================

Routine for PH 628

Lecture 5: Goodness-of-Fit and Regression Diagnostics for Logistic Regression

Gabriel Carrasco Escobar

Hosmer-Lemeshow goodness-of-fit test

Adaptation H-L goodness-of-fit test for small # of covariate patterns

Regression diagnostics: residuals

Leverage (called hat matrix diagonal in SAS)

Collinearity