Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(tidyverse); library(epiDisplay); library(ResourceSelection); library(regclass); library(mctest)
head(dat)
## SEX RESP1 PERFST CS2 SURVSTAT AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1 0 0 100 1 3 49.2 1 0 8.45722 0
## 2 1 1 100 1 3 12.3 1 0 1.01848 1
## 3 1 0 30 1 3 62.4 1 0 0.70363 1
## 4 0 1 100 1 1 43.7 1 1 8.30116 1
## 5 1 1 70 1 3 49.8 1 1 3.27447 1
## 6 0 0 80 1 3 69.2 1 0 3.74538 1
## MAXSIZE RELSTAT RFS K67 PSGRP TMRSIZE
## 1 2 3 86.68639 13 1 0
## 2 10 3 10.55227 33 1 1
## 3 2 3 5.45694 76 0 0
## 4 15 3 15.87771 51 1 1
## 5 2 3 8.67850 54 0 0
## 6 1 3 10.38790 34 1 0
log_cont <- glm(RESP1 ~ CLNSTG2 + AGE + K67, family=binomial(link="logit") ,data=dat)
summary(log_cont)
##
## Call:
## glm(formula = RESP1 ~ CLNSTG2 + AGE + K67, family = binomial(link = "logit"),
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2325 -1.0601 0.4277 1.0692 1.4832
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.081265 0.797810 1.355 0.175325
## CLNSTG2 2.357205 0.658713 3.578 0.000346 ***
## AGE -0.015124 0.011542 -1.310 0.190101
## K67 -0.009274 0.011474 -0.808 0.418935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136.62 on 101 degrees of freedom
## Residual deviance: 115.58 on 98 degrees of freedom
## AIC: 123.58
##
## Number of Fisher Scoring iterations: 4
logistic.display(log_cont)
##
## Logistic regression predicting RESP1
##
## crude OR(95%CI) adj. OR(95%CI)
## CLNSTG2: 1 vs 0 10.16 (2.83,36.47) 10.56 (2.9,38.41)
##
## AGE (cont. var.) 0.99 (0.97,1.01) 0.98 (0.96,1.01)
##
## K67 (cont. var.) 0.9921 (0.9728,1.0119) 0.9908 (0.9687,1.0133)
##
## P(Wald's test) P(LR-test)
## CLNSTG2: 1 vs 0 < 0.001 < 0.001
##
## AGE (cont. var.) 0.19 0.185
##
## K67 (cont. var.) 0.419 0.417
##
## Log-likelihood = -57.7908
## No. of observations = 102
## AIC value = 123.5816
(h_cont <- hoslem.test(log_cont$model$RESP1, fitted(log_cont), g=10))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: log_cont$model$RESP1, fitted(log_cont)
## X-squared = 5.3577, df = 8, p-value = 0.7187
# H-L pvalue>0.1 -> "Good fit"
cbind(h_cont$observed,h_cont$expected)
## y0 y1 yhat0 yhat1
## [0.332,0.393] 6 5 7.0816143 3.918386
## (0.393,0.421] 6 4 5.8899039 4.110096
## (0.421,0.442] 6 4 5.6891314 4.310869
## (0.442,0.504] 7 3 5.2885876 4.711412
## (0.504,0.54] 5 5 4.8439400 5.156060
## (0.54,0.585] 3 7 4.4058707 5.594129
## (0.585,0.774] 4 6 3.8009522 6.199048
## (0.774,0.899] 0 10 1.3013618 8.698638
## (0.899,0.916] 2 8 0.9584768 9.041523
## (0.916,0.951] 1 10 0.7401617 10.259838
log_bin <- glm(RESP1 ~ CLNSTG2 + PSGRP, family=binomial(link="logit") ,data=dat)
summary(log_bin)
##
## Call:
## glm(formula = RESP1 ~ CLNSTG2 + PSGRP, family = binomial(link = "logit"),
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2075 -1.0696 0.4279 1.1241 1.2892
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2590 0.3141 -0.825 0.409515
## CLNSTG2 2.2184 0.6613 3.355 0.000794 ***
## PSGRP 0.3857 0.4499 0.857 0.391314
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136.62 on 101 degrees of freedom
## Residual deviance: 117.28 on 99 degrees of freedom
## AIC: 123.28
##
## Number of Fisher Scoring iterations: 4
logistic.display(log_bin)
##
## Logistic regression predicting RESP1
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## CLNSTG2: 1 vs 0 10.16 (2.83,36.47) 9.19 (2.52,33.6) < 0.001
##
## PSGRP: 1 vs 0 2.14 (0.95,4.81) 1.47 (0.61,3.55) 0.391
##
## P(LR-test)
## CLNSTG2: 1 vs 0 < 0.001
##
## PSGRP: 1 vs 0 0.391
##
## Log-likelihood = -58.6389
## No. of observations = 102
## AIC value = 123.2778
(h_bin <- hoslem.test(log_bin$model$RESP1, fitted(log_bin), g=4)) # g=4 -> number of covarites patters (2x2)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: log_bin$model$RESP1, fitted(log_bin)
## X-squared = 1.6597, df = 2, p-value = 0.4361
# H-L pvalue>0.1 -> "Good fit"
cbind(h_bin$observed,h_bin$expected)
## y0 y1 yhat0 yhat1
## [0.436,0.532] 37 34 37.0000000 34.000000
## (0.532,0.876] 0 8 0.9883233 7.011677
## (0.876,0.913] 3 20 2.0116768 20.988323
inspect_cont <- dat %>% mutate(pearson=residuals(log_cont, "pearson"),
deviance=residuals(log_cont, "deviance"),
leverage=hatvalues(log_cont)) %>%
dplyr::select(RESP1,CLNSTG2, AGE, K67, pearson, deviance, leverage)
inspect_cont %>% ggplot(aes(rownames(inspect_cont),pearson)) +
geom_point(size=2) +
geom_hline(yintercept = c(0,-2,2), col=c("blue","red","red"), linetype="dashed") +
geom_text(aes(label=ifelse(pearson<(-2) | pearson>(2),rownames(inspect_cont),"")), col="red",hjust = 0, vjust=-.5) +
theme_classic()
# pattern scattered around 0; no obious departures from model assumptions. Except obs 42, 94, 95.
inspect_cont %>% ggplot(aes(rownames(inspect_cont),deviance)) +
geom_point(size=2) +
geom_hline(yintercept = c(0,-2,2), col=c("blue","red","red"), linetype="dashed") +
geom_text(aes(label=ifelse(deviance<(-2) | deviance>(2),rownames(inspect_cont),"")), col="red",hjust = 0, vjust=-.5) +
theme_classic()
# pattern scattered around 0; no obious departures from model assumptions. Except obs 42, 94, 95.
# Inspect suspect values 42, 94, 95.
inspect_cont[c(42,94,95),]
## RESP1 CLNSTG2 AGE K67 pearson deviance leverage
## 42 0 1 41.0 66 -3.013660 -2.149775 0.04376740
## 94 0 1 45.6 37 -3.329580 -2.232513 0.02988009
## 95 0 1 73.2 15 -2.992643 -2.143906 0.04474055
Residual outliers: Case 42 (p 35) and cases 94 and 95 (p 37). Case 42 did not have complete response (resid<0), but was in stage 1 or 2, which predicts complete response. Likewise with cases 94 and 95. This discrepancy between observed and predicted responses made these patients outliers.
n<-nrow(log_cont$model) # Num of obs
k<-ncol(log_cont$model)-1 # Num of covariates
(hc <- 2*(k+1)/n)
## [1] 0.07843137
inspect_cont %>% ggplot(aes(rownames(inspect_cont),leverage)) +
geom_point(size=2) +
geom_hline(yintercept = hc, col="red", linetype="dashed") +
geom_text(aes(label=ifelse(leverage>hc,rownames(inspect_cont),"")), col="red",hjust = 0, vjust=-.5) +
theme_classic()
# Extremely large values noted (hi>hc) -> obs 20, 26.
# Inspect suspect values 20, 26.
inspect_cont[c(20,26),]
## RESP1 CLNSTG2 AGE K67 pearson deviance leverage
## 20 0 0 21.8 77 -1.018907 -1.193360 0.09420449
## 26 0 0 38.1 89 -0.851998 -1.044748 0.10138057
Hat matrix (leverage) outliers: Cases 20 and 26 (p 34). Case 20 was in stage 3 or 4, was 21.8 years old, and had Ki-67 = 77%. Case 26 was in stage 3 or 4, was 38.1 years old, and had Ki-67 = 89%. This combination of young age and aggressive disease appears to be an unusual profile, making these patients outliers. (Note: Observed or predicted response is irrelevant when looking at hat matrix outliers.) Calculation of hat matrix diagonal (leverages) does not depend on outcome.
summary(log_cont)
##
## Call:
## glm(formula = RESP1 ~ CLNSTG2 + AGE + K67, family = binomial(link = "logit"),
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2325 -1.0601 0.4277 1.0692 1.4832
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.081265 0.797810 1.355 0.175325
## CLNSTG2 2.357205 0.658713 3.578 0.000346 ***
## AGE -0.015124 0.011542 -1.310 0.190101
## K67 -0.009274 0.011474 -0.808 0.418935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136.62 on 101 degrees of freedom
## Residual deviance: 115.58 on 98 degrees of freedom
## AIC: 123.58
##
## Number of Fisher Scoring iterations: 4
VIF(log_cont)
## CLNSTG2 AGE K67
## 1.007211 1.005875 1.001354
omcdiag(log_cont$model,log_cont$model$RESP1)
##
## Call:
## omcdiag(x = log_cont$model, y = log_cont$model$RESP1)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.8190 0
## Farrar Chi-Square: 19.5152 1
## Red Indicator: 0.1741 0
## Sum of Lambda Inverse: 4.4406 0
## Theil's Method: -2.6307 0
## Condition Number: 9.2750 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
imcdiag(log_cont$model,log_cont$model$RESP1)
##
## Call:
## imcdiag(x = log_cont$model, y = log_cont$model$RESP1)
##
##
## All Individual Multicollinearity Diagnostics Result
##
## VIF TOL Wi Fi Leamer CVIF Klein
## RESP1 1.2194 0.8201 7.1664 10.8593 0.9056 0 0
## CLNSTG2 1.1935 0.8379 6.3206 9.5777 0.9154 0 0
## AGE 1.0192 0.9812 0.6264 0.9492 0.9905 0 0
## K67 1.0085 0.9916 0.2782 0.4215 0.9958 0 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## CLNSTG2 , AGE , K67 , coefficient(s) are non-significant may be due to multicollinearity
##
## R-square of y on all x: 1
##
## * use method argument to check which regressors may be the reason of collinearity
## ===================================