Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(tidyverse); library(GGally); library(regclass); library(mctest)
head(dat)
## SEX RESP1 PERFST CS2 SURVSTAT AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1 0 2 100 1 3 49.2 1 0 8.45722 0
## 2 1 1 100 1 3 12.3 1 0 1.01848 1
## 3 1 2 30 1 3 62.4 1 0 0.70363 1
## 4 0 1 100 1 1 43.7 1 1 8.30116 1
## 5 1 1 70 1 3 49.8 1 1 3.27447 1
## 6 0 2 80 1 3 69.2 1 0 3.74538 1
## MAXSIZE RELSTAT RFS K67 PSGRP TMRSIZE
## 1 2 3 86.68639 13 1 0
## 2 10 3 10.55227 33 1 1
## 3 2 3 5.45694 76 0 0
## 4 15 3 15.87771 51 1 1
## 5 2 3 8.67850 54 0 0
## 6 1 3 10.38790 34 1 0
dat <- dat %>% mutate(AGELINC = 2*AGE+3) # create dummy collinear variable AGELINC
model<-lm(K67 ~ SEX + AGE + AGELINC + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model)
##
## Call:
## lm(formula = K67 ~ SEX + AGE + AGELINC + CS2 + CLNSTG2 + PSGRP +
## TMRSIZE, data = dat, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.289 -14.812 -0.961 12.851 50.133
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.71803 7.16908 5.819 7.79e-08 ***
## SEX 8.72413 4.17346 2.090 0.0392 *
## AGE -0.02403 0.10264 -0.234 0.8154
## AGELINC NA NA NA NA
## CS2 -6.06455 4.53965 -1.336 0.1847
## CLNSTG2 1.15575 5.07790 0.228 0.8204
## PSGRP -4.19745 4.21521 -0.996 0.3219
## TMRSIZE 3.87147 4.95254 0.782 0.4363
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.25 on 96 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.0857, Adjusted R-squared: 0.02855
## F-statistic: 1.5 on 6 and 96 DF, p-value: 0.1865
model_orig<-lm(K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model_orig)
##
## Call:
## lm(formula = K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE,
## data = dat, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.289 -14.812 -0.961 12.851 50.133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.71803 7.16908 5.819 7.79e-08 ***
## SEX 8.72413 4.17346 2.090 0.0392 *
## AGE -0.02403 0.10264 -0.234 0.8154
## CS2 -6.06455 4.53965 -1.336 0.1847
## CLNSTG2 1.15575 5.07790 0.228 0.8204
## PSGRP -4.19745 4.21521 -0.996 0.3219
## TMRSIZE 3.87147 4.95254 0.782 0.4363
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.25 on 96 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.0857, Adjusted R-squared: 0.02855
## F-statistic: 1.5 on 6 and 96 DF, p-value: 0.1865
dat <- dat %>% mutate(AGESQ=AGE*AGE) # create dummy collinear variable AGESQ
dat %>% dplyr::select(K67, SEX, AGE, AGESQ, CS2, CLNSTG2, PSGRP, TMRSIZE) %>% ggcorr(label = TRUE, label_round = 4) #Correlation
model_cuad<-lm(K67 ~ SEX + AGE + AGESQ + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model_cuad)
##
## Call:
## lm(formula = K67 ~ SEX + AGE + AGESQ + CS2 + CLNSTG2 + PSGRP +
## TMRSIZE, data = dat, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.642 -15.708 -1.379 13.414 46.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.997073 14.507616 4.067 9.84e-05 ***
## SEX 8.261439 4.168394 1.982 0.0504 .
## AGE -0.822268 0.592383 -1.388 0.1684
## AGESQ 0.008020 0.005863 1.368 0.1745
## CS2 -6.169016 4.519831 -1.365 0.1755
## CLNSTG2 0.993888 5.056392 0.197 0.8446
## PSGRP -4.899019 4.227434 -1.159 0.2494
## TMRSIZE 3.955068 4.930593 0.802 0.4245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.16 on 95 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1034, Adjusted R-squared: 0.03729
## F-statistic: 1.564 on 7 and 95 DF, p-value: 0.1555
VIF(model_cuad)
## SEX AGE AGESQ CS2 CLNSTG2 PSGRP TMRSIZE
## 1.046010 34.582419 34.291094 1.279601 1.310716 1.131350 1.034913
omcdiag(model_cuad$model,model_cuad$model$K67)
##
## Call:
## omcdiag(x = model_cuad$model, y = model_cuad$model$K67)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.0177 0
## Farrar Chi-Square: 400.8248 1
## Red Indicator: 0.2322 0
## Sum of Lambda Inverse: 77.2601 1
## Theil's Method: -4.2321 0
## Condition Number: 51.6710 1
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
imcdiag(model_cuad$model,model_cuad$model$K67)
##
## Call:
## imcdiag(x = model_cuad$model, y = model_cuad$model$K67)
##
##
## All Individual Multicollinearity Diagnostics Result
##
## VIF TOL Wi Fi Leamer CVIF Klein
## K67 1.1153 0.8966 1.5644 1.8444 0.9469 0 0
## SEX 1.0893 0.9181 1.2114 1.4282 0.9582 0 0
## AGE 35.2838 0.0283 465.2801 548.5408 0.1683 0 0
## AGESQ 34.9666 0.0286 460.9754 543.4657 0.1691 0 0
## CS2 1.3047 0.7665 4.1351 4.8751 0.8755 0 0
## CLNSTG2 1.3112 0.7626 4.2241 4.9800 0.8733 0 0
## PSGRP 1.1473 0.8716 1.9997 2.3575 0.9336 0 0
## TMRSIZE 1.0419 0.9598 0.5689 0.6708 0.9797 0 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## SEX , CS2 , CLNSTG2 , PSGRP , coefficient(s) are non-significant may be due to multicollinearity
##
## R-square of y on all x: 1
##
## * use method argument to check which regressors may be the reason of collinearity
## ===================================
imcdiag(model_cuad$model,model_cuad$model$K67)$idiags[,1]>10
## K67 SEX AGE AGESQ CS2 CLNSTG2 PSGRP TMRSIZE
## FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
imcdiag(model_cuad$model,model_cuad$model$K67)$idiags[,2]<0.1
## K67 SEX AGE AGESQ CS2 CLNSTG2 PSGRP TMRSIZE
## FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
# AGE and AGESQ have VIF>10 and TOL<0.1 suggesting multicollinearity
# Model Estimates of AGE
summary(model_orig)$coefficients["AGE","Estimate"] # without AGESQ
## [1] -0.02403084
summary(model_cuad)$coefficients["AGE","Estimate"] # with AGESQ
## [1] -0.822268
# Model standard error of AGE
summary(model_orig)$coefficients["AGE","Std. Error"] # without AGESQ
## [1] 0.1026433
summary(model_cuad)$coefficients["AGE","Std. Error"] # with AGESQ
## [1] 0.5923831
# Consecuentely, the presence of AGESQ dramatically affects the estimatesof AGE
# Centering to remove the collinerity
dat <- dat %>% mutate(CTRAGE=AGE-mean(AGE), CTRAGESQ = CTRAGE*CTRAGE) # create centered dummy collinear variable CTRAGESQ
dat %>% dplyr::select(K67, SEX, CTRAGE, CTRAGESQ, CS2, CLNSTG2, PSGRP, TMRSIZE) %>% ggcorr(label = TRUE, label_round = 4) #Correlation
model_ctr<-lm(K67 ~ SEX + CTRAGE + CTRAGESQ + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model_ctr)
##
## Call:
## lm(formula = K67 ~ SEX + CTRAGE + CTRAGESQ + CS2 + CLNSTG2 +
## PSGRP + TMRSIZE, data = dat, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.642 -15.708 -1.379 13.414 46.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.956626 4.363593 8.698 9.96e-14 ***
## SEX 8.261439 4.168394 1.982 0.0504 .
## CTRAGE 0.033491 0.110494 0.303 0.7625
## CTRAGESQ 0.008020 0.005863 1.368 0.1745
## CS2 -6.169016 4.519831 -1.365 0.1755
## CLNSTG2 0.993888 5.056392 0.197 0.8446
## PSGRP -4.899019 4.227434 -1.159 0.2494
## TMRSIZE 3.955068 4.930593 0.802 0.4245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.16 on 95 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1034, Adjusted R-squared: 0.03729
## F-statistic: 1.564 on 7 and 95 DF, p-value: 0.1555
VIF(model_ctr)
## SEX CTRAGE CTRAGESQ CS2 CLNSTG2 PSGRP TMRSIZE
## 1.046010 1.203167 1.225923 1.279601 1.310716 1.131350 1.034913
omcdiag(model_ctr$model,model_ctr$model$K67)
##
## Call:
## omcdiag(x = model_ctr$model, y = model_ctr$model$K67)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.4955 0
## Farrar Chi-Square: 69.7877 1
## Red Indicator: 0.1643 0
## Sum of Lambda Inverse: 9.4641 0
## Theil's Method: -5.8054 0
## Condition Number: 7.7158 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
imcdiag(model_ctr$model,model_ctr$model$K67)
##
## Call:
## imcdiag(x = model_ctr$model, y = model_ctr$model$K67)
##
##
## All Individual Multicollinearity Diagnostics Result
##
## VIF TOL Wi Fi Leamer CVIF Klein
## K67 1.1153 0.8966 1.5644 1.8444 0.9469 0 0
## SEX 1.0893 0.9181 1.2114 1.4282 0.9582 0 0
## CTRAGE 1.2043 0.8303 2.7731 3.2693 0.9112 0 0
## CTRAGESQ 1.2501 0.8000 3.3939 4.0012 0.8944 0 0
## CS2 1.3047 0.7665 4.1351 4.8751 0.8755 0 0
## CLNSTG2 1.3112 0.7626 4.2241 4.9800 0.8733 0 0
## PSGRP 1.1473 0.8716 1.9997 2.3575 0.9336 0 0
## TMRSIZE 1.0419 0.9598 0.5689 0.6708 0.9797 0 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## SEX , CTRAGE , CS2 , CLNSTG2 , PSGRP , coefficient(s) are non-significant may be due to multicollinearity
##
## R-square of y on all x: 1
##
## * use method argument to check which regressors may be the reason of collinearity
## ===================================
imcdiag(model_ctr$model,model_ctr$model$K67)$idiags[,1]>10
## K67 SEX CTRAGE CTRAGESQ CS2 CLNSTG2 PSGRP TMRSIZE
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
imcdiag(model_ctr$model,model_ctr$model$K67)$idiags[,2]<0.1
## K67 SEX CTRAGE CTRAGESQ CS2 CLNSTG2 PSGRP TMRSIZE
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# CTRAGE and CTRAGESQ have VIF<10 and TOL>0.1 suggesting NO multicollinearity
# Model Estimates of AGE
summary(model_orig)$coefficients["AGE","Estimate"] # without AGESQ
## [1] -0.02403084
summary(model_cuad)$coefficients["AGE","Estimate"] # with AGESQ
## [1] -0.822268
summary(model_ctr)$coefficients["CTRAGE","Estimate"] # with AGESQ
## [1] 0.03349068