Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(tidyverse); library(GGally); library(regclass); library(mctest)

Perfect collinearity

head(dat)
##   SEX RESP1 PERFST CS2 SURVSTAT  AGE HISTDX1 CLNSTG2 SURVTIME TRTGRP
## 1   0     2    100   1        3 49.2       1       0  8.45722      0
## 2   1     1    100   1        3 12.3       1       0  1.01848      1
## 3   1     2     30   1        3 62.4       1       0  0.70363      1
## 4   0     1    100   1        1 43.7       1       1  8.30116      1
## 5   1     1     70   1        3 49.8       1       1  3.27447      1
## 6   0     2     80   1        3 69.2       1       0  3.74538      1
##   MAXSIZE RELSTAT      RFS K67 PSGRP TMRSIZE
## 1       2       3 86.68639  13     1       0
## 2      10       3 10.55227  33     1       1
## 3       2       3  5.45694  76     0       0
## 4      15       3 15.87771  51     1       1
## 5       2       3  8.67850  54     0       0
## 6       1       3 10.38790  34     1       0
dat <- dat %>% mutate(AGELINC = 2*AGE+3) # create dummy collinear variable AGELINC
model<-lm(K67 ~ SEX + AGE + AGELINC + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model)
## 
## Call:
## lm(formula = K67 ~ SEX + AGE + AGELINC + CS2 + CLNSTG2 + PSGRP + 
##     TMRSIZE, data = dat, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.289 -14.812  -0.961  12.851  50.133 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.71803    7.16908   5.819 7.79e-08 ***
## SEX          8.72413    4.17346   2.090   0.0392 *  
## AGE         -0.02403    0.10264  -0.234   0.8154    
## AGELINC           NA         NA      NA       NA    
## CS2         -6.06455    4.53965  -1.336   0.1847    
## CLNSTG2      1.15575    5.07790   0.228   0.8204    
## PSGRP       -4.19745    4.21521  -0.996   0.3219    
## TMRSIZE      3.87147    4.95254   0.782   0.4363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.25 on 96 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.0857, Adjusted R-squared:  0.02855 
## F-statistic:   1.5 on 6 and 96 DF,  p-value: 0.1865

Assessing collinearity

model_orig<-lm(K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model_orig)
## 
## Call:
## lm(formula = K67 ~ SEX + AGE + CS2 + CLNSTG2 + PSGRP + TMRSIZE, 
##     data = dat, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.289 -14.812  -0.961  12.851  50.133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.71803    7.16908   5.819 7.79e-08 ***
## SEX          8.72413    4.17346   2.090   0.0392 *  
## AGE         -0.02403    0.10264  -0.234   0.8154    
## CS2         -6.06455    4.53965  -1.336   0.1847    
## CLNSTG2      1.15575    5.07790   0.228   0.8204    
## PSGRP       -4.19745    4.21521  -0.996   0.3219    
## TMRSIZE      3.87147    4.95254   0.782   0.4363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.25 on 96 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.0857, Adjusted R-squared:  0.02855 
## F-statistic:   1.5 on 6 and 96 DF,  p-value: 0.1865
dat <- dat %>% mutate(AGESQ=AGE*AGE) # create dummy collinear variable AGESQ
dat %>% dplyr::select(K67, SEX, AGE, AGESQ, CS2, CLNSTG2, PSGRP, TMRSIZE) %>% ggcorr(label = TRUE, label_round = 4) #Correlation

model_cuad<-lm(K67 ~ SEX + AGE + AGESQ + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model_cuad)
## 
## Call:
## lm(formula = K67 ~ SEX + AGE + AGESQ + CS2 + CLNSTG2 + PSGRP + 
##     TMRSIZE, data = dat, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.642 -15.708  -1.379  13.414  46.800 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 58.997073  14.507616   4.067 9.84e-05 ***
## SEX          8.261439   4.168394   1.982   0.0504 .  
## AGE         -0.822268   0.592383  -1.388   0.1684    
## AGESQ        0.008020   0.005863   1.368   0.1745    
## CS2         -6.169016   4.519831  -1.365   0.1755    
## CLNSTG2      0.993888   5.056392   0.197   0.8446    
## PSGRP       -4.899019   4.227434  -1.159   0.2494    
## TMRSIZE      3.955068   4.930593   0.802   0.4245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.16 on 95 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1034, Adjusted R-squared:  0.03729 
## F-statistic: 1.564 on 7 and 95 DF,  p-value: 0.1555
VIF(model_cuad)
##       SEX       AGE     AGESQ       CS2   CLNSTG2     PSGRP   TMRSIZE 
##  1.046010 34.582419 34.291094  1.279601  1.310716  1.131350  1.034913
omcdiag(model_cuad$model,model_cuad$model$K67)
## 
## Call:
## omcdiag(x = model_cuad$model, y = model_cuad$model$K67)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.0177         0
## Farrar Chi-Square:       400.8248         1
## Red Indicator:             0.2322         0
## Sum of Lambda Inverse:    77.2601         1
## Theil's Method:           -4.2321         0
## Condition Number:         51.6710         1
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
imcdiag(model_cuad$model,model_cuad$model$K67)
## 
## Call:
## imcdiag(x = model_cuad$model, y = model_cuad$model$K67)
## 
## 
## All Individual Multicollinearity Diagnostics Result
## 
##             VIF    TOL       Wi       Fi Leamer CVIF Klein
## K67      1.1153 0.8966   1.5644   1.8444 0.9469    0     0
## SEX      1.0893 0.9181   1.2114   1.4282 0.9582    0     0
## AGE     35.2838 0.0283 465.2801 548.5408 0.1683    0     0
## AGESQ   34.9666 0.0286 460.9754 543.4657 0.1691    0     0
## CS2      1.3047 0.7665   4.1351   4.8751 0.8755    0     0
## CLNSTG2  1.3112 0.7626   4.2241   4.9800 0.8733    0     0
## PSGRP    1.1473 0.8716   1.9997   2.3575 0.9336    0     0
## TMRSIZE  1.0419 0.9598   0.5689   0.6708 0.9797    0     0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
## 
## SEX , CS2 , CLNSTG2 , PSGRP , coefficient(s) are non-significant may be due to multicollinearity
## 
## R-square of y on all x: 1 
## 
## * use method argument to check which regressors may be the reason of collinearity
## ===================================
imcdiag(model_cuad$model,model_cuad$model$K67)$idiags[,1]>10
##     K67     SEX     AGE   AGESQ     CS2 CLNSTG2   PSGRP TMRSIZE 
##   FALSE   FALSE    TRUE    TRUE   FALSE   FALSE   FALSE   FALSE
imcdiag(model_cuad$model,model_cuad$model$K67)$idiags[,2]<0.1
##     K67     SEX     AGE   AGESQ     CS2 CLNSTG2   PSGRP TMRSIZE 
##   FALSE   FALSE    TRUE    TRUE   FALSE   FALSE   FALSE   FALSE
# AGE and AGESQ have VIF>10 and TOL<0.1 suggesting multicollinearity

# Model Estimates of AGE
summary(model_orig)$coefficients["AGE","Estimate"] # without AGESQ
## [1] -0.02403084
summary(model_cuad)$coefficients["AGE","Estimate"] # with AGESQ
## [1] -0.822268
# Model standard error of AGE
summary(model_orig)$coefficients["AGE","Std. Error"] # without AGESQ
## [1] 0.1026433
summary(model_cuad)$coefficients["AGE","Std. Error"] # with AGESQ
## [1] 0.5923831
# Consecuentely, the presence of AGESQ dramatically affects the estimatesof AGE

How collinearity may be treated

# Centering to remove the collinerity
dat <- dat %>% mutate(CTRAGE=AGE-mean(AGE), CTRAGESQ = CTRAGE*CTRAGE) # create centered dummy collinear variable CTRAGESQ
dat %>% dplyr::select(K67, SEX, CTRAGE, CTRAGESQ, CS2, CLNSTG2, PSGRP, TMRSIZE) %>% ggcorr(label = TRUE, label_round = 4) #Correlation

model_ctr<-lm(K67 ~ SEX + CTRAGE + CTRAGESQ + CS2 + CLNSTG2 + PSGRP + TMRSIZE, data = dat,na.action=na.exclude)
summary(model_ctr)
## 
## Call:
## lm(formula = K67 ~ SEX + CTRAGE + CTRAGESQ + CS2 + CLNSTG2 + 
##     PSGRP + TMRSIZE, data = dat, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.642 -15.708  -1.379  13.414  46.800 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.956626   4.363593   8.698 9.96e-14 ***
## SEX          8.261439   4.168394   1.982   0.0504 .  
## CTRAGE       0.033491   0.110494   0.303   0.7625    
## CTRAGESQ     0.008020   0.005863   1.368   0.1745    
## CS2         -6.169016   4.519831  -1.365   0.1755    
## CLNSTG2      0.993888   5.056392   0.197   0.8446    
## PSGRP       -4.899019   4.227434  -1.159   0.2494    
## TMRSIZE      3.955068   4.930593   0.802   0.4245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.16 on 95 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1034, Adjusted R-squared:  0.03729 
## F-statistic: 1.564 on 7 and 95 DF,  p-value: 0.1555
VIF(model_ctr)
##      SEX   CTRAGE CTRAGESQ      CS2  CLNSTG2    PSGRP  TMRSIZE 
## 1.046010 1.203167 1.225923 1.279601 1.310716 1.131350 1.034913
omcdiag(model_ctr$model,model_ctr$model$K67)
## 
## Call:
## omcdiag(x = model_ctr$model, y = model_ctr$model$K67)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.4955         0
## Farrar Chi-Square:        69.7877         1
## Red Indicator:             0.1643         0
## Sum of Lambda Inverse:     9.4641         0
## Theil's Method:           -5.8054         0
## Condition Number:          7.7158         0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
imcdiag(model_ctr$model,model_ctr$model$K67)
## 
## Call:
## imcdiag(x = model_ctr$model, y = model_ctr$model$K67)
## 
## 
## All Individual Multicollinearity Diagnostics Result
## 
##             VIF    TOL     Wi     Fi Leamer CVIF Klein
## K67      1.1153 0.8966 1.5644 1.8444 0.9469    0     0
## SEX      1.0893 0.9181 1.2114 1.4282 0.9582    0     0
## CTRAGE   1.2043 0.8303 2.7731 3.2693 0.9112    0     0
## CTRAGESQ 1.2501 0.8000 3.3939 4.0012 0.8944    0     0
## CS2      1.3047 0.7665 4.1351 4.8751 0.8755    0     0
## CLNSTG2  1.3112 0.7626 4.2241 4.9800 0.8733    0     0
## PSGRP    1.1473 0.8716 1.9997 2.3575 0.9336    0     0
## TMRSIZE  1.0419 0.9598 0.5689 0.6708 0.9797    0     0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
## 
## SEX , CTRAGE , CS2 , CLNSTG2 , PSGRP , coefficient(s) are non-significant may be due to multicollinearity
## 
## R-square of y on all x: 1 
## 
## * use method argument to check which regressors may be the reason of collinearity
## ===================================
imcdiag(model_ctr$model,model_ctr$model$K67)$idiags[,1]>10
##      K67      SEX   CTRAGE CTRAGESQ      CS2  CLNSTG2    PSGRP  TMRSIZE 
##    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
imcdiag(model_ctr$model,model_ctr$model$K67)$idiags[,2]<0.1
##      K67      SEX   CTRAGE CTRAGESQ      CS2  CLNSTG2    PSGRP  TMRSIZE 
##    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
# CTRAGE and CTRAGESQ have VIF<10 and TOL>0.1 suggesting NO multicollinearity

# Model Estimates of AGE
summary(model_orig)$coefficients["AGE","Estimate"] # without AGESQ
## [1] -0.02403084
summary(model_cuad)$coefficients["AGE","Estimate"] # with AGESQ
## [1] -0.822268
summary(model_ctr)$coefficients["CTRAGE","Estimate"] # with AGESQ
## [1] 0.03349068