Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(tidyverse); library(epiDisplay); library(GGally); library(haven); library(factoextra); library(reshape2)

Uses of Principal Components Analysis (PCA)

head(dat)
## # A tibble: 6 x 14
##     AGE SHOCKTYP   SBP   MAP   DBP CARDIND APPTIME   MCT    UO   HGB   HCT
##   <dbl>    <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    68        2   114    88    73    0.66    11.5  22.5   110  11.3    34
## 2    37        2   149   115    97    3.55     8.2  15.6    40  12.7    39
## 3    50        2   146   101    74    4.05     5.6  12.5     0  13.4    41
## 4    53        2   107    83    70    0.95     6.4  38       0  15.5    46
## 5    75        2   141    65    82    1.9     12.6  29.7    42  13.7    42
## 6    66        2   114    59    44    3.48     9    16.8     0   9.3    28
## # … with 3 more variables: LCARDIND <dbl>, LAPPTIME <dbl>, LMCT <dbl>
res.pca <- dat %>% dplyr::select(SBP,MAP,DBP,LCARDIND,LAPPTIME,LMCT,UO,HGB,HCT) %>%
  #prcomp(scale = TRUE)
  princomp(cor = TRUE, scores = TRUE)
  

dat %>% dplyr::select(SBP,MAP,DBP,LCARDIND,LAPPTIME,LMCT,UO,HGB,HCT) %>%
  ggcorr(label = TRUE, label_round = 4) #Correlation

# High correlation in three groups: SBP-MAP-DBP; LAPPTIME-LMCT; HGB-HCT

summary(res.pca)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.8301235 1.7168993 0.9822378 0.9650414 0.6997882
## Proportion of Variance 0.3721502 0.3275270 0.1071990 0.1034783 0.0544115
## Cumulative Proportion  0.3721502 0.6996773 0.8068763 0.9103546 0.9647661
##                            Comp.6     Comp.7      Comp.8      Comp.9
## Standard deviation     0.39435469 0.33144463 0.165823143 0.155680978
## Proportion of Variance 0.01727951 0.01220617 0.003055257 0.002692952
## Cumulative Proportion  0.98204562 0.99425179 0.997307048 1.000000000
# Eigenvalues
eig <- (res.pca$sdev)^2
# Diff Eigenvalues
diff <- -diff(eig)
diff[length(eig)]<-NA
# Variances in proportion
variance <- eig/sum(eig)
# Cumulative variances
cumvar <- cumsum(variance)

(eig.summ <- data.frame(eig = eig, diff= diff, variance = variance,cumvariance = cumvar))
##               eig        diff    variance cumvariance
## Comp.1 3.34935212 0.401608942 0.372150236   0.3721502
## Comp.2 2.94774318 1.982952058 0.327527020   0.6996773
## Comp.3 0.96479112 0.033486131 0.107199013   0.8068763
## Comp.4 0.93130499 0.441601447 0.103478332   0.9103546
## Comp.5 0.48970354 0.334187926 0.054411505   0.9647661
## Comp.6 0.15551562 0.045660074 0.017279513   0.9820456
## Comp.7 0.10985554 0.082358229 0.012206172   0.9942518
## Comp.8 0.02749731 0.003260748 0.003055257   0.9973070
## Comp.9 0.02423657          NA 0.002692952   1.0000000
res.pca$loadings # princomp proc - small values have been kept blank here
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SBP              0.530  0.251                0.723  0.249         0.248
## MAP       0.147  0.546  0.168               -0.181         0.107 -0.775
## DBP       0.212  0.512  0.111               -0.569 -0.145 -0.131  0.559
## LCARDIND -0.398  0.153         0.249  0.817 -0.190  0.213              
## LAPPTIME  0.375 -0.227  0.455 -0.153  0.494  0.121 -0.557              
## LMCT      0.431 -0.234  0.323 -0.213  0.102 -0.237  0.728  0.111       
## UO       -0.101  0.163 -0.346 -0.902  0.168                            
## HGB       0.466        -0.479  0.150  0.141        -0.110  0.695       
## HCT       0.464        -0.479  0.180  0.139         0.113 -0.682 -0.126
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.111  0.111  0.111  0.111  0.111  0.111  0.111  0.111
## Cumulative Var  0.111  0.222  0.333  0.444  0.556  0.667  0.778  0.889
##                Comp.9
## SS loadings     1.000
## Proportion Var  0.111
## Cumulative Var  1.000
(b<-as.data.frame.matrix(res.pca$loadings)) # princomp proc - complete loadings
##               Comp.1      Comp.2      Comp.3        Comp.4      Comp.5
## SBP       0.07963231  0.52954936  0.25056114  0.0148834022  0.05897088
## MAP       0.14650709  0.54632524  0.16787094  0.0007011633 -0.02923805
## DBP       0.21232722  0.51213344  0.11113288  0.0086675560 -0.08100578
## LCARDIND -0.39759990  0.15308048 -0.08140771  0.2486439833  0.81697148
## LAPPTIME  0.37466156 -0.22715660  0.45497484 -0.1534176640  0.49399003
## LMCT      0.43129853 -0.23418415  0.32327408 -0.2125844956  0.10159298
## UO       -0.10102035  0.16278799 -0.34619167 -0.9022237630  0.16849268
## HGB       0.46623828  0.03740492 -0.47926940  0.1504575693  0.14050126
## HCT       0.46373795  0.03241178 -0.47854431  0.1802905699  0.13851196
##               Comp.6      Comp.7      Comp.8       Comp.9
## SBP       0.72338339  0.24868982  0.01635055  0.247706889
## MAP      -0.18124834 -0.08056568  0.10676769 -0.774782078
## DBP      -0.56894266 -0.14456738 -0.13080243  0.558519613
## LCARDIND -0.18965298  0.21322238  0.03211282  0.011134373
## LAPPTIME  0.12138699 -0.55701964 -0.09167052  0.007361287
## LMCT     -0.23702226  0.72813659  0.11067012 -0.022574693
## UO        0.01862822 -0.01126139 -0.02288404  0.007160897
## HGB       0.06105056 -0.10980629  0.69545535  0.098502588
## HCT       0.09598419  0.11252122 -0.68217273 -0.126364759
colfunc<-colorRampPalette(c("red","white","red"))
melt(as.matrix(b)) %>% ggplot(aes(x = Var2, y = Var1, fill = value)) + # prcomp proc
  geom_tile() +
  scale_fill_gradientn(colours = colfunc(10), limits=c(-1,1)) + theme_minimal() +
  xlab(NULL) + ylab(NULL) +
  scale_y_discrete(limits=c("HCT","HGB","UO","LMCT","LAPPTIME","LCARDIND","DBP","MAP","SBP"))

Data reduction

#1) ∑VarCi/T ≥ p where p is .70, .80, .90, or some other fixed value.
eig.summ %>% filter(cumvariance<.91) # First 3 components explains 90% of variance
##         eig       diff  variance cumvariance
## 1 3.3493521 0.40160894 0.3721502   0.3721502
## 2 2.9477432 1.98295206 0.3275270   0.6996773
## 3 0.9647911 0.03348613 0.1071990   0.8068763
#2) Another way to choose h is to make scree plot—eigenvalue (Var Ci ) versus component #: Cut-off just before plot flattens out.
fviz_eig(res.pca)

#3) If correlation matrix is used, still another rule is to select all components with eigenvalue ≥ 1 . 
#   Reasoning is that a component with eigenvalue < 1 explains less than a single (standardized) variable, and so is not worth using.
eig.summ %>% filter(eig>1) # First 2 components has eigenvalues>1
##        eig      diff  variance cumvariance
## 1 3.349352 0.4016089 0.3721502   0.3721502
## 2 2.947743 1.9829521 0.3275270   0.6996773
a <-as.data.frame.matrix(res.pca$loadings) # princomp proc

a[,1]=a[,1]*(sqrt(eig[1]))
a[,2]=a[,2]*(sqrt(eig[2]))
a[,3]=a[,3]*(sqrt(eig[3]))
a[,4]=a[,4]*(sqrt(eig[4]))
a[,5]=a[,5]*(sqrt(eig[5]))
a[,6]=a[,6]*(sqrt(eig[6]))
a[,7]=a[,7]*(sqrt(eig[7]))
a[,8]=a[,8]*(sqrt(eig[8]))
a[,9]=a[,9]*(sqrt(eig[9]))
a$var=rownames(a)

colfunc2<-colorRampPalette(c("blue","white","red"))
melt(a) %>% ggplot(aes(x = variable, y = var, fill = value)) + geom_tile() +
  scale_fill_gradientn(colours = colfunc2(10), limits=c(-1,1)) + theme_minimal() +
  xlab(NULL) + ylab(NULL) +
  scale_y_discrete(limits=c("HCT","HGB","UO","LMCT","LAPPTIME","LCARDIND","DBP","MAP","SBP"))  + 
  geom_point(aes(size= ifelse(value>.5 | value<(-.5),1,NA)), col="#234458") +
  guides(size = "none")

# We could use aij ’s to compute scores for each subject and use in place of Xj ’s. This is the only use for aij ’s.
# (SAS output pag.61)
pred.pca <- as.data.frame(predict(res.pca, newdata = dat)) %>% bind_cols(dat)

res.pca$rotation
## NULL
(eig.summ <- eig.summ %>% mutate(a_crit=.5/(sqrt(eig))))
##          eig        diff    variance cumvariance    a_crit
## 1 3.34935212 0.401608942 0.372150236   0.3721502 0.2732056
## 2 2.94774318 1.982952058 0.327527020   0.6996773 0.2912227
## 3 0.96479112 0.033486131 0.107199013   0.8068763 0.5090417
## 4 0.93130499 0.441601447 0.103478332   0.9103546 0.5181125
## 5 0.48970354 0.334187926 0.054411505   0.9647661 0.7145019
## 6 0.15551562 0.045660074 0.017279513   0.9820456 1.2678942
## 7 0.10985554 0.082358229 0.012206172   0.9942518 1.5085476
## 8 0.02749731 0.003260748 0.003055257   0.9973070 3.0152607
## 9 0.02423657          NA 0.002692952   1.0000000 3.2116962

Additional

summary(res.pca)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.8301235 1.7168993 0.9822378 0.9650414 0.6997882
## Proportion of Variance 0.3721502 0.3275270 0.1071990 0.1034783 0.0544115
## Cumulative Proportion  0.3721502 0.6996773 0.8068763 0.9103546 0.9647661
##                            Comp.6     Comp.7      Comp.8      Comp.9
## Standard deviation     0.39435469 0.33144463 0.165823143 0.155680978
## Proportion of Variance 0.01727951 0.01220617 0.003055257 0.002692952
## Cumulative Proportion  0.98204562 0.99425179 0.997307048 1.000000000
# Graph of individuals. Individuals with a similar profile are grouped together.
fviz_pca_ind(res.pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)     # Avoid text overlapping

# Graph of variables. Positive correlated variables point to the same side of the plot. Negative correlated variables point to opposite sides of the graph.
fviz_pca_var(res.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)    # Avoid text overlapping

# Biplot of individuals and variables
fviz_pca_biplot(res.pca, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
)