Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(tidyverse); library(epiDisplay); library(GGally); library(haven); library(factoextra); library(reshape2)
head(dat)
## # A tibble: 6 x 14
## AGE SHOCKTYP SBP MAP DBP CARDIND APPTIME MCT UO HGB HCT
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 68 2 114 88 73 0.66 11.5 22.5 110 11.3 34
## 2 37 2 149 115 97 3.55 8.2 15.6 40 12.7 39
## 3 50 2 146 101 74 4.05 5.6 12.5 0 13.4 41
## 4 53 2 107 83 70 0.95 6.4 38 0 15.5 46
## 5 75 2 141 65 82 1.9 12.6 29.7 42 13.7 42
## 6 66 2 114 59 44 3.48 9 16.8 0 9.3 28
## # … with 3 more variables: LCARDIND <dbl>, LAPPTIME <dbl>, LMCT <dbl>
res.pca <- dat %>% dplyr::select(SBP,MAP,DBP,LCARDIND,LAPPTIME,LMCT,UO,HGB,HCT) %>%
#prcomp(scale = TRUE)
princomp(cor = TRUE, scores = TRUE)
dat %>% dplyr::select(SBP,MAP,DBP,LCARDIND,LAPPTIME,LMCT,UO,HGB,HCT) %>%
ggcorr(label = TRUE, label_round = 4) #Correlation
# High correlation in three groups: SBP-MAP-DBP; LAPPTIME-LMCT; HGB-HCT
summary(res.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.8301235 1.7168993 0.9822378 0.9650414 0.6997882
## Proportion of Variance 0.3721502 0.3275270 0.1071990 0.1034783 0.0544115
## Cumulative Proportion 0.3721502 0.6996773 0.8068763 0.9103546 0.9647661
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.39435469 0.33144463 0.165823143 0.155680978
## Proportion of Variance 0.01727951 0.01220617 0.003055257 0.002692952
## Cumulative Proportion 0.98204562 0.99425179 0.997307048 1.000000000
# Eigenvalues
eig <- (res.pca$sdev)^2
# Diff Eigenvalues
diff <- -diff(eig)
diff[length(eig)]<-NA
# Variances in proportion
variance <- eig/sum(eig)
# Cumulative variances
cumvar <- cumsum(variance)
(eig.summ <- data.frame(eig = eig, diff= diff, variance = variance,cumvariance = cumvar))
## eig diff variance cumvariance
## Comp.1 3.34935212 0.401608942 0.372150236 0.3721502
## Comp.2 2.94774318 1.982952058 0.327527020 0.6996773
## Comp.3 0.96479112 0.033486131 0.107199013 0.8068763
## Comp.4 0.93130499 0.441601447 0.103478332 0.9103546
## Comp.5 0.48970354 0.334187926 0.054411505 0.9647661
## Comp.6 0.15551562 0.045660074 0.017279513 0.9820456
## Comp.7 0.10985554 0.082358229 0.012206172 0.9942518
## Comp.8 0.02749731 0.003260748 0.003055257 0.9973070
## Comp.9 0.02423657 NA 0.002692952 1.0000000
res.pca$loadings # princomp proc - small values have been kept blank here
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SBP 0.530 0.251 0.723 0.249 0.248
## MAP 0.147 0.546 0.168 -0.181 0.107 -0.775
## DBP 0.212 0.512 0.111 -0.569 -0.145 -0.131 0.559
## LCARDIND -0.398 0.153 0.249 0.817 -0.190 0.213
## LAPPTIME 0.375 -0.227 0.455 -0.153 0.494 0.121 -0.557
## LMCT 0.431 -0.234 0.323 -0.213 0.102 -0.237 0.728 0.111
## UO -0.101 0.163 -0.346 -0.902 0.168
## HGB 0.466 -0.479 0.150 0.141 -0.110 0.695
## HCT 0.464 -0.479 0.180 0.139 0.113 -0.682 -0.126
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111
## Cumulative Var 0.111 0.222 0.333 0.444 0.556 0.667 0.778 0.889
## Comp.9
## SS loadings 1.000
## Proportion Var 0.111
## Cumulative Var 1.000
(b<-as.data.frame.matrix(res.pca$loadings)) # princomp proc - complete loadings
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SBP 0.07963231 0.52954936 0.25056114 0.0148834022 0.05897088
## MAP 0.14650709 0.54632524 0.16787094 0.0007011633 -0.02923805
## DBP 0.21232722 0.51213344 0.11113288 0.0086675560 -0.08100578
## LCARDIND -0.39759990 0.15308048 -0.08140771 0.2486439833 0.81697148
## LAPPTIME 0.37466156 -0.22715660 0.45497484 -0.1534176640 0.49399003
## LMCT 0.43129853 -0.23418415 0.32327408 -0.2125844956 0.10159298
## UO -0.10102035 0.16278799 -0.34619167 -0.9022237630 0.16849268
## HGB 0.46623828 0.03740492 -0.47926940 0.1504575693 0.14050126
## HCT 0.46373795 0.03241178 -0.47854431 0.1802905699 0.13851196
## Comp.6 Comp.7 Comp.8 Comp.9
## SBP 0.72338339 0.24868982 0.01635055 0.247706889
## MAP -0.18124834 -0.08056568 0.10676769 -0.774782078
## DBP -0.56894266 -0.14456738 -0.13080243 0.558519613
## LCARDIND -0.18965298 0.21322238 0.03211282 0.011134373
## LAPPTIME 0.12138699 -0.55701964 -0.09167052 0.007361287
## LMCT -0.23702226 0.72813659 0.11067012 -0.022574693
## UO 0.01862822 -0.01126139 -0.02288404 0.007160897
## HGB 0.06105056 -0.10980629 0.69545535 0.098502588
## HCT 0.09598419 0.11252122 -0.68217273 -0.126364759
colfunc<-colorRampPalette(c("red","white","red"))
melt(as.matrix(b)) %>% ggplot(aes(x = Var2, y = Var1, fill = value)) + # prcomp proc
geom_tile() +
scale_fill_gradientn(colours = colfunc(10), limits=c(-1,1)) + theme_minimal() +
xlab(NULL) + ylab(NULL) +
scale_y_discrete(limits=c("HCT","HGB","UO","LMCT","LAPPTIME","LCARDIND","DBP","MAP","SBP"))
#1) ∑VarCi/T ≥ p where p is .70, .80, .90, or some other fixed value.
eig.summ %>% filter(cumvariance<.91) # First 3 components explains 90% of variance
## eig diff variance cumvariance
## 1 3.3493521 0.40160894 0.3721502 0.3721502
## 2 2.9477432 1.98295206 0.3275270 0.6996773
## 3 0.9647911 0.03348613 0.1071990 0.8068763
#2) Another way to choose h is to make scree plot—eigenvalue (Var Ci ) versus component #: Cut-off just before plot flattens out.
fviz_eig(res.pca)
#3) If correlation matrix is used, still another rule is to select all components with eigenvalue ≥ 1 .
# Reasoning is that a component with eigenvalue < 1 explains less than a single (standardized) variable, and so is not worth using.
eig.summ %>% filter(eig>1) # First 2 components has eigenvalues>1
## eig diff variance cumvariance
## 1 3.349352 0.4016089 0.3721502 0.3721502
## 2 2.947743 1.9829521 0.3275270 0.6996773
a <-as.data.frame.matrix(res.pca$loadings) # princomp proc
a[,1]=a[,1]*(sqrt(eig[1]))
a[,2]=a[,2]*(sqrt(eig[2]))
a[,3]=a[,3]*(sqrt(eig[3]))
a[,4]=a[,4]*(sqrt(eig[4]))
a[,5]=a[,5]*(sqrt(eig[5]))
a[,6]=a[,6]*(sqrt(eig[6]))
a[,7]=a[,7]*(sqrt(eig[7]))
a[,8]=a[,8]*(sqrt(eig[8]))
a[,9]=a[,9]*(sqrt(eig[9]))
a$var=rownames(a)
colfunc2<-colorRampPalette(c("blue","white","red"))
melt(a) %>% ggplot(aes(x = variable, y = var, fill = value)) + geom_tile() +
scale_fill_gradientn(colours = colfunc2(10), limits=c(-1,1)) + theme_minimal() +
xlab(NULL) + ylab(NULL) +
scale_y_discrete(limits=c("HCT","HGB","UO","LMCT","LAPPTIME","LCARDIND","DBP","MAP","SBP")) +
geom_point(aes(size= ifelse(value>.5 | value<(-.5),1,NA)), col="#234458") +
guides(size = "none")
# We could use aij ’s to compute scores for each subject and use in place of Xj ’s. This is the only use for aij ’s.
# (SAS output pag.61)
pred.pca <- as.data.frame(predict(res.pca, newdata = dat)) %>% bind_cols(dat)
res.pca$rotation
## NULL
(eig.summ <- eig.summ %>% mutate(a_crit=.5/(sqrt(eig))))
## eig diff variance cumvariance a_crit
## 1 3.34935212 0.401608942 0.372150236 0.3721502 0.2732056
## 2 2.94774318 1.982952058 0.327527020 0.6996773 0.2912227
## 3 0.96479112 0.033486131 0.107199013 0.8068763 0.5090417
## 4 0.93130499 0.441601447 0.103478332 0.9103546 0.5181125
## 5 0.48970354 0.334187926 0.054411505 0.9647661 0.7145019
## 6 0.15551562 0.045660074 0.017279513 0.9820456 1.2678942
## 7 0.10985554 0.082358229 0.012206172 0.9942518 1.5085476
## 8 0.02749731 0.003260748 0.003055257 0.9973070 3.0152607
## 9 0.02423657 NA 0.002692952 1.0000000 3.2116962
summary(res.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.8301235 1.7168993 0.9822378 0.9650414 0.6997882
## Proportion of Variance 0.3721502 0.3275270 0.1071990 0.1034783 0.0544115
## Cumulative Proportion 0.3721502 0.6996773 0.8068763 0.9103546 0.9647661
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.39435469 0.33144463 0.165823143 0.155680978
## Proportion of Variance 0.01727951 0.01220617 0.003055257 0.002692952
## Cumulative Proportion 0.98204562 0.99425179 0.997307048 1.000000000
# Graph of individuals. Individuals with a similar profile are grouped together.
fviz_pca_ind(res.pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE) # Avoid text overlapping
# Graph of variables. Positive correlated variables point to the same side of the plot. Negative correlated variables point to opposite sides of the graph.
fviz_pca_var(res.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE) # Avoid text overlapping
# Biplot of individuals and variables
fviz_pca_biplot(res.pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)