Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(tidyverse); library(epiDisplay); library(tidyr); library(haven); library(MASS); library(doBy)
library(mvnfast);library(DiscriMiner); library(haven); library(doBy)
head(dat)
## MAP HCT LCARDIND LAPPTIME LMCT UO SHOCKGRP
## 1 88 34 -0.18045606 1.0606978 1.352183 110 1
## 2 115 39 0.55022835 0.9138139 1.193125 40 1
## 3 101 41 0.60745502 0.7481880 1.096910 0 1
## 4 83 46 -0.02227639 0.8061800 1.579784 0 1
## 5 65 42 0.27875360 1.1003705 1.472756 42 1
## 6 59 28 0.54157924 0.9542425 1.225309 0 1
# Simple statistics
options(scipen=999)
a<-as.data.frame(t(summaryBy(.~SHOCKGRP, data=as.data.frame(dat), FUN=c(length,mean, var, sd))))
a[order(row.names(a)),]
## 1 2
## HCT.length 34.00000000 79.00000000
## HCT.mean 33.86764706 35.31392405
## HCT.sd 7.09919844 8.09483428
## HCT.var 50.39861854 65.52634210
## LAPPTIME.length 34.00000000 79.00000000
## LAPPTIME.mean 0.88981480 0.98558421
## LAPPTIME.sd 0.24013079 0.20413907
## LAPPTIME.var 0.05766279 0.04167276
## LCARDIND.length 34.00000000 79.00000000
## LCARDIND.mean 0.41819672 0.29914843
## LCARDIND.sd 0.27478439 0.26523784
## LCARDIND.var 0.07550646 0.07035111
## LMCT.length 34.00000000 79.00000000
## LMCT.mean 1.25342078 1.34094148
## LMCT.sd 0.18819632 0.18873614
## LMCT.var 0.03541786 0.03562133
## MAP.length 34.00000000 79.00000000
## MAP.mean 85.52941176 68.21518987
## MAP.sd 18.12454437 21.56128468
## MAP.var 328.49910873 464.88899708
## SHOCKGRP 1.00000000 2.00000000
## UO.length 34.00000000 79.00000000
## UO.mean 103.67647059 33.36708861
## UO.sd 156.63501690 78.90064788
## UO.var 24534.52852050 6225.31223629
# Generalized Squared Distance to SHOCKGRP
#***Falta
# Multivariate Analysis of Variance (MANOVA)
summary(manova(cbind(MAP,HCT,LCARDIND,LAPPTIME,LMCT,UO)~SHOCKGRP,data=dat),test="Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## SHOCKGRP 1 0.78021 4.9768 6 106 0.0001584 ***
## Residuals 111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova(cbind(MAP,HCT,LCARDIND,LAPPTIME,LMCT,UO)~SHOCKGRP,data=dat),test="Pillai")
## Df Pillai approx F num Df den Df Pr(>F)
## SHOCKGRP 1 0.21979 4.9768 6 106 0.0001584 ***
## Residuals 111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova(cbind(MAP,HCT,LCARDIND,LAPPTIME,LMCT,UO)~SHOCKGRP,data=dat),test='Hotelling-Lawley')
## Df Hotelling-Lawley approx F num Df den Df Pr(>F)
## SHOCKGRP 1 0.2817 4.9768 6 106 0.0001584 ***
## Residuals 111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova(cbind(MAP,HCT,LCARDIND,LAPPTIME,LMCT,UO)~SHOCKGRP,data=dat),test="Roy")
## Df Roy approx F num Df den Df Pr(>F)
## SHOCKGRP 1 0.2817 4.9768 6 106 0.0001584 ***
## Residuals 111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Linear Discriminant Function
lda <- linDA(variables=dat[,1:6],group = dat[,7],prior = c(0.5,0.5)) # Equal priors - unknown
summary(lda)
## Length Class Mode
## functions 14 -none- numeric
## confusion 4 table numeric
## scores 226 -none- numeric
## classification 113 factor numeric
## error_rate 1 -none- numeric
## specs 6 -none- list
lda$functions
## 1 2
## constant -85.24037672 -81.61191980
## MAP 0.22913827 0.18711223
## HCT 0.28466034 0.30503554
## LCARDIND 52.64757863 51.14622554
## LAPPTIME -39.84210643 -38.93587888
## LMCT 120.85697747 120.04070271
## UO 0.01741983 0.01223854
# Posterior probability of membership (calculates l*)
lda.pred <- lda(SHOCKGRP ~ ., prior=c(0.5,0.5), data=dat)
head(dt.lda.pred <- lda.pred %>% predict(dat) %>% as.data.frame() %>%
mutate(SHOCKGRP=dat$SHOCKGRP, miss=ifelse(class==SHOCKGRP,"","*")))
## class posterior.1 posterior.2 LD1 SHOCKGRP miss
## 1 2 0.4547765 0.5452235 0.1581498 1 *
## 2 1 0.8304808 0.1695192 -1.3854488 1
## 3 1 0.7130302 0.2869698 -0.7935374 1
## 4 2 0.3653903 0.6346097 0.4813153 1 *
## 5 2 0.2867108 0.7132892 0.7946415 1 *
## 6 2 0.3162854 0.6837146 0.6721277 1 *
# There is no constant in R because by default, R function 'lda' from the MASS package, centers the data.
#For obtain constant term in Discriminant Analysis on R (with library MASS):
#groupmean<-(lda$prior%*%lda$means)
#constant<-(groupmean%*%lda$scaling)
# Error matrix
lda$confusion
## predicted
## original 1 2
## 1 20 14
## 2 16 63
prop.table(lda$confusion,1)
## predicted
## original 1 2
## 1 0.5882353 0.4117647
## 2 0.2025316 0.7974684
1-sum(diag(prop.table(lda$confusion))) # total error rate
## [1] 0.2654867
(err<-tabpct(dt.lda.pred$SHOCKGRP,dt.lda.pred$class))
##
## Original table
## dt.lda.pred$class
## dt.lda.pred$SHOCKGRP 1 2 Total
## 1 20 14 34
## 2 16 63 79
## Total 36 77 113
##
## Row percent
## dt.lda.pred$class
## dt.lda.pred$SHOCKGRP 1 2 Total
## 1 20 14 34
## (58.8) (41.2) (100)
## 2 16 63 79
## (20.3) (79.7) (100)
##
## Column percent
## dt.lda.pred$class
## dt.lda.pred$SHOCKGRP 1 % 2 %
## 1 20 (55.6) 14 (18.2)
## 2 16 (44.4) 63 (81.8)
## Total 36 (100) 77 (100)
## $table.row.percent
## dt.lda.pred$class
## dt.lda.pred$SHOCKGRP 1 2
## 1 58.82353 41.17647
## 2 20.25316 79.74684
##
## $table.column.percent
## dt.lda.pred$class
## dt.lda.pred$SHOCKGRP 1 2
## 1 55.55556 18.18182
## 2 44.44444 81.81818
# Adjust PRIORS
tab1(dat$SHOCKGRP)
## dat$SHOCKGRP :
## Frequency Percent Cum. percent
## 1 34 30.1 30.1
## 2 79 69.9 100.0
## Total 113 100.0 100.0
# Suppose costs of misclassification are c1 = 1, c2 = 3. Then p1′ = p1c1/p1c1+p2c2 = .30(1)/.30(1)+.70(3) =.125
# p2' = p2'=p2c2/p1c1+p2c2 = .70(3)/.30(1)+.70(3) =.875
lda.pp <- linDA(variables=dat[,1:6],group = dat[,7], validation = "crossval")
lda.pp.pred <- lda(SHOCKGRP ~ ., prior=c(0.125,0.875), data=dat, CV=TRUE)
dt.lda.pp.pred <- data.frame(class=lda.pp.pred$class, posterior=lda.pp.pred$posterior,SHOCKGRP=dat$SHOCKGRP) %>%
mutate(miss=ifelse(class==SHOCKGRP,"","*"))
(err.pp<-tabpct(dt.lda.pp.pred$SHOCKGRP,dt.lda.pp.pred$class))
##
## Original table
## dt.lda.pp.pred$class
## dt.lda.pp.pred$SHOCKGRP 1 2 Total
## 1 6 28 34
## 2 2 77 79
## Total 8 105 113
##
## Row percent
## dt.lda.pp.pred$class
## dt.lda.pp.pred$SHOCKGRP 1 2 Total
## 1 6 28 34
## (17.6) (82.4) (100)
## 2 2 77 79
## (2.5) (97.5) (100)
##
## Column percent
## dt.lda.pp.pred$class
## dt.lda.pp.pred$SHOCKGRP 1 % 2 %
## 1 6 (75) 28 (26.7)
## 2 2 (25) 77 (73.3)
## Total 8 (100) 105 (100)
## $table.row.percent
## dt.lda.pp.pred$class
## dt.lda.pp.pred$SHOCKGRP 1 2
## 1 17.647059 82.352941
## 2 2.531646 97.468354
##
## $table.column.percent
## dt.lda.pp.pred$class
## dt.lda.pp.pred$SHOCKGRP 1 2
## 1 75.00000 26.66667
## 2 25.00000 73.33333
1-sum(diag(prop.table(table(dt.lda.pp.pred$SHOCKGRP,dt.lda.pp.pred$class)))) # total error rate
## [1] 0.2654867
# Priors are higher for SHOCK, thus fewer people will be classified into NOSHOCK
# Go to SAS output, pp 88-98
lda.jkn <- linDA(variables=dat[,1:6],group = dat[,7], validation = "crossval")
lda.jkn.pred <- lda(SHOCKGRP ~ ., prior=c(0.5,0.5), data=dat, CV=TRUE)
dt.lda.jkn.pred <- data.frame(class=lda.jkn.pred$class, posterior=lda.jkn.pred$posterior,SHOCKGRP=dat$SHOCKGRP) %>%
mutate(miss=ifelse(class==SHOCKGRP,"","*"))
# Error rate
(err.jkn<-tabpct(dt.lda.jkn.pred$SHOCKGRP,dt.lda.jkn.pred$class))
##
## Original table
## dt.lda.jkn.pred$class
## dt.lda.jkn.pred$SHOCKGRP 1 2 Total
## 1 19 15 34
## 2 19 60 79
## Total 38 75 113
##
## Row percent
## dt.lda.jkn.pred$class
## dt.lda.jkn.pred$SHOCKGRP 1 2 Total
## 1 19 15 34
## (55.9) (44.1) (100)
## 2 19 60 79
## (24.1) (75.9) (100)
##
## Column percent
## dt.lda.jkn.pred$class
## dt.lda.jkn.pred$SHOCKGRP 1 % 2 %
## 1 19 (50) 15 (20)
## 2 19 (50) 60 (80)
## Total 38 (100) 75 (100)
## $table.row.percent
## dt.lda.jkn.pred$class
## dt.lda.jkn.pred$SHOCKGRP 1 2
## 1 55.88235 44.11765
## 2 24.05063 75.94937
##
## $table.column.percent
## dt.lda.jkn.pred$class
## dt.lda.jkn.pred$SHOCKGRP 1 2
## 1 50 20
## 2 50 80
1-sum(diag(prop.table(table(dt.lda.jkn.pred$SHOCKGRP,dt.lda.jkn.pred$class)))) # total error rate
## [1] 0.300885
head(dat)
## Prin1 Prin2 UO SHOCKGRP
## 1 1.170012573 0.5269088 110 1
## 2 0.657838528 3.1610322 40 1
## 3 -0.006187395 2.4082767 0 1
## 4 2.512347511 0.2266346 0 1
## 5 1.835852042 0.7102252 42 1
## 6 -1.500648679 -0.5371938 0 1
# Simple statistics
options(scipen=999)
a<-as.data.frame(t(summaryBy(.~SHOCKGRP, data=as.data.frame(dat), FUN=c(length,mean, var, sd))))
a[order(row.names(a)),]
## 1 2
## Prin1.length 34.0000000 79.0000000
## Prin1.mean -0.3276147 0.1409988
## Prin1.sd 1.6008693 1.9126293
## Prin1.var 2.5627824 3.6581510
## Prin2.length 34.0000000 79.0000000
## Prin2.mean 1.1283780 -0.4856310
## Prin2.sd 1.5026979 1.5758949
## Prin2.var 2.2581011 2.4834447
## SHOCKGRP 1.0000000 2.0000000
## UO.length 34.0000000 79.0000000
## UO.mean 103.6764706 33.3670886
## UO.sd 156.6350169 78.9006479
## UO.var 24534.5285205 6225.3122363
# Multivariate Analysis of Variance (MANOVA)
summary(manova(cbind(Prin1,Prin2,UO)~SHOCKGRP,data=dat),test="Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## SHOCKGRP 1 0.77466 10.569 3 109 0.000003712 ***
## Residuals 111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Linear Discriminant Function
lda.pca <- linDA(variables=dat[,1:3],group = dat[,4], validation = "crossval") # Equal priors - unknown
lda.pca$functions
## 1 2
## constant -1.823135137 -0.488526363
## Prin1 -0.043106088 0.089892014
## Prin2 0.372648670 -0.253727161
## UO 0.007808955 0.003754608
# Posterior probability of membership (calculates l*)
lda.pca.pred <- lda(SHOCKGRP ~ ., prior=c(0.5,0.5), data=dat, CV=TRUE)
head(dt.lda.pca.pred <- data.frame(class=lda.pca.pred$class, posterior=lda.pca.pred$posterior,SHOCKGRP=dat$SHOCKGRP) %>%
mutate(miss=ifelse(class==SHOCKGRP,"","*")))
## class posterior.1 posterior.2 SHOCKGRP miss
## 1 1 0.5238408 0.4761592 1
## 2 1 0.8168466 0.1831534 1
## 3 1 0.7220861 0.2779139 1
## 4 2 0.3074135 0.6925865 1 *
## 5 2 0.4553771 0.5446229 1 *
## 6 2 0.3287087 0.6712913 1 *
(err.pca<-tabpct(dt.lda.pca.pred$SHOCKGRP,dt.lda.pca.pred$class))
##
## Original table
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKGRP 1 2 Total
## 1 22 12 34
## 2 17 62 79
## Total 39 74 113
##
## Row percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKGRP 1 2 Total
## 1 22 12 34
## (64.7) (35.3) (100)
## 2 17 62 79
## (21.5) (78.5) (100)
##
## Column percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKGRP 1 % 2 %
## 1 22 (56.4) 12 (16.2)
## 2 17 (43.6) 62 (83.8)
## Total 39 (100) 74 (100)
## $table.row.percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKGRP 1 2
## 1 64.70588 35.29412
## 2 21.51899 78.48101
##
## $table.column.percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKGRP 1 2
## 1 56.41026 16.21622
## 2 43.58974 83.78378
head(dat)
## Prin1 Prin2 UO SHOCKTYP
## 1 1.170012573 0.5269088 110 NONSHOCK
## 2 0.657838528 3.1610322 40 NONSHOCK
## 3 -0.006187395 2.4082767 0 NONSHOCK
## 4 2.512347511 0.2266346 0 NONSHOCK
## 5 1.835852042 0.7102252 42 NONSHOCK
## 6 -1.500648679 -0.5371938 0 NONSHOCK
# Simple statistics
options(scipen=999)
a<-as.data.frame(t(summaryBy(.~SHOCKTYP, data=as.data.frame(dat), FUN=c(length,mean, var, sd))))
a[order(row.names(a)),]
## 1 2 3 4 5
## Prin1.length 34 17 20 16 16
## Prin1.mean -0.32761474 -0.60400587 1.77016747 -1.28780551 0.09293491
## Prin1.sd 1.600869 1.980040 1.481867 1.207295 1.608278
## Prin1.var 2.562782 3.920556 2.195929 1.457562 2.586559
## Prin2.length 34 17 20 16 16
## Prin2.mean 1.1283780 -0.6584137 -0.5598924 -0.4682162 -0.1417048
## Prin2.sd 1.5026979 1.9071942 1.6501165 0.9563811 1.4787739
## Prin2.var 2.2581011 3.6373896 2.7228845 0.9146648 2.1867724
## SHOCKTYP NONSHOCK HYPOVOL CARDIO BACTER NEURO
## UO.length 34 17 20 16 16
## UO.mean 103.67647 31.76471 16.10000 32.81250 71.87500
## UO.sd 156.63502 92.63539 32.96873 83.92157 111.09688
## UO.var 24534.529 8581.316 1086.937 7042.829 12342.517
# Multivariate Analysis of Variance (MANOVA)
summary(manova(cbind(Prin1,Prin2,UO)~SHOCKTYP,data=dat),test="Wilks") # test=“Pillai”, “Wilks”, “Hotelling-Lawley”, “Roy”
## Df Wilks approx F num Df den Df Pr(>F)
## SHOCKTYP 4 0.54011 5.555 12 254.28 0.00000001959 ***
## Residuals 98
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Since it is statistically significant it tells us one or more of the groups are different
# from each other. However, no pairwise tests are provided to assess which groups differ.
# Linear Discriminant Function
lda.pca <- linDA(variables=dat[,1:3],group = dat[,4], validation = "crossval") # Equal priors - unknown
lda.pca$functions
## NONSHOCK HYPOVOL CARDIO BACTER NEURO
## constant -1.708014013 -1.997795203 -2.449846095 -2.237278664 -2.101434579
## Prin1 -0.086184902 -0.180175287 0.768336704 -0.468201822 0.118731879
## Prin2 0.398723142 -0.297379067 -0.358435601 -0.178829361 -0.150133837
## UO 0.006955727 0.002768284 0.003784395 0.001938076 0.006209108
# Posterior probability of membership (calculates l*)
lda.pca.pred <- lda(SHOCKTYP ~ ., prior=c(0.2,0.2,0.2,0.2,0.2), data=dat, CV=TRUE)
head(dt.lda.pca.pred <- data.frame(class=lda.pca.pred$class, posterior=lda.pca.pred$posterior,SHOCKTYP=dat$SHOCKTYP) %>%
mutate(miss=ifelse(class==SHOCKTYP,"","*")))
## class posterior.NONSHOCK posterior.HYPOVOL posterior.CARDIO
## 1 NEURO 0.22907889 0.14052676 0.24906010
## 2 NONSHOCK 0.58823134 0.08216509 0.07328114
## 3 NONSHOCK 0.46031030 0.13688351 0.06422665
## 4 CARDIO 0.08170681 0.09659001 0.57884868
## 5 CARDIO 0.17604798 0.12237064 0.37875957
## 6 BACTER 0.11151153 0.30578235 0.04143129
## posterior.BACTER posterior.NEURO SHOCKTYP miss
## 1 0.08143475 0.2998995 NONSHOCK *
## 2 0.08004616 0.1762763 NONSHOCK
## 3 0.15292417 0.1856554 NONSHOCK
## 4 0.03928463 0.2035699 NONSHOCK *
## 5 0.06293128 0.2598905 NONSHOCK *
## 6 0.36939600 0.1718788 NONSHOCK *
# Error matrix
lda.pca$confusion
## predicted
## original NONSHOCK HYPOVOL CARDIO BACTER NEURO
## NONSHOCK 24 2 5 3 0
## HYPOVOL 5 4 3 5 0
## CARDIO 1 2 16 1 0
## BACTER 4 1 2 9 0
## NEURO 11 1 2 1 1
(err.pca<-tabpct(dt.lda.pca.pred$SHOCKTYP,dt.lda.pca.pred$class))
##
## Original table
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKTYP NONSHOCK HYPOVOL CARDIO BACTER NEURO Total
## NONSHOCK 17 3 5 6 3 34
## HYPOVOL 3 4 3 7 0 17
## CARDIO 1 2 16 1 0 20
## BACTER 2 2 2 9 1 16
## NEURO 4 3 2 3 4 16
## Total 27 14 28 26 8 103
##
## Row percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKTYP NONSHOCK HYPOVOL CARDIO BACTER NEURO Total
## NONSHOCK 17 3 5 6 3 34
## (50) (8.8) (14.7) (17.6) (8.8) (100)
## HYPOVOL 3 4 3 7 0 17
## (17.6) (23.5) (17.6) (41.2) (0) (100)
## CARDIO 1 2 16 1 0 20
## (5) (10) (80) (5) (0) (100)
## BACTER 2 2 2 9 1 16
## (12.5) (12.5) (12.5) (56.2) (6.2) (100)
## NEURO 4 3 2 3 4 16
## (25) (18.8) (12.5) (18.8) (25) (100)
##
## Column percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKTYP NONSHOCK % HYPOVOL % CARDIO
## NONSHOCK 17 (63.0) 3 (21.4) 5
## HYPOVOL 3 (11.1) 4 (28.6) 3
## CARDIO 1 (3.7) 2 (14.3) 16
## BACTER 2 (7.4) 2 (14.3) 2
## NEURO 4 (14.8) 3 (21.4) 2
## Total 27 (100) 14 (100) 28
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKTYP % BACTER % NEURO %
## NONSHOCK (17.9) 6 (23.1) 3 (37.5)
## HYPOVOL (10.7) 7 (26.9) 0 (0.0)
## CARDIO (57.1) 1 (3.8) 0 (0.0)
## BACTER (7.1) 9 (34.6) 1 (12.5)
## NEURO (7.1) 3 (11.5) 4 (50.0)
## Total (100) 26 (100) 8 (100)
## $table.row.percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKTYP NONSHOCK HYPOVOL CARDIO BACTER NEURO
## NONSHOCK 50.000000 8.823529 14.705882 17.647059 8.823529
## HYPOVOL 17.647059 23.529412 17.647059 41.176471 0.000000
## CARDIO 5.000000 10.000000 80.000000 5.000000 0.000000
## BACTER 12.500000 12.500000 12.500000 56.250000 6.250000
## NEURO 25.000000 18.750000 12.500000 18.750000 25.000000
##
## $table.column.percent
## dt.lda.pca.pred$class
## dt.lda.pca.pred$SHOCKTYP NONSHOCK HYPOVOL CARDIO BACTER NEURO
## NONSHOCK 62.962963 21.428571 17.857143 23.076923 37.500000
## HYPOVOL 11.111111 28.571429 10.714286 26.923077 0.000000
## CARDIO 3.703704 14.285714 57.142857 3.846154 0.000000
## BACTER 7.407407 14.285714 7.142857 34.615385 12.500000
## NEURO 14.814815 21.428571 7.142857 11.538462 50.000000