Inquiries regarding this code contact gabriel.carrasco@upch.pe
rm(list=ls())
library(tidyverse); library(epiDisplay); library(cluster); library(factoextra); library(TDA)
library(purrr); library(dendextend); library(tidyr)
head(dat_a)
## LOAG LSULFIDE LNH3N LAS LCD CR CU LPB
## 1 8.221748 6.269096 0.9707789 5.209486 0.5877867 30.9 173 4.820282
## 2 8.455531 6.594413 1.2412686 5.318120 0.7419373 40.1 211 5.288267
## 3 7.812378 6.204558 1.3737156 4.795791 0.4252677 29.0 179 4.997212
## 4 6.216606 6.447306 1.4906544 3.637586 0.3220835 20.0 53 3.737670
## 5 4.709530 1.481605 0.2776317 2.484907 0.1222176 10.2 13 1.945910
## 6 6.781058 3.610918 0.4187103 4.330733 0.2926696 22.6 72 4.248495
## LZN FE MN LPAH LPHTH
## 1 5.560682 21600 219 8.846209 7.102499
## 2 5.814131 23700 302 9.051813 9.118006
## 3 5.313206 19300 230 8.496174 6.782192
## 4 5.049856 13800 179 7.343426 4.882802
## 5 3.367296 7380 69 3.332205 4.955827
## 6 4.890349 19300 177 6.873164 6.732211
d <- get_dist(dat_a,method = "euclidean") # method="euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski", "pearson", "spearman" or "kendall"
fviz_dist(d, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
# In SAS and R, the Euclidean distance is used by default.
Distance between clusters is defined as distance between centroids (using chosen distance measure—Euclidean, Mahalanobis, etc.).
hc1 <- hclust(d, method = "centroid")
labels(hc1)<-dat$STATION[hc1$order]
plot(hc1, cex = 0.6, hang = -1)
Calculate all distances between points in one cluster and points in other cluster (for instance, if 7 points in one cluster and 5 points in other, then 7 × 5 = 35 distances to calculate). Distance between clusters is defined as shortest of distances calculated.
hc2 <- hclust(d, method = "single" )
labels(hc2)<-dat$STATION[hc2$order]
plot(hc2, cex = 0.6, hang = -1)
# Alternative compute with agnes [cluster package]
hc2.1 <- agnes(dat_a, method = "single")
pltree(hc2.1, cex = 0.6, hang = -1)
# Agglomerative coefficient
hc2.1$ac
## [1] 0.7755984
Calculate all distances between points in one cluster and points in other cluster. Distance between clusters is defined as the mean of all distances calculated.
hc4 <- hclust(d, method = "average")
labels(hc4)<-dat$STATION[hc4$order]
plot(hc4, cex = 0.6, hang = -1)
hc4.1 <- agnes(dat_a, method = "average")
pltree(hc4.1, cex = 0.6, hang = -1,labels = dat$STATION)
hc4.1$ac
## [1] 0.9351761
# methods to assess
m <- c( "average", "single", "complete", "ward", "weighted")
names(m) <- c( "average", "single", "complete", "ward", "weighted")
# function to compute coefficient
ac <- function(x) {
agnes(dat_a, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward weighted
## 0.9351761 0.7755984 0.9641158 0.9756548 0.9475407
fviz_nbclust(dat_a, kmeans, method = "gap_stat")
fviz_nbclust(dat_a, FUN = hcut, method = "wss",k.max = 20)
# Cut in 3 groups and color by groups
fviz_dend(hc1, k = 3, # Cut in three groups
cex = 0.5, # label size
palette = "jco",
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
df.stats.hc1 <- dat_a %>% mutate(cluster_cent=cutree(hc1, k=3)) %>%
gather(variable, value, -cluster_cent) %>% mutate(value=as.numeric(value)) %>%
group_by(cluster_cent, variable) %>%
dplyr::summarise(N=n(),Mean=mean(value),Std_Dev=sd(value),Min = min(value), Max = max(value),
q25 = quantile(value, 0.25), median = median(value), q75 = quantile(value, 0.75))
# Cut in 4 groups and color by groups
fviz_dend(hc4, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
df.stats.hc4 <- dat_a %>% mutate(cluster_cent=cutree(hc4, k=4)) %>%
gather(variable, value, -cluster_cent) %>% mutate(value=as.numeric(value)) %>%
group_by(cluster_cent, variable) %>%
dplyr::summarise(N=n(),Mean=mean(value),Std_Dev=sd(value),Min = min(value), Max = max(value),
q25 = quantile(value, 0.25), median = median(value), q75 = quantile(value, 0.75))
# Compare two dendograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("entanglement =", round(entanglement(dend_list), 2))
)