Inquiries regarding this code contact gabriel.carrasco@upch.pe

rm(list=ls())
library(tidyverse); library(epiDisplay); library(cluster); library(factoextra); library(TDA)
library(purrr); library(dendextend); library(tidyr)

Distance between two points

head(dat_a)

##       LOAG LSULFIDE     LNH3N      LAS       LCD   CR  CU      LPB
## 1 8.221748 6.269096 0.9707789 5.209486 0.5877867 30.9 173 4.820282
## 2 8.455531 6.594413 1.2412686 5.318120 0.7419373 40.1 211 5.288267
## 3 7.812378 6.204558 1.3737156 4.795791 0.4252677 29.0 179 4.997212
## 4 6.216606 6.447306 1.4906544 3.637586 0.3220835 20.0  53 3.737670
## 5 4.709530 1.481605 0.2776317 2.484907 0.1222176 10.2  13 1.945910
## 6 6.781058 3.610918 0.4187103 4.330733 0.2926696 22.6  72 4.248495
##        LZN    FE  MN     LPAH    LPHTH
## 1 5.560682 21600 219 8.846209 7.102499
## 2 5.814131 23700 302 9.051813 9.118006
## 3 5.313206 19300 230 8.496174 6.782192
## 4 5.049856 13800 179 7.343426 4.882802
## 5 3.367296  7380  69 3.332205 4.955827
## 6 4.890349 19300 177 6.873164 6.732211

d <- get_dist(dat_a,method = "euclidean") # method="euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski", "pearson", "spearman" or "kendall"
fviz_dist(d, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# In SAS and R, the Euclidean distance is used by default.

Methods of forming clusters

Centroid (“METHOD = CENTROID” in SAS)

Distance between clusters is defined as distance between centroids (using chosen distance measure—Euclidean, Mahalanobis, etc.).

hc1 <- hclust(d, method = "centroid")
labels(hc1)<-dat$STATION[hc1$order]
plot(hc1, cex = 0.6, hang = -1)

Nearest neighbor (“METHOD = SINGLE” in SAS)

Calculate all distances between points in one cluster and points in other cluster (for instance, if 7 points in one cluster and 5 points in other, then 7 × 5 = 35 distances to calculate). Distance between clusters is defined as shortest of distances calculated.

hc2 <- hclust(d, method = "single" )
labels(hc2)<-dat$STATION[hc2$order]
plot(hc2, cex = 0.6, hang = -1)

# Alternative compute with agnes [cluster package]
hc2.1 <- agnes(dat_a, method = "single")
pltree(hc2.1, cex = 0.6, hang = -1)

# Agglomerative coefficient
hc2.1$ac

## [1] 0.7755984

Average linkage (“METHOD = AVERAGE” in SAS)

Calculate all distances between points in one cluster and points in other cluster. Distance between clusters is defined as the mean of all distances calculated.

hc4 <- hclust(d, method = "average")
labels(hc4)<-dat$STATION[hc4$order]
plot(hc4, cex = 0.6, hang = -1)

hc4.1 <- agnes(dat_a, method = "average")
pltree(hc4.1, cex = 0.6, hang = -1,labels = dat$STATION)

hc4.1$ac

## [1] 0.9351761

Extra:

Compare different clustering methods

# methods to assess
m <- c( "average", "single", "complete", "ward", "weighted")
names(m) <- c( "average", "single", "complete", "ward", "weighted")

# function to compute coefficient
ac <- function(x) {
  agnes(dat_a, method = x)$ac
}

map_dbl(m, ac)

##   average    single  complete      ward  weighted 
## 0.9351761 0.7755984 0.9641158 0.9756548 0.9475407

Determine the optimal number of clusters

fviz_nbclust(dat_a, kmeans, method = "gap_stat")

fviz_nbclust(dat_a, FUN = hcut, method = "wss",k.max = 20)

# Cut in 3 groups and color by groups
fviz_dend(hc1, k = 3, # Cut in three groups
          cex = 0.5, # label size
          palette = "jco",
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE # Add rectangle around groups
)

df.stats.hc1 <- dat_a %>% mutate(cluster_cent=cutree(hc1, k=3)) %>%
  gather(variable, value, -cluster_cent) %>% mutate(value=as.numeric(value)) %>%
  group_by(cluster_cent, variable) %>%
  dplyr::summarise(N=n(),Mean=mean(value),Std_Dev=sd(value),Min = min(value), Max = max(value),
                   q25 = quantile(value, 0.25), median = median(value), q75 = quantile(value, 0.75))

# Cut in 4 groups and color by groups
fviz_dend(hc4, k = 4, # Cut in four groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE # Add rectangle around groups
)

df.stats.hc4 <- dat_a %>% mutate(cluster_cent=cutree(hc4, k=4)) %>%
  gather(variable, value, -cluster_cent) %>% mutate(value=as.numeric(value)) %>%
  group_by(cluster_cent, variable) %>%
  dplyr::summarise(N=n(),Mean=mean(value),Std_Dev=sd(value),Min = min(value), Max = max(value),
                   q25 = quantile(value, 0.25), median = median(value), q75 = quantile(value, 0.75))

# Compare two dendograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)

dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
           highlight_distinct_edges = FALSE, # Turn-off dashed lines
           common_subtrees_color_lines = FALSE, # Turn-off line colors
           common_subtrees_color_branches = TRUE, # Color common branches 
           main = paste("entanglement =", round(entanglement(dend_list), 2))
)

Routine for PH 628

Lecture 9: Cluster Analysis

Gabriel Carrasco Escobar