--- title: "Merging and hierarchical cluster examples" output: prettydoc::html_pretty: theme: tactile highlight: vignette vignette: > %\VignetteIndexEntry{Merging and hierarchical cluster examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} %\VignetteDepends{ggplot2,patchwork} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", out.width = "100%", fig.asp = 0.5, dpi = 120 ) ``` This section contains several examples on how to merge cluster data, either generated with **clugenr** or from other sources. Although it is possible to merge data in any dimension, these examples will focus on merging 2D data. Therefore, we'll use the same `plot_examples_2d()` function used for the `vignette("examples2d")`, requiring the following setup code: ```{r setup, message=FALSE, warning=FALSE} # Load the clugenr library library(clugenr) # Load functions for plotting examples source("plot_examples_2d.R", local = knitr::knit_global()) # Keep examples reproducible in newer R versions RNGversion("3.6.0") ``` ## Merging two data sets generated with `clugen()` ```{r} seed1 <- 4444 seed2 <- 5555 ``` ```{r} e090 <- clugen(2, 5, 1000, c(1, 1), pi / 12, c(20, 20), 14, 1.2, 1.5, seed = seed1, proj_dist_fn = "unif", point_dist_fn = "n") e091 <- clugen(2, 3, 1500, c(1, 0), 0.05, c(20, 20), 0, 0, 4, seed = seed2, point_dist_fn = "n", cluster_offset = c(20, 0)) e092 <- clumerge(e090, e091) ``` ```{r} plot_examples_2d(list(e = e090, t = "e090: data set 1"), list(e = e091, t = "e091: data set 2"), list(e = e092, t = "e092: merged data sets")) ``` In the previous example, clusters from individual data sets remain as separate clusters in the merged data set. It's also possible to maintain the original cluster labels by setting the `clusters_field` parameter to `NA`: ```{r} e093 <- clumerge(e090, e091, clusters_field = NA) ``` ```{r} plot_examples_2d(list(e = e090, t = "e090: data set 1"), list(e = e091, t = "e091: data set 2"), list(e = e093, t = "e093: merged data sets"), palette = "set2") ``` ## Adding noise to a `clugen()`-generated data set ```{r} set.seed(333) e094 <- list(points = matrix(120 * runif(2 * 500) - 60, ncol = 2), clusters = factor(rep(9L, 500))) # Assign noise to cluster 9 e095 <- clumerge(e094, e092, clusters_field = NA) # and keep it 9 when merging ``` ```{r} plot_examples_2d(list(e = e092, t = "e092: original merged data sets"), list(e = e094, t = "e094: random uniform noise"), list(e = e095, t = "e095: data sets with noise"), palette = "set2", pmargin = -0.1) ``` ## Merging with data not generated with `clugen()` Data generated with `clugen()` can be merged with other data sets, for example as a way of augmenting them. In this example we perform `stats::prcomp()` to the `datasets::swiss` dataset to reduce its 6 variables to 2. As there are only 47 observations in this dataset, we'll use `clugen()` to generate very similar clusters and then use `clumerge()` to create an augmented dataset: ```{r} set.seed(888) spc <- prcomp(swiss) e096 <- list(points = spc$x[, 1:2], clusters = factor(rep(4L, dim(spc$x)[1]))) e097 <- clugen(2, 3, 50, c(-0.3, 1), 0., c(0, 0), 0, 0, 3, proj_dist_fn = "unif", clusizes_fn = c(20, 6, 25), clucenters_fn = matrix(c(-58, -5, 10, -40, 38, 0), byrow = TRUE, ncol = 2), angle_deltas_fn = c(0.02, -0.03, 0.1), llengths_fn = c(55, 100, 60)) e098 <- clumerge(e096, e097, clusters_field = NA) ``` ```{r} plot_examples_2d(list(e = e096, t = "e096: swiss PCA"), list(e = e097, t = "e097: clugen-generated"), list(e = e098, t = "e098: merged"), palette = "seaborn") ``` We can also hierarchize clusters from different sources: ```{r} e099 <- append(e096, list(hclusters = factor(rep(1L, dim(spc$x)[1])))) e100 <- append(e097, list(hclusters = factor(rep(2L, length(e097$clusters))))) e101 <- clumerge(e099, e100, fields = "points", clusters_field = "hclusters") ``` ```{r} plot_examples_2d(list(e = e099, t = "e099: swiss PCA (1 cluster)"), list(e = e100, t = "e100: clugen-generated (1 cluster)"), list(e = e101, t = "e101: merged (2 clusters)"), clusters_field = "hclusters", palette = "seaborn") ``` Note that augmenting a dataset this way is probably not a statistically sound approach for most problems. In any case, if we perform clustering on the merged dataset and assume there are 3 clusters, results are interesting: ```{r} # For display purposes, assign all points in merged data to a single cluster e102 <- list(points = e101$points, hclusters = factor(rep(1L, length(e101$hclusters)))) # Perform hierachical clustering with Manhattan distance, complete linkage and # assuming 3 clusters clusts <- cutree(hclust(dist(e101$points, method = "manhattan")), 3) # Create data set with merged points and new cluster labels e103 <- list(points = e101$points, hclusters = factor(clusts)) ``` ```{r} plot_examples_2d(list(e = e101, t = "e101: merged (2 clusters)"), list(e = e102, t = "e102: merged (1 cluster)"), list(e = e103, t = "e103: after clustering (3 clusters)"), clusters_field = "hclusters", palette = "seaborn") ```