# Predmet: "Analyza zhlukov a klasifikacia dat" # Studijny program: "Pravdepodobnost a matematicka statistika" # Vyucujuci: Radoslav Harman, KAMS FMFI UK Bratislava # Cvicenie k particnym metodam analyzy zhlukov: porovnanie na umelych datach # Porovnanie viacerych metod zhlukovania na roznych umelych datach # Co maju vsetky porovnania spolocne: # Mame len 2 zhluky (a algoritmy to vedia) # Pocet objektov/bodov v kazdom zhluku je rovnaky a "maly" porovnaj <- function(X) { # X je matica realnych cisel velkosti N krat 2 # Funkcia nakresli zlukovanie na dva zhluky # pomocou metod k-means, k-medoids, Mclust, dbscan par(mfrow = c(2, 2)) # k-means clust.kmeans <- kmeans(X, centers = 2, nstart = 1000)$cluster plot(X, pch = 19, col = clust.kmeans, main = "k-means", xlab = "x", ylab = "y", asp = 1) grid() # k-medoids library(cluster) clust.kmedoids <- pam(X, k = 2)$cluster plot(X, pch = 19, col = clust.kmedoids, main = "k-medoids", xlab = "x", ylab = "y", asp = 1) grid() # model-based clustering library(mclust) clust.model <- Mclust(X, G = 2, modelName = "VVV")$classification plot(X, pch = 19, col = clust.model, main = "model-based", xlab = "x", ylab = "y", asp = 1) grid() # DBScan library(dbscan) # Heuristicky napad: # Najdeme take eps, ktore dava dva zhluky (a sum) a maximalizuje n1*n2, # kde n1 je velkost prveho zhluku a n2 je velkost druheho zhluku # Nasledovne priamociare prehladavanie obcas zlyha, ale nam staci eps.vec <- seq(0.1, 5, by = 0.001) n.eps <- length(eps.vec) res <- c(); crit <- c() for (i in 1:n.eps) { clust.db <- dbscan(X, eps = eps.vec[i])$cluster if ((sum(clust.db == 2) > 0) && (sum(clust.db == 3) == 0)) { res <- c(res, eps.vec[i]) n1 <- sum(clust.db == 1) n2 <- sum(clust.db == 2) crit <- c(crit, n1*n2) } } eps2 <- res[which.max(crit)] clust.db <- 3 - dbscan(X, eps = eps2)$cluster plot(X, pch = 19, col = clust.db, main = "DBScan", xlab = "x", ylab = "y", asp = 1) grid() par(mfrow = c(1, 1)) } n <- 100 # Kazdy zhluk bude mat rovnaky pocet bodov (mohli by sme variovat aj velkosti) # Jasne oddelene gulovite zhluky rovnakej hustoty data1x <- rnorm(n, mean = -3); data1y <- rnorm(n) data2x <- rnorm(n, mean = 3); data2y <- rnorm(n) X <- rbind(cbind(data1x, data1y), cbind(data2x, data2y)) porovnaj(X) # model-based obcas pozoruhodne "zlyha" # Jasne oddelene elipticke zhluky rovnakej hustoty data1x <- rnorm(n, mean = -3); data1y <- rnorm(n, sd = 5) data2x <- rnorm(n, mean = 3); data2y <- rnorm(n, sd = 5) X <- rbind(cbind(data1x, data1y), cbind(data2x, data2y)) porovnaj(X) # Jasne oddelene gulovite zhluky roznej hustoty data1x <- rnorm(n, mean = -3, sd = 2); data1y <- rnorm(n, sd = 3) data2x <- rnorm(n, mean = 3, sd = 0.5); data2y <- rnorm(n, sd = 0.5) X <- rbind(cbind(data1x, data1y), cbind(data2x, data2y)) porovnaj(X) # Kriziace sa elipticke zhluky rovnakej hustoty data1x <- rnorm(n, sd = 0.25); data1y <- rnorm(n, sd = 4) data2x <- rnorm(n, sd = 4); data2y <- rnorm(n, sd = 0.25) X <- rbind(cbind(data1x, data1y), cbind(data2x, data2y)) porovnaj(X) # Zhluk mensej hustoty v zhluku vacsej hustoty data1x <- rnorm(n, sd = 0.5); data1y <- rnorm(n, sd = 0.5) data2x <- rnorm(n, sd = 3); data2y <- rnorm(n, sd = 3) X <- rbind(cbind(data1x, data1y), cbind(data2x, data2y)) porovnaj(X) # Malo separovane prstencovite zhluky r.annulus <- function(n, R1, R2) { # Generuje m bodov v medzikruzi s vnutornym a vonkajsim polomerom R1, resp. R2 psi <- 2*pi*runif(n) R <- sqrt(runif(n)*(R2^2 - R1^2) + R1^2) return(R*cbind(cos(psi), sin(psi))) } X1 <- r.annulus(n, 0.2, 0.5) X2 <- r.annulus(n, 0.7, 1) X <- rbind(X1, X2) porovnaj(X) # Velmi separovane prstencovite zhluky X1 <- r.annulus(n, 0.1, 0.3) X2 <- r.annulus(n, 0.8, 1) X <- rbind(X1, X2) porovnaj(X) # Zakliesnene bananovite zhluky X1 <- r.annulus(2*n, 0.8, 1.2) X1 <- X1[X1[, 2] > 0, ] X1[, 2] <- X1[, 2] - 0.2 X1[, 1] <- X1[, 1] - 0.5 X2 <- r.annulus(2*n, 0.8, 1.2) X2 <- X2[X2[, 2] < 0, ] X2[, 2] <- X2[, 2] + 0.2 X2[, 1] <- X2[, 1] + 0.5 X <- rbind(X1, X2) porovnaj(X)