# Predmet: "Analyza zhlukov a klasifikacia dat" # Studijny program: "Pravdepodobnost a matematicka statistika" # Vyucujuci: Radoslav Harman, KAMS FMFI UK Bratislava # Analyza zhlukov 1: k-means, k-medoids, model-based clustering, DBScan planets <- read.csv("C:/planets.csv") # Data som vytvoril specialnou selekciou a poolovanim zaznamov z databaz: # https://exoplanetarchive.ipac.caltech.edu/docs/data.html # Data su prisposobene pedagogickym ucelom, nie su podkladom pre vedecky vyskum! # Hladame zhluky v *objavenych* planetach, ktore su zrejme nereprezentativnym vyberom... dim(planets) n <- nrow(planets) planets table(planets$discoverymethod) planets[planets$discoverymethod == "Orbital Brightness Modulation", ] # COLUMN pl_name: Planet Name # COLUMN discoverymethod: Discovery Method # COLUMN disc_year: Discovery Year # COLUMN pl_orbper: Orbital Period [days] # COLUMN pl_orbsmax: Orbit Semi-Major Axis [au] # COLUMN pl_rade: Planet Radius [Earth Radius] # COLUMN pl_bmasse: Planet Mass [Earth Mass] # COLUMN pl_orbeccen: Eccentricity # COLUMN st_rad: Stellar Radius [Solar Radius] # COLUMN st_mass: Stellar Mass [Solar mass] plot(planets[, 4:10], pch = 19, cex = 0.1) planets[planets$pl_orbper > 600, ] planets <- planets[planets$pl_orbper < 600, ] # Urobime jednoduchy feature engineering par(mfrow = c(2, 5)) for (i in 4:8) hist(planets[, i], main = names(planets)[i]) for (i in 4:8) hist(log(planets[, i]), main = names(planets)[i]) par(mfrow = c(1, 1)) for (i in c(4,5,8)) planets[, i] <- log(planets[, i]) planets[, 7] <- (planets[, 7])^(1/3) plot(planets[, 4:8], pch = 19, cex = 0.1) apply(planets[, 4:8], 2, sd) planets[, 4:8] <- scale(planets[, 4:8]) plot(prcomp(planets[, 4:8])$x[, 1:2], pch = 19) ### k-means planets.kmeans <- kmeans(planets[, 4:8], centers = 4, nstart = 1000) planets.kmeans planets.kmeans$cluster planets.kmeans$withinss sum(planets.kmeans$withinss) planets.kmeans$tot.withinss # Rozklad "celkovej sumy stvorcov" planets.kmeans$betweenss planets.kmeans$betweenss + planets.kmeans$tot.withinss planets.kmeans$totss plot(planets[, 4:8], pch = 19, cex = 0.1, col = planets.kmeans$cluster) plot(prcomp(planets[, 4:8])$x[, 1:2], pch = 19, col = planets.kmeans$cluster) # Skusme si nakreslit 3D obrazok library(rgl) X <- prcomp(planets[, 4:8])$x[, 1:3] plot3d(X, col = planets.kmeans$cluster, type = "s", radius = 0.1) # Laktovy diagram best <- rep(0, 10) for (k in 1:10) best[k] <- kmeans(planets[, 4:8], centers = k, nstart = 100)$tot.withinss plot(1:10, best, type = "b") # Skusme teda len 2 zhluky planets.kmeans <- kmeans(planets[, 4:8], centers = 2, nstart = 1000) plot(planets[, 4:8], pch = 19, cex = 0.1, col = planets.kmeans$cluster) plot(prcomp(planets[, 4:8])$x[, 1:2], pch = 19, col = planets.kmeans$cluster) plot3d(X, col = planets.kmeans$cluster, type = "s", radius = 0.1) ### k-medoids library(cluster) planets.kmedoids <- pam(planets[, 4:8], k = 4) planets.kmedoids planets.kmedoids$medoids planets[planets.kmedoids$id.med, ] planets.kmedoids$clustering planets.kmedoids$clusinfo plot(planets[, 4:8], pch = 19, cex = 0.1, col = planets.kmedoids$cluster) plot(prcomp(planets[, 4:8])$x[, 1:2], pch = 19, col = planets.kmedoids$cluster) plot(planets.kmedoids) # Aj toto skusme pre k=2 planets.kmedoids <- pam(planets[, 4:8], k = 2) plot(planets[, 4:8], pch = 19, cex = 0.1, col = planets.kmedoids$cluster) plot(prcomp(planets[, 4:8])$x[, 1:2], pch = 19, col = planets.kmedoids$cluster) plot(planets.kmedoids) planets.kmedoids <- pam(planets[, 4:8], k = 2) ### model-based clustering library(mclust) help(Mclust) planets.model <- Mclust(planets[, 4:8], modelName = "VVV") planets.model planets.model$classification planets.model$parameters planets.model$z plot(planets.model) plot(planets[, 4:8], pch = 19, cex = 0.1, col = planets.model$classification) plot(prcomp(planets[, 4:8])$x[, 1:2], pch = 19, col = planets.model$classification) # Vlastny napad: Analogia silhouette vypocitana z model-based clustering probs <- planets.model$z clust <- planets.model$classification ord <- rep(0, n) for (k in 1:5) plot(sort(probs[clust == k, k]), type = "b", col = k, pch = 19) # Skusme si nakreslit aj tu 3D obrazok plot3d(X, col = planets.model$classification, type = "s", radius = 0.1) ### DBScan library(dbscan) help(dbscan) planets.db <- dbscan(planets[, 4:8], eps = 1) planets.db$cluster # graf "k-distance" head(kNNdist(planets[, 4:8], k = 5)) res <- as.vector(kNNdist(planets[, 4:8], k = 5)) plot(ecdf(res), xlab = "eps", ylab = "podiel bodov") # Pozrime sa na pocty zhlukov pre rozne epsilon (noise sa tu rata ako jeden zhluk) eps.vec <- seq(0.1, 3, by = 0.02) n.eps <- length(eps.vec) n.clst <- rep(0, n.eps) for (i in 1:n.eps) n.clst[i] <- length(table(dbscan(planets[, 4:8], eps = eps.vec[i])$cluster)) plot(eps.vec, n.clst, type = "b", pch = 19); grid() # Zobrazme si 3-PCA priemet pre rozne epsilon library(rgl) X <- prcomp(planets[, 4:8])$x[, 1:3] planets.db <- dbscan(planets[, 4:8], eps = 0.2) plot3d(X, col = planets.db$cluster + 1, type = "s", radius = 0.1) planets.db <- dbscan(planets[, 4:8], eps = 0.4) plot3d(X, col = planets.db$cluster + 1, type = "s", radius = 0.1) planets.db <- dbscan(planets[, 4:8], eps = 0.7) plot3d(X, col = planets.db$cluster + 1, type = "s", radius = 0.1) # Mozeme mat "nizky limit na hustotu pre spajanie" # Niekedy sa odporuca k = 2*p # Mala graficka analyza rozumnych epsilonov res <- as.vector(kNNdist(planets[, 4:8], k = 10)) plot(ecdf(res), xlab = "eps", ylab = "podiel bodov") for (i in 1:n.eps) n.clst[i] <- length(table(dbscan(planets[, 4:8], minPts = 10, eps = eps.vec[i])$cluster)) plot(eps.vec, n.clst, type = "b", pch = 19); grid() # Opat zobrazme si 3-PCA priemet pre rozumne epsilon planets.db <- dbscan(planets[, 4:8], minPts = 10, eps = 0.7) plot3d(X, col = planets.db$cluster + 1, type = "s", radius = 0.1)