# Self-organizing maps

library(kohonen)      # SOMs
library(plotly)
library(cluster)      # pam
library(RColorBrewer) # brewer.pal

#================
# Umele data ####

# Vygenerovanie
# (Podla Hastie et al.: Elements of statistical learning, 2nd ed.
# Example 14.5)
n0 <- 30
t1 <- runif(n0, -pi/8, pi/8)
r1 <- runif(n0, 0, 2*pi)
x1 <- sin(t1)*cos(r1) + rnorm(n0, sd=0.3)
y1 <- sin(t1)*sin(r1) + rnorm(n0, sd=0.3)
z1 <- cos(t1) + rnorm(n0, sd=0.3)

t2 <- runif(n0, pi/2-pi/4, pi/2+pi/4)
r2 <- runif(n0, -pi/5, pi/5)
x2 <- sin(t2)*cos(r2) + rnorm(n0, sd=0.3)
y2 <- sin(t2)*sin(r2) + rnorm(n0, sd=0.3)
z2 <- cos(t2) + rnorm(n0, sd=0.3)

t3 <- runif(n0, pi/2-pi/4, pi/2+pi/4)
r3 <- runif(n0, pi/2-pi/5, pi/2+pi/5)
x3 <- sin(t3)*cos(r3) + rnorm(n0, sd=0.3)
y3 <- sin(t3)*sin(r3) + rnorm(n0, sd=0.3)
z3 <- cos(t3) + rnorm(n0, sd=0.3)
X <- rbind(cbind(x1, y1, z1),
           cbind(x2, y2, z2),
           cbind(x3, y3, z3)) # data

lab <- rep(c(1,2,3), each=n0) # triedy

cols0 <- c("#2ca02c","#ff7f0e", "#636efa")

# Vykreslenie
fig <- plot_ly(x = ~X[, 1], 
               y = ~X[, 2], 
               z = ~X[, 3],
               color = ~as.factor(lab),
               colors = cols0)
fig <- fig %>% add_markers()
fig

# ulozenie
#library(htmlwidgets)
#saveWidget(partial_bundle(fig), "11_SOMdata.html")

# SOM
?somgrid
g <- somgrid(xdim = 5, ydim = 5, topo = "rectangular")  # priprava siete
?som
map <- som(X, grid = g)

?plot.kohonen
plot(map, type="mapping", labels=lab, pchs=16, col=cols0[lab])
  # nie realne pozicie v ramci uzlu, len nahodne zasumene

# Uzly
kody <- getCodes(map, 1)  
head(kody)
# reprezentativne vektory uzlov ... codebook vectors

cols <- c("#2ca02c","#ff7f0e", "#636efa", "black")
fig <- plot_ly(x = ~X[, 1], 
               y = ~X[, 2], 
               z = ~X[, 3],
               color = ~as.factor(lab),
               colors = cols)
fig <- fig %>% add_markers()
fig <- fig %>% add_trace(x = ~kody[, 1], 
                         y = ~kody[, 2], 
                         z = ~kody[, 3],
                         mode = "markers", 
                         type = "scatter3d",
                         color = ~as.factor(rep(4, nrow(kody))),
                         colors = cols,
                         marker = list(size = 15))
fig

#library(htmlwidgets)
#saveWidget(partial_bundle(fig), "11_SOMkody.html")

# vs. k-means
set.seed(9846)
umele.km <- kmeans(X, centers=3)$cluster

# ako boli data rozdelene do zhlukov v k-means:
par(mfrow=c(1, 3))
nn <- sum(umele.km==1)
A <- matrix(0, nrow=nn, ncol=2)
plot(jitter(A), pch=16, col=cols[lab[umele.km == 1]])  # zhluk 1

nn <- sum(umele.km==2)
A <- matrix(0, nrow=nn, ncol=2)
plot(jitter(A), pch=16, col=cols[lab[umele.km == 2]])  # zhluk 2

nn <- sum(umele.km==3)
A <- matrix(0, nrow=nn, ncol=2)
plot(jitter(A), pch=16, col=cols[lab[umele.km == 3]])  # zhluk 3
# zase nie suradnice dat, ale nahodne umiestnene body

dev.off()

# Zhlukovanie na zaklade SOM
D <- object.distances(map, "codes")  # vzdialenosti code vectors
attributes(D)  # matica vzdialenosti
D
set.seed(9856)
map.km <- kmeans(D, centers=3)$cluster  # k-means pre D

plot(map, type="mapping", labels=lab, pchs=16, col=cols[lab])
add.cluster.boundaries(map, map.km)   # zhluky

#====================
# Zlozenie jedla ####

food <- read.table("01_food.txt", header=TRUE, sep=" ")
X <- food[,9:14]   # data predelene vahou
colnames(X) <- c("Fat", "Energy", "Carbohydrates", "Protein", "Cholesterol", "Saturated.fat")

# skalovanie
X <- scale(X, center=FALSE, scale=TRUE)

nrow(food)
sqrt(nrow(food) / 5) # aby v priemere bolo v uzloch ~ 5 bodov

set.seed(881359)
g <- somgrid(xdim = 13, ydim = 13, topo = "hexagonal")
map <- som(as.matrix(X), grid = g)

plot(map, type="counts") # pocty bodov v uzloch
plot(map, type="counts", shape = "straight")  # nie ako kruhy

kody <- getCodes(map, 1)  
head(kody)
# reprezentativne vektory uzlov ... codebook vectors

plot(map, type="codes", palette.name=rainbow) 
# zastupenie zloziek v codebook vectors

# jednotlive zlozky
for(j in 1:ncol(kody)) {
  plot(map, type = "property", property = kody[, j],
       main = colnames(kody)[j])
  invisible(readline(prompt="Press [enter] to continue"))
}

# Zhlukovanie na zaklade SOM
D <- object.distances(map, "codes")  
set.seed(9856)
map.med <- pam(D, 5)$cluster  # k-medoids pre D

plot(map, type="codes", palette.name=rainbow) 
add.cluster.boundaries(map, map.med)   # vykreslenie pre zlozky

plot(map, type="counts")
add.cluster.boundaries(map, map.med)   # vykreslenie pre pocty bodov

# vykreslenie U-matrix:
# - package Umatrix
# - google -> github

#================
# MNIST data ####

M <- read.table("MNIST.txt")
lab.all <- M[, 1]
X.all <- M[, 2:ncol(M)]

# Iba 2500 riadkov
set.seed(12398754)
ind <- sample.int(nrow(M), 2500, replace=FALSE) 
lab <- M[ind,1]
X.mnist <- M[ind,-1]

col.pal <- c(brewer.pal(9, "Set1"),"black")
cols <- col.pal[lab+1]

# PCA skratenie
Z <- prcomp(X.mnist)
Zn <- Z$x[,1:30]

nrow(Zn)
sqrt(nrow(Zn) / 5) # aby v priemere bolo v uzloch ~ 5 bodov

set.seed(881359)
g <- somgrid(xdim = 22, ydim = 22, topo = "hexagonal")
map <- som(as.matrix(Zn), grid = g)

plot(map, type="mapping", labels=lab, pchs=16, col=cols)

# Zhlukovanie na zaklade SOM
D <- object.distances(map, "codes")  
set.seed(9856)
map.med <- pam(D, 10)$cluster  # k-medoids pre D
plot(map, type="mapping", labels=lab, pchs=16, col=cols)
add.cluster.boundaries(map, map.med)   # dokreslenie zhlukov

# Zhlukovanie na povodnych datach
mnist.med <- pam(X.mnist, 10)$cluster
par(mfrow=c(3, 4))
for(i in 1:10) {
  nn <- sum(mnist.med==i)
  A <- matrix(0, nrow=nn, ncol=2)
  plot(jitter(A), pch=16, col=cols[mnist.med == i])
}
lims <- par("usr")
legend(lims[2],lims[4], legend=0:9, pch=16, col=col.pal, xpd=NA)
dev.off()
