# Cvicenia z predmetu Analyza zhlukov a klasifikacia dat # Studijny program: "Pravdepodobnost a matematicka statistika" # Radoslav Harman, FMFI UK, Bratislava # # Ukazka pouzitia metody opornych bodov (support vector machines) # Volne zalozene na priklade z manualu kniznice kernlab (Karatzoglou, Smola, Hornik) # http://cran.r-project.org/web/packages/kernlab/kernlab.pdf # V druhej casti kratke porovnanie s inymi metodami library(kernlab) data(spam); help(spam) #Tu je podrobne vysvetlenie dat # Pozrime si zakladne info o databaze dim(spam); n <- nrow(spam) names(spam) table(spam[, 58]) # premenna 58 je znama spravna klasifikacia # Urobime si numericky prehlad o premennych print(t(apply(spam[, -58], 2, summary))) # Vsimnime si, ze dokonca 1. kvartil je u vacsiny premennych rovny 0. # Rozdelenia su zjavne nie normalne # Graficky prehlad o premennych (pre jednoduchost prvych 12 premennych) par(mfrow = c(3, 4)) for (i in 1:12) hist(spam[, i], main = names(spam)[i]) # Okrem mnozstva nulovych hodnot vidime ze data su skewed for (i in 1:12) hist(log(spam[, i] + 0.01), main = names(spam)[i]) # Toto data zjavne vylepsilo # Takze logaritmus je transformacia dat, ktora mozu napomoct diskriminacii spam[, -58] <- log(spam[, -58] + 0.01) # Vhodne je prehliadnut vzajomne korelacie a grafy dvojic # kvoli prehladnosti len 4 nahodne vybrate premenne so znahodnenym poradim vykreslovania rand.order <- sample(1:n) smpl <- spam[rand.order, c(sample(1:57, 4), 58)] plot(smpl[, 1:4], col = as.numeric(smpl[, 5]), cex = 0.7, pch = 19) # Pozrime sa este na priemet dat do 2D pomocou PCA par(mfrow = c(1, 1)) x <- t(prcomp(spam[, -58])$x[, 1]) y <- t(prcomp(spam[, -58])$x[, 2]) plot(x, y, col = as.numeric(spam[, 58]), pch = 19, cex = 0.5) # Vidime, ze zmysluplna klasifikacia by mala byt mozna, # aj ak by sme na nu pouzili len prve dve pca skore # Vytvorme treningovu vzorku dat a validacnu vzorku dat index <- sample(1:n) ind.train <- index[1:floor(0.8*n)] ind.test <- index[(floor(0.8*n) + 1):n] spam.train <- spam[ind.train,] spam.test <- spam[ind.test,] # "Natrenujeme" SVM, cize vypocitame optimalne hodnoty premennych alfa, vid prednaska help(ksvm) filter <- ksvm(type~., data = spam.train, kernel = "rbfdot", type = "C-svc", kpar = list(sigma = 0.05), C = 5, cross = 10) # Pouzili sme gaussovske jadro s parametrom sigma=0.05 a penalizacnu konstantu C=5 # cross=10 je "k" pre k-nasobnu krizovu validaciu # Zakladne info: print(filter) # Pocet opornych bodov, cize bodov i, kde je optimalne alpha_i ostro >0; vid teoria print(nSV(filter)) # Indexy i opornych bodov print(alphaindex(filter)) # Mozeme si pozriet aj konkretne optimalne hodnoty nenulovych alpha print(alpha(filter)) # Vypocitaj klasifikaciu pre testovacie e-maily mailtype <- predict(filter, spam.test) # Nakreslime si confusion matrix a spocitajme hrubu in-sample (resubsitucnu) chybu table(mailtype, spam.test[, 58]) 1 - mean(mailtype == spam.test[, 58]) # Mozeme sa hrat s parametrami, napriklad: filter <- ksvm(type~., data = spam, kernel = "rbfdot", kpar = list(sigma = 0.05), C = 5, cross = 10); print(filter) filter <- ksvm(type~., data = spam, kernel = "rbfdot", kpar = list(sigma = 0.01), C = 10, cross = 10); print(filter) # Mohli by sme robit grid search (hyperarameter tuning) # pre rozne sigma a C a tak najst najlepsie nastavenie # Porovnajme s ostatnymi metodami, ale uz len mechanicky # Porovnajme s LDA library(MASS) lda.filter <- lda(type~., data = spam.train) lda.pred <- predict(lda.filter, spam.test) table(lda.pred$class, spam.test[, 58]) 1 - mean(lda.pred$class == spam.test[, 58]) # Porovnajme s KNN library(class) knn.pred <- knn(spam.train[, -58], spam.test[, -58], spam[ind.train, 58], k = 5) table(knn.pred, spam[ind.test, 58]) 1 - mean(knn.pred == spam[ind.test, 58]) # Porovnajme s CT library(rpart) tree.filter <- rpart(type~., data = spam.train, method = "class", parms = list(split = "information")) plot(tree.filter); text(tree.filter) tree.pred <- predict(tree.filter, spam.test) table(tree.pred[, 1] < 0.5, spam.test[, 58]) 1 - mean((as.numeric(tree.pred[, 1] < 0.5) + 1) == as.numeric(spam.test[, 58])) # Porovnajme s RF library(randomForest) for.filter <- randomForest(type~., data = spam.train) for.pred <- predict(for.filter, spam.test) table(for.pred, spam.test[, 58]) 1 - mean(for.pred == spam.test[, 58])