# Cvicenie z predmetu "Analyza zhlukov a klasifikacia dat" # Studijny program "Pravdepodobnost a matematicka statistika", FMFI UK # Vyucujuci: Radoslav Harman # # Demonstracia metody k najblizsich susedov a zakladnych pojmov klasifikacie # Data su merania charakteristik vzoriek rakovinovych buniek (breast cancer Wisconsin data set) # Zdroj dat: http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic) can <- read.table("http://www.iam.fmph.uniba.sk/ospm/Harman/data/wdbc.txt", header = FALSE, sep = ",") dim(can); head(can) n <- nrow(can) # V1: Kod subjektu # V2: Typ nadoru # V3: Priemer "rozmerov" jadier # V4: Priemer "rozptylov urovni sedej" v jadrach # V5: Priemer "vzdialenosti buniek okolo" jadier # V6: Priemer "ploch" jadier # V7: Priemer "hladkosti kontur" jadier # V8: Priemer "kompaktnosti" jadier # V9: Priemer "miery prevalencie konkavit" jadier # V10: Priemer poctov "konkavnych bodov" jadier # V11: Priemer "mier symetrickosti" jadier # V12: Priemer "fraktalnych dimenzii" hranic jadier # V13-V22: To iste, avsak namiesto priemeru sa pocita extremalna hodnota # V23-V32: To iste, avsak namiesto priemeru sa pocita smerodajna odchylka hodnot # Pozrime sa na zastupenie tried table(can$V2) # Kvoli jednoduchosti sa sustredme len na prvych 10 premennych # Odstranme tiez kod subjektu a namiesto kategorii M, B pouzime 1, 2 # (Cize uvazujeme M ako "positives" a B ako "negatives".) can <- can[, 2:12] can$V2 <- as.numeric(can$V2 == "B") + 1 head(can) # Zobrazme si data plot(can[, 2:11], pch = 19, cex = 0.1, col = can$V2) # Odstranime niektore z hladiska klasifikacie zrejme redundantne premenne can <- can[, c(1:3, 6:7, 9:11)] plot(can[, 2:8], pch = 19, cex = 0.1, col = can$V2) # Standardizujme premenne, aby kazda mala "spravodlivu sancu na vplyv na klasifikaciu" can[, 2:8] <- scale(can[, 2:8]) plot(can[, 2:8], pch = 19, cex = 0.1, col = can$V2) # Mozeme si data pozriet aj v suradniciach urcenych hlavnymi komponentami plot(as.data.frame(prcomp(can[, 2:8])$x), pch = 19, cex = 0.1, col = can$V2, xlim = c(-8, 6), ylim = c(-8, 6)) ###### # Zatial budeme pouzivat len k najblizsich susedov library(class); help(knn) ######### # Resubstitucny odhad pre confusion matrix can.predict <- knn(can, can, can$V2, k = 11) confusion.info <- function(nnC) { # Nenormalizovana aj normalizovana confusion matrix a odhady roznych charakteristik # nnC ... nenormalizovana kontingencna tabulka klasifikacie (confusion matrix) print("Non-normalized confusion matrix"); print(nnC) C <- nnC/sum(nnC); print("Normalized confusion matrix"); print(round(C, 4)) print(c("Sensitivity", round(C[1, 1]/(C[1, 1] + C[1, 2]), 4))) print(c("Specificity", round(C[2, 2]/(C[2, 1] + C[2, 2]), 4))) print(c("Accuracy", round(C[2, 2] + C[1, 1], 4))) PY <- (C[1, 1]*C[2, 2] - C[1, 2]*C[2, 1]) / sqrt(sum(C[1, ])*sum(C[2, ])*sum(C[, 1])*sum(C[, 2])) print(c("Pearson-Yule", round(PY, 4))) } confusion.info(table(can$V2, can.predict)) # Pre niektore typy odhadov C, akym je aj resubstitucia, si vieme situaciu zobrazit chyba <- as.numeric(can$V2 != can.predict) pcs <- as.data.frame(prcomp(can[, 2:8])$x[, 1:2]) plot(pcs, type = "n"); grid() points(pcs, pch = 19, col = can$V2, cex = 2*chyba + 1) ######## # Odhad metodou nezavislej (validacnej) podmnoziny train <- rep(TRUE, n) train[sample(1:n, round(0.5*n))] <- FALSE n.train <- sum(train) can.predict <- knn(can[train, ], can[!train, ], can[train, ]$V2, k = 11) # Nenormalizovana aj normalizovana confusion matrix a odhady roznych charakteristik confusion.info(table(can[!train, ]$V2, can.predict)) # Nakreslime si obrazok chybovosti na validacnej mnozine dat chyba <- as.numeric(can[!train, ]$V2 != can.predict) pcs <- as.data.frame(prcomp(can[, 2:8])$x[!train, 1:2]) plot(pcs, type = "n"); grid() points(pcs, pch = 19, col = can[!train, ]$V2, cex = 2*chyba + 1) ######## # Krizova validacia metodou leave-one-out can.predict <- rep(0, n) for (i in 1:n) { can.predict[i] <- knn(can[-i, ], can[i, ], can[-i, ]$V2, k = 11) } # Nenormalizovana aj normalizovana confusion matrix a odhady roznych charakteristik confusion.info(table(can$V2, can.predict)) # Nakreslime si obrazok omylov pre leave-one-out chyba <- as.numeric(can$V2 != can.predict) pcs <- as.data.frame(prcomp(can[, 2:8])$x[, 1:2]) plot(pcs, type = "n"); grid() points(pcs, pch = 19, col = can$V2, cex = 2*chyba + 1)