# Predmet: "Analyza zhlukov a klasifikacia dat" # Studijny program: "Pravdepodobnost a matematicka statistika" # Vyucujuci: Radoslav Harman, KAMS FMFI UK Bratislava # # Rozhodovacie stromy a rekurzivne delenie # (Classification tress and recursive partitioning) clev <- read.table("http://www.iam.fmph.uniba.sk/ospm/Harman/data/clev.txt", header = TRUE) dim(clev); head(clev); n <- nrow(clev) # Data tykajuce sa srdcovych chorob 297 ludi # (niekolko riadkov bolo vylucenych kvoli chybajucim hodnotam premennych) # Typ srdcovej poruchy oznacuje premenna cat, pricom 0 znamena "bez srdcovej poruchy" # Viac na https://www.kaggle.com/ronitf/heart-disease-uci # Rychly graficky prehlad hodnotami premennych a ich (niektorych) korelacii perm <- sample(1:n) plot(clev[perm, 1:7], pch = 19, col = as.numeric(clev$cat[perm] > 0) + 1) plot(clev[perm, 8:14], pch = 19, col = as.numeric(clev$cat[perm] > 0) + 1) # Kedze nas bude zaujimat len ci je alebo nie je pritomna srdcova porucha (a nie jej typ), # prevedieme vsetky nenulove hodnoty cls na jednotku: clev$cls <- as.factor(as.numeric(clev$cat > 0)) # Numericke premenne # age ... age # rbps ... resting blood pressure (systolic) # chol ... serum cholesterol # hr ... heart rate # op ... "old peak" ST depression induced by exercise # ca ... the number of major vessels coloured by flouroscopy # Niektore z vysvetlujucich premennych su kvalitativne (kategoricke), # takze ich typ predefinujeme na factor. (Vieme, ze kategoricke vysvetlujuce premenne # sa v stromoch riesia inak ako ordinalne, takze rpart musi vediet ktore to su.) clev$sex <- as.factor(clev$sex) # Sex clev$cp <- as.factor(clev$cp) # Chest pain clev$fbs <- as.factor(clev$fbs) # Fasting blood sugar clev$ecg <- as.factor(clev$ecg) # Resting electrocardiographic results clev$exa <- as.factor(clev$exa) # Exercise-induced angina clev$sl <- as.factor(clev$sl) # Slope of the peak exercise ST segment clev$thl <- as.factor(clev$thl) # Type of defect (3 = no defect) # Nacitajme kniznicu na recursive patitioning library(rpart) help(rpart) # procedura rpart konstruuje viacero typov stromov, klasifikacne, aj takzvane regresne # pre nasu prednasku su relevantne klasifikacne stromy, cize method="class" # argument formula je spravna_klasifikacia~vysvetlujuca_premenna_1+...+vysvetlujuca_premenna_p # parameter parms umoznuje okrem ineho zadat sposob merania "necistoty", napr # parms = list(split = "information") alebo parms = list(split = "gini") help(rpart.control) # rpart.control umoznuje nastavit detailnejsie parametere, napriklad pocet krosvalidacii # typu "repeated random sub-sampling validation": control=rpart.control(xval=100) clev.tree <- rpart(cls~age+sex+cp+rbps+chol+fbs+ecg+hr+exa+op+sl+ca+thl, data = clev, method = "class", parms = list(split = "information"), control = rpart.control(xval = 100)) summary(clev.tree) # Tento prikaz nam poskytuje podrobne textove informacie o skonstruovanom strome # Tabulka na zaciatku zodpoveda pruningu maximalneho stromu pre limitne complexity parameters (CP) # nsplit je pocet splitov (neterminalnych uzlov) daneho stromu # rel error je resub. chyba relativne k chybe "trivialneho" stromu # xerror je ako rel error, ale odhadnute pomocou krizovej validacie, nie resubstitucne # Zvysok opisuje samotny strom (podmienka v neterminalnom uzle je prvy z "primary splits") # Strom si mozeme vizializovat nasledovne: plot(clev.tree); text(clev.tree) # Du: Najdite si balik na krajsiu vizualizaciu stromu # Pre klasifikaciu objektov pomocou skonstruovaneho stromu mozeme pouzit funkciu predict # Napriklad kategorie prvych 20 ludi z treningovych dat odhaduje klasifikacny strom takto: predict(clev.tree, clev[1:20, 1:13]) # Pozrime si k tomu aj spravnu klasifikaciu cbind(clev$cls[1:20], predict(clev.tree, clev[1:20, 1:13])) # Vypocitajme odhad celej confusion matrix pomocou resubsitucie nnC.res <- matrix(0, ncol = 2, nrow = 2) clev.pred.res <- apply(predict(clev.tree, clev[, 1:13]), 1, which.max) nnC.res[1, 1] <- sum((clev$cls == 0) & (clev.pred.res == 1)) nnC.res[1, 2] <- sum((clev$cls == 0) & (clev.pred.res == 2)) nnC.res[2, 1] <- sum((clev$cls == 1) & (clev.pred.res == 1)) nnC.res[2, 2] <- sum((clev$cls == 1) & (clev.pred.res == 2)) confusion.info <- function(nnC) { # Nenormalizovana aj normalizovana confusion matrix a odhady roznych charakteristik # nnC ... nenormalizovana kontingencna tabulka klasifikacie (confusion matrix) print("Non-normalized confusion matrix"); print(nnC) C <- nnC/sum(nnC); print("Normalized confusion matrix"); print(round(C, 4)) print(c("Sensitivity", round(C[1, 1]/(C[1, 1] + C[1, 2]), 4))) print(c("Specificity", round(C[2, 2]/(C[2, 1] + C[2, 2]), 4))) print(c("Accuracy", round(C[2, 2] + C[1, 1], 4))) PY <- (C[1, 1]*C[2, 2] - C[1, 2]*C[2, 1]) / sqrt(sum(C[1, ])*sum(C[2, ])*sum(C[, 1])*sum(C[, 2])) print(c("Pearson-Yule", round(PY, 4))) } confusion.info(nnC.res) # Vypocitajme odhad confusion matrix pomocou leave-one-out crossvalidation nnC.loo <- matrix(0, ncol = 2, nrow = 2) for (i in 1:n) { clev.tree.i <- rpart(cls~age+sex+cp+rbps+chol+fbs+ecg+hr+exa+op+sl+ca+thl, data = clev[-i, ], method = "class", parms = list(split = "information")) clev.pred.loo.i <- predict(clev.tree.i, clev[i, 1:13]) cls.cor <- as.numeric(clev$cls[i]) cls.pred <- which.max(clev.pred.loo.i) nnC.loo[cls.cor, cls.pred] <- nnC.loo[cls.cor, cls.pred] + 1 } confusion.info(nnC.loo) # Skusime pouzit klasifikacny les: library(randomForest) help("randomForest") clev.forest <- randomForest(cls~age+sex+cp+rbps+chol+fbs+ecg+hr+exa+op+sl+ca+thl, data = clev) # Vypocitame confusion matrix pomocou tzv out-of-bag sample nnC.oob <- matrix(0, ncol = 2, nrow = 2) nnC.oob[1, 1] <- sum((clev$cls == 0) & (clev.forest$predicted == 0)) nnC.oob[1, 2] <- sum((clev$cls == 0) & (clev.forest$predicted == 1)) nnC.oob[2, 1] <- sum((clev$cls == 1) & (clev.forest$predicted == 0)) nnC.oob[2, 2] <- sum((clev$cls == 1) & (clev.forest$predicted == 1)) clev.forest$confusion confusion.info(nnC.oob)