# Krokove metody ####

#=========================
# Students: cele data ####

# Zdroj: https://archive.ics.uci.edu/ml/datasets.php

studenti <- read.table("07_student-mat.csv", sep=";", header=TRUE)

# vyhodime G1, G2
studenti <- studenti[, - which(colnames(studenti) %in% c("G1", "G2"))] 

# aby faktorove bral ako faktorove
studenti$Medu[studenti$Medu %in% c(0, 1)] <- 1   # je ich prilis malo
studenti$Medu <- as.factor(studenti$Medu)
studenti$Fedu[studenti$Fedu %in% c(0, 1)] <- 1
studenti$Fedu <- as.factor(studenti$Fedu)
studenti$traveltime <- as.factor(studenti$traveltime)
studenti$studytime <- as.factor(studenti$studytime)

n <- nrow(studenti)

# nazvy stlpcov bez G3
mena <- colnames(studenti[, 1:(ncol(studenti)-1)]) 

# Train-Val-Test
set.seed(123456)
n.train <- round(0.7*n)
n.val <- round((n - n.train) / 2)
n.test <- n - n.train - n.val
ind.train <- sample.int(n, n.train, replace=FALSE)
ind.val <- sample((1:n)[-ind.train], n.val, replace=FALSE)
ind.test <- (1:n)[-c(ind.train,ind.val)]
train <- studenti[ind.train, ]
val <- studenti[ind.val, ]
test <- studenti[ind.test, ]

# IC, CV - Train+Val spolu
train2 <- studenti[-ind.test, ]
n.train2 <- nrow(train2)

#===============
# AIC, BIC ####

?step

m.full <- lm(G3 ~ ., data=train2)
form.full <- paste("G3 ~", paste0(mena, collapse="+"))
form.full

# AIC (penalta k=2)

# backward, zacne z plneho modelu m.full
backward.AIC <- step(m.full, direction="backward", k=2)
length(backward.AIC$coef) # pocet premennych

# forward, zacne z uboheho modelu, nanajvys po plny model
forward.AIC <- step(lm(G3~1,data=train2), scope=as.formula(form.full),
                    direction="forward", k=2)
length(forward.AIC$coef)

# bidirectional, z uboheho modelu
both.AIC <- step(lm(G3~1,data=train2), scope=as.formula(form.full),
                    direction="both", k=2)
length(both.AIC$coef)

# step (narozdiel od regsubsets) pridava "cele" kategoricke premenne

# penalta k=log(n), cize BIC
backward.BIC <- step(m.full, direction="backward", k=log(n.train2))
length(backward.BIC$coef)
forward.BIC <- step(lm(G3~1,data=train2), scope=as.formula(form.full),
                    direction="forward", k=log(n.train2))
length(forward.BIC$coef)
both.BIC <- step(lm(G3~1,data=train2), scope=as.formula(form.full),
                 direction="both", k=log(n.train2))
length(both.BIC$coef)

# MSE_test: predikcne chyby
y.bAIC <- predict(backward.AIC, newdata=test)
y.fAIC <- predict(forward.AIC, newdata=test)
y.bothAIC <- predict(both.AIC, newdata=test)
y.bBIC <- predict(backward.BIC, newdata=test)
y.fBIC <- predict(forward.BIC, newdata=test)
y.bothBIC <- predict(both.BIC, newdata=test)
MSE.bAIC <- sum((test$G3 - y.bAIC)^2) / nrow(test)
MSE.fAIC <- sum((test$G3 - y.fAIC)^2) / nrow(test)
MSE.bothAIC <- sum((test$G3 - y.bothAIC)^2) / nrow(test)
MSE.bBIC <- sum((test$G3 - y.bBIC)^2) / nrow(test)
MSE.fBIC <- sum((test$G3 - y.fBIC)^2) / nrow(test)
MSE.bothBIC <- sum((test$G3 - y.bothBIC)^2) / nrow(test)
paste("backward AIC: ", formatC(MSE.bAIC, 6, format="f", digits=2))
paste("forward AIC:  ", formatC(MSE.fAIC, 6, format="f", digits=2))
paste("both AIC:     ", formatC(MSE.bothAIC, 6, format="f", digits=2))
paste("backward BIC: ", formatC(MSE.bBIC, 6, format="f", digits=2))
paste("forward BIC:  ", formatC(MSE.fBIC, 6, format="f", digits=2))
paste("both BIC:     ", formatC(MSE.bothBIC, 6, format="f", digits=2))

#========================
# Validacia manualne ####

X.all <- model.matrix(G3~., data=studenti)
X.train <- X.all[ind.train, ]
X.val <- X.all[ind.val, ]
X.test <- X.all[ind.test, ]
y.train <- train$G3
y.val <- val$G3
y.test <- test$G3
m <- ncol(X.all)

# Forward selection
selected <- rep(FALSE,m)  # ktore premenne sme uz zobrali do modelu
selected[1] <- TRUE       # zaciname s ubohym modelom
m1 <- lm(y.train ~ X.train[, selected] - 1) # ziskame kvalitu uboheho modelu
b <- m1$coef
pred <- X.val[, selected] * b         # predikcie uboheho modelu
MSE.val <- sum((y.val-pred)^2)/n.val  # a jeho MSE

added <- 1   # cisla pridavanych premennych
for(iter in 2:m) {
  MSE.val <- c(MSE.val,Inf)
  added <- c(added,0)
  for(i in 2:m) {    # skusame pridat dalsiu premennu
    if(selected[i]==FALSE) {
      sel2 <- selected
      sel2[i] <- TRUE
      m1 <- lm(y.train~X.train[,sel2]-1) # model s pridanou premennou
      b <- m1$coef
      b[is.na(b)] <- 0  # ak nejaky koeficient nevie odhadnut, tak ho mozeme pokladat za 0
      pred <- X.val[,sel2] %*% b
      MSE <- sum((y.val-pred)^2)/n.val  # validacna chyba
      if(MSE < MSE.val[iter]) {         # najdeme najmensiu val. chybu
        MSE.val[iter] <- MSE
        added[iter] <- i
      }
    }
  }
  selected[added[iter]] <- TRUE           # pridame najlepsiu premennu
}
# Model "manualne" (y ~ X), aby sa vyhlo problemom s kategorickymi prem.
# Potom predict() v takom zapise nefunguje.

q <- which(MSE.val==min(MSE.val))[1]  # zvoleny pocet premennych

plot(1:m, MSE.val, type="l", lwd=3, main="Validacna chyba")
lines(c(q,q), c(0,MSE.val[q]), lwd=3, col="red")
colnames(X.train)[added]

# Testovacie a trenovacie chyby
MSE.test <- MSE.train <- rep(0,m)
for(i in 1:m) {
  sel <- added[1:i]                # vybrany model velkosti i
  m1 <- lm(y.train~X.train[,sel]-1)
  b <- m1$coef
  b[is.na(b)] <- 0  # ak nejaky koeficient nevie odhadnut, tak ho mozeme pokladat za 0
  if(i==1) {                       # predikcie
    pred <- X.test[,sel] * b
  } else {
    pred <- X.test[,sel] %*% b
  }
  MSE.test[i] <- sum((y.test-pred)^2)/n.test
  MSE.train[i] <- sum(resid(m1)^2)/n.train
}

# Vykreslenie
par(mfrow=c(1,3))
plot(MSE.train, type="l", lwd=2, main="Train")
points(q, MSE.train[q], cex=2, pch=16, col="red")
plot(MSE.val, type="l", lwd=2, main="Validation")
lines(c(q,q), c(0,MSE.val[q]), lwd=3, col="red")
plot(MSE.test, type="l", lwd=2, main="Test")
points(q, MSE.test[q], cex=2, pch=16, col="red")

#===================
# Krosvalidacia ####

library(caret)

# Vsetky netestovacie data
X.all <- model.matrix(G3 ~ ., data=studenti)
X <- X.all[-ind.test, ]
y <- studenti$G3[-ind.test]

# Nastavenia modelu
ctrl <- rfeControl(functions = lmFuncs,
                   method = "cv",
                   number = 10)
# rfe ... recursive feature selection
# lmFuncs ... robime s linearnou regresiou (funkciou lm())
# cv ... crossvalidation
# number = 10 ... 10-fold crossvalidation

# Feature selection pomocou CV
q.max <- 30
subsets <- 1:q.max   # modely akych velkosti nas zaujimaju
lmProfile <- rfe(X, y, sizes=subsets, rfeControl=ctrl)
lmProfile

# vezme iba najlepsi ziskany model, nie najlepsie modely pre kazde q

prem.cv <- lmProfile$optVariables            # vybrane premenne
sel.cv <- c(1, match(prem.cv, colnames(X)))  # cisla ich stlpcov
m.cv <- lmProfile$fit                        # vybrany model
b.cv <- m.cv$coef
y.cv <- X.test[,sel.cv] %*% b.cv          # predikcia
MSE.test.cv <- sum((y.cv - y.test)^2)/n.test  # testovacia chyba

paste("MSE test: ", formatC(MSE.test.cv, 6, format="f", digits=2))
paste("RMSE test:", formatC(sqrt(MSE.test.cv), 6, format="f", digits=2))
