# Vsetky kombinacie ####

#=========================
# Students: cele data ####

# Zdroj: https://archive.ics.uci.edu/ml/datasets.php

studenti <- read.table("07_student-mat.csv", sep=";", header=TRUE)

# vyhodime G1, G2
studenti <- studenti[, - which(colnames(studenti) %in% c("G1", "G2"))] 

# aby faktorove bral ako faktorove
studenti$Medu[studenti$Medu %in% c(0, 1)] <- 1   # je ich prilis malo
studenti$Medu <- as.factor(studenti$Medu)
studenti$Fedu[studenti$Fedu %in% c(0, 1)] <- 1
studenti$Fedu <- as.factor(studenti$Fedu)
studenti$traveltime <- as.factor(studenti$traveltime)
studenti$studytime <- as.factor(studenti$studytime)

n <- nrow(studenti)

# nazvy stlpcov bez G3
mena <- colnames(studenti[, 1:(ncol(studenti)-1)]) 

# Train-Val-Test
set.seed(123456)
n.train <- round(0.7*n)
n.val <- round((n - n.train) / 2)
n.test <- n - n.train - n.val
ind.train <- sample.int(n, n.train, replace=FALSE)
ind.val <- sample((1:n)[-ind.train], n.val, replace=FALSE)
ind.test <- (1:n)[-c(ind.train,ind.val)]
train <- studenti[ind.train, ]
val <- studenti[ind.val, ]
test <- studenti[ind.test, ]

# IC, CV - Train+Val spolu
train2 <- studenti[-ind.test, ]

#==============
# AIC, BIC ####

#-----------------------------------
# Manualne, pre iba zopar premennych ####

prem.cisla <- c(2, 15, 16, 17, 29)
prem <- mena[prem.cisla]    # iba tieto iksy uvazujeme
form.best <- rep("a",5)     # formuly modelov s najnizsim RSS (pre vsetky q)
for(t in 1:5){
  komb <- combn(prem,t)    # vsetky q-prvkove kombinacie premennych
  RSSmin <- +Inf
  for(j in 1:ncol(komb)) {
    form <- paste("G3 ~", paste0(komb[,j], collapse="+")) # formula modelu
    m <- lm(form, data=train2)
    RSS <- sum(resid(m)^2)
    if(RSS < RSSmin) {    # najdeme najmensie RSS spomedzi q-prvkovych modelov
      RSSmin <- RSS
      form.best[t] <- form
    }
  }
}
form.best

# pozor: kategoricka premenna s >2 kategoriami
#        zvysuje pocet premennych o viac ako 1

form.best <- c("G3 ~ 1", form.best)  # aj ubohy model (pre q=1)

# AIC, BIC
RSS <- aic <- bic <- rep(0,6)
for(j in 1:6){
  m <- lm( form.best[j], data=train2 )
  RSS[j] <- sum(resid(m)^2)
  aic[j] <- AIC(m)
  bic[j] <- BIC(m)
}
plot(1:6, RSS, lwd=2)
plot(1:6, aic, lwd=2)
plot(1:6, bic, lwd=2)

# Z obrazku sa tazko porovnavaju, tak si ich vypisme
RSS
aic
which(aic == min(aic))
which(bic == min(bic))
# vitazi q=4 pre AIC, q=3 pre BIC

# MSE_test
m.AIC <- lm(form.best[aic == min(aic)], data=train2)
m.BIC <- lm(form.best[bic == min(bic)], data=train2)
y.AIC <- predict(m.AIC, newdata=test)
y.BIC <- predict(m.BIC, newdata=test)

m.full <- lm(G3 ~ ., data=train2)  # pre porovnanie: plny model
y.full <- predict(m.full, newdata=test)

MSE.test.AIC <- sum((test$G3 - y.AIC) ^ 2) / n.test
MSE.test.BIC <- sum((test$G3 - y.BIC) ^ 2) / n.test
MSE.test.full <- sum((test$G3 - y.full) ^ 2) / n.test

formatC(MSE.test.AIC, format="f", digits=2)
formatC(MSE.test.BIC, format="f", digits=2)
formatC(MSE.test.full, format="f", digits=2)
  # iba na didakticke ucely
  # v praxi: MSE_test iba na zvolenom fin. modeli
  # (resp. nemozeme zmenit rozhodnutie)

#---------------------
# Rkovsky balicek ####

# Formula modelu
form.full <- paste("G3 ~", paste0(mena, collapse="+"))
form.full

# Selekcia premennych
library(leaps)
modely <- regsubsets(as.formula(form.full),
                     data=train2, nbest=4, nvmax=10, method="exhaustive", really.big=TRUE)
# nbest: pre kazde q vyberie 4 najlepsie (z hladiska RSS)
# nvmax: maximalne q=10
res <- summary(modely)
vars <- res$which                  # ktore premenne su v modeloch
sizes <- unname(apply(vars,1,sum)) # velkosti modelov
for(i in 1:nrow(vars)) {
  print( paste0( c(i, colnames(vars)[ vars[i,] ]), collapse=" ") )
}

# regsubsets moze 'rozbit' kategoricke premenne: zahrnut do modelu iba niektore dummy 
# zahrnulo napr. iba Medu4
# Ak trvame na tom, ze pre kategoricku budu vsetky dummy vzdy pokope, 
# tak musime manualne (alebo ine funkcie)

# AIC BIC pre vyselektovane
X <- model.matrix(as.formula(form.full), data=train2)
ytrain <- train2$G3
d <- nrow(vars)
AIC.hodnoty <- BIC.hodnoty <- rep(0,d)
for(i in 1:d) {
  AIC.hodnoty[i] <- AIC( lm(ytrain ~ X[,vars[i,]] -1) )  # intercept je uz v matici X
  BIC.hodnoty[i] <- BIC( lm(ytrain ~ X[,vars[i,]] -1) )
}
plot(sizes, AIC.hodnoty, lwd=2)
plot(sizes, BIC.hodnoty, lwd=2)

# Najlepsie modely
prem.AIC <- which(AIC.hodnoty == min(AIC.hodnoty))
sizes[prem.AIC]                              # AIC vybera najvacsi uvazovany model
colnames(vars)[ vars[prem.AIC,] ]            # ake premenne vybera
m.AIC <- lm(ytrain ~ X[,vars[prem.AIC,]] -1) # model
b.AIC <- m.AIC$coef                          # koeficienty

prem.BIC <- which(BIC.hodnoty == min(BIC.hodnoty))
sizes[prem.BIC]  # iba 6 premennych
colnames(vars)[ vars[prem.BIC,] ]
m.BIC <- lm(ytrain ~ X[,vars[prem.BIC,]] -1)
b.BIC <- m.BIC$coef

# Plny model
m.full <- lm(as.formula(form.full), data=train2)
b.full <- m.full$coef

# MSE_test
Xtest <- model.matrix(as.formula(form.full), data=test)
X.AIC <- Xtest[,vars[prem.AIC,]]                  # matica modelu
y.AIC <- X.AIC %*% b.AIC                          # predikcie
MSE.AIC <- sum((test$G3 - y.AIC)^2) / nrow(test)  # predikcna chyba

X.BIC <- Xtest[,vars[prem.BIC,]]
y.BIC <- X.BIC %*% b.BIC
MSE.BIC <- sum((test$G3 - y.BIC)^2) / nrow(test)
y.full <- Xtest %*% b.full
MSE.full <- sum((test$G3 - y.full)^2) / nrow(test)

paste("AIC: ", formatC(MSE.AIC, 6, format="f", digits=2))
paste("BIC: ", formatC(MSE.BIC, 6, format="f", digits=2))
paste("Full:", formatC(MSE.full, 6, format="f", digits=2))
  # znovu iba pre demonstraciu

# Vykreslenie predikcii
plot(test$G3, y.AIC, ylim = c(min(y.AIC, y.BIC, y.full), max(y.AIC, y.BIC, y.full)), pch=16)
abline(c(0,1))  #  "optimalna" priamka
points(test$G3, y.BIC, lwd=2, col="red")
points(test$G3, y.full, pch=4, lwd=2, col="blue")

#===============
# Validacia ####

# Manualne pre par premennych

# Priprava X, y
form <- as.formula(paste("G3 ~", paste0(prem, collapse="+")))
X <- model.matrix(form, data=train) # matica plneho modelu
y <- train$G3
X.val <- model.matrix(form, data=val) # pre validacnu vzorku
y.val <- val$G3
n.val <- nrow(val)

prem.cisla <- c(2, 15, 16, 17, 29)
k0 <- length(prem)
prem <- mena[prem.cisla]       # iba tieto iksy uvazujeme
sel.best <- vector("list", k0) # vybrane optimalne premenne
MSE.val <- rep(Inf, k0)        # dosiahnute krosvalidacne MSE
for(t in 1:k0){
  komb <- combn(k0, t)    # vsetky q-prvkove kombinacie premennych
  for(j in 1:ncol(komb)) {
    if(t == 1) {
      sel <- komb[j]
    } else {
      sel <- komb[, j]
    }
    m1 <- lm(y ~ X[, c(1, sel+1)] - 1) # model s danymi premennymi
    b <- m1$coef
    b[is.na(b)] <- 0  # ak nejaky koeficient nevie odhadnut, tak ho mozeme pokladat za 0
    pred <- X.val[, c(1, sel+1)] %*% b
    MSE <- sum((y.val-pred)^2)/n.val
    
    if(MSE < MSE.val[t]) { # najdeme najmensiu val chybu
      MSE.val[t] <- MSE
      sel.best[[t]] <- sel
    }
  }
}
# Model "manualne" (y ~ X), aby sa vyhlo problemom s kategorickymi prem.
# Potom predict() v takom zapise nefunguje.

# + Ubohy model
m1 <- lm(y ~ 1)
b <- m1$coef
pred <- b      # predikcie uboheho modelu
MSE <- sum((y.val-pred)^2)/n.val  # a jeho MSE
MSE.val <- c(MSE, MSE.val)
sel.best <- append(0, sel.best)

for(i in 1:(k0+1)){
  print(c(format(MSE.val[i], format="f", digits=2, nsmall=2),
          prem[sel.best[[i]]]))
} # vybrane premenne

q.val <- which(MSE.val==min(MSE.val))  # zvoleny pocet premennych (vratane interceptu)

# Vykreslenie
plot(MSE.val, type="l", lwd=2, main="Validacna chyba")
lines(c(q.val,q.val), c(0,MSE.val[q.val]), lwd=3, col="red")

# Testovacie chyby
form <- as.formula(paste("G3 ~", paste0(prem, collapse="+")))
X.test <- model.matrix(form, data=test) # matica plneho modelu
y.test <- test$G3
m <- length(MSE.val)
MSE.test.val <- rep(0,m)
for(i in 1:m) {
  sel <- sel.best[[i]]
  if(i == 1) {
    m1 <- lm(y ~ 1)
  } else {
    m1 <- lm(y~X[, c(1, sel+1)]-1)
  }
  b <- m1$coef
  if(i==1) {                       # predikcie
    pred <- b
  } else {
    pred <- X.test[,c(1, sel+1)] %*% b
  }
  MSE.test.val[i] <- sum((y.test-pred)^2)/n.test
}

# Vykreslenie
par(mfrow=c(1,2))
plot(MSE.val, type="l", lwd=2, main="Validacna chyba")
lines(c(q.val,q.val), c(0,MSE.val[q.val]), lwd=3, col="red")
plot(MSE.test.val, type="l", lwd=2, main="Test")
points(q.val, MSE.test.val[q.val], cex=2, pch=16, col="black")

#===================
# Krosvalidacia ####

# Manualne pre par premennych

# Rozdelenie do sad
K <- 5
n.cv <- nrow(train2)
set.seed(18624)
sady <- sample.int(K, n.cv, replace=TRUE)
table(sady)

# Priprava X, y
form <- as.formula(paste("G3 ~", paste0(prem, collapse="+")))
X <- model.matrix(form, data=train2) # matica plneho modelu
y <- train2$G3

prem.cisla <- c(2, 15, 16, 17, 29)
k0 <- length(prem)
prem <- mena[prem.cisla]    # iba tieto iksy uvazujeme
sel.best <- vector("list", k0) # vybrane optimalne premenne
MSE.cv <- rep(Inf, k0)      # dosiahnute krosvalidacne MSE
for(t in 1:k0){
  komb <- combn(k0, t)    # vsetky q-prvkove kombinacie premennych
  for(j in 1:ncol(komb)) {
    if(t == 1) {
      sel <- komb[j]
    } else {
      sel <- komb[, j]
    }
    MSE.sady <- rep(Inf,K)
    for(it in 1:K) {   # krosvalidacia
      X.cv <- X[sady!=it, c(1, sel+1)]
      y.cv <- y[sady!=it]
      m1 <- lm(y.cv~X.cv-1) # model s danymi premennymi
      b <- m1$coef
      b[is.na(b)] <- 0  # ak nejaky koeficient nevie odhadnut, tak ho mozeme pokladat za 0
      pred <- X[sady==it, c(1, sel+1)] %*% b
      n.sada <- length(pred)
      MSE.sady[it] <- sum((y[sady==it]-pred)^2)/n.sada   # validacna chyba
    }
    
    MSE <- mean(MSE.sady) # celkova CV chyba
    if(MSE < MSE.cv[t]) { # najdeme najmensiu CV chybu
      MSE.cv[t] <- MSE
      sel.best[[t]] <- sel
    }
  }
}

# + Ubohy model
for(j in 1:K) {
  y.cv <- y[sady!=j]
  m1 <- lm(y.cv~1)  # ziskame kvalitu uboheho modelu
  b <- m1$coef
  pred <- b         # predikcie uboheho modelu
  n.sada <- length(y[sady==j])
  MSE.sady[j] <- sum((y[sady==j]-pred)^2)/n.sada  # a jeho MSE
}
MSE.cv <- c(mean(MSE.sady), MSE.cv)
sel.best <- append(0, sel.best)

for(i in 1:(k0+1)){
  print(c(format(MSE.cv[i], format="f", digits=2, nsmall=2),
          prem[sel.best[[i]]]))
} # vybrane premenne

q.cv <- which(MSE.cv==min(MSE.cv))  # zvoleny pocet premennych (vratane interceptu)

# Vystupne modely fitujeme na celych trenovacich datach

# Testovacie chyby
form <- as.formula(paste("G3 ~", paste0(prem, collapse="+")))
X.test <- model.matrix(form, data=test) # matica plneho modelu
y.test <- test$G3
m <- length(MSE.cv)
MSE.test.cv <- rep(0,m)
for(i in 1:m) {
  sel <- sel.best[[i]]
  if(i == 1) {
    m1 <- lm(y ~ 1) # na celom train2
  } else {
    m1 <- lm(y~X[, c(1, sel+1)]-1) # na celom train2
  }
  b <- m1$coef
  if(i==1) {                       # predikcie
    pred <- b
  } else {
    pred <- X.test[,c(1, sel+1)] %*% b
  }
  MSE.test.cv[i] <- sum((y.test-pred)^2)/n.test
}

# Vykreslenie
par(mfrow=c(1,2))
plot(MSE.cv, type="l", lwd=2, main="Krosvalidacna chyba")
lines(c(q.cv,q.cv), c(0,MSE.cv[q.cv]), lwd=3, col="red")
plot(MSE.test.cv, type="l", lwd=2, main="Test")
points(q.cv, MSE.test.cv[q.cv], cex=2, pch=16, col="black")
