library(ISLR2)      # Hitters data
library(factoextra) # fviz_pca
library(pls)        # pcr
library(glmnet)     # cv.glmnet

# Data
data(Hitters)
data <- na.omit(Hitters) # vyradit NA
n <- nrow(data)

# Rozdelenie na Train, Test
set.seed(278643)
n.test <- round(n/10)
n.train <- n - n.test
ind.train <- sample.int(n, n.train, replace=FALSE)
ind.test <- (1:n)[-ind.train]
train <- data[ind.train, ]
test <- data[ind.test, ]

# X, y
y <- data$Salary
X <- data[, - which(colnames(data) == "Salary")]
X$League <- factor(X$League, labels=c(0, 1))  # dummy prekodovanie
X$Division <- factor(X$Division, labels=c(0, 1))
X$NewLeague <- factor(X$NewLeague, labels=c(0, 1))
X <- data.matrix(X) # konverzia na maticu

X.train <- X[ind.train, ]
X.test <- X[ind.test, ]
y.train <- y[ind.train]
y.test <- y[ind.test]

#=========
# PCA ####

hit.pca <- prcomp(X.train, scale=TRUE)
round(hit.pca$rotation[, 1:2], 2)
fviz_pca(hit.pca, geom="point")
summary(hit.pca)

#==================
# PCR manualne ####
q <- 2
Z <- hit.pca$x[, 1:q] # prvych q stlpcov

m.man <- lm(y.train ~ Z)
summary(m.man)

#==================
# PCR funkciou ####

?pcr

set.seed(8520)
m.pcr <- pcr(Salary ~ ., data = train, scale = TRUE, validation = "CV")
  # standardizovat
  # validacia pomocou CV
  # segments = 10 je default (?mvrCv)

summary(m.pcr)

# vykreslenie CV
validationplot(m.pcr)

CVs <- RMSEP(m.pcr)
CVs
which(CVs$val[1, , ] == min(CVs$val[1, , ]))
  # najde minimum RMSE pre CV
q <- which(CVs$val[1, , ] == min(CVs$val[1, , ])) - 1

# predikcie
?predict.mvr
predict(m.pcr, newdata = test) # pre kazde "q" ine predikcie

#=======================
# Kvalita predikcie ####

# RMSE test
y.pcr <- predict(m.pcr, newdata = test, ncomp = q) # pre opt. pocet PC
sqrt(sum((y.test - y.pcr)^2))

# plny model
m.full <- lm(Salary ~ ., data=train)
m.full
y.full <- predict(m.full, newdata = test)
sqrt(sum((y.test - y.full)^2))

# Ridge
?predict.cv.glmnet
set.seed(123)
mRidge <- cv.glmnet(X.train, y.train, family="gaussian", alpha=0, nfolds=10)
y.ridge <- predict(mRidge, newx = X.test, s = "lambda.min")
sqrt(sum((y.test - y.ridge)^2))

# Lasso
?predict.cv.glmnet
set.seed(123)
mLasso <- cv.glmnet(X.train, y.train, family="gaussian", alpha=1, nfolds=10)
y.lasso <- predict(mLasso, newx = X.test, s = "lambda.min")
sqrt(sum((y.test - y.lasso)^2))

print(c("RMSE PCR:  ", round(sqrt(sum((y.test - y.pcr)^2)), 2)))
print(c("RMSE Full: ", round(sqrt(sum((y.test - y.full)^2)), 2)))
print(c("RMSE Ridge:", round(sqrt(sum((y.test - y.ridge)^2)), 2)))
print(c("RMSE Lasso:", round(sqrt(sum((y.test - y.lasso)^2)), 2)))

