boston <- read.table("http://www.iam.fmph.uniba.sk/ospm/Filova/vsa/bostonh.dat")
Premenné:
-V1: miera kriminality
-V2: podiel veľkých pozemkov
-V3: podiel veľkoobchodných priestorov
-V4: Rieka Charles
-V5: koncentrácia oxidu dusitého
-V6: priemerný počet izieb v obydlí
-V7: podiel zastavby staršej ako 1940
-V8: vážená vzdialenosť od centier zamestnanosti
-V9: index prístupnosti k diaľniciam
-V10: miera zdanenia
-V11: počet učiteľov na žiaka
-V12: index afroamerického obyvateľstva
-V13: percento populácie s nízkym príjmom
-V14: mediánová hodnota bytov v tisíckach dolárov
head(boston)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 28.7
plot(boston)
Takýto spôsob vizualizácie je neprehľadný, aj keď vidíme všetky premenné naraz Vezmime si len V1-V3 a V13-V14:
plot(boston[,-c(4:12)])
pairs(boston[,-c(4:12)], pch=as.numeric(as.factor(boston$V4)), col=as.numeric(as.factor(boston$V4)))
pairs(boston[,-c(4:12)], pch=16, col=as.numeric(as.factor(boston$V11)))
boxplot(boston)
z <- boston
for(i in 1:14) z[, i] <- (boston[, i] - mean(boston[, i]))/(sqrt(var(boston[, i])))
boxplot(z)
xt <- boston
xt[, 1] <- log(boston[, 1])
xt[, 2] <- boston[, 2]/10
xt[, 3] <- log(boston[, 3])
xt[, 5] <- log(boston[, 5])
xt[, 6] <- log(boston[, 6])
xt[, 7] <- ((boston[, 7]^2.5))/10000
xt[, 8] <- log(boston[, 8])
xt[, 9] <- log(boston[, 9])
xt[, 10] <- log(boston[, 10])
xt[, 11] <- exp(0.4 * boston[, 11])/1000
xt[, 12] <- boston[, 12]/100
xt[, 13] <- sqrt(boston[, 13])
xt[, 14] <- log(boston[, 14])
tablex = rbind(t(apply(boston, 2, mean)), t(apply(boston, 2, median)), t((apply(boston, 2, sd))^2), t(apply(boston, 2, sd)))
print(" Mean Median Variance SE")
## [1] " Mean Median Variance SE"
t(round(tablex, 4))
## [,1] [,2] [,3] [,4]
## V1 3.6135 0.2565 73.9866 8.6015
## V2 11.3636 0.0000 543.9368 23.3225
## V3 11.1368 9.6900 47.0644 6.8604
## V4 0.0692 0.0000 0.0645 0.2540
## V5 0.5547 0.5380 0.0134 0.1159
## V6 6.2846 6.2085 0.4937 0.7026
## V7 68.5749 77.5000 792.3584 28.1489
## V8 3.7950 3.2074 4.4340 2.1057
## V9 9.5494 5.0000 75.8164 8.7073
## V10 408.2372 330.0000 28404.7595 168.5371
## V11 18.4555 19.0500 4.6870 2.1649
## V12 356.6740 391.4400 8334.7523 91.2949
## V13 12.6531 11.3600 50.9948 7.1411
## V14 22.5328 21.2000 84.5867 9.1971
zt <- xt
for(i in 1:14) zt[, i] <- (xt[, i] - mean(xt[, i]))/(sqrt(var(xt[, i])))
boxplot(zt)
print(cov(boston))
## V1 V2 V3 V4 V5
## V1 73.9865782 -40.2159560 23.9923388 -0.122108643 0.419593894
## V2 -40.2159560 543.9368137 -85.4126481 -0.252925293 -1.396148200
## V3 23.9923388 -85.4126481 47.0644425 0.109668806 0.607073693
## V4 -0.1221086 -0.2529253 0.1096688 0.064512973 0.002684303
## V5 0.4195939 -1.3961482 0.6070737 0.002684303 0.013427636
## V6 -1.3250378 5.1125134 -1.8879566 0.016284745 -0.024603450
## V7 85.4053223 -373.9015482 124.5139031 0.618571205 2.385927202
## V8 -6.8767215 32.6293041 -10.2280975 -0.053042959 -0.187695836
## V9 46.8477610 -63.3486949 35.5499714 -0.016295543 0.616929453
## V10 844.8215381 -1236.4537354 833.3602902 -1.523367119 13.046285530
## V11 5.3993308 -19.7765707 5.6921040 -0.066819160 0.047397324
## V12 -302.3818163 373.7214023 -223.5797555 1.131324541 -4.020569639
## V13 27.9861679 -68.7830369 29.5802703 -0.097816264 0.488946168
## V14 -30.7185080 77.3151755 -30.5208228 0.409409463 -0.455412432
## V6 V7 V8 V9 V10
## V1 -1.32503785 85.4053223 -6.87672154 46.84776101 844.821538
## V2 5.11251341 -373.9015482 32.62930405 -63.34869487 -1236.453735
## V3 -1.88795657 124.5139031 -10.22809746 35.54997135 833.360290
## V4 0.01628475 0.6185712 -0.05304296 -0.01629554 -1.523367
## V5 -0.02460345 2.3859272 -0.18769584 0.61692945 13.046286
## V6 0.49367085 -4.7519292 0.30366342 -1.28381457 -34.583448
## V7 -4.75192919 792.3583985 -44.32937946 111.77084648 2402.690122
## V8 0.30366342 -44.3293795 4.43401514 -9.06825201 -189.664592
## V9 -1.28381457 111.7708465 -9.06825201 75.81636598 1335.756577
## V10 -34.58344778 2402.6901225 -189.66459173 1335.75657653 28404.759488
## V11 -0.54076322 15.9369213 -1.05977455 8.76071616 168.153141
## V12 8.21500573 -702.9403283 56.04035576 -353.27621939 -6797.911215
## V13 -3.07974141 121.0777246 -7.47332906 30.38544241 654.714520
## V14 4.49344588 -97.5890166 4.84022864 -30.56122804 -726.255716
## V11 V12 V13 V14
## V1 5.39933079 -302.381816 27.98616788 -30.7185080
## V2 -19.77657066 373.721402 -68.78303690 77.3151755
## V3 5.69210400 -223.579756 29.58027028 -30.5208228
## V4 -0.06681916 1.131325 -0.09781626 0.4094095
## V5 0.04739732 -4.020570 0.48894617 -0.4554124
## V6 -0.54076322 8.215006 -3.07974141 4.4934459
## V7 15.93692134 -702.940328 121.07772456 -97.5890166
## V8 -1.05977455 56.040356 -7.47332906 4.8402286
## V9 8.76071616 -353.276219 30.38544241 -30.5612280
## V10 168.15314053 -6797.911215 654.71451963 -726.2557164
## V11 4.68698912 -35.059527 5.78272856 -10.1106571
## V12 -35.05952730 8334.752263 -238.66751554 279.9898338
## V13 5.78272856 -238.667516 50.99475951 -48.4475383
## V14 -10.11065714 279.989834 -48.44753832 84.5867236
print(cor(boston))
## V1 V2 V3 V4 V5
## V1 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## V2 -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## V3 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## V4 -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## V5 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## V6 -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## V7 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## V8 -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## V9 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## V10 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## V11 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## V12 -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## V13 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## V14 -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## V6 V7 V8 V9 V10
## V1 -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## V2 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## V3 -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## V4 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## V5 -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## V6 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## V7 -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## V8 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## V9 -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## V10 -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## V11 -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## V12 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## V13 -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## V14 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## V11 V12 V13 V14
## V1 0.2899456 -0.38506394 0.4556215 -0.3883046
## V2 -0.3916785 0.17552032 -0.4129946 0.3604453
## V3 0.3832476 -0.35697654 0.6037997 -0.4837252
## V4 -0.1215152 0.04878848 -0.0539293 0.1752602
## V5 0.1889327 -0.38005064 0.5908789 -0.4273208
## V6 -0.3555015 0.12806864 -0.6138083 0.6953599
## V7 0.2615150 -0.27353398 0.6023385 -0.3769546
## V8 -0.2324705 0.29151167 -0.4969958 0.2499287
## V9 0.4647412 -0.44441282 0.4886763 -0.3816262
## V10 0.4608530 -0.44180801 0.5439934 -0.4685359
## V11 1.0000000 -0.17738330 0.3740443 -0.5077867
## V12 -0.1773833 1.00000000 -0.3660869 0.3334608
## V13 0.3740443 -0.36608690 1.0000000 -0.7376627
## V14 -0.5077867 0.33346082 -0.7376627 1.0000000
Peknú vizualizáciu korelačnej matice vieme vyrobiť pomocou balíka corrplot:
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.1
## corrplot 0.84 loaded
corrplot.mixed(cor(boston), lower = "number", lower.col = "black", upper = "ellipse", tl.pos="lt")
tablext = cbind(apply(xt, 2, mean), apply(xt, 2, median), (apply(xt, 2, sd))^2, apply(xt, 2, sd))
print(" Mean Median Variance SE")
## [1] " Mean Median Variance SE"
round(tablext, 4)
## [,1] [,2] [,3] [,4]
## V1 -0.7804 -1.3606 4.6745 2.1621
## V2 1.1364 0.0000 5.4394 2.3322
## V3 2.1602 2.2711 0.6037 0.7770
## V4 0.0692 0.0000 0.0645 0.2540
## V5 -0.6100 -0.6199 0.0406 0.2015
## V6 1.8319 1.8259 0.0126 0.1123
## V7 5.0608 5.2876 12.7203 3.5665
## V8 1.1880 1.1655 0.2911 0.5395
## V9 1.8677 1.6094 0.7653 0.8748
## V10 5.9314 5.7991 0.1571 0.3964
## V11 2.1501 2.0390 1.8605 1.3640
## V12 3.5667 3.9144 0.8335 0.9129
## V13 3.4177 3.3705 0.9745 0.9872
## V14 3.0345 3.0540 0.1671 0.4088
Pre transformované dáta:
corrplot.mixed(cor(xt), lower = "number", lower.col = "black", upper = "ellipse", tl.pos="lt")
library(KernSmooth) #kniznica pre jadrove odhady hustoty
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
d <- bkde2D(boston[, 5:6], bandwidth = 1.06*c(sd(boston[, 5]), sd(boston[, 6]))* 200^(-1/5))
contour(d$x1, d$x2, d$fhat, col = c("blue", "black", "yellow", "cyan", "red", "magenta", "green", "blue","black"), lwd=3, cex.axis = 1)
library(misc3d)
## Warning: package 'misc3d' was built under R version 3.6.1
d <- kde3d(boston[, 5], boston[, 6], boston[, 7], n = 15)
contour3d(d$d, level = c(max(d$d[10,10,])*.02, max(d$d[10,10,])*.5, max(d$d[10,10,])*1.3), fill = c(FALSE, FALSE, TRUE), col.mesh = c("darkgreen","red","darkblue") , engine = "standard", screen=list(z=210,x=-40,y=-295), scale=TRUE)
Krajšie vizualizácie vieme vytvoriť pomocoun plot3d. Ďalšie užitočné balíky pre vizualizáciu dát sú rgl a ggplot2
library(plot3D)
## Warning: package 'plot3D' was built under R version 3.6.1
x <- boston$V1
y <- boston$V13
z <- boston$V14
scatter3D(x, y, z, bty = "g", colkey = FALSE, main ="bty= 'g'")
scatter3D(x, y, z, bty = "g", pch = 18, col = gg.col(100))
scatter3D(x, y, z, phi = 0, bty = "g", type = "h", ticktype = "detailed", pch = 19, cex = 0.5)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data = boston[6:14],
title = "Boston Housing Correlation Plot",
upper = list(continuous = wrap("cor", size = 3)),
lower = list(continuous = "points")
)
library(aplpack)
## Warning: package 'aplpack' was built under R version 3.6.1
xx <- boston[83:118,]
ncolors <- 30
faces(xx, nrow = 6,face.type=1,scale=TRUE, col.nose = rainbow(ncolors), col.eyes = rainbow(ncolors,
start = 0.6, end = 0.85), col.hair = terrain.colors(ncolors),
col.face = heat.colors(ncolors), col.lips = rainbow(ncolors,
start = 0, end = 1), col.ears = rainbow(ncolors, start = 0,
end = 0.8), plot.faces = TRUE)
## effect of variables:
## modified item Var
## "height of face " "V1"
## "width of face " "V2"
## "structure of face" "V3"
## "height of mouth " "V4"
## "width of mouth " "V5"
## "smiling " "V6"
## "height of eyes " "V7"
## "width of eyes " "V8"
## "height of hair " "V9"
## "width of hair " "V10"
## "style of hair " "V11"
## "height of nose " "V12"
## "width of nose " "V13"
## "width of ear " "V14"
## "height of ear " "V1"
Všetky premenné sa najskôr naškálujú medzi 0 a 1.
Horizontálna os: súradnice j (j=1,…,p)
Vertikálna os: hodnota škálovanej premennej \(x_{ij}\)
Použitie:
korelácia: ak sú čiary takmer rovnobežné, usudzujeme na kladnú lineárnu závislosť. Ak sa pretínajú cca v strede, ide o negatívnu lineárnu závislosť.
detekcia podskupín: čiary konvergujúce k rôznym bodom.
Najskôr na menších dátach:
library(MASS)
z <- boston[, 14]
z[boston[, 14] <= median(boston[, 14])] <- 1
z[boston[, 14] > median(boston[, 14])] <- 2
parcoord(boston[sample(506, 10), c(1,2,3,13,14)], col = z, frame = TRUE)
axis(side = 2, at = seq(0, 1, 0.2), labels = seq(0, 1, 0.2))
parcoord(boston[, seq(1, 14, 1)], lty = z, lwd = 0.7, col = z, main = "PCP for Boston Housing", frame = TRUE)
axis(side = 2, at = seq(0, 1, 0.2), labels = seq(0, 1, 0.2))
Tu je farba podľa toho, či je hodnota pod alebo nad mediánom.
Vykreslite logaritmus mediánovej ceny nehnuteľnosti v závislosti od toho, či je pri rieke a od toho, akú má prístupnosť k diaľnici. Zlepší sa symetria dát logaritmickou transformáciou?
Graficky vyšetrite závislosť kriminality a logaritmu mediánovej ceny nehnuteľnosti. Ako ďalšiu premennú zahrňte dostupnosť diaľnice. Prečo by ste (ne)odporúčali použiť mieru kriminality ako prediktor ceny nehnuteľnosti?
Načítajte dáta ‘spam’ z knižnice ‘ElemStatLearn’. Tieto dáta reprezentujú 4601 mailov, ktoré boli klasifikované na základe toho, či ide o spam alebo nie. Vyskúšajte vizualizovať tieto dáta na základe niektorých vybraných premenných. Viete na základe týchto vizualizácií usúdiť, aké správy sú pravdepodobne spam?