library(MASS)
library(rpart)
library(class)
library(nnet)
library(e1071)
In this section we will study the performance of the classification methods discussed earlier. We will use the miss-classification rate and cross-validation.
As we saw, each of the methods has a slightly different list of arguments. It will therefore be worthwhile to write a single routine that does them all.
do.class <- function(df, I, B=100, which=1:6) {
miss.rate <- matrix(0, B, 7)
n <- dim(df)[1]
colnames(miss.rate) <- c("LDA", "QDA", "Tree", "NN",
"SVM", "knn 3", "knn 9")
for(i in 1:B) {
I <- sample(1:n, size=floor(n/2))
train <- df[I, ]
colnames(train)[1] <- "group"
if(1 %in% which) {
fit <- lda(group~., data=train)
pred <- predict(fit, df[-I, -1])$class
miss.rate[i, "LDA"] <- msr(factor(df[-I, 1]), pred)
}
if(2 %in% which) {
fit <- qda(group~., data=train)
pred <- predict(fit, df[-I, -1])$class
miss.rate[i, "QDA"] <- msr(factor(df[-I, 1]), pred)
}
if(3 %in% which) {
fit <-rpart(group~., data=train, method = "class")
pred <- predict(fit, df[-I, -1], type="class")
miss.rate[i, "Tree"] <- msr(factor(df[-I, 1]), pred)
}
if(4 %in% which) {
fit <- nnet(factor(group)~., data=train, size=2,
rang = 0.1, trace=0,
decay = 5e-4, maxit = 200)
pred <- predict(fit, df[-I, -1], type="class")
miss.rate[i, "NN"] <- msr(df[-I, 1], pred)
}
if(5 %in% which) {
fit <- svm(factor(group)~., data=train)
pred <- predict(fit, df[-I, -1])
miss.rate[i, "SVM"] <- msr(df[-I, 1], pred)
}
if(6 %in% which) {
pred <- factor(
knn(df[I, -1], df[-I, -1], cl=df[I, 1], k=3))
miss.rate[i, "knn 3"] <- msr(factor(df[-I, 1]), pred)
}
if(7 %in% which) {
pred <- factor(
knn(df[I, -1], df[-I, -1], cl=df[I, 1], k=9))
miss.rate[i, "knn 9"] <- msr(factor(df[-I, 1]), pred)
}
}
apply(miss.rate[, which], 2, mean)
}
df <- gen.ex(1)
sort(do.class(df[, c(3, 1, 2)]))
## LDA QDA SVM NN knn 3 Tree
## 8.40 9.16 9.16 10.40 10.54 15.26
df <- gen.ex(2)
sort(do.class(df[, c(3, 1, 2)]))
## SVM QDA knn 3 NN Tree LDA
## 5.74 6.92 8.52 10.92 15.56 18.60
df <- gen.ex(3)
sort(do.class(df[, c(3, 1, 2)]))
## NN LDA QDA SVM knn 3 Tree
## 9.666 9.772 9.983 10.544 11.188 18.585
sort(do.class(iris[, c(5, 1:4)]))
## LDA QDA NN knn 3 SVM Tree
## 2.570 3.293 4.729 4.802 4.876 6.417
sort(do.class(kyphosis))
## Error in qda.default(x, grouping, ...): some group is too small for 'qda'
QDA does not work here. Essentially there is not enough data to fit a quadratic model. So
sort(do.class(kyphosis, which=c(1, 3:7)))
## LDA SVM knn 9 NN Tree knn 3
## 21.521 21.667 23.375 24.252 24.789 25.103
The subjective assessment, on a 0 to 20 integer scale, of 54 classical painters. The painters were assessed on four characteristics: composition, drawing, colour and expression. They were also grouped in 8 “Schools”. The data is due to the Eighteenth century art critic, de Piles.
kable.nice(head(painters))
Composition | Drawing | Colour | Expression | School | |
---|---|---|---|---|---|
Da Udine | 10 | 8 | 16 | 3 | A |
Da Vinci | 15 | 16 | 4 | 14 | A |
Del Piombo | 8 | 13 | 16 | 7 | A |
Del Sarto | 12 | 16 | 9 | 8 | A |
Fr. Penni | 0 | 15 | 8 | 0 | A |
Guilio Romano | 15 | 16 | 4 | 14 | A |
pushViewport(viewport(layout = grid.layout(2, 2)))
print(ggplot(data=painters, aes(School, Composition)) +
geom_boxplot(),
vp=viewport(layout.pos.row=1, layout.pos.col=1))
print(ggplot(data=painters, aes(School, Drawing)) +
geom_boxplot(),
vp=viewport(layout.pos.row=1, layout.pos.col=2))
print(ggplot(data=painters, aes(School, Colour)) +
geom_boxplot(),
vp=viewport(layout.pos.row=2, layout.pos.col=1))
print(ggplot(data=painters, aes(School, Expression)) +
geom_boxplot(),
vp=viewport(layout.pos.row=2, layout.pos.col=2))
sort(do.class(painters[, c(5, 1:4)]))
## Error in qda.default(x, grouping, ...): some group is too small for 'qda'
Again QDA does not work here. So
sort(do.class(painters[, c(5, 1:4)], which=c(1, 3:7)))
## LDA SVM knn 3 Tree knn 9 NN
## 70.733 75.728 78.688 78.799 80.945 84.497
and this is clearly a very difficult classification problem, none of the methods does well.
It should also be pointed out that we have used all these methods essentially with their defaults. In real life one would play around with the tuning parameters to get better performance.