I've written the code below to do some work on machine learning in R. I'm not overly happy with some bits of it, and I suspect I could improve it quite a bit. Bits I'm specifically interested in looking at are how to deal with appending to vectors in the loop, and whether I can combine all of the functions kf.knn, kf.svm etc into one function with various arguments.
library(class)
library(nnet)
library(epibasix)
library(CVThresh)
library(e1071)
df <- read.table("semeion.data.data")
df$digit <- replicate(nrow(df), 42)
df$digit[which(as.logical(df$V266), arr.ind=TRUE)] = 9
df$digit[which(as.logical(df$V265), arr.ind=TRUE)] = 8
df$digit[which(as.logical(df$V264), arr.ind=TRUE)] = 7
df$digit[which(as.logical(df$V263), arr.ind=TRUE)] = 6
df$digit[which(as.logical(df$V262), arr.ind=TRUE)] = 5
df$digit[which(as.logical(df$V261), arr.ind=TRUE)] = 4
df$digit[which(as.logical(df$V260), arr.ind=TRUE)] = 3
df$digit[which(as.logical(df$V259), arr.ind=TRUE)] = 2
df$digit[which(as.logical(df$V258), arr.ind=TRUE)] = 1
df$digit[which(as.logical(df$V257), arr.ind=TRUE)] = 0
data <- df[,c(0:256,267)]
squareTable <- function(x,y) {
x <- factor(x)
y <- factor(y)
commonLevels <- sort(unique(c(levels(x), levels(y))))
x <- factor(x, levels = commonLevels)
y <- factor(y, levels = commonLevels)
table(x,y)
}
kf.knn <- function(data, k, nearest)
{
N <- nrow(data)
data <- data[sample(1:N),]
folds.index <- cvtype(N, cv.bsize=1, cv.kfold=k, FALSE)$cv.index
total = 0
for (i in 1:k)
{
test <- data[folds.index[i,], 0:256]
test.labels <- data[as.array(folds.index[i,]), 257]
rest <- as.array(folds.index[-i,])
train <- data[rest, 0:256]
train.labels <- data[rest, 257]
knnpredict <- knn(train, test, train.labels, nearest)
t <- table(as.factor(test.labels), as.factor(knnpredict))
kap <- epiKappa(t)
total <- total + kap$kappa
}
return(total / k)
}
kf.nnet <- function(data, k, hidden)
{
N <- nrow(data)
data <- data[sample(1:N),]
folds.index <- cvtype(N, cv.bsize=1, cv.kfold=k, FALSE)$cv.index
total = 0
for (i in 2:k)
{
test <- data[folds.index[i,], 0:256]
test.labels <- data[as.array(folds.index[i,]), 257]
rest <- as.array(folds.index[-i,])
train <- data[rest, 0:256]
train.labels <- data[rest, 257]
train$label <- as.factor(train.labels)
nnetwork <- nnet(label ~ ., data=train, size=hidden, MaxNWts = 10000, maxit=200, linout=TRUE)
nnetpredict <- predict(nnetwork, test, "class")
table <- squareTable(as.factor(test.labels), as.factor(nnetpredict))
print(table)
kap <- epiKappa(table)
total <- total + kap$kappa
}
return(total / k)
}
kf.svm <- function(data, k)
{
N <- nrow(data)
data <- data[sample(1:N),]
folds.index <- cvtype(N, cv.bsize=1, cv.kfold=k, FALSE)$cv.index
total = 0
for (i in 2:k)
{
test <- data[folds.index[i,], 0:256]
test.labels <- data[as.array(folds.index[i,]), 257]
rest <- as.array(folds.index[-i,])
train <- data[rest, 0:256]
train.labels <- data[rest, 257]
train$label <- as.factor(train.labels)
svmmodel <- svm(label ~ ., data=train, cost=8, gamma=0.001, cross=2)
svmpredict <- predict(svmmodel, test)
t <- table(as.factor(test.labels), as.factor(svmpredict))
print(t)
kap <- epiKappa(t)
total <- total + kap$kappa
}
return(total / k)
}
result <- vector("numeric", 1)
n <- vector("numeric", 1)
for (i in 2:10)
{
avg <- kf.nnet(data, 5, i)
result <- append(result, avg)
n <- append(n, i)
cat("!!!!!!!", i, avg, "\n")
}
result <- result[2:length(result)]
n <- n[2:length(n)]
df <- data.frame(kappa = result, n=n)
plot(df$n, df$res)
print(df)