diff --git a/hw3/.RData b/hw3/.RData index d058cb3..75b0d6a 100644 Binary files a/hw3/.RData and b/hw3/.RData differ diff --git a/hw3/.Rhistory b/hw3/.Rhistory index c181128..3a641a7 100644 --- a/hw3/.Rhistory +++ b/hw3/.Rhistory @@ -1,426 +1,79 @@ auto = read.table("auto.data",header=T,na.strings="?") -length(x=auto$mpg) -glm -glm.pred -help(rep) -glm.pred=rep(FALSE,397) -glm.pred -medium(auto$mpg) -median(auto$mpg) -glm.pred[auto$mpg>median(auto$mpg)]=T -glm.pred -contour(auto) -contour(glm.pred ~ auto$mpg) -contour(glm.pred,auto$mpg) -help(contour) -contour(auto$mpg,auto$horsepower,glm.pred) -glm.pred -length(glm.pred) -table(glm.pred,auto$mpg) -table(glm.pred,auto$mpg,auto$horsepower) -glm.pred=rep(0,397) -glm.pred[auto$mpg>median(auto$mpg)]=1 -glm.pred auto$mpg01=rep(0,397) auto$mpg01[auto$mpg>median(auto$mpg)]=1 -auto$mpg01 -auto$mpg01 -auto$mpg01 -plots(auto) -plot(auto) -boxplot(auto) -boxplot.matrix(auto) -help(boxplot) -boxplot(auto$mpg01,auto) -boxplot(auto$mpg,auto) -boxplot(auto$mpg) -boxplot(auto) -boxplot(mpg01 ~ auto) -boxplot(mpg01 ~) -boxplot(auto$mpg01 ~ auto) -attach(auto) -boxplot(mpg01) -boxplot(mpg01 ~ auto) -boxplot(mpg01 ~ auto,auto) -boxplot(mpg01 ~ auto,data = auto) -help(plot.table) -plot.table(auto) -help(plot.table) -plot(auto) -plot(auto,t="box") -help(plot.table) -help(plot.table,plot.frame=1) -help(plot.table) -help(plot.table,frame.plot=1) -help(plot.table) -help(plot.table,frame.plot=is.num) -help(plot.table) -plot(auto,t="box",frame.plot=1) -plot(auto,frame.plot=1) -plot(auto,frame.plot=1) -plot(auto,frame.plot=is.num) -plot(auto,frame.plot=0) -plot(auto,frame.plot="0") -plot(auto,frame.plot="1") -plot(auto,frame.plot=TRUE) -plot(auto,frame.plot=FALSE) -plot(auto,frame.plot=TRUE) -plot(auto,frame.plot=T) -plot(auto,frame.plot=1) -boxplot(mpg~mpg01,auto) -boxplot(mpg01 ~ mpg,auto) -boxplot(mpg01 ~ *,auto) -boxplot(mpg01 ~ ,auto) -boxplot(mpg01 ~ auto,auto) -boxplot(mpg01,auto) -boxplot(auto) -boxplot(auto,y=mpg01) -boxplot(auto,y=mpg) -boxplot(data = auto) -boxplot(auto) -help(for) -plot(auto,frame.plot=1) -plot(auto) -names(auto) -auto$name -help(sample) -x <- 1:12 -x -sample(x) -help(sample) -sample(x,replace=T) -sample(x,replace=T) -sample(x,replace=F) -c -x -sample(x,replace=T) -x -help(sample) -sample(x[x>9]) -sample(x[x>8]) -help(sample) -x <- 1:10 -sample(x[x>8]) -sample(x[x>]) -help(sample) -help(sample) -help(sample) -sample(auto,size=length(mpg01)/2) -x <- length(mpg01) -sample(x,size=length(mpg01)/2) -auto[sample(x,size=length(mpg01)/2)] -auto$mpg[sample(x,size=length(mpg01)/2)] -help(data.frame) -data.frame( -help(data.frame) -auto[sample(x,size=length(mpg01)/2)] -train = sample(x,size=length(mpg01)/2) -train = -auto[train] -auto$mpg[train] -auto$mpg[train,] -auto$mpg[train] -auto$mpg[23] -auto$mpg[228] -auto$mpg[391] -auto.test=auto[!train] -auto.train=auto[train] -auto.test -summary(auto.test) -train=(mpg<15) -train -train = (sample(x,size=length(mpg01)/2)) -train -head(auto) -auto[,train[ -auto[,train] -train -help(contains) -auto[1,train] -train -auto[[,train]] -auto[[1,train]] -autoi -head(auto) -head(auto[sample(nrow(auto),397/2)]) -head(auto[sample(nrow(auto),3)]) -data = data.frame(auto) -data -head(data[sample(nrow(data),3)]) -nrow(data) -head(data[sample(ncol(data),3)]) -head(data[sample(ncol(data),397/2)]) -head(data[sample(ncol(data),3)]) -head(data[sample(ncol(data),3)]) -head(data[sample(ncol(data),3)]) -head(data[sample(ncol(data),3)]) -head(data[,sample(ncol(data),3)]) -head(data[,sample(ncol(data),3)]) -head(data[,sample(ncol(data),3)]) -head(data[,sample(ncol(data),3)]) -head(data[,sample(ncol(data),3)]) -head(data[sample(ncol(data),3),]) -head(data[sample(ncol(data),3),]) -head(data[sample(ncol(data),3),]) -head(data[sample(nrow(data),3),]) -head(data[sample(nrow(data),397/2),]) -head(data[sample(nrow(data),397/2),]) -head(data[sample(nrow(data),397/2),]) -head(data[sample(nrow(data),397/2),]) -head(data[sample(nrow(data),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -head(auto[sample(nrow(auto),397/2),]) -train = auto[sample(nrow(auto),397/2),] -[sample(nrow(auto),397/2),] -sample(nrow(auto),397/2) -train sample(nrow(auto),397/2) -train = sample(nrow(auto),397/2) -autp[train,] -auto[train,] -train = sample(nrow(auto),397/2) -head(auto[train,]) -head(auto[!train,]) -traindata = auto[train,] -testdata = auto[!train,] -testdata -traindata -length(traindata) -length(traindata$mpg) -198*2 -summary(testdata) -testdata = auto[!train] -testdata -testdata = auto[!train,] -train -summary(train) -names(train) -head(traindata) - -testdata = auto[!train,] -testdata -!train -train -?sample -sort(train) -train_vals = train -train = rep(false,397) -train = rep(F,397) -train -help for -?for -?for -help)for) -help(for) -help(for) -help lapply() -?lapply -sapply(train, -?sapply -sapply(train, -?sapply -train[train_vals]=T -train -traindata = auto[train,] -traindata -length(auto) -length(traindata) -length(traindata$mpg) -testdata=auto[!train,] -length(testdate$mpg) -length(testdata$mpg) -training_indices = sample(nrow(auto),397/2) -train_bools = rep(F,length(auto$mpg)) -train_bools[training_indices]=T -head(train_bools) -length(train_bools) +library(ISLR) +library(MASS) +library(class) +train_bools <- (auto$year %% 2 == 0) train_data = auto[train_bools,] test_data = auto[!train_bools,] -summary(train_data) -summary(test_data) -lda.fit -library(MASS) -lda.fit -lda() -detach(auto) -mpg01 -mpg -attach(test_data) -mpg01 -names() -names(test_data) -ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data) -detach(test_data) -ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data) -lda.fit -lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data) -lda.fit -summary(lda.fit) -coefficients(lda.fit) -plot(lda.fit) -lda.pred=predict(lda.fit,test_data) -lda.pred=predict(lda.fit, !training_bools) -lda.pred=predict(lda.fit, !training_indices) -test_data -lda.pred=predict(lda.fit, test_data) -lda.pred -plot(lda.pred) -names(lda.pred) -lda.class=lda.pres$class -lda.class=lda.pred$class -table(lda.class,testdata) -table(lda.class,test_data) -length(lda.class) -length(test_data) -table(lda.class,test_data$mpg01) -mean(lda.class==test_data$mpg01) -sum(lda.pred$posterior[,1]>=.5) -sum(lda.pred$posterior[,1]<.5) -lda.pred$posterior[,1] -sum(lda.pred$posterior<.5) -lda.pred$posterior -lda.pred$posterior<5 -lda.pred$posterior<.5 -sum(lda.pred$posterior<.5) -sum(lda.pred$posterior<.5[,1]) -sum(lda.pred$posterior<.5[1]) -sum(lda.pred$posterior<.5[2]) -lda.pred$posterior<.5[2] -lda.pred$posterior<.5 -lda.pred$posterior -lda.pred$posterior[,1] -lda.pred$posterior[1,] -lda.pred$posterior[,2] -lda.pred$posterior[,1] -lda.pred$posterior[,1]>.5 -sum(lda.pred$posterior[,1]>.5) -sum.bool(lda.pred$posterior[,1]>.5) -?sum -sum.bool(lda.pred$posterior[,1]>.5,na.rm=T) -sum(lda.pred$posterior[,1]>.5,na.rm=T) -sum(lda.pred$posterior[,1]>.5) -sum(lda.pred$posterior[,1]>.5,na.rm=T) -sum(lda.pred$posterior[,1]>=.5,na.rm=T) -sum(lda.pred$posterior[,1]<.5,na.rm=T) -mean(lda.pred$[,1]==test_data,na.rm=T) -lda.pred -lda.pred$class -lda.pred$class==test_data$mpg01 -mean(lda.pred$class==test_data$mpg01,na.rm=T) -mean(lda.pred$class!=test_data$mpg01,na.rm=T) -lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data) -lda.fit -mean(lda.pred$class==test_data$mpg01,na.rm=T) -lda.pred=predict(lda.fit, test_data) -mean(lda.pred$class==test_data$mpg01,na.rm=T) -mean(lda.pred$class!=test_data$mpg01,na.rm=T) -train_data == test_data -train_data$mpg01 == test_data$mpg01 -lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data) -lda.pred=predict(lda.fit, test_data) -mean(lda.pred$class!=test_data$mpg01,na.rm=T) -lda.pred -lda.pred$posterior[,1] -summary(lda.fit) -lda.fit -lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data) -lda.fit -mean(lda.pred$class!=test_data$mpg01,na.rm=T) -lda.pred=predict(lda.fit, test_data) -mean(lda.pred$class!=test_data$mpg01,na.rm=T) -head(lda.pred) -lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data) -lda.pred=predict(lda.fit, test_data) -head(lda.pred) -mean(lda.pred$class!=test_data$mpg01,na.rm=T) -qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data) -qda.fit -qda.class=predict(qda.fit,test_data)$class -qda.class=predict(qda.fit,test_data,na.rm=T)$class -qda.class=predict(qda.fit,test_data)$class -qda.class -mean(qda.pred$class!=test_data$mpg01,na.rm=T) -qda.pred=predict(qda.fit,test_data) -qda.pred=predict(qda.fit,test_data,na.rm=T) -mean(qda.pred$class!=test_data$mpg01,na.rm=T) -glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial) -glm.probs=predict(glm.fit,test_data,type="response") -glm.pred=rep(0,199) -glm.pred[glm.probs>.5]=1 -table(glm.pred,test_data$mpg01) -mean(glm.pred!=test_data$mpg01) -library(class) -?cbind +help(knn) +help(knn) + train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3]) + test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3]) +train +test ?knn -knn.fit = knn(train_data,test_data,auto$mpg01[training_indices]) -knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1) -knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1) -?knn -training_indices -train_bools -knn.fit = knn(train_data,test_data,auto$mpg01[train_bools],k=1) -sdf = (mpg01<1) -sdf = (auto$mpg01<1) -sdf -train_bools -cbind(horsepower,displacement) -cbind(train_data$horsepower,displacement) -cbind(train_data$horsepower,train_data$displacement) -cbind(auto$horsepower,auto$displacement)[train_bools] -cbind(auto$horsepower,auto$displacement)[train_bools,] -cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] -cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] -train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] -test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] -train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] -test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] -train.mpg01 = auto[train_bools] -train.mpg01 = auto$mpg01[train_bools] -test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] -train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] -test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] -train.mpg01 = auto$mpg01[train_bools] -set.seed(56) -knn.pred = knn(train.X,test.X,train.mpg01,k=1) -?cbind -?Knn -?knn -train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] -test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] -train.mpg01 = auto$mpg01[train_bools] -train.X = train.X[!is.na(train.X)] -test.X = data.frame(test.X, -train.mpg01 = train.mpg01[!is.na(train.mpg01)] -knn.pred = knn(train.X,test.X,train.mpg01,k=1) -length(train.mpg01) -length(test.X) -text.X -test.X -test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] -length(test.X) -test.X -knn.pred = knn(train.X,test.X,train.mpg01,k=1) + cl <- factor(c(rep("s",25), rep("c",25), rep("v",25))) +cl +length(cl) +length(train) +nrows(train) +nrow(train) train.X train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] train.X +test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] test.X -knn.pred = knn(train.X,test.X,train.mpg01,k=1) -?knn -length(train.X) -length(train.X[1,]) -length(train.X[,1]) -?knn -plot(auto) -train.X = cbind(auto$horsepower,auto$displacement)[train_bools,] -test.X = cbind(auto$horsepower,auto$displacement)[!train_bools,] -train.mpg01 = auto$mpg01[train_bools] -knn.pred = knn(train.X,test.X,train.mpg01,k=1) train.X -test.X +train.mpg01 = auto$mpg01[train_bools] train.mpg01 +length(train.mpg01) +nrow(train.X) +knn(train.X,train.Y,train.mpg01,K=1) +knn(train.X,train.Y,train.mpg01,k=1) +knn(train.X,test.X,train.mpg01,k=1) +train.X +na.omit(train.X) +?na.omit +na.omit(train.X) +na.omit(train.X) +knn(na.omit(train.X),test.X,train.mpg01,k=1) +knn(na.omit(train.X),test.X,na.omit(train.mpg01),k=1) +knn(na.omit(train.X),na.omit(test.X),na.omit(train.mpg01),k=1) +train.mpg012 = na.omit(auto$mpg01)[train_bools] +train.mpg012 +train.mpg01 +nrow(train) +na.omit(auto) +auto +na.omit(auto) +summary(auto) +summary(na.omit(auto)) +Auto = na.omit(auto) +auto = na.omit(auto) +ncol(auto) +nrow(auto) +auto <- na.omit(auto) +train_bools <- (auto$year %% 2 == 0) +train_data = auto[train_bools,] +test_data = auto[!train_bools,] +train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] +test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] +train.mpg01 = auto$mpg01[train_bools] knn.pred = knn(train.X,test.X,train.mpg01,k=1) +mean(knn.pred != auto$mpg01) +mean(knn.pred != test_data$mpg01) +knn.pred = knn(train.X,test.X,train.mpg01,k=2) +mean(knn.pred != test_data$mpg01) +knn.pred = knn(train.X,test.X,train.mpg01,k=3) +mean(knn.pred != test_data$mpg01) +knn.pred = knn(train.X,test.X,train.mpg01,k=4) +mean(knn.pred != test_data$mpg0) +knn.pred +length(knn.pred) +dim(knn.pred) +length(test_data) +ncol(test_data) +nrow(test_data) q() diff --git a/hw3/answers b/hw3/answers index bea2e50..a4e2011 100644 --- a/hw3/answers +++ b/hw3/answers @@ -85,6 +85,7 @@ Part B: Choose one of Questions 10 or 11 given car gets high or low gas mileage based on the Auto data set. +────────────────────────────────────────────────────────────────────────── (a) Create a binary variable, mpg01 , that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the @@ -92,6 +93,9 @@ Part B: Choose one of Questions 10 or 11 data.frame() function to create a single data set containing both mpg01 and the other Auto variables. + > auto$mpg01=rep(0,397) + > auto$mpg01[auto$mpg>median(auto$mpg)]=1 + > auto$mpg01 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 @@ -106,6 +110,7 @@ Part B: Choose one of Questions 10 or 11 [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 +────────────────────────────────────────────────────────────────────────── (b) Explore the data graphically in order to investigate the associ- ation between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01 @@ -119,6 +124,16 @@ Part B: Choose one of Questions 10 or 11 Displacement is on the cusp and the other variables don't have a terribly useful relationship with this median. + The boxplots indicate that acceleration really isn't a great + predictor of mpg01, but displacement is. It also confirms + horsepower and weight as good predictors, and cylinders also + seems to be very strong, even though I didn't take that from + the scatter plots. + + I will use mpg01 ~ horsepower + weight + cylinders + displacement + + +────────────────────────────────────────────────────────────────────────── (c) Split the data into a training set and a test set. Seems like a 50/50 random sampling is appropriate enough. @@ -127,87 +142,133 @@ Part B: Choose one of Questions 10 or 11 > train_bools = rep(F,length(auto$mpg)) > train_bools[training_indices]=T > head(train_bools) - [1] FALSE TRUE FALSE FALSE TRUE FALSE + [1] TRUE TRUE TRUE FALSE TRUE FALSE > length(train_bools) [1] 397 > train_data = auto[train_bools,] > test_data = auto[!train_bools,] + Actually, I changed this now, because a solution I found + online suggested a different test split and I was having + trouble with the KNN model, so I followed their style. I used: + > train <- (auto$year %% 2 == 0) + + and then the rest the same + +────────────────────────────────────────────────────────────────────────── (d) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? > lda.fit Call: - lda(mpg01 ~ horsepower + weight + acceleration + displacement, - data = train_data) + lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data) Prior probabilities of groups: 0 1 - 0.5431472 0.4568528 + 0.4666667 0.5333333 Group means: - horsepower weight acceleration displacement - 0 129.08411 3557.757 14.55981 269.729 - 1 79.64444 2345.233 16.39222 116.800 + horsepower weight cylinders displacement + 0 131.96939 3579.827 6.755102 268.4082 + 1 77.96429 2313.598 4.071429 111.7188 Coefficients of linear discriminants: - LD1 - horsepower 0.005678626 - weight -0.001137499 - acceleration -0.014950459 - displacement -0.007401647 + LD1 + horsepower 0.0060634365 + weight -0.0011442212 + cylinders -0.6390942259 + displacement 0.0004517291 - Error Rate against test data: + + ***Test Data Error Rate: > mean(lda.pred$class!=test_data$mpg01,na.rm=T) - [1] 0.1179487 + [1] 0.1428571 +────────────────────────────────────────────────────────────────────────── (e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? - > qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data) > qda.fit Call: - qda(mpg01 ~ horsepower + weight + acceleration + displacement, - data = train_data) + lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data) Prior probabilities of groups: 0 1 - 0.5431472 0.4568528 + 0.4666667 0.5333333 Group means: - horsepower weight acceleration displacement - 0 129.08411 3557.757 14.55981 269.729 - 1 79.64444 2345.233 16.39222 116.800 + horsepower weight cylinders displacement + 0 131.96939 3579.827 6.755102 268.4082 + 1 77.96429 2313.598 4.071429 111.7188 + + Coefficients of linear discriminants: + LD1 + horsepower 0.0060634365 + weight -0.0011442212 + cylinders -0.6390942259 + displacement 0.0004517291 - Error Rate: + ***Test Data Error Rate: > mean(qda.pred$class!=test_data$mpg01,na.rm=T) - [1] 0.1025641 + [1] 0.1428571 +────────────────────────────────────────────────────────────────────────── (f) Perform logistic regression on the training data in order to pre- dict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? - > glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial) + > glm.fit=glm(mpg01 ~ horsepower + weight + cylinders + displacement,data=train_data,family=binomial) > glm.probs=predict(glm.fit,test_data,type="response") > glm.pred=rep(0,199) > glm.pred[glm.probs>.5]=1 + + ***Test Data Error Rate: > mean(glm.pred!=test_data$mpg01) - [1] 0.120603 + [1] 0.1407035 +────────────────────────────────────────────────────────────────────────── (g) Perform KNN on the training data, with several values of K, in order to predict mpg01 . Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? - + The knn method can't handle the NA values, so + + > set.seed(1) + > auto <- na.omit(auto) + > train_bools <- (auto$year %% 2 == 0) + > train_data = auto[train_bools,] + > test_data = auto[!train_bools,] + + > train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] + > test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,] + > train.mpg01 = auto$mpg01[train_bools] + + ***Test Data Error Rates: + k = 1 + > mean(knn.pred != test_data$mpg01) + [1] 0.1483516 + k = 2 + > mean(knn.pred != test_data$mpg01) + [1] 0.1593407 + k = 3 + > mean(knn.pred != test_data$mpg01) + [1] 0.1648352 + k = 4 + > mean(knn.pred != test_data$mpg0) + [1] 0.1813187 + + k = 1 looks like the best, since the error rate increases with k. + +