finished hw3

2026-02-18 12:20:13 +00:00 · 2017-02-09 22:59:23 -05:00 · 2017-02-09 22:59:23 -05:00 · 7d230c0b30
commit 7d230c0b30
parent b0c1b3ed08
3 changed files with 152 additions and 438 deletions
--- a/hw3/.RData
+++ b/hw3/.RData
--- a/hw3/.Rhistory
+++ b/hw3/.Rhistory
@ -1,426 +1,79 @@
 auto = read.table("auto.data",header=T,na.strings="?")
 length(x=auto$mpg)
 glm
 glm.pred
 help(rep)
 glm.pred=rep(FALSE,397)
 glm.pred
 medium(auto$mpg)
 median(auto$mpg)
 glm.pred[auto$mpg>median(auto$mpg)]=T
 glm.pred
 contour(auto)
 contour(glm.pred ~ auto$mpg)
 contour(glm.pred,auto$mpg)
 help(contour)
 contour(auto$mpg,auto$horsepower,glm.pred)
 glm.pred
 length(glm.pred)
 table(glm.pred,auto$mpg)
 table(glm.pred,auto$mpg,auto$horsepower)
 glm.pred=rep(0,397)
 glm.pred[auto$mpg>median(auto$mpg)]=1
 glm.pred
 auto$mpg01=rep(0,397)
 auto$mpg01[auto$mpg>median(auto$mpg)]=1
-auto$mpg01
+library(ISLR)
-auto$mpg01
+library(MASS)
-auto$mpg01
+library(class)
-plots(auto)
+train_bools <- (auto$year %% 2 == 0)
 plot(auto)
 boxplot(auto)
 boxplot.matrix(auto)
 help(boxplot)
 boxplot(auto$mpg01,auto)
 boxplot(auto$mpg,auto)
 boxplot(auto$mpg)
 boxplot(auto)
 boxplot(mpg01 ~ auto)
 boxplot(mpg01 ~)
 boxplot(auto$mpg01 ~ auto)
 attach(auto)
 boxplot(mpg01)
 boxplot(mpg01 ~ auto)
 boxplot(mpg01 ~ auto,auto)
 boxplot(mpg01 ~ auto,data = auto)
 help(plot.table)
 plot.table(auto)
 help(plot.table)
 plot(auto)
 plot(auto,t="box")
 help(plot.table)
 help(plot.table,plot.frame=1)
 help(plot.table)
 help(plot.table,frame.plot=1)
 help(plot.table)
 help(plot.table,frame.plot=is.num)
 help(plot.table)
 plot(auto,t="box",frame.plot=1)
 plot(auto,frame.plot=1)
 plot(auto,frame.plot=1)
 plot(auto,frame.plot=is.num)
 plot(auto,frame.plot=0)
 plot(auto,frame.plot="0")
 plot(auto,frame.plot="1")
 plot(auto,frame.plot=TRUE)
 plot(auto,frame.plot=FALSE)
 plot(auto,frame.plot=TRUE)
 plot(auto,frame.plot=T)
 plot(auto,frame.plot=1)
 boxplot(mpg~mpg01,auto)
 boxplot(mpg01 ~ mpg,auto)
 boxplot(mpg01 ~ *,auto)
 boxplot(mpg01 ~ ,auto)
 boxplot(mpg01 ~ auto,auto)
 boxplot(mpg01,auto)
 boxplot(auto)
 boxplot(auto,y=mpg01)
 boxplot(auto,y=mpg)
 boxplot(data = auto)
 boxplot(auto)
 help(for)
 plot(auto,frame.plot=1)
 plot(auto)
 names(auto)
 auto$name
 help(sample)
 x <- 1:12
 x
 sample(x)
 help(sample)
 sample(x,replace=T)
 sample(x,replace=T)
 sample(x,replace=F)
 c
 x
 sample(x,replace=T)
 x
 help(sample)
 sample(x[x>9])
 sample(x[x>8])
 help(sample)
 x <- 1:10
 sample(x[x>8])
 sample(x[x>])
 help(sample)
 help(sample)
 help(sample)
 sample(auto,size=length(mpg01)/2)
 x <- length(mpg01)
 sample(x,size=length(mpg01)/2)
 auto[sample(x,size=length(mpg01)/2)]
 auto$mpg[sample(x,size=length(mpg01)/2)]
 help(data.frame)
 data.frame(
 help(data.frame)
 auto[sample(x,size=length(mpg01)/2)]
 train = sample(x,size=length(mpg01)/2)
 train = 
 auto[train]
 auto$mpg[train]
 auto$mpg[train,]
 auto$mpg[train]
 auto$mpg[23]
 auto$mpg[228]
 auto$mpg[391]
 auto.test=auto[!train]
 auto.train=auto[train]
 auto.test
 summary(auto.test)
 train=(mpg<15)
 train
 train = (sample(x,size=length(mpg01)/2))
 train
 head(auto)
 auto[,train[
 auto[,train]
 train
 help(contains)
 auto[1,train]
 train
 auto[[,train]]
 auto[[1,train]]
 autoi
 head(auto)
 head(auto[sample(nrow(auto),397/2)])
 head(auto[sample(nrow(auto),3)])
 data = data.frame(auto)
 data
 head(data[sample(nrow(data),3)])
 nrow(data)
 head(data[sample(ncol(data),3)])
 head(data[sample(ncol(data),397/2)])
 head(data[sample(ncol(data),3)])
 head(data[sample(ncol(data),3)])
 head(data[sample(ncol(data),3)])
 head(data[sample(ncol(data),3)])
 head(data[,sample(ncol(data),3)])
 head(data[,sample(ncol(data),3)])
 head(data[,sample(ncol(data),3)])
 head(data[,sample(ncol(data),3)])
 head(data[,sample(ncol(data),3)])
 head(data[sample(ncol(data),3),])
 head(data[sample(ncol(data),3),])
 head(data[sample(ncol(data),3),])
 head(data[sample(nrow(data),3),])
 head(data[sample(nrow(data),397/2),])
 head(data[sample(nrow(data),397/2),])
 head(data[sample(nrow(data),397/2),])
 head(data[sample(nrow(data),397/2),])
 head(data[sample(nrow(data),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 head(auto[sample(nrow(auto),397/2),])
 train = auto[sample(nrow(auto),397/2),]
 [sample(nrow(auto),397/2),]
 sample(nrow(auto),397/2)
 train sample(nrow(auto),397/2)
 train = sample(nrow(auto),397/2)
 autp[train,]
 auto[train,]
 train = sample(nrow(auto),397/2)
 head(auto[train,])
 head(auto[!train,])
 traindata = auto[train,]
 testdata = auto[!train,]
 testdata
 traindata
 length(traindata)
 length(traindata$mpg)
 198*2
 summary(testdata)
 testdata = auto[!train]
 testdata
 testdata = auto[!train,]
 train
 summary(train)
 names(train)
 head(traindata)
 testdata = auto[!train,]
 testdata
 !train
 train
 ?sample
 sort(train)
 train_vals = train
 train = rep(false,397)
 train = rep(F,397)
 train
 help for
 ?for
 ?for
 help)for)
 help(for)
 help(for)
 help lapply()
 ?lapply
 sapply(train,
 ?sapply
 sapply(train,
 ?sapply
 train[train_vals]=T
 train
 traindata = auto[train,]
 traindata
 length(auto)
 length(traindata)
 length(traindata$mpg)
 testdata=auto[!train,]
 length(testdate$mpg)
 length(testdata$mpg)
 training_indices = sample(nrow(auto),397/2)
 train_bools = rep(F,length(auto$mpg))
 train_bools[training_indices]=T
 head(train_bools)
 length(train_bools)
 train_data = auto[train_bools,]
 test_data = auto[!train_bools,]
-summary(train_data)
+help(knn)
-summary(test_data)
+help(knn)
-lda.fit
+     train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
-library(MASS)
+     test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
-lda.fit
+train
-lda()
+test
 detach(auto)
 mpg01
 mpg
 attach(test_data)
 mpg01
 names()
 names(test_data)
 ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
 detach(test_data)
 ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
 lda.fit
 lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
 lda.fit
 summary(lda.fit)
 coefficients(lda.fit)
 plot(lda.fit)
 lda.pred=predict(lda.fit,test_data)
 lda.pred=predict(lda.fit, !training_bools)
 lda.pred=predict(lda.fit, !training_indices)
 test_data
 lda.pred=predict(lda.fit, test_data)
 lda.pred
 plot(lda.pred)
 names(lda.pred)
 lda.class=lda.pres$class
 lda.class=lda.pred$class
 table(lda.class,testdata)
 table(lda.class,test_data)
 length(lda.class)
 length(test_data)
 table(lda.class,test_data$mpg01)
 mean(lda.class==test_data$mpg01)
 sum(lda.pred$posterior[,1]>=.5)
 sum(lda.pred$posterior[,1]<.5)
 lda.pred$posterior[,1]
 sum(lda.pred$posterior<.5)
 lda.pred$posterior
 lda.pred$posterior<5
 lda.pred$posterior<.5
 sum(lda.pred$posterior<.5)
 sum(lda.pred$posterior<.5[,1])
 sum(lda.pred$posterior<.5[1])
 sum(lda.pred$posterior<.5[2])
 lda.pred$posterior<.5[2]
 lda.pred$posterior<.5
 lda.pred$posterior
 lda.pred$posterior[,1]
 lda.pred$posterior[1,]
 lda.pred$posterior[,2]
 lda.pred$posterior[,1]
 lda.pred$posterior[,1]>.5
 sum(lda.pred$posterior[,1]>.5)
 sum.bool(lda.pred$posterior[,1]>.5)
 ?sum
 sum.bool(lda.pred$posterior[,1]>.5,na.rm=T)
 sum(lda.pred$posterior[,1]>.5,na.rm=T)
 sum(lda.pred$posterior[,1]>.5)
 sum(lda.pred$posterior[,1]>.5,na.rm=T)
 sum(lda.pred$posterior[,1]>=.5,na.rm=T)
 sum(lda.pred$posterior[,1]<.5,na.rm=T)
 mean(lda.pred$[,1]==test_data,na.rm=T)
 lda.pred
 lda.pred$class
 lda.pred$class==test_data$mpg01
 mean(lda.pred$class==test_data$mpg01,na.rm=T)
 mean(lda.pred$class!=test_data$mpg01,na.rm=T)
 lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
 lda.fit
 mean(lda.pred$class==test_data$mpg01,na.rm=T)
 lda.pred=predict(lda.fit, test_data)
 mean(lda.pred$class==test_data$mpg01,na.rm=T)
 mean(lda.pred$class!=test_data$mpg01,na.rm=T)
 train_data == test_data
 train_data$mpg01 == test_data$mpg01
 lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
 lda.pred=predict(lda.fit, test_data)
 mean(lda.pred$class!=test_data$mpg01,na.rm=T)
 lda.pred
 lda.pred$posterior[,1]
 summary(lda.fit)
 lda.fit
 lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
 lda.fit
 mean(lda.pred$class!=test_data$mpg01,na.rm=T)
 lda.pred=predict(lda.fit, test_data)
 mean(lda.pred$class!=test_data$mpg01,na.rm=T)
 head(lda.pred)
 lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
 lda.pred=predict(lda.fit, test_data)
 head(lda.pred)
 mean(lda.pred$class!=test_data$mpg01,na.rm=T)
 qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
 qda.fit
 qda.class=predict(qda.fit,test_data)$class
 qda.class=predict(qda.fit,test_data,na.rm=T)$class
 qda.class=predict(qda.fit,test_data)$class
 qda.class
 mean(qda.pred$class!=test_data$mpg01,na.rm=T)
 qda.pred=predict(qda.fit,test_data)
 qda.pred=predict(qda.fit,test_data,na.rm=T)
 mean(qda.pred$class!=test_data$mpg01,na.rm=T)
 glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
 glm.probs=predict(glm.fit,test_data,type="response")
 glm.pred=rep(0,199)
 glm.pred[glm.probs>.5]=1
 table(glm.pred,test_data$mpg01)
 mean(glm.pred!=test_data$mpg01)
 library(class)
 ?cbind
 ?knn
-knn.fit = knn(train_data,test_data,auto$mpg01[training_indices])
+     cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
-knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1)
+cl
-knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1)
+length(cl)
-?knn
+length(train)
-training_indices
+nrows(train)
-train_bools
+nrow(train)
 knn.fit = knn(train_data,test_data,auto$mpg01[train_bools],k=1)
 sdf = (mpg01<1)
 sdf = (auto$mpg01<1)
 sdf
 train_bools
 cbind(horsepower,displacement)
 cbind(train_data$horsepower,displacement)
 cbind(train_data$horsepower,train_data$displacement)
 cbind(auto$horsepower,auto$displacement)[train_bools]
 cbind(auto$horsepower,auto$displacement)[train_bools,]
 cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 train.mpg01 = auto[train_bools]
 train.mpg01 = auto$mpg01[train_bools]
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 train.mpg01 = auto$mpg01[train_bools]
 set.seed(56)
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 ?cbind
 ?Knn
 ?knn
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 train.mpg01 = auto$mpg01[train_bools]
 train.X = train.X[!is.na(train.X)]
 test.X = data.frame(test.X,
 train.mpg01 = train.mpg01[!is.na(train.mpg01)]
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 length(train.mpg01)
 length(test.X)
 text.X
 test.X
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 length(test.X)
 test.X
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 train.X
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 train.X
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 test.X
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 ?knn
 length(train.X)
 length(train.X[1,])
 length(train.X[,1])
 ?knn
 plot(auto)
 train.X = cbind(auto$horsepower,auto$displacement)[train_bools,]
 test.X = cbind(auto$horsepower,auto$displacement)[!train_bools,]
 train.mpg01 = auto$mpg01[train_bools]
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 train.X
-test.X
+train.mpg01 = auto$mpg01[train_bools]
 train.mpg01
 length(train.mpg01)
 nrow(train.X)
 knn(train.X,train.Y,train.mpg01,K=1)
 knn(train.X,train.Y,train.mpg01,k=1)
 knn(train.X,test.X,train.mpg01,k=1)
 train.X
 na.omit(train.X)
 ?na.omit
 na.omit(train.X)
 na.omit(train.X)
 knn(na.omit(train.X),test.X,train.mpg01,k=1)
 knn(na.omit(train.X),test.X,na.omit(train.mpg01),k=1)
 knn(na.omit(train.X),na.omit(test.X),na.omit(train.mpg01),k=1)
 train.mpg012 = na.omit(auto$mpg01)[train_bools]
 train.mpg012
 train.mpg01
 nrow(train)
 na.omit(auto)
 auto
 na.omit(auto)
 summary(auto)
 summary(na.omit(auto))
 Auto = na.omit(auto)
 auto = na.omit(auto)
 ncol(auto)
 nrow(auto)
 auto <- na.omit(auto)
 train_bools <- (auto$year %% 2 == 0)
 train_data = auto[train_bools,]
 test_data = auto[!train_bools,]
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 train.mpg01 = auto$mpg01[train_bools]
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 mean(knn.pred != auto$mpg01)
 mean(knn.pred != test_data$mpg01)
 knn.pred = knn(train.X,test.X,train.mpg01,k=2)
 mean(knn.pred != test_data$mpg01)
 knn.pred = knn(train.X,test.X,train.mpg01,k=3)
 mean(knn.pred != test_data$mpg01)
 knn.pred = knn(train.X,test.X,train.mpg01,k=4)
 mean(knn.pred != test_data$mpg0)
 knn.pred
 length(knn.pred)
 dim(knn.pred)
 length(test_data)
 ncol(test_data)
 nrow(test_data)
 q()
--- a/hw3/answers
+++ b/hw3/answers
@ -85,6 +85,7 @@ Part B: Choose one of Questions 10 or 11
    given car gets high or low gas mileage based on the Auto data
    set.
 ──────────────────────────────────────────────────────────────────────────
    (a) Create a binary variable, mpg01 , that contains a 1 if mpg
    contains a value above its median, and a 0 if mpg contains a
    value below its median. You can compute the median using the
@ -92,6 +93,9 @@ Part B: Choose one of Questions 10 or 11
    data.frame() function to create a single data set containing
    both mpg01 and the other Auto variables.
        > auto$mpg01=rep(0,397)
        > auto$mpg01[auto$mpg>median(auto$mpg)]=1
 > auto$mpg01
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
@ -106,6 +110,7 @@ Part B: Choose one of Questions 10 or 11
 [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 ──────────────────────────────────────────────────────────────────────────
    (b) Explore the data graphically in order to investigate the
    associ- ation between mpg01 and the other features. Which of the
    other features seem most likely to be useful in predicting mpg01
@ -119,6 +124,16 @@ Part B: Choose one of Questions 10 or 11
        Displacement is on the cusp and the other variables don't
        have a terribly useful relationship with this median.
        The boxplots indicate that acceleration really isn't a great
        predictor of mpg01, but displacement is. It also confirms
        horsepower and weight as good predictors, and cylinders also
        seems to be very strong, even though I didn't take that from
        the scatter plots.
        I will use mpg01 ~ horsepower + weight + cylinders + displacement
 ──────────────────────────────────────────────────────────────────────────
    (c) Split the data into a training set and a test set.
        Seems like a 50/50 random sampling is appropriate enough. 
@ -127,87 +142,133 @@ Part B: Choose one of Questions 10 or 11
        > train_bools = rep(F,length(auto$mpg))
        > train_bools[training_indices]=T
        > head(train_bools)
-        [1] FALSE  TRUE FALSE FALSE  TRUE FALSE
+        [1]  TRUE  TRUE  TRUE FALSE  TRUE FALSE
        > length(train_bools)
        [1] 397
        > train_data = auto[train_bools,]
        > test_data = auto[!train_bools,]
        Actually, I changed this now, because a solution I found
        online suggested a different test split and I was having
        trouble with the KNN model, so I followed their style. I used:
        > train <- (auto$year %% 2 == 0)
        and then the rest the same
 ──────────────────────────────────────────────────────────────────────────
    (d) Perform LDA on the training data in order to predict mpg01
    using the variables that seemed most associated with mpg01 in
    (b). What is the test error of the model obtained?
        > lda.fit
        Call:
-        lda(mpg01 ~ horsepower + weight + acceleration + displacement, 
+        lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
            data = train_data)
        Prior probabilities of groups:
                0         1 
-        0.5431472 0.4568528 
+        0.4666667 0.5333333 
        Group means:
-          horsepower   weight acceleration displacement
+          horsepower   weight cylinders displacement
-        0  129.08411 3557.757     14.55981      269.729
+        0  131.96939 3579.827  6.755102     268.4082
-        1   79.64444 2345.233     16.39222      116.800
+        1   77.96429 2313.598  4.071429     111.7188
        Coefficients of linear discriminants:
                               LD1
-        horsepower    0.005678626
+        horsepower    0.0060634365
-        weight       -0.001137499
+        weight       -0.0011442212
-        acceleration -0.014950459
+        cylinders    -0.6390942259
-        displacement -0.007401647
+        displacement  0.0004517291
-        Error Rate against test data:
+
     ***Test Data Error Rate:
        > mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-        [1] 0.1179487
+        [1] 0.1428571
 ──────────────────────────────────────────────────────────────────────────
    (e) Perform QDA on the training data in order to predict mpg01
    using the variables that seemed most associated with mpg01 in
    (b). What is the test error of the model obtained?
        > qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
        > qda.fit
        Call:
-        qda(mpg01 ~ horsepower + weight + acceleration + displacement, 
+        lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
            data = train_data)
        Prior probabilities of groups:
                0         1 
-        0.5431472 0.4568528 
+        0.4666667 0.5333333 
        Group means:
-          horsepower   weight acceleration displacement
+          horsepower   weight cylinders displacement
-        0  129.08411 3557.757     14.55981      269.729
+        0  131.96939 3579.827  6.755102     268.4082
-        1   79.64444 2345.233     16.39222      116.800
+        1   77.96429 2313.598  4.071429     111.7188
-        Error Rate:
+        Coefficients of linear discriminants:
                               LD1
        horsepower    0.0060634365
        weight       -0.0011442212
        cylinders    -0.6390942259
        displacement  0.0004517291
    ***Test Data Error Rate:
        > mean(qda.pred$class!=test_data$mpg01,na.rm=T)
-        [1] 0.1025641
+        [1] 0.1428571
 ──────────────────────────────────────────────────────────────────────────
    (f) Perform logistic regression on the training data in order to
    pre- dict mpg01 using the variables that seemed most associated
    with mpg01 in (b). What is the test error of the model obtained?
-        > glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
+        > glm.fit=glm(mpg01 ~ horsepower + weight + cylinders + displacement,data=train_data,family=binomial)
        > glm.probs=predict(glm.fit,test_data,type="response")
        > glm.pred=rep(0,199)
        > glm.pred[glm.probs>.5]=1
     ***Test Data Error Rate:
        > mean(glm.pred!=test_data$mpg01)
-        [1] 0.120603
+        [1] 0.1407035
 ──────────────────────────────────────────────────────────────────────────
    (g) Perform KNN on the training data, with several values of K,
    in order to predict mpg01 . Use only the variables that seemed
    most associated with mpg01 in (b). What test errors do you
    obtain? Which value of K seems to perform the best on this data
    set?
       The knn method can't handle the NA values, so
        > set.seed(1)
        > auto <- na.omit(auto)
        > train_bools <- (auto$year %% 2 == 0)
        > train_data = auto[train_bools,]
        > test_data = auto[!train_bools,]
        > train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
        > test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
        > train.mpg01 = auto$mpg01[train_bools]
     ***Test Data Error Rates:
     k = 1
        > mean(knn.pred != test_data$mpg01)
        [1] 0.1483516
     k = 2
        > mean(knn.pred != test_data$mpg01)
        [1] 0.1593407
     k = 3
        > mean(knn.pred != test_data$mpg01)
        [1] 0.1648352
     k = 4
        > mean(knn.pred != test_data$mpg0)
        [1] 0.1813187
        k = 1 looks like the best, since the error rate increases with k.