finished hw3

2025-07-17 20:13:26 +00:00 · 2017-02-09 22:59:23 -05:00 · 2017-02-09 22:59:23 -05:00 · 7d230c0b30
commit 7d230c0b30
parent b0c1b3ed08
3 changed files with 152 additions and 438 deletions
--- a/hw3/.RData
+++ b/hw3/.RData
--- a/hw3/.Rhistory
+++ b/hw3/.Rhistory
@ -1,426 +1,79 @@
 auto = read.table("auto.data",header=T,na.strings="?")
-length(x=auto$mpg)
-glm
-glm.pred
-help(rep)
-glm.pred=rep(FALSE,397)
-glm.pred
-medium(auto$mpg)
-median(auto$mpg)
-glm.pred[auto$mpg>median(auto$mpg)]=T
-glm.pred
-contour(auto)
-contour(glm.pred ~ auto$mpg)
-contour(glm.pred,auto$mpg)
-help(contour)
-contour(auto$mpg,auto$horsepower,glm.pred)
-glm.pred
-length(glm.pred)
-table(glm.pred,auto$mpg)
-table(glm.pred,auto$mpg,auto$horsepower)
-glm.pred=rep(0,397)
-glm.pred[auto$mpg>median(auto$mpg)]=1
-glm.pred
 auto$mpg01=rep(0,397)
 auto$mpg01[auto$mpg>median(auto$mpg)]=1
-auto$mpg01
-auto$mpg01
-auto$mpg01
-plots(auto)
-plot(auto)
-boxplot(auto)
-boxplot.matrix(auto)
-help(boxplot)
-boxplot(auto$mpg01,auto)
-boxplot(auto$mpg,auto)
-boxplot(auto$mpg)
-boxplot(auto)
-boxplot(mpg01 ~ auto)
-boxplot(mpg01 ~)
-boxplot(auto$mpg01 ~ auto)
-attach(auto)
-boxplot(mpg01)
-boxplot(mpg01 ~ auto)
-boxplot(mpg01 ~ auto,auto)
-boxplot(mpg01 ~ auto,data = auto)
-help(plot.table)
-plot.table(auto)
-help(plot.table)
-plot(auto)
-plot(auto,t="box")
-help(plot.table)
-help(plot.table,plot.frame=1)
-help(plot.table)
-help(plot.table,frame.plot=1)
-help(plot.table)
-help(plot.table,frame.plot=is.num)
-help(plot.table)
-plot(auto,t="box",frame.plot=1)
-plot(auto,frame.plot=1)
-plot(auto,frame.plot=1)
-plot(auto,frame.plot=is.num)
-plot(auto,frame.plot=0)
-plot(auto,frame.plot="0")
-plot(auto,frame.plot="1")
-plot(auto,frame.plot=TRUE)
-plot(auto,frame.plot=FALSE)
-plot(auto,frame.plot=TRUE)
-plot(auto,frame.plot=T)
-plot(auto,frame.plot=1)
-boxplot(mpg~mpg01,auto)
-boxplot(mpg01 ~ mpg,auto)
-boxplot(mpg01 ~ *,auto)
-boxplot(mpg01 ~ ,auto)
-boxplot(mpg01 ~ auto,auto)
-boxplot(mpg01,auto)
-boxplot(auto)
-boxplot(auto,y=mpg01)
-boxplot(auto,y=mpg)
-boxplot(data = auto)
-boxplot(auto)
-help(for)
-plot(auto,frame.plot=1)
-plot(auto)
-names(auto)
-auto$name
-help(sample)
-x <- 1:12
-x
-sample(x)
-help(sample)
-sample(x,replace=T)
-sample(x,replace=T)
-sample(x,replace=F)
-c
-x
-sample(x,replace=T)
-x
-help(sample)
-sample(x[x>9])
-sample(x[x>8])
-help(sample)
-x <- 1:10
-sample(x[x>8])
-sample(x[x>])
-help(sample)
-help(sample)
-help(sample)
-sample(auto,size=length(mpg01)/2)
-x <- length(mpg01)
-sample(x,size=length(mpg01)/2)
-auto[sample(x,size=length(mpg01)/2)]
-auto$mpg[sample(x,size=length(mpg01)/2)]
-help(data.frame)
-data.frame(
-help(data.frame)
-auto[sample(x,size=length(mpg01)/2)]
-train = sample(x,size=length(mpg01)/2)
-train = 
-auto[train]
-auto$mpg[train]
-auto$mpg[train,]
-auto$mpg[train]
-auto$mpg[23]
-auto$mpg[228]
-auto$mpg[391]
-auto.test=auto[!train]
-auto.train=auto[train]
-auto.test
-summary(auto.test)
-train=(mpg<15)
-train
-train = (sample(x,size=length(mpg01)/2))
-train
-head(auto)
-auto[,train[
-auto[,train]
-train
-help(contains)
-auto[1,train]
-train
-auto[[,train]]
-auto[[1,train]]
-autoi
-head(auto)
-head(auto[sample(nrow(auto),397/2)])
-head(auto[sample(nrow(auto),3)])
-data = data.frame(auto)
-data
-head(data[sample(nrow(data),3)])
-nrow(data)
-head(data[sample(ncol(data),3)])
-head(data[sample(ncol(data),397/2)])
-head(data[sample(ncol(data),3)])
-head(data[sample(ncol(data),3)])
-head(data[sample(ncol(data),3)])
-head(data[sample(ncol(data),3)])
-head(data[,sample(ncol(data),3)])
-head(data[,sample(ncol(data),3)])
-head(data[,sample(ncol(data),3)])
-head(data[,sample(ncol(data),3)])
-head(data[,sample(ncol(data),3)])
-head(data[sample(ncol(data),3),])
-head(data[sample(ncol(data),3),])
-head(data[sample(ncol(data),3),])
-head(data[sample(nrow(data),3),])
-head(data[sample(nrow(data),397/2),])
-head(data[sample(nrow(data),397/2),])
-head(data[sample(nrow(data),397/2),])
-head(data[sample(nrow(data),397/2),])
-head(data[sample(nrow(data),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-head(auto[sample(nrow(auto),397/2),])
-train = auto[sample(nrow(auto),397/2),]
-[sample(nrow(auto),397/2),]
-sample(nrow(auto),397/2)
-train sample(nrow(auto),397/2)
-train = sample(nrow(auto),397/2)
-autp[train,]
-auto[train,]
-train = sample(nrow(auto),397/2)
-head(auto[train,])
-head(auto[!train,])
-traindata = auto[train,]
-testdata = auto[!train,]
-testdata
-traindata
-length(traindata)
-length(traindata$mpg)
-198*2
-summary(testdata)
-testdata = auto[!train]
-testdata
-testdata = auto[!train,]
-train
-summary(train)
-names(train)
-head(traindata)
-
-testdata = auto[!train,]
-testdata
-!train
-train
-?sample
-sort(train)
-train_vals = train
-train = rep(false,397)
-train = rep(F,397)
-train
-help for
-?for
-?for
-help)for)
-help(for)
-help(for)
-help lapply()
-?lapply
-sapply(train,
-?sapply
-sapply(train,
-?sapply
-train[train_vals]=T
-train
-traindata = auto[train,]
-traindata
-length(auto)
-length(traindata)
-length(traindata$mpg)
-testdata=auto[!train,]
-length(testdate$mpg)
-length(testdata$mpg)
-training_indices = sample(nrow(auto),397/2)
-train_bools = rep(F,length(auto$mpg))
-train_bools[training_indices]=T
-head(train_bools)
-length(train_bools)
+library(ISLR)
+library(MASS)
+library(class)
+train_bools <- (auto$year %% 2 == 0)
 train_data = auto[train_bools,]
 test_data = auto[!train_bools,]
-summary(train_data)
-summary(test_data)
-lda.fit
-library(MASS)
-lda.fit
-lda()
-detach(auto)
-mpg01
-mpg
-attach(test_data)
-mpg01
-names()
-names(test_data)
-ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
-detach(test_data)
-ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
-lda.fit
-lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
-lda.fit
-summary(lda.fit)
-coefficients(lda.fit)
-plot(lda.fit)
-lda.pred=predict(lda.fit,test_data)
-lda.pred=predict(lda.fit, !training_bools)
-lda.pred=predict(lda.fit, !training_indices)
-test_data
-lda.pred=predict(lda.fit, test_data)
-lda.pred
-plot(lda.pred)
-names(lda.pred)
-lda.class=lda.pres$class
-lda.class=lda.pred$class
-table(lda.class,testdata)
-table(lda.class,test_data)
-length(lda.class)
-length(test_data)
-table(lda.class,test_data$mpg01)
-mean(lda.class==test_data$mpg01)
-sum(lda.pred$posterior[,1]>=.5)
-sum(lda.pred$posterior[,1]<.5)
-lda.pred$posterior[,1]
-sum(lda.pred$posterior<.5)
-lda.pred$posterior
-lda.pred$posterior<5
-lda.pred$posterior<.5
-sum(lda.pred$posterior<.5)
-sum(lda.pred$posterior<.5[,1])
-sum(lda.pred$posterior<.5[1])
-sum(lda.pred$posterior<.5[2])
-lda.pred$posterior<.5[2]
-lda.pred$posterior<.5
-lda.pred$posterior
-lda.pred$posterior[,1]
-lda.pred$posterior[1,]
-lda.pred$posterior[,2]
-lda.pred$posterior[,1]
-lda.pred$posterior[,1]>.5
-sum(lda.pred$posterior[,1]>.5)
-sum.bool(lda.pred$posterior[,1]>.5)
-?sum
-sum.bool(lda.pred$posterior[,1]>.5,na.rm=T)
-sum(lda.pred$posterior[,1]>.5,na.rm=T)
-sum(lda.pred$posterior[,1]>.5)
-sum(lda.pred$posterior[,1]>.5,na.rm=T)
-sum(lda.pred$posterior[,1]>=.5,na.rm=T)
-sum(lda.pred$posterior[,1]<.5,na.rm=T)
-mean(lda.pred$[,1]==test_data,na.rm=T)
-lda.pred
-lda.pred$class
-lda.pred$class==test_data$mpg01
-mean(lda.pred$class==test_data$mpg01,na.rm=T)
-mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
-lda.fit
-mean(lda.pred$class==test_data$mpg01,na.rm=T)
-lda.pred=predict(lda.fit, test_data)
-mean(lda.pred$class==test_data$mpg01,na.rm=T)
-mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-train_data == test_data
-train_data$mpg01 == test_data$mpg01
-lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
-lda.pred=predict(lda.fit, test_data)
-mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-lda.pred
-lda.pred$posterior[,1]
-summary(lda.fit)
-lda.fit
-lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
-lda.fit
-mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-lda.pred=predict(lda.fit, test_data)
-mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-head(lda.pred)
-lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
-lda.pred=predict(lda.fit, test_data)
-head(lda.pred)
-mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
-qda.fit
-qda.class=predict(qda.fit,test_data)$class
-qda.class=predict(qda.fit,test_data,na.rm=T)$class
-qda.class=predict(qda.fit,test_data)$class
-qda.class
-mean(qda.pred$class!=test_data$mpg01,na.rm=T)
-qda.pred=predict(qda.fit,test_data)
-qda.pred=predict(qda.fit,test_data,na.rm=T)
-mean(qda.pred$class!=test_data$mpg01,na.rm=T)
-glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
-glm.probs=predict(glm.fit,test_data,type="response")
-glm.pred=rep(0,199)
-glm.pred[glm.probs>.5]=1
-table(glm.pred,test_data$mpg01)
-mean(glm.pred!=test_data$mpg01)
-library(class)
-?cbind
+help(knn)
+help(knn)
+     train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
+     test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
+train
+test
 ?knn
-knn.fit = knn(train_data,test_data,auto$mpg01[training_indices])
-knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1)
-knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1)
-?knn
-training_indices
-train_bools
-knn.fit = knn(train_data,test_data,auto$mpg01[train_bools],k=1)
-sdf = (mpg01<1)
-sdf = (auto$mpg01<1)
-sdf
-train_bools
-cbind(horsepower,displacement)
-cbind(train_data$horsepower,displacement)
-cbind(train_data$horsepower,train_data$displacement)
-cbind(auto$horsepower,auto$displacement)[train_bools]
-cbind(auto$horsepower,auto$displacement)[train_bools,]
-cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
-cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
-train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
-test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
-train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
-test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
-train.mpg01 = auto[train_bools]
-train.mpg01 = auto$mpg01[train_bools]
-test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
-train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
-test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
-train.mpg01 = auto$mpg01[train_bools]
-set.seed(56)
-knn.pred = knn(train.X,test.X,train.mpg01,k=1)
-?cbind
-?Knn
-?knn
-train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
-test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
-train.mpg01 = auto$mpg01[train_bools]
-train.X = train.X[!is.na(train.X)]
-test.X = data.frame(test.X,
-train.mpg01 = train.mpg01[!is.na(train.mpg01)]
-knn.pred = knn(train.X,test.X,train.mpg01,k=1)
-length(train.mpg01)
-length(test.X)
-text.X
-test.X
-test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
-length(test.X)
-test.X
-knn.pred = knn(train.X,test.X,train.mpg01,k=1)
+     cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
+cl
+length(cl)
+length(train)
+nrows(train)
+nrow(train)
 train.X
 train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
 train.X
+test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
 test.X
-knn.pred = knn(train.X,test.X,train.mpg01,k=1)
-?knn
-length(train.X)
-length(train.X[1,])
-length(train.X[,1])
-?knn
-plot(auto)
-train.X = cbind(auto$horsepower,auto$displacement)[train_bools,]
-test.X = cbind(auto$horsepower,auto$displacement)[!train_bools,]
-train.mpg01 = auto$mpg01[train_bools]
-knn.pred = knn(train.X,test.X,train.mpg01,k=1)
 train.X
-test.X
+train.mpg01 = auto$mpg01[train_bools]
 train.mpg01
+length(train.mpg01)
+nrow(train.X)
+knn(train.X,train.Y,train.mpg01,K=1)
+knn(train.X,train.Y,train.mpg01,k=1)
+knn(train.X,test.X,train.mpg01,k=1)
+train.X
+na.omit(train.X)
+?na.omit
+na.omit(train.X)
+na.omit(train.X)
+knn(na.omit(train.X),test.X,train.mpg01,k=1)
+knn(na.omit(train.X),test.X,na.omit(train.mpg01),k=1)
+knn(na.omit(train.X),na.omit(test.X),na.omit(train.mpg01),k=1)
+train.mpg012 = na.omit(auto$mpg01)[train_bools]
+train.mpg012
+train.mpg01
+nrow(train)
+na.omit(auto)
+auto
+na.omit(auto)
+summary(auto)
+summary(na.omit(auto))
+Auto = na.omit(auto)
+auto = na.omit(auto)
+ncol(auto)
+nrow(auto)
+auto <- na.omit(auto)
+train_bools <- (auto$year %% 2 == 0)
+train_data = auto[train_bools,]
+test_data = auto[!train_bools,]
+train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
+test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
+train.mpg01 = auto$mpg01[train_bools]
 knn.pred = knn(train.X,test.X,train.mpg01,k=1)
+mean(knn.pred != auto$mpg01)
+mean(knn.pred != test_data$mpg01)
+knn.pred = knn(train.X,test.X,train.mpg01,k=2)
+mean(knn.pred != test_data$mpg01)
+knn.pred = knn(train.X,test.X,train.mpg01,k=3)
+mean(knn.pred != test_data$mpg01)
+knn.pred = knn(train.X,test.X,train.mpg01,k=4)
+mean(knn.pred != test_data$mpg0)
+knn.pred
+length(knn.pred)
+dim(knn.pred)
+length(test_data)
+ncol(test_data)
+nrow(test_data)
 q()
--- a/hw3/answers
+++ b/hw3/answers
@ -85,6 +85,7 @@ Part B: Choose one of Questions 10 or 11
    given car gets high or low gas mileage based on the Auto data
    set.

+──────────────────────────────────────────────────────────────────────────
    (a) Create a binary variable, mpg01 , that contains a 1 if mpg
    contains a value above its median, and a 0 if mpg contains a
    value below its median. You can compute the median using the
@ -92,6 +93,9 @@ Part B: Choose one of Questions 10 or 11
    data.frame() function to create a single data set containing
    both mpg01 and the other Auto variables.

+        > auto$mpg01=rep(0,397)
+        > auto$mpg01[auto$mpg>median(auto$mpg)]=1
+
 > auto$mpg01
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
@ -106,6 +110,7 @@ Part B: Choose one of Questions 10 or 11
 [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1


+──────────────────────────────────────────────────────────────────────────
    (b) Explore the data graphically in order to investigate the
    associ- ation between mpg01 and the other features. Which of the
    other features seem most likely to be useful in predicting mpg01
@ -119,6 +124,16 @@ Part B: Choose one of Questions 10 or 11
        Displacement is on the cusp and the other variables don't
        have a terribly useful relationship with this median.

+        The boxplots indicate that acceleration really isn't a great
+        predictor of mpg01, but displacement is. It also confirms
+        horsepower and weight as good predictors, and cylinders also
+        seems to be very strong, even though I didn't take that from
+        the scatter plots.
+
+        I will use mpg01 ~ horsepower + weight + cylinders + displacement
+
+
+──────────────────────────────────────────────────────────────────────────
    (c) Split the data into a training set and a test set.

        Seems like a 50/50 random sampling is appropriate enough. 
@ -127,87 +142,133 @@ Part B: Choose one of Questions 10 or 11
        > train_bools = rep(F,length(auto$mpg))
        > train_bools[training_indices]=T
        > head(train_bools)
-        [1] FALSE  TRUE FALSE FALSE  TRUE FALSE
+        [1]  TRUE  TRUE  TRUE FALSE  TRUE FALSE
        > length(train_bools)
        [1] 397
        > train_data = auto[train_bools,]
        > test_data = auto[!train_bools,]
                 
+        Actually, I changed this now, because a solution I found
+        online suggested a different test split and I was having
+        trouble with the KNN model, so I followed their style. I used:

+        > train <- (auto$year %% 2 == 0)
+
+        and then the rest the same
+
+──────────────────────────────────────────────────────────────────────────
    (d) Perform LDA on the training data in order to predict mpg01
    using the variables that seemed most associated with mpg01 in
    (b). What is the test error of the model obtained?

        > lda.fit
        Call:
-        lda(mpg01 ~ horsepower + weight + acceleration + displacement, 
-            data = train_data)
+        lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
        
        Prior probabilities of groups:
                0         1 
-        0.5431472 0.4568528 
+        0.4666667 0.5333333 
        
        Group means:
-          horsepower   weight acceleration displacement
-        0  129.08411 3557.757     14.55981      269.729
-        1   79.64444 2345.233     16.39222      116.800
+          horsepower   weight cylinders displacement
+        0  131.96939 3579.827  6.755102     268.4082
+        1   77.96429 2313.598  4.071429     111.7188
        
        Coefficients of linear discriminants:
-                              LD1
-        horsepower    0.005678626
-        weight       -0.001137499
-        acceleration -0.014950459
-        displacement -0.007401647
+                               LD1
+        horsepower    0.0060634365
+        weight       -0.0011442212
+        cylinders    -0.6390942259
+        displacement  0.0004517291


-        Error Rate against test data:
+
+     ***Test Data Error Rate:
        > mean(lda.pred$class!=test_data$mpg01,na.rm=T)
-        [1] 0.1179487
+        [1] 0.1428571



+──────────────────────────────────────────────────────────────────────────
    (e) Perform QDA on the training data in order to predict mpg01
    using the variables that seemed most associated with mpg01 in
    (b). What is the test error of the model obtained?

-        > qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
        > qda.fit
        Call:
-        qda(mpg01 ~ horsepower + weight + acceleration + displacement, 
-            data = train_data)
+        lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
        
        Prior probabilities of groups:
                0         1 
-        0.5431472 0.4568528 
+        0.4666667 0.5333333 
        
        Group means:
-          horsepower   weight acceleration displacement
-        0  129.08411 3557.757     14.55981      269.729
-        1   79.64444 2345.233     16.39222      116.800
+          horsepower   weight cylinders displacement
+        0  131.96939 3579.827  6.755102     268.4082
+        1   77.96429 2313.598  4.071429     111.7188
+        
+        Coefficients of linear discriminants:
+                               LD1
+        horsepower    0.0060634365
+        weight       -0.0011442212
+        cylinders    -0.6390942259
+        displacement  0.0004517291

-        Error Rate:
+    ***Test Data Error Rate:
        > mean(qda.pred$class!=test_data$mpg01,na.rm=T)
-        [1] 0.1025641
+        [1] 0.1428571



+──────────────────────────────────────────────────────────────────────────
    (f) Perform logistic regression on the training data in order to
    pre- dict mpg01 using the variables that seemed most associated
    with mpg01 in (b). What is the test error of the model obtained?

-        > glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
+        > glm.fit=glm(mpg01 ~ horsepower + weight + cylinders + displacement,data=train_data,family=binomial)
        > glm.probs=predict(glm.fit,test_data,type="response")
        > glm.pred=rep(0,199)
        > glm.pred[glm.probs>.5]=1
+
+     ***Test Data Error Rate:
        > mean(glm.pred!=test_data$mpg01)
-        [1] 0.120603
+        [1] 0.1407035


+──────────────────────────────────────────────────────────────────────────
    (g) Perform KNN on the training data, with several values of K,
    in order to predict mpg01 . Use only the variables that seemed
    most associated with mpg01 in (b). What test errors do you
    obtain? Which value of K seems to perform the best on this data
    set?

-        
+       The knn method can't handle the NA values, so
+
+        > set.seed(1)
+        > auto <- na.omit(auto)
+        > train_bools <- (auto$year %% 2 == 0)
+        > train_data = auto[train_bools,]
+        > test_data = auto[!train_bools,]
+
+        > train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
+        > test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
+        > train.mpg01 = auto$mpg01[train_bools]
+
+     ***Test Data Error Rates:
+     k = 1
+        > mean(knn.pred != test_data$mpg01)
+        [1] 0.1483516
+     k = 2
+        > mean(knn.pred != test_data$mpg01)
+        [1] 0.1593407
+     k = 3
+        > mean(knn.pred != test_data$mpg01)
+        [1] 0.1648352
+     k = 4
+        > mean(knn.pred != test_data$mpg0)
+        [1] 0.1813187
+
+        k = 1 looks like the best, since the error rate increases with k.
+
+