finished hw3

This commit is contained in:
caes 2017-02-09 22:59:23 -05:00
parent b0c1b3ed08
commit 7d230c0b30
3 changed files with 152 additions and 438 deletions

Binary file not shown.

View File

@ -1,426 +1,79 @@
auto = read.table("auto.data",header=T,na.strings="?") auto = read.table("auto.data",header=T,na.strings="?")
length(x=auto$mpg)
glm
glm.pred
help(rep)
glm.pred=rep(FALSE,397)
glm.pred
medium(auto$mpg)
median(auto$mpg)
glm.pred[auto$mpg>median(auto$mpg)]=T
glm.pred
contour(auto)
contour(glm.pred ~ auto$mpg)
contour(glm.pred,auto$mpg)
help(contour)
contour(auto$mpg,auto$horsepower,glm.pred)
glm.pred
length(glm.pred)
table(glm.pred,auto$mpg)
table(glm.pred,auto$mpg,auto$horsepower)
glm.pred=rep(0,397)
glm.pred[auto$mpg>median(auto$mpg)]=1
glm.pred
auto$mpg01=rep(0,397) auto$mpg01=rep(0,397)
auto$mpg01[auto$mpg>median(auto$mpg)]=1 auto$mpg01[auto$mpg>median(auto$mpg)]=1
auto$mpg01 library(ISLR)
auto$mpg01 library(MASS)
auto$mpg01 library(class)
plots(auto) train_bools <- (auto$year %% 2 == 0)
plot(auto)
boxplot(auto)
boxplot.matrix(auto)
help(boxplot)
boxplot(auto$mpg01,auto)
boxplot(auto$mpg,auto)
boxplot(auto$mpg)
boxplot(auto)
boxplot(mpg01 ~ auto)
boxplot(mpg01 ~)
boxplot(auto$mpg01 ~ auto)
attach(auto)
boxplot(mpg01)
boxplot(mpg01 ~ auto)
boxplot(mpg01 ~ auto,auto)
boxplot(mpg01 ~ auto,data = auto)
help(plot.table)
plot.table(auto)
help(plot.table)
plot(auto)
plot(auto,t="box")
help(plot.table)
help(plot.table,plot.frame=1)
help(plot.table)
help(plot.table,frame.plot=1)
help(plot.table)
help(plot.table,frame.plot=is.num)
help(plot.table)
plot(auto,t="box",frame.plot=1)
plot(auto,frame.plot=1)
plot(auto,frame.plot=1)
plot(auto,frame.plot=is.num)
plot(auto,frame.plot=0)
plot(auto,frame.plot="0")
plot(auto,frame.plot="1")
plot(auto,frame.plot=TRUE)
plot(auto,frame.plot=FALSE)
plot(auto,frame.plot=TRUE)
plot(auto,frame.plot=T)
plot(auto,frame.plot=1)
boxplot(mpg~mpg01,auto)
boxplot(mpg01 ~ mpg,auto)
boxplot(mpg01 ~ *,auto)
boxplot(mpg01 ~ ,auto)
boxplot(mpg01 ~ auto,auto)
boxplot(mpg01,auto)
boxplot(auto)
boxplot(auto,y=mpg01)
boxplot(auto,y=mpg)
boxplot(data = auto)
boxplot(auto)
help(for)
plot(auto,frame.plot=1)
plot(auto)
names(auto)
auto$name
help(sample)
x <- 1:12
x
sample(x)
help(sample)
sample(x,replace=T)
sample(x,replace=T)
sample(x,replace=F)
c
x
sample(x,replace=T)
x
help(sample)
sample(x[x>9])
sample(x[x>8])
help(sample)
x <- 1:10
sample(x[x>8])
sample(x[x>])
help(sample)
help(sample)
help(sample)
sample(auto,size=length(mpg01)/2)
x <- length(mpg01)
sample(x,size=length(mpg01)/2)
auto[sample(x,size=length(mpg01)/2)]
auto$mpg[sample(x,size=length(mpg01)/2)]
help(data.frame)
data.frame(
help(data.frame)
auto[sample(x,size=length(mpg01)/2)]
train = sample(x,size=length(mpg01)/2)
train =
auto[train]
auto$mpg[train]
auto$mpg[train,]
auto$mpg[train]
auto$mpg[23]
auto$mpg[228]
auto$mpg[391]
auto.test=auto[!train]
auto.train=auto[train]
auto.test
summary(auto.test)
train=(mpg<15)
train
train = (sample(x,size=length(mpg01)/2))
train
head(auto)
auto[,train[
auto[,train]
train
help(contains)
auto[1,train]
train
auto[[,train]]
auto[[1,train]]
autoi
head(auto)
head(auto[sample(nrow(auto),397/2)])
head(auto[sample(nrow(auto),3)])
data = data.frame(auto)
data
head(data[sample(nrow(data),3)])
nrow(data)
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),397/2)])
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[sample(ncol(data),3),])
head(data[sample(ncol(data),3),])
head(data[sample(ncol(data),3),])
head(data[sample(nrow(data),3),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
train = auto[sample(nrow(auto),397/2),]
[sample(nrow(auto),397/2),]
sample(nrow(auto),397/2)
train sample(nrow(auto),397/2)
train = sample(nrow(auto),397/2)
autp[train,]
auto[train,]
train = sample(nrow(auto),397/2)
head(auto[train,])
head(auto[!train,])
traindata = auto[train,]
testdata = auto[!train,]
testdata
traindata
length(traindata)
length(traindata$mpg)
198*2
summary(testdata)
testdata = auto[!train]
testdata
testdata = auto[!train,]
train
summary(train)
names(train)
head(traindata)
testdata = auto[!train,]
testdata
!train
train
?sample
sort(train)
train_vals = train
train = rep(false,397)
train = rep(F,397)
train
help for
?for
?for
help)for)
help(for)
help(for)
help lapply()
?lapply
sapply(train,
?sapply
sapply(train,
?sapply
train[train_vals]=T
train
traindata = auto[train,]
traindata
length(auto)
length(traindata)
length(traindata$mpg)
testdata=auto[!train,]
length(testdate$mpg)
length(testdata$mpg)
training_indices = sample(nrow(auto),397/2)
train_bools = rep(F,length(auto$mpg))
train_bools[training_indices]=T
head(train_bools)
length(train_bools)
train_data = auto[train_bools,] train_data = auto[train_bools,]
test_data = auto[!train_bools,] test_data = auto[!train_bools,]
summary(train_data) help(knn)
summary(test_data) help(knn)
lda.fit train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
library(MASS) test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
lda.fit train
lda() test
detach(auto)
mpg01
mpg
attach(test_data)
mpg01
names()
names(test_data)
ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
detach(test_data)
ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
lda.fit
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
lda.fit
summary(lda.fit)
coefficients(lda.fit)
plot(lda.fit)
lda.pred=predict(lda.fit,test_data)
lda.pred=predict(lda.fit, !training_bools)
lda.pred=predict(lda.fit, !training_indices)
test_data
lda.pred=predict(lda.fit, test_data)
lda.pred
plot(lda.pred)
names(lda.pred)
lda.class=lda.pres$class
lda.class=lda.pred$class
table(lda.class,testdata)
table(lda.class,test_data)
length(lda.class)
length(test_data)
table(lda.class,test_data$mpg01)
mean(lda.class==test_data$mpg01)
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
lda.pred$posterior[,1]
sum(lda.pred$posterior<.5)
lda.pred$posterior
lda.pred$posterior<5
lda.pred$posterior<.5
sum(lda.pred$posterior<.5)
sum(lda.pred$posterior<.5[,1])
sum(lda.pred$posterior<.5[1])
sum(lda.pred$posterior<.5[2])
lda.pred$posterior<.5[2]
lda.pred$posterior<.5
lda.pred$posterior
lda.pred$posterior[,1]
lda.pred$posterior[1,]
lda.pred$posterior[,2]
lda.pred$posterior[,1]
lda.pred$posterior[,1]>.5
sum(lda.pred$posterior[,1]>.5)
sum.bool(lda.pred$posterior[,1]>.5)
?sum
sum.bool(lda.pred$posterior[,1]>.5,na.rm=T)
sum(lda.pred$posterior[,1]>.5,na.rm=T)
sum(lda.pred$posterior[,1]>.5)
sum(lda.pred$posterior[,1]>.5,na.rm=T)
sum(lda.pred$posterior[,1]>=.5,na.rm=T)
sum(lda.pred$posterior[,1]<.5,na.rm=T)
mean(lda.pred$[,1]==test_data,na.rm=T)
lda.pred
lda.pred$class
lda.pred$class==test_data$mpg01
mean(lda.pred$class==test_data$mpg01,na.rm=T)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
lda.fit
mean(lda.pred$class==test_data$mpg01,na.rm=T)
lda.pred=predict(lda.fit, test_data)
mean(lda.pred$class==test_data$mpg01,na.rm=T)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
train_data == test_data
train_data$mpg01 == test_data$mpg01
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
lda.pred=predict(lda.fit, test_data)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
lda.pred
lda.pred$posterior[,1]
summary(lda.fit)
lda.fit
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
lda.fit
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
lda.pred=predict(lda.fit, test_data)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
head(lda.pred)
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
lda.pred=predict(lda.fit, test_data)
head(lda.pred)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
qda.fit
qda.class=predict(qda.fit,test_data)$class
qda.class=predict(qda.fit,test_data,na.rm=T)$class
qda.class=predict(qda.fit,test_data)$class
qda.class
mean(qda.pred$class!=test_data$mpg01,na.rm=T)
qda.pred=predict(qda.fit,test_data)
qda.pred=predict(qda.fit,test_data,na.rm=T)
mean(qda.pred$class!=test_data$mpg01,na.rm=T)
glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
glm.probs=predict(glm.fit,test_data,type="response")
glm.pred=rep(0,199)
glm.pred[glm.probs>.5]=1
table(glm.pred,test_data$mpg01)
mean(glm.pred!=test_data$mpg01)
library(class)
?cbind
?knn ?knn
knn.fit = knn(train_data,test_data,auto$mpg01[training_indices]) cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1) cl
knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1) length(cl)
?knn length(train)
training_indices nrows(train)
train_bools nrow(train)
knn.fit = knn(train_data,test_data,auto$mpg01[train_bools],k=1)
sdf = (mpg01<1)
sdf = (auto$mpg01<1)
sdf
train_bools
cbind(horsepower,displacement)
cbind(train_data$horsepower,displacement)
cbind(train_data$horsepower,train_data$displacement)
cbind(auto$horsepower,auto$displacement)[train_bools]
cbind(auto$horsepower,auto$displacement)[train_bools,]
cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto[train_bools]
train.mpg01 = auto$mpg01[train_bools]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
set.seed(56)
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
?cbind
?Knn
?knn
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
train.X = train.X[!is.na(train.X)]
test.X = data.frame(test.X,
train.mpg01 = train.mpg01[!is.na(train.mpg01)]
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
length(train.mpg01)
length(test.X)
text.X
test.X
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
length(test.X)
test.X
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
train.X train.X
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,] train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
train.X train.X
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
test.X test.X
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
?knn
length(train.X)
length(train.X[1,])
length(train.X[,1])
?knn
plot(auto)
train.X = cbind(auto$horsepower,auto$displacement)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
train.X train.X
test.X train.mpg01 = auto$mpg01[train_bools]
train.mpg01 train.mpg01
length(train.mpg01)
nrow(train.X)
knn(train.X,train.Y,train.mpg01,K=1)
knn(train.X,train.Y,train.mpg01,k=1)
knn(train.X,test.X,train.mpg01,k=1)
train.X
na.omit(train.X)
?na.omit
na.omit(train.X)
na.omit(train.X)
knn(na.omit(train.X),test.X,train.mpg01,k=1)
knn(na.omit(train.X),test.X,na.omit(train.mpg01),k=1)
knn(na.omit(train.X),na.omit(test.X),na.omit(train.mpg01),k=1)
train.mpg012 = na.omit(auto$mpg01)[train_bools]
train.mpg012
train.mpg01
nrow(train)
na.omit(auto)
auto
na.omit(auto)
summary(auto)
summary(na.omit(auto))
Auto = na.omit(auto)
auto = na.omit(auto)
ncol(auto)
nrow(auto)
auto <- na.omit(auto)
train_bools <- (auto$year %% 2 == 0)
train_data = auto[train_bools,]
test_data = auto[!train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
knn.pred = knn(train.X,test.X,train.mpg01,k=1) knn.pred = knn(train.X,test.X,train.mpg01,k=1)
mean(knn.pred != auto$mpg01)
mean(knn.pred != test_data$mpg01)
knn.pred = knn(train.X,test.X,train.mpg01,k=2)
mean(knn.pred != test_data$mpg01)
knn.pred = knn(train.X,test.X,train.mpg01,k=3)
mean(knn.pred != test_data$mpg01)
knn.pred = knn(train.X,test.X,train.mpg01,k=4)
mean(knn.pred != test_data$mpg0)
knn.pred
length(knn.pred)
dim(knn.pred)
length(test_data)
ncol(test_data)
nrow(test_data)
q() q()

View File

@ -85,6 +85,7 @@ Part B: Choose one of Questions 10 or 11
given car gets high or low gas mileage based on the Auto data given car gets high or low gas mileage based on the Auto data
set. set.
──────────────────────────────────────────────────────────────────────────
(a) Create a binary variable, mpg01 , that contains a 1 if mpg (a) Create a binary variable, mpg01 , that contains a 1 if mpg
contains a value above its median, and a 0 if mpg contains a contains a value above its median, and a 0 if mpg contains a
value below its median. You can compute the median using the value below its median. You can compute the median using the
@ -92,6 +93,9 @@ Part B: Choose one of Questions 10 or 11
data.frame() function to create a single data set containing data.frame() function to create a single data set containing
both mpg01 and the other Auto variables. both mpg01 and the other Auto variables.
> auto$mpg01=rep(0,397)
> auto$mpg01[auto$mpg>median(auto$mpg)]=1
> auto$mpg01 > auto$mpg01
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0
[38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
@ -106,6 +110,7 @@ Part B: Choose one of Questions 10 or 11
[371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
──────────────────────────────────────────────────────────────────────────
(b) Explore the data graphically in order to investigate the (b) Explore the data graphically in order to investigate the
associ- ation between mpg01 and the other features. Which of the associ- ation between mpg01 and the other features. Which of the
other features seem most likely to be useful in predicting mpg01 other features seem most likely to be useful in predicting mpg01
@ -119,6 +124,16 @@ Part B: Choose one of Questions 10 or 11
Displacement is on the cusp and the other variables don't Displacement is on the cusp and the other variables don't
have a terribly useful relationship with this median. have a terribly useful relationship with this median.
The boxplots indicate that acceleration really isn't a great
predictor of mpg01, but displacement is. It also confirms
horsepower and weight as good predictors, and cylinders also
seems to be very strong, even though I didn't take that from
the scatter plots.
I will use mpg01 ~ horsepower + weight + cylinders + displacement
──────────────────────────────────────────────────────────────────────────
(c) Split the data into a training set and a test set. (c) Split the data into a training set and a test set.
Seems like a 50/50 random sampling is appropriate enough. Seems like a 50/50 random sampling is appropriate enough.
@ -127,87 +142,133 @@ Part B: Choose one of Questions 10 or 11
> train_bools = rep(F,length(auto$mpg)) > train_bools = rep(F,length(auto$mpg))
> train_bools[training_indices]=T > train_bools[training_indices]=T
> head(train_bools) > head(train_bools)
[1] FALSE TRUE FALSE FALSE TRUE FALSE [1] TRUE TRUE TRUE FALSE TRUE FALSE
> length(train_bools) > length(train_bools)
[1] 397 [1] 397
> train_data = auto[train_bools,] > train_data = auto[train_bools,]
> test_data = auto[!train_bools,] > test_data = auto[!train_bools,]
Actually, I changed this now, because a solution I found
online suggested a different test split and I was having
trouble with the KNN model, so I followed their style. I used:
> train <- (auto$year %% 2 == 0)
and then the rest the same
──────────────────────────────────────────────────────────────────────────
(d) Perform LDA on the training data in order to predict mpg01 (d) Perform LDA on the training data in order to predict mpg01
using the variables that seemed most associated with mpg01 in using the variables that seemed most associated with mpg01 in
(b). What is the test error of the model obtained? (b). What is the test error of the model obtained?
> lda.fit > lda.fit
Call: Call:
lda(mpg01 ~ horsepower + weight + acceleration + displacement, lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
data = train_data)
Prior probabilities of groups: Prior probabilities of groups:
0 1 0 1
0.5431472 0.4568528 0.4666667 0.5333333
Group means: Group means:
horsepower weight acceleration displacement horsepower weight cylinders displacement
0 129.08411 3557.757 14.55981 269.729 0 131.96939 3579.827 6.755102 268.4082
1 79.64444 2345.233 16.39222 116.800 1 77.96429 2313.598 4.071429 111.7188
Coefficients of linear discriminants: Coefficients of linear discriminants:
LD1 LD1
horsepower 0.005678626 horsepower 0.0060634365
weight -0.001137499 weight -0.0011442212
acceleration -0.014950459 cylinders -0.6390942259
displacement -0.007401647 displacement 0.0004517291
Error Rate against test data:
***Test Data Error Rate:
> mean(lda.pred$class!=test_data$mpg01,na.rm=T) > mean(lda.pred$class!=test_data$mpg01,na.rm=T)
[1] 0.1179487 [1] 0.1428571
──────────────────────────────────────────────────────────────────────────
(e) Perform QDA on the training data in order to predict mpg01 (e) Perform QDA on the training data in order to predict mpg01
using the variables that seemed most associated with mpg01 in using the variables that seemed most associated with mpg01 in
(b). What is the test error of the model obtained? (b). What is the test error of the model obtained?
> qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
> qda.fit > qda.fit
Call: Call:
qda(mpg01 ~ horsepower + weight + acceleration + displacement, lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
data = train_data)
Prior probabilities of groups: Prior probabilities of groups:
0 1 0 1
0.5431472 0.4568528 0.4666667 0.5333333
Group means: Group means:
horsepower weight acceleration displacement horsepower weight cylinders displacement
0 129.08411 3557.757 14.55981 269.729 0 131.96939 3579.827 6.755102 268.4082
1 79.64444 2345.233 16.39222 116.800 1 77.96429 2313.598 4.071429 111.7188
Error Rate: Coefficients of linear discriminants:
LD1
horsepower 0.0060634365
weight -0.0011442212
cylinders -0.6390942259
displacement 0.0004517291
***Test Data Error Rate:
> mean(qda.pred$class!=test_data$mpg01,na.rm=T) > mean(qda.pred$class!=test_data$mpg01,na.rm=T)
[1] 0.1025641 [1] 0.1428571
──────────────────────────────────────────────────────────────────────────
(f) Perform logistic regression on the training data in order to (f) Perform logistic regression on the training data in order to
pre- dict mpg01 using the variables that seemed most associated pre- dict mpg01 using the variables that seemed most associated
with mpg01 in (b). What is the test error of the model obtained? with mpg01 in (b). What is the test error of the model obtained?
> glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial) > glm.fit=glm(mpg01 ~ horsepower + weight + cylinders + displacement,data=train_data,family=binomial)
> glm.probs=predict(glm.fit,test_data,type="response") > glm.probs=predict(glm.fit,test_data,type="response")
> glm.pred=rep(0,199) > glm.pred=rep(0,199)
> glm.pred[glm.probs>.5]=1 > glm.pred[glm.probs>.5]=1
***Test Data Error Rate:
> mean(glm.pred!=test_data$mpg01) > mean(glm.pred!=test_data$mpg01)
[1] 0.120603 [1] 0.1407035
──────────────────────────────────────────────────────────────────────────
(g) Perform KNN on the training data, with several values of K, (g) Perform KNN on the training data, with several values of K,
in order to predict mpg01 . Use only the variables that seemed in order to predict mpg01 . Use only the variables that seemed
most associated with mpg01 in (b). What test errors do you most associated with mpg01 in (b). What test errors do you
obtain? Which value of K seems to perform the best on this data obtain? Which value of K seems to perform the best on this data
set? set?
The knn method can't handle the NA values, so
> set.seed(1)
> auto <- na.omit(auto)
> train_bools <- (auto$year %% 2 == 0)
> train_data = auto[train_bools,]
> test_data = auto[!train_bools,]
> train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
> test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
> train.mpg01 = auto$mpg01[train_bools]
***Test Data Error Rates:
k = 1
> mean(knn.pred != test_data$mpg01)
[1] 0.1483516
k = 2
> mean(knn.pred != test_data$mpg01)
[1] 0.1593407
k = 3
> mean(knn.pred != test_data$mpg01)
[1] 0.1648352
k = 4
> mean(knn.pred != test_data$mpg0)
[1] 0.1813187
k = 1 looks like the best, since the error rate increases with k.