finished hw3

This commit is contained in:
caes 2017-02-09 22:59:23 -05:00
parent b0c1b3ed08
commit 7d230c0b30
3 changed files with 152 additions and 438 deletions

Binary file not shown.

View File

@ -1,426 +1,79 @@
auto = read.table("auto.data",header=T,na.strings="?")
length(x=auto$mpg)
glm
glm.pred
help(rep)
glm.pred=rep(FALSE,397)
glm.pred
medium(auto$mpg)
median(auto$mpg)
glm.pred[auto$mpg>median(auto$mpg)]=T
glm.pred
contour(auto)
contour(glm.pred ~ auto$mpg)
contour(glm.pred,auto$mpg)
help(contour)
contour(auto$mpg,auto$horsepower,glm.pred)
glm.pred
length(glm.pred)
table(glm.pred,auto$mpg)
table(glm.pred,auto$mpg,auto$horsepower)
glm.pred=rep(0,397)
glm.pred[auto$mpg>median(auto$mpg)]=1
glm.pred
auto$mpg01=rep(0,397)
auto$mpg01[auto$mpg>median(auto$mpg)]=1
auto$mpg01
auto$mpg01
auto$mpg01
plots(auto)
plot(auto)
boxplot(auto)
boxplot.matrix(auto)
help(boxplot)
boxplot(auto$mpg01,auto)
boxplot(auto$mpg,auto)
boxplot(auto$mpg)
boxplot(auto)
boxplot(mpg01 ~ auto)
boxplot(mpg01 ~)
boxplot(auto$mpg01 ~ auto)
attach(auto)
boxplot(mpg01)
boxplot(mpg01 ~ auto)
boxplot(mpg01 ~ auto,auto)
boxplot(mpg01 ~ auto,data = auto)
help(plot.table)
plot.table(auto)
help(plot.table)
plot(auto)
plot(auto,t="box")
help(plot.table)
help(plot.table,plot.frame=1)
help(plot.table)
help(plot.table,frame.plot=1)
help(plot.table)
help(plot.table,frame.plot=is.num)
help(plot.table)
plot(auto,t="box",frame.plot=1)
plot(auto,frame.plot=1)
plot(auto,frame.plot=1)
plot(auto,frame.plot=is.num)
plot(auto,frame.plot=0)
plot(auto,frame.plot="0")
plot(auto,frame.plot="1")
plot(auto,frame.plot=TRUE)
plot(auto,frame.plot=FALSE)
plot(auto,frame.plot=TRUE)
plot(auto,frame.plot=T)
plot(auto,frame.plot=1)
boxplot(mpg~mpg01,auto)
boxplot(mpg01 ~ mpg,auto)
boxplot(mpg01 ~ *,auto)
boxplot(mpg01 ~ ,auto)
boxplot(mpg01 ~ auto,auto)
boxplot(mpg01,auto)
boxplot(auto)
boxplot(auto,y=mpg01)
boxplot(auto,y=mpg)
boxplot(data = auto)
boxplot(auto)
help(for)
plot(auto,frame.plot=1)
plot(auto)
names(auto)
auto$name
help(sample)
x <- 1:12
x
sample(x)
help(sample)
sample(x,replace=T)
sample(x,replace=T)
sample(x,replace=F)
c
x
sample(x,replace=T)
x
help(sample)
sample(x[x>9])
sample(x[x>8])
help(sample)
x <- 1:10
sample(x[x>8])
sample(x[x>])
help(sample)
help(sample)
help(sample)
sample(auto,size=length(mpg01)/2)
x <- length(mpg01)
sample(x,size=length(mpg01)/2)
auto[sample(x,size=length(mpg01)/2)]
auto$mpg[sample(x,size=length(mpg01)/2)]
help(data.frame)
data.frame(
help(data.frame)
auto[sample(x,size=length(mpg01)/2)]
train = sample(x,size=length(mpg01)/2)
train =
auto[train]
auto$mpg[train]
auto$mpg[train,]
auto$mpg[train]
auto$mpg[23]
auto$mpg[228]
auto$mpg[391]
auto.test=auto[!train]
auto.train=auto[train]
auto.test
summary(auto.test)
train=(mpg<15)
train
train = (sample(x,size=length(mpg01)/2))
train
head(auto)
auto[,train[
auto[,train]
train
help(contains)
auto[1,train]
train
auto[[,train]]
auto[[1,train]]
autoi
head(auto)
head(auto[sample(nrow(auto),397/2)])
head(auto[sample(nrow(auto),3)])
data = data.frame(auto)
data
head(data[sample(nrow(data),3)])
nrow(data)
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),397/2)])
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),3)])
head(data[sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[,sample(ncol(data),3)])
head(data[sample(ncol(data),3),])
head(data[sample(ncol(data),3),])
head(data[sample(ncol(data),3),])
head(data[sample(nrow(data),3),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(data[sample(nrow(data),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
head(auto[sample(nrow(auto),397/2),])
train = auto[sample(nrow(auto),397/2),]
[sample(nrow(auto),397/2),]
sample(nrow(auto),397/2)
train sample(nrow(auto),397/2)
train = sample(nrow(auto),397/2)
autp[train,]
auto[train,]
train = sample(nrow(auto),397/2)
head(auto[train,])
head(auto[!train,])
traindata = auto[train,]
testdata = auto[!train,]
testdata
traindata
length(traindata)
length(traindata$mpg)
198*2
summary(testdata)
testdata = auto[!train]
testdata
testdata = auto[!train,]
train
summary(train)
names(train)
head(traindata)
testdata = auto[!train,]
testdata
!train
train
?sample
sort(train)
train_vals = train
train = rep(false,397)
train = rep(F,397)
train
help for
?for
?for
help)for)
help(for)
help(for)
help lapply()
?lapply
sapply(train,
?sapply
sapply(train,
?sapply
train[train_vals]=T
train
traindata = auto[train,]
traindata
length(auto)
length(traindata)
length(traindata$mpg)
testdata=auto[!train,]
length(testdate$mpg)
length(testdata$mpg)
training_indices = sample(nrow(auto),397/2)
train_bools = rep(F,length(auto$mpg))
train_bools[training_indices]=T
head(train_bools)
length(train_bools)
library(ISLR)
library(MASS)
library(class)
train_bools <- (auto$year %% 2 == 0)
train_data = auto[train_bools,]
test_data = auto[!train_bools,]
summary(train_data)
summary(test_data)
lda.fit
library(MASS)
lda.fit
lda()
detach(auto)
mpg01
mpg
attach(test_data)
mpg01
names()
names(test_data)
ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
detach(test_data)
ldf.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
lda.fit
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
lda.fit
summary(lda.fit)
coefficients(lda.fit)
plot(lda.fit)
lda.pred=predict(lda.fit,test_data)
lda.pred=predict(lda.fit, !training_bools)
lda.pred=predict(lda.fit, !training_indices)
test_data
lda.pred=predict(lda.fit, test_data)
lda.pred
plot(lda.pred)
names(lda.pred)
lda.class=lda.pres$class
lda.class=lda.pred$class
table(lda.class,testdata)
table(lda.class,test_data)
length(lda.class)
length(test_data)
table(lda.class,test_data$mpg01)
mean(lda.class==test_data$mpg01)
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
lda.pred$posterior[,1]
sum(lda.pred$posterior<.5)
lda.pred$posterior
lda.pred$posterior<5
lda.pred$posterior<.5
sum(lda.pred$posterior<.5)
sum(lda.pred$posterior<.5[,1])
sum(lda.pred$posterior<.5[1])
sum(lda.pred$posterior<.5[2])
lda.pred$posterior<.5[2]
lda.pred$posterior<.5
lda.pred$posterior
lda.pred$posterior[,1]
lda.pred$posterior[1,]
lda.pred$posterior[,2]
lda.pred$posterior[,1]
lda.pred$posterior[,1]>.5
sum(lda.pred$posterior[,1]>.5)
sum.bool(lda.pred$posterior[,1]>.5)
?sum
sum.bool(lda.pred$posterior[,1]>.5,na.rm=T)
sum(lda.pred$posterior[,1]>.5,na.rm=T)
sum(lda.pred$posterior[,1]>.5)
sum(lda.pred$posterior[,1]>.5,na.rm=T)
sum(lda.pred$posterior[,1]>=.5,na.rm=T)
sum(lda.pred$posterior[,1]<.5,na.rm=T)
mean(lda.pred$[,1]==test_data,na.rm=T)
lda.pred
lda.pred$class
lda.pred$class==test_data$mpg01
mean(lda.pred$class==test_data$mpg01,na.rm=T)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
lda.fit
mean(lda.pred$class==test_data$mpg01,na.rm=T)
lda.pred=predict(lda.fit, test_data)
mean(lda.pred$class==test_data$mpg01,na.rm=T)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
train_data == test_data
train_data$mpg01 == test_data$mpg01
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
lda.pred=predict(lda.fit, test_data)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
lda.pred
lda.pred$posterior[,1]
summary(lda.fit)
lda.fit
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=test_data)
lda.fit
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
lda.pred=predict(lda.fit, test_data)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
head(lda.pred)
lda.fit=lda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
lda.pred=predict(lda.fit, test_data)
head(lda.pred)
mean(lda.pred$class!=test_data$mpg01,na.rm=T)
qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
qda.fit
qda.class=predict(qda.fit,test_data)$class
qda.class=predict(qda.fit,test_data,na.rm=T)$class
qda.class=predict(qda.fit,test_data)$class
qda.class
mean(qda.pred$class!=test_data$mpg01,na.rm=T)
qda.pred=predict(qda.fit,test_data)
qda.pred=predict(qda.fit,test_data,na.rm=T)
mean(qda.pred$class!=test_data$mpg01,na.rm=T)
glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
glm.probs=predict(glm.fit,test_data,type="response")
glm.pred=rep(0,199)
glm.pred[glm.probs>.5]=1
table(glm.pred,test_data$mpg01)
mean(glm.pred!=test_data$mpg01)
library(class)
?cbind
help(knn)
help(knn)
train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
train
test
?knn
knn.fit = knn(train_data,test_data,auto$mpg01[training_indices])
knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1)
knn.fit = knn(train_data,test_data,auto$mpg01[training_indices],k=1)
?knn
training_indices
train_bools
knn.fit = knn(train_data,test_data,auto$mpg01[train_bools],k=1)
sdf = (mpg01<1)
sdf = (auto$mpg01<1)
sdf
train_bools
cbind(horsepower,displacement)
cbind(train_data$horsepower,displacement)
cbind(train_data$horsepower,train_data$displacement)
cbind(auto$horsepower,auto$displacement)[train_bools]
cbind(auto$horsepower,auto$displacement)[train_bools,]
cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto[train_bools]
train.mpg01 = auto$mpg01[train_bools]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
set.seed(56)
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
?cbind
?Knn
?knn
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
train.X = train.X[!is.na(train.X)]
test.X = data.frame(test.X,
train.mpg01 = train.mpg01[!is.na(train.mpg01)]
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
length(train.mpg01)
length(test.X)
text.X
test.X
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
length(test.X)
test.X
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
cl
length(cl)
length(train)
nrows(train)
nrow(train)
train.X
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
train.X
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
test.X
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
?knn
length(train.X)
length(train.X[1,])
length(train.X[,1])
?knn
plot(auto)
train.X = cbind(auto$horsepower,auto$displacement)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
train.X
test.X
train.mpg01 = auto$mpg01[train_bools]
train.mpg01
length(train.mpg01)
nrow(train.X)
knn(train.X,train.Y,train.mpg01,K=1)
knn(train.X,train.Y,train.mpg01,k=1)
knn(train.X,test.X,train.mpg01,k=1)
train.X
na.omit(train.X)
?na.omit
na.omit(train.X)
na.omit(train.X)
knn(na.omit(train.X),test.X,train.mpg01,k=1)
knn(na.omit(train.X),test.X,na.omit(train.mpg01),k=1)
knn(na.omit(train.X),na.omit(test.X),na.omit(train.mpg01),k=1)
train.mpg012 = na.omit(auto$mpg01)[train_bools]
train.mpg012
train.mpg01
nrow(train)
na.omit(auto)
auto
na.omit(auto)
summary(auto)
summary(na.omit(auto))
Auto = na.omit(auto)
auto = na.omit(auto)
ncol(auto)
nrow(auto)
auto <- na.omit(auto)
train_bools <- (auto$year %% 2 == 0)
train_data = auto[train_bools,]
test_data = auto[!train_bools,]
train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
train.mpg01 = auto$mpg01[train_bools]
knn.pred = knn(train.X,test.X,train.mpg01,k=1)
mean(knn.pred != auto$mpg01)
mean(knn.pred != test_data$mpg01)
knn.pred = knn(train.X,test.X,train.mpg01,k=2)
mean(knn.pred != test_data$mpg01)
knn.pred = knn(train.X,test.X,train.mpg01,k=3)
mean(knn.pred != test_data$mpg01)
knn.pred = knn(train.X,test.X,train.mpg01,k=4)
mean(knn.pred != test_data$mpg0)
knn.pred
length(knn.pred)
dim(knn.pred)
length(test_data)
ncol(test_data)
nrow(test_data)
q()

View File

@ -85,6 +85,7 @@ Part B: Choose one of Questions 10 or 11
given car gets high or low gas mileage based on the Auto data
set.
──────────────────────────────────────────────────────────────────────────
(a) Create a binary variable, mpg01 , that contains a 1 if mpg
contains a value above its median, and a 0 if mpg contains a
value below its median. You can compute the median using the
@ -92,6 +93,9 @@ Part B: Choose one of Questions 10 or 11
data.frame() function to create a single data set containing
both mpg01 and the other Auto variables.
> auto$mpg01=rep(0,397)
> auto$mpg01[auto$mpg>median(auto$mpg)]=1
> auto$mpg01
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0
[38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
@ -106,6 +110,7 @@ Part B: Choose one of Questions 10 or 11
[371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
──────────────────────────────────────────────────────────────────────────
(b) Explore the data graphically in order to investigate the
associ- ation between mpg01 and the other features. Which of the
other features seem most likely to be useful in predicting mpg01
@ -119,6 +124,16 @@ Part B: Choose one of Questions 10 or 11
Displacement is on the cusp and the other variables don't
have a terribly useful relationship with this median.
The boxplots indicate that acceleration really isn't a great
predictor of mpg01, but displacement is. It also confirms
horsepower and weight as good predictors, and cylinders also
seems to be very strong, even though I didn't take that from
the scatter plots.
I will use mpg01 ~ horsepower + weight + cylinders + displacement
──────────────────────────────────────────────────────────────────────────
(c) Split the data into a training set and a test set.
Seems like a 50/50 random sampling is appropriate enough.
@ -127,87 +142,133 @@ Part B: Choose one of Questions 10 or 11
> train_bools = rep(F,length(auto$mpg))
> train_bools[training_indices]=T
> head(train_bools)
[1] FALSE TRUE FALSE FALSE TRUE FALSE
[1] TRUE TRUE TRUE FALSE TRUE FALSE
> length(train_bools)
[1] 397
> train_data = auto[train_bools,]
> test_data = auto[!train_bools,]
Actually, I changed this now, because a solution I found
online suggested a different test split and I was having
trouble with the KNN model, so I followed their style. I used:
> train <- (auto$year %% 2 == 0)
and then the rest the same
──────────────────────────────────────────────────────────────────────────
(d) Perform LDA on the training data in order to predict mpg01
using the variables that seemed most associated with mpg01 in
(b). What is the test error of the model obtained?
> lda.fit
Call:
lda(mpg01 ~ horsepower + weight + acceleration + displacement,
data = train_data)
lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
Prior probabilities of groups:
0 1
0.5431472 0.4568528
0.4666667 0.5333333
Group means:
horsepower weight acceleration displacement
0 129.08411 3557.757 14.55981 269.729
1 79.64444 2345.233 16.39222 116.800
horsepower weight cylinders displacement
0 131.96939 3579.827 6.755102 268.4082
1 77.96429 2313.598 4.071429 111.7188
Coefficients of linear discriminants:
LD1
horsepower 0.005678626
weight -0.001137499
acceleration -0.014950459
displacement -0.007401647
horsepower 0.0060634365
weight -0.0011442212
cylinders -0.6390942259
displacement 0.0004517291
Error Rate against test data:
***Test Data Error Rate:
> mean(lda.pred$class!=test_data$mpg01,na.rm=T)
[1] 0.1179487
[1] 0.1428571
──────────────────────────────────────────────────────────────────────────
(e) Perform QDA on the training data in order to predict mpg01
using the variables that seemed most associated with mpg01 in
(b). What is the test error of the model obtained?
> qda.fit=qda(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data)
> qda.fit
Call:
qda(mpg01 ~ horsepower + weight + acceleration + displacement,
data = train_data)
lda(mpg01 ~ horsepower + weight + cylinders + displacement, data = train_data)
Prior probabilities of groups:
0 1
0.5431472 0.4568528
0.4666667 0.5333333
Group means:
horsepower weight acceleration displacement
0 129.08411 3557.757 14.55981 269.729
1 79.64444 2345.233 16.39222 116.800
horsepower weight cylinders displacement
0 131.96939 3579.827 6.755102 268.4082
1 77.96429 2313.598 4.071429 111.7188
Error Rate:
Coefficients of linear discriminants:
LD1
horsepower 0.0060634365
weight -0.0011442212
cylinders -0.6390942259
displacement 0.0004517291
***Test Data Error Rate:
> mean(qda.pred$class!=test_data$mpg01,na.rm=T)
[1] 0.1025641
[1] 0.1428571
──────────────────────────────────────────────────────────────────────────
(f) Perform logistic regression on the training data in order to
pre- dict mpg01 using the variables that seemed most associated
with mpg01 in (b). What is the test error of the model obtained?
> glm.fit=glm(mpg01 ~ horsepower + weight + acceleration + displacement,data=train_data,family=binomial)
> glm.fit=glm(mpg01 ~ horsepower + weight + cylinders + displacement,data=train_data,family=binomial)
> glm.probs=predict(glm.fit,test_data,type="response")
> glm.pred=rep(0,199)
> glm.pred[glm.probs>.5]=1
***Test Data Error Rate:
> mean(glm.pred!=test_data$mpg01)
[1] 0.120603
[1] 0.1407035
──────────────────────────────────────────────────────────────────────────
(g) Perform KNN on the training data, with several values of K,
in order to predict mpg01 . Use only the variables that seemed
most associated with mpg01 in (b). What test errors do you
obtain? Which value of K seems to perform the best on this data
set?
The knn method can't handle the NA values, so
> set.seed(1)
> auto <- na.omit(auto)
> train_bools <- (auto$year %% 2 == 0)
> train_data = auto[train_bools,]
> test_data = auto[!train_bools,]
> train.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[train_bools,]
> test.X = cbind(auto$horsepower,auto$displacement,auto$weight,auto$acceleration)[!train_bools,]
> train.mpg01 = auto$mpg01[train_bools]
***Test Data Error Rates:
k = 1
> mean(knn.pred != test_data$mpg01)
[1] 0.1483516
k = 2
> mean(knn.pred != test_data$mpg01)
[1] 0.1593407
k = 3
> mean(knn.pred != test_data$mpg01)
[1] 0.1648352
k = 4
> mean(knn.pred != test_data$mpg0)
[1] 0.1813187
k = 1 looks like the best, since the error rate increases with k.