#Using pipes to select the features I have decided could be potential candidates for modelling
car_data = car_data %>%
select(region,
price,
year,
manufacturer,
model,
condition,
cylinders,
fuel,
odometer,
title_status,
transmission,
drive,
size,
type,
paint_color,
state) %>%
#Mutating the features in order to use them more effectively
mutate(region = as.factor(region),
manufacturer = as.factor(manufacturer),
model = as.factor(model),
condition = as.factor(condition),
cylinders = as.factor(cylinders),
title_status = as.factor(title_status),
transmission = as.factor(transmission),
drive = as.factor(drive),
type = as.factor(type),
paint_color = as.factor(paint_color),
state = as.factor(state),
year = as.numeric(year),
size = as.factor(size))%>%
#Filtering Data
filter(between(price,500,50000),
odometer < 250000,
year > 1990)
par(mfrow=c(2,2))
#Exploratory Analysis with 10,000 sampled data
set.seed(1934)
gg_base_sample = ggplot(data = car_data[sample(nrow(car_data),
size = 10000),] %>%
na.omit())
#Histogram and Density Plots
hist_price = gg_base_sample+geom_histogram(aes(x = price),
color = 'red',
fill = 'light blue') +
labs(title = 'Histogram of Prices',
subtitle = "Randomly Sampled 10,000 Obs",
x = 'Price',
y = 'Count')
hist_odometer = gg_base_sample+geom_histogram(aes(x = odometer,
fill = odometer),
color = 'red',
fill = 'light blue') +
labs(title = 'Histogram of Mileage',
subtitle = "Randomly Sampled 10,000 Obs",
x = 'Mileage',
y = 'Count')
hist_year = gg_base_sample+geom_histogram(aes(x = year,
fill = year),
color = 'red',
fill = 'light blue') +
labs(title = 'Histogram of Year',
subtitle = "Randomly Sampled 10,000 Obs",
x = 'Yeare',
y = 'Count')+
theme_classic()
gridExtra::grid.arrange(hist_price,
hist_odometer,
hist_year,
nrow = 1)
hist_names = c('state', 'condition', 'cylinders', 'drive', 'manufacturer',
'paint_color', 'size', 'fuel', 'type', 'title_status')
hist_list = list()
for(i in 1:10){
hist_list[[i]] = gg_base_sample +
geom_histogram(aes_string(x = hist_names[i],
fill = hist_names[i]),
stat = 'count') +
# scale_fill_viridis_d(option = 'viridis')+
scale_fill_brewer(palette = "Dark2")+
labs(title = paste('Histogram of', hist_names[i]),
subtitle = "Randomly Sampled 10,000 Obs",
x = hist_names[i],
y = 'Count')+theme_classic()+theme (legend.position = "none")
}
# Testing the interactions with price
price_odom = gg_base_sample + geom_point(aes(y = price,
x = odometer,
color = size))+
scale_color_brewer(palette = "Dark2")+
geom_smooth(aes(y = price,
x = odometer),
size = 2,
color = 'red')+
labs(title = "GAM of Price against Odometer Reading",
y = "List Price",
x = "Odometer Reading (Miles)")
price_year = gg_base_sample + geom_point(aes(y = price,
x = year,
color = size))+
scale_color_brewer(palette = "Dark2")+
geom_smooth(aes(y = price,
x = year),
size = 2,
color = 'red')+
labs(title = "GAM of Price against Year",
y = "List Price",
x = "Year")
gridExtra::grid.arrange(price_odom,
price_year,
nrow = 1,
ncol = 2)
gridExtra::grid.arrange(hist_list[[2]],
hist_list[[3]],
hist_list[[7]],
hist_list[[10]],
nrow = 2,
ncol = 2)
gridExtra::grid.arrange(hist_list[[6]],
hist_list[[4]],
nrow = 1,
ncol = 2)
options(warn=-1)
#boxplots
#Selecting the features i want to put into boxplots
bps = car_data %>%
select(condition,
cylinders,
title_status,
transmission,
drive,
type,
paint_color,
price) %>%
na.omit()
var_list = colnames(bps)
plot_list = list()
for(i in 1:7){
plot_list[[i]] = ggplot(data = bps) +
geom_boxplot(aes_string(x = var_list[i],
y = var_list[8],
fill = var_list[i])) +
labs(title = paste("Boxplot of Price vs", var_list[i]),
x = var_list[i],
y = "Price")+theme (legend.position = "none")
}
gridExtra::grid.arrange(plot_list[[1]],
plot_list[[2]],
plot_list[[3]],
plot_list[[7]],
nrow = 2,
ncol = 2)
gridExtra::grid.arrange(plot_list[[4]],
plot_list[[5]],
plot_list[[6]],
nrow = 2,
ncol = 2)
#Subsetting the Data into Training and Testing for my Model
#Training Data Consists of 80% of the Data Randomly Generated
#Setting a seed for reproducability, using the last four digits of my student ID
set.seed(1934)
#Sampling 80% to obtain indexes with which the data will be divided
train_index = sample(nrow(car_data),
size = (nrow(car_data)*0.8),
replace = FALSE)
#Using the Sampled Elements to subset the data into test and training sets
Here is where I need to explain why I picked year condition size and odometer
Linear Regression is clearly not suitable for this project as the assumptions will not be held, but for sake of predictions and comparison to other models I will include it.
#lets fit some models
#Removing the Traing Data's NA's
train = car_data[train_index,] %>%
na.omit()
lr_model = lm(price~year+odometer+state+type,
data = train)
#This is the test dataset we will use to make predictions with and omit the NAs as well
test = car_data[-train_index,] %>%
na.omit()
lr_predictions = predict(lr_model,newdata=test)
lr_rmse = sqrt(mean((lr_predictions-test$price)**2))
library(glmnet)
#Have to attach a design matrix as the input for the model
#-1 is to omit the intercept
covariateMatrix = model.matrix( ~ year+odometer+state+type -1 ,
data = train)
testMatrix = model.matrix( ~ year+odometer+state+type -1 ,
data = test)
#fitting an intiial ridge regression model
ridge = glmnet(covariateMatrix,
train$price,
alpha=0)
#Now this matrix is obtained I will use the glmet package to obtain a cross validated ridge regression model from within the training data
#I will use 10 fold cross validation
cvRidge = cv.glmnet(covariateMatrix,
train$price,
alpha = 0,
nfolds = 10)
# lamda.min: value of lambda that minimises
# the cross-validation error
par(mfrow=c(1, 2))
plot(ridge, xvar="lambda")
abline(v=log(cvRidge$lambda.min), lwd=4, lty=2)
plot(cvRidge)
abline(v=log(cvRidge$lambda.min), lwd=4, lty=2)
#So we can see the model is minimised at a log lambda value of about minimises our function
#This explains the data is extremely different from the maximum likely hood model likely because we have quite a bit of colinearity
ridgePrediction = predict(cvRidge,newx=testMatrix)
ridge_rmse = sqrt(mean((ridgePrediction-test$price)**2))
lasso = glmnet(covariateMatrix,
train$price,
alpha=1)
#Now this matrix is obtained I will use the glmet package to obtain a cross validated ridge regression model from within the training data
#I will use 10 fold cross validation
cvLasso = cv.glmnet(covariateMatrix,
train$price,
alpha = 1,
nfolds = 10)
# lamda.min: value of lambda that minimises
# the cross-validation error
par(mfrow=c(1, 2))
plot(lasso, xvar="lambda")
abline(v=log(cvLasso$lambda.min), lwd=4, lty=2)
plot(cvLasso)
abline(v=log(cvLasso$lambda.min), lwd=4, lty=2)
LassoPrediction = predict(cvLasso,newx=testMatrix)
lasso_rmse = sqrt(mean((LassoPrediction-test$price)**2))
a = 0.5 #Choosing an an alpha parameter for the weights of the ridge/lasso relationship
elastic = glmnet(covariateMatrix,
train$price,
alpha=a)
#Now this matrix is obtained I will use the glmet package to obtain a cross validated ridge regression model from within the training data
#I will use 10 fold cross validation
cvElastic = cv.glmnet(covariateMatrix,
train$price,
alpha = a,
nfolds = 10)
# lamda.min: value of lambda that minimises
# the cross-validation error
par(mfrow=c(1, 2))
plot(elastic, xvar="lambda")
abline(v=log(cvElastic$lambda.min), lwd=4, lty=2)
plot(cvElastic)
abline(v=log(cvElastic$lambda.min), lwd=4, lty=2)
ElasticPrediction = predict(cvElastic,newx=testMatrix)
elastic_rmse = sqrt(mean((ElasticPrediction-test$price)**2))
I decided to fit two different generalized additive models
#Fitting a penalised regression spline model for the two numeric covariates with 10 knots
PRS_Model = mgcv::gam(price ~ splines::bs(odometer, knots = 10) + splines::bs(year, knots = 10) +state+type,
data=train)
prs_pred = predict(PRS_Model,newdata=test)
prs_rmse = sqrt(mean((prs_pred-test$price)**2))
#obtaining the prices from test data to compare the model to in the random forest
#Subsetting the data because it is quite large
rf_train = train[sample(nrow(train),
size = 10000),]
rf_test = test[sample(nrow(test),
size = 10000),]
testResponse = test$price
testCovariates = test %>%
select(year, odometer, state, type)
# Testing the data on the random forest algorithm
rf1 = randomForest(price ~ year + odometer + state + type,
data = rf_train,
xtest = testCovariates,
ytest = testResponse,
mtry = 4,
ntree = 150,
do.trace = FALSE)
rf1_TE = sqrt(rf1$test$mse)
#Plotting the Root MSE against the number of trees to evaluate the point where returns begin to diminish
plot(rf1_TE,
type = 'p',
ylab = 'Test Root MSE',
xlab = "Number of Trees",
main = "Evaluation of Root MSE against Number of Trees")
abline(v = 40,
col = 'green',
lwd =3)
text(x = 100,
y = 7000,
"RMSE plateaus at about 40 trees",
cex = 1.5,
col = 'brown')
#Building Random Forest Model and Computing the RMSE
rf_model = randomForest(price ~ year + odometer + state + type,
data=train,
mtry = 4,
ntree = 40,
do.trace = FALSE)
rf_pred = predict(rf_model,newdata=test)
rf_rmse = sqrt(mean((rf_pred-test$price)**2))
set.seed(1934)
sample_comp = sample(length(rf_pred),
size = 500)
#Plotting an Example of Predictions vs Observed in a Selected 2 Dimensional Case
ggplot() + geom_line(aes(y = rf_pred[sample_comp],
x = test$odometer[sample_comp]),
color = 'blue',
size = 1.5,
alpha = 0.4)+
geom_line(aes(y = test$price[sample_comp],
x = test$odometer[sample_comp]),
color = 'yellow',
size = 1.5,
alpha = 0.4)+
labs(title = "Price vs Odometer",
subtitle = "Random Forest Predictions (Blue) vs Observed (Yellow)",
y = "Price ($)",
x = "Odometer (Mileage)")+theme_classic()
RMSE = as.data.frame(rbind("Random Forest" = rf_rmse,
"Penalised Regression Spline" = prs_rmse,
"Elastic Net Model" = elastic_rmse,
"LASSO Model" = lasso_rmse,
"Ridge Regression" = ridge_rmse,
"Multiple Linear Regression" = lr_rmse))
colnames(RMSE) = "Root Mean Squared Error"
RMSE
## Root Mean Squared Error
## Random Forest 5122.263
## Penalised Regression Spline 6261.475
## Elastic Net Model 6519.284
## LASSO Model 6530.457
## Ridge Regression 6557.076
## Multiple Linear Regression 6500.726
Though the error is quite large, if a two dimensional comparison is taken, the model fits the data quite well. Positive signs for the random forest model to continue development and push for better methods to minimize the generalization error.