NAME: Giacomo Saccaggi
BADGE: 833063
NICKNAME: g.saccaggi
TEAM: Random_Forest_512
ROUND: 1st
Packages we used: caret tree randomForest
Bike sharing systems are a means of renting bicycles where the process of obtaining membership, rental, and bike return is automated via a network of kiosk locations throughout a city. Using these systems, people are able rent a bike from a one location and return it to a different place on an as-needed basis. Currently, there are over 500 bike-sharing programs around the world.
The data generated by these systems makes them attractive for researchers because the duration of travel, departure location, arrival location, and time elapsed is explicitly recorded. Bike sharing systems therefore function as a sensor network, which can be used for studying mobility in a city. > In this competition, participants are asked to combine historical usage patterns with weather data > in order to forecast bike rental demand.
Submissions are evaluated by the Root Mean Squared Logarithmic Error (RMSLE). The RMSLE is calculated as
RMSE = sqrt( mean( (log(y+1) - log(yhat+1) )^2 ) )
Where:
yhat is your predicted count y is the actual count log(x) is the natural logarithm
During the competition, the leaderboard displays your partial score, which is the RMSE for 3000 >(random) observations of the test set. At the end of the contest, the leaderboard will display the final score, which is the RMSE for the >remaining 3493 observations of the test set. The final score will determine the final winner. This >method prevents users from overfitting to the leaderboard.
# get the required R packages
library(caret)
library(tree)
library(randomForest)
# import data
set.seed(512)
train <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/99.csv", stringsAsFactors=T)
test <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/100.csv", stringsAsFactors=F)
# Transform date
exdata<-anno<-mese<-giorno<-ora<-numeric()
for(i in 1:length(train$datetime)){
exdata<-as.character(train$datetime[i])
anno[i]<-as.numeric(substr(exdata, 1,4))
mese[i]<-as.numeric(substr(exdata, 6,7))
giorno[i]<-as.numeric(substr(exdata, 9,10))
ora[i]<-as.numeric(substr(exdata, 12,13))
};train<-cbind(train[,-1],anno,mese,giorno,ora)
exdata<-anno<-mese<-giorno<-ora<-numeric()
for(i in 1:length(test$datetime)){
exdata<-as.character(test$datetime[i])
anno[i]<-as.numeric(substr(exdata, 1,4))
mese[i]<-as.numeric(substr(exdata, 6,7))
giorno[i]<-as.numeric(substr(exdata, 9,10))
ora[i]<-as.numeric(substr(exdata, 12,13))
};test<-cbind(test[,-1],anno,mese,giorno,ora)
# Features engineering
##1
oreworkingday<-rep(0,length(train$ora))
orenoworkingday<-rep(0, length(train$ora))
d<-which(train$workingday==1)
for(i in 1:length(d)){oreworkingday[d[i]]<-train$ora[d[i]]}
d<-which(train$workingday==0)
orenoworkingday[d]<-train$ora[d]
train<-cbind(train,orenoworkingday,oreworkingday)
oreworkingday<-rep(0,length(test$ora))
orenoworkingday<-rep(0, length(test$ora))
d<-which(test$workingday==1)
for(i in 1:length(d)){oreworkingday[d[i]]<-test$ora[d[i]]}
d<-which(test$workingday==0)
orenoworkingday[d]<-test$ora[d]
test<-cbind(test,orenoworkingday,oreworkingday)
##2
train<-train[,-which(colnames(train)=="temp")]
test<-test[,-which(colnames(test)=="temp")]
##3
windinverno<-rep(0,length(train$windspeed))
d<-which(train$season==1)
for(i in 1:length(d)){windinverno[d[i]]<-train$windspeed[d[i]]}
windestate<-rep(0,length(train$windspeed))
d<-which(train$season==3)
for(i in 1:length(d)){windestate[d[i]]<-train$windspeed[d[i]]}
windprimavera<-rep(0,length(train$windspeed))
d<-which(train$season==2)
for(i in 1:length(d)){windprimavera[d[i]]<-train$windspeed[d[i]]}
windautunno<-rep(0,length(train$windspeed))
d<-which(train$season==4)
for(i in 1:length(d)){windautunno[d[i]]<-train$windspeed[d[i]]}
train<-cbind(train,windinverno,windautunno,windestate,windprimavera)
windinverno<-rep(0,length(test$windspeed))
d<-which(test$season==1)
for(i in 1:length(d)){windinverno[d[i]]<-test$windspeed[d[i]]}
windestate<-rep(0,length(test$windspeed))
d<-which(test$season==3)
for(i in 1:length(d)){windestate[d[i]]<-test$windspeed[d[i]]}
windprimavera<-rep(0,length(test$windspeed))
d<-which(test$season==2)
for(i in 1:length(d)){windprimavera[d[i]]<-test$windspeed[d[i]]}
windautunno<-rep(0,length(test$windspeed))
d<-which(test$season==4)
for(i in 1:length(d)){windautunno[d[i]]<-test$windspeed[d[i]]}
test<-cbind(test,windinverno,windautunno,windestate,windprimavera)
#4
y_1<-rep(0, length(test[,1]))
y_1train<-rep(0, length(train[,1]))
for(i in 1:12){
d<-which(train$mese==i & train$anno==2011)
dt<-which(test$mese==i & test$anno==2011)
fit<-lm(train$count~train$giorno)
yhat<-predict(fit, newdata=train[d,])
y_1train[d]<-yhat
yhat<-predict(fit, newdata=test[dt,])
y_1[dt]<-yhat
}
for(i in 1:12){
d<-which(train$mese==i & train$anno==2012)
dt<-which(test$mese==i & test$anno==2012)
fit<-lm(train$count~train$giorno)
yhat<-predict(fit, newdata=train[d,])
y_1train[d]<-yhat
yhat<-predict(fit, newdata=test[dt,])
y_1[dt]<-yhat
}
train<-cbind(train,y_1train)
colnames(train)[length(train)]<-"y_1"
test<-cbind(test,y_1)
# Outliers
mod <- lm(registered ~ ., data=train[,-c(5,7)])
cooksd <- cooks.distance(mod)
righeout <- as.numeric(names(cooksd)[(cooksd > 12*mean(cooksd, na.rm=T))])
train<-train[-as.integer(righeout), ]
mod <- lm(casual ~ ., data=train[,-c(6,7)])
cooksd <- cooks.distance(mod)
righeout <- as.numeric(names(cooksd)[(cooksd > 12*mean(cooksd, na.rm=T))])
train<-train[-as.integer(righeout), ]
# Models
as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}
fit.rf=randomForest(as.factor(log1p(train$registered)) ~ ., data=train[,-c(8,10,13)], ntree=1000)
yhatRFTot1= predict(fit.rf, newdata=test)
fitrf1<-fit.rf$predicted
fit.rf=randomForest(as.factor(log1p(train$casual)) ~ ., data=train[,-c(9,10,13)], ntree=1000)
yhatRFTot2= predict(fit.rf, newdata=test)
fitrf2<-fit.rf$predicted
yhatRFTot<-expm1(as.numeric.factor(yhatRFTot1))+expm1(as.numeric.factor(yhatRFTot2))
mserf<-mean((train$count-(expm1(as.numeric.factor(fitrf1))+expm1(as.numeric.factor(fitrf2))))^2)
write.table(file="bikeshaingrfrf.txt", yhatRFTot, row.names = FALSE, col.names = FALSE)
head(yhatRFTot)
[1] 5 4 2 1 1 3