NAME : Giacomo Saccaggi
BADGE: 833063
NICKNAME: g.saccaggi
TEAM: Random_Forest_512
ROUND: 1st
Packages we used: caret tree randomForest
The Ames Housing dataset was compiled by Dean De Cock for use in data science.
With 81 predictors describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges >you to predict the final price of each home.
See DeCook “Ames Housing dataset” description in Data Sets. See Kaggle competition “House Prices: Advanced Regression Techniques”
Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the >logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap >houses will affect the result equally.)
RMSE = sqrt( mean( ( log(y) - log(yhat) )^2 ) )
During the competition, the leaderboard displays your partial score, which is the RMSE for 735 (random) houses of >the test set. At the end of the contest, the leaderboard will display the final score, which is the RMSE for the remaining 735 >houses of the test set. The final score will determine the final winner. This method prevents users from >overfitting to the leaderboard.
# get the required R packages
library(tree)
library(caret)
library(randomForest)
library(ranger)
library(Boruta)
set.seed(512)
train <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/60.csv", stringsAsFactors=T)
test <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/61.csv", stringsAsFactors=F)
train<-train[,-c(1,2)]
test<-test[,-c(1,2)]
ordinalinormali<-c(7,9,11,27,28,40,42,53,55,65)
ordinaliconNAN<-c(30,31,32,33,35,57,60,63,64,72,73)
qualordinali<-function(i,ordine){
varb=levels(train[ ,i])
ordinal<-rep(0,length(train[ ,i]))
ordinaltest<-rep(0,length(test[ ,i]))
varb<-varb[ordine]
for( t in 1:length(varb)){
d<-which(as.character(train[ ,i])==as.character(varb[t]))
ordinal[d]<-t
d<-which(as.character(test[ ,i])==varb[t])
ordinaltest[d]<-t
}
return(list(ordinal,ordinaltest))
}
# Second type (with NAN means the lack of a characteristic):
qualordinaliconNAN<-function(i,ordine){
varb=levels(train[ ,i])
varb<-varb[ordine]
listaa<-list()
listab<-list()
listac<-numeric()
esp<-1:1000/100
for (h in 1:length(esp)) {
r<-esp[h]
ordinal<-rep(0,length(train[ ,i]))
ordinaltest<-rep(0,length(test[ ,i]))
d<-which(is.na(train[ ,i])==T)
ordinal[d]<-0
d<-which(is.na(test[ ,i])==T)
ordinaltest[d]<-0
for( t in 1:length(varb)){
d<-which(as.character(train[ ,i])==varb[t])
ordinal[d]<-(t+1)^r
d<-which(as.character(test[ ,i])==varb[t])
ordinaltest[d]<-(t+1)^r
}
listaa[[h]]<-ordinal
listab[[h]]<-ordinaltest
listac[h]<-cor(ordinal,train$SalePrice)
}
d<- 0
d<-which.max(abs(listac))
return(list(listaa[[d]],listab[[d]],esp[d]))
}
### First type
#lot shape
k=ordinalinormali[1]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(4,1,2,3))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Utilities -> ha un solo livello è una variabile inutile
k=ordinalinormali[2]
#colnames(train)[k];levels(train[ ,k])
#Land.Slope
k=ordinalinormali[3]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(1,2,3))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Exter.Qual
k=ordinalinormali[4]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(1,3,4,2))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Exter.Cond
k=ordinalinormali[5]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(1,3,5,2,4))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Heating.QC
k=ordinalinormali[6]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(1,3,5,2,4))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Electrical
k=ordinalinormali[7]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(5,2,3,4)) # elimino livello che corrisponde al niente
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Kitchen.Qual
k=ordinalinormali[8]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(1,3,4,2))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Functional
k=ordinalinormali[9]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(8,3,4,5,1,2,7,6))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Paved.Drive
k=ordinalinormali[10]
#colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinali(k,c(3,2,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
### Second type
#Bsmt.Qual
k=ordinaliconNAN[1]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(4,2,5,3,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Bsmt.Cond
k=ordinaliconNAN[2]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(4,2,5,3,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Bsmt.Exposure
k=ordinaliconNAN[3]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(5,4,2,3)) #due valori senza senso il 28 917
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#BsmtFin.Type.1
k=ordinaliconNAN[4]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(6,4,5,2,1,3))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#BsmtFin.Type.2
k=ordinaliconNAN[5]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(7,5,6,3,2,4)) #ci sono dei valori strani
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Fireplace.Qu
k=ordinaliconNAN[6]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(4,2,5,3,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Garage.Finish
k=ordinaliconNAN[7]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(3,2,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Garage.Qual
k=ordinaliconNAN[8]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(3,1,4,2))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Garage.Cond
k=ordinaliconNAN[9]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(4,2,5,3,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Pool.QC
k=ordinaliconNAN[10]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(3,2,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
#Fence
k=ordinaliconNAN[11]
#k;colnames(train)[k];levels(train[ ,k])
x<-numeric()
x<-qualordinaliconNAN(k,c(4,2,3,1))
train[ ,k]<-x[[1]]
test[ ,k]<-x[[2]]
############
#dummy
no<-numeric()
trainnum<-numeric()
for(i in 1:length(train)){
ifelse(is.numeric(train[,i])==TRUE & is.factor(train[,i])==FALSE, trainnum<-cbind(trainnum, train[,i]),no[length(no)+1]<-i)
}
trainqualitative<-train[,no]
testqualitative<-test[,no]
train<-train[,-no]
test<-test[,-no]
dummyqualitative<-function(i){
varb=levels(trainqualitative[ ,i])
dummy<-numeric()
dummytest<-numeric()
for( t in 1:length(varb)){
d<-0
d<-which(as.character(trainqualitative[ ,i])==as.character(varb[t]))
vett<-rep(0,length(trainqualitative[ ,i]))
vett[d]<-1
dummy<-cbind(dummy,vett)
d<-0
d<-which(as.character(testqualitative[ ,i])==as.character(varb[t]))
vetttest<-rep(0,length(testqualitative[ ,i]))
vetttest[d]<-1
dummytest<-cbind(dummytest,vetttest)
}
colnames(dummy)<-paste0("Dummy.",colnames(trainqualitative)[i],".",varb)
colnames(dummytest)<-paste0("Dummy.",colnames(trainqualitative)[i],".",varb)
return(list(dummy,dummytest))
}
for (i in 1:length(trainqualitative)) {
x<-0
x<-dummyqualitative(i)
train<-cbind(train,x[[1]])
test<-cbind(test,x[[2]])
}
# Outliers
mod <- lm(SalePrice ~ ., data=train)
cooksd <- cooks.distance(mod)
righeout <- as.numeric(names(cooksd)[(cooksd > 7*mean(cooksd, na.rm=T))])
righeout<-righeout[-which(is.na(righeout))]
train<-train[-as.integer(righeout), ]
# Missing data
data<-train[,-57]
data<-data.frame(data)
train<-data.frame(train)
test<-data.frame(test)
for(i in 1:length(data)){
d<-numeric()
d<-which(is.na(data[,i])==T)
ifelse(length(d)!=0,data<-data[-d,],"")
}
# Garage.Yr.Blt
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Yr.Blt~. , data=data)
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Yr.Blt)==T)
train$Garage.Yr.Blt[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Yr.Blt)==T)
test$Garage.Yr.Blt[d]<-yhat[d]
#Bsmt.Full.Bath
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Full.Bath~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Full.Bath)==T)
train$Bsmt.Full.Bath[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Full.Bath)==T)
test$Bsmt.Full.Bath[d]<-yhat[d]
#Bsmt.Half.Bath
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Half.Bath~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Half.Bath)==T)
train$Bsmt.Half.Bath[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Half.Bath)==T)
test$Bsmt.Half.Bath[d]<-yhat[d]
#BsmtFin.SF.2
abero<-yhat<-d<-numeric()
albero <- tree(data$BsmtFin.SF.2~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$BsmtFin.SF.2)==T)
train$BsmtFin.SF.2[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$BsmtFin.SF.2)==T)
test$BsmtFin.SF.2[d]<-yhat[d]
#Bsmt.Unf.SF
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Unf.SF~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Unf.SF)==T)
train$Bsmt.Unf.SF[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Unf.SF)==T)
test$Bsmt.Unf.SF[d]<-yhat[d]
#Total.Bsmt.SF
abero<-yhat<-d<-numeric()
albero <- tree(data$Total.Bsmt.SF~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Total.Bsmt.SF)==T)
train$Total.Bsmt.SF[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Total.Bsmt.SF)==T)
test$Total.Bsmt.SF[d]<-yhat[d]
#BsmtFin.SF.1
abero<-yhat<-d<-numeric()
albero <- tree(data$BsmtFin.SF.1~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$BsmtFin.SF.1)==T)
train$BsmtFin.SF.1[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$BsmtFin.SF.1)==T)
test$BsmtFin.SF.1[d]<-yhat[d]
#Mas.Vnr.Area
abero<-yhat<-d<-numeric()
albero <- tree(data$Mas.Vnr.Area~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Mas.Vnr.Area)==T)
train$Mas.Vnr.Area[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Mas.Vnr.Area)==T)
test$Mas.Vnr.Area[d]<-yhat[d]
#Garage.Cars
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Cars~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Cars)==T)
train$Garage.Cars[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Cars)==T)
test$Garage.Cars[d]<-yhat[d]
#Garage.Area
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Area~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Area)==T)
train$Garage.Area[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Area)==T)
test$Garage.Area[d]<-yhat[d]
#Lot.Frontage
abero<-yhat<-d<-numeric()
albero <- tree(data$Lot.Frontage~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Lot.Frontage)==T)
train$Lot.Frontage[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Lot.Frontage)==T)
test$Lot.Frontage[d]<-yhat[d]
# PCA
#1
# cucina
KITCH<-cbind(train$Kitchen.AbvGr,train$Kitchen.Qual,train$SalePrice)
colnames(KITCH)<-c("Kitchen.AbvGr","Kitchen.Qual","sales.price")
kitch<-prcomp(KITCH[,-3])
Kitchen.PCA<-0
Kitchen.PCA<-train$Kitchen.AbvGr*kitch$rotation[1]+train$Kitchen.Qual*kitch$rotation[2]
d<-c(which(colnames(train)=="Kitchen.AbvGr"),which(colnames(train)=="Kitchen.Qual"))
train<-cbind(train[,-d],Kitchen.PCA)
Kitchen.PCA<-0
Kitchen.PCA<-test$Kitchen.AbvGr*kitch$rotation[1]+test$Kitchen.Qual*kitch$rotation[2]
d<-c(which(colnames(test)=="Kitchen.AbvGr"),which(colnames(test)=="Kitchen.Qual"))
test<-cbind(test[,-d],Kitchen.PCA)
#2
# fireplaces
FIRE<-cbind((train$Fireplace.Qu),(train$Fireplaces),train$SalePrice)
colnames(FIRE)<-c("Fireplace.Qu","Fireplaces","salesprice")
firepca<-prcomp(FIRE[,-length(colnames(FIRE))])
Fire.PCA<-0
Fire.PCA<-firepca$rotation[1]*train$Fireplace.Qu+firepca$rotation[2]*train$Fireplaces
d<-c(which(colnames(train)=="Fireplace.Qu"),which(colnames(train)=="Fireplaces"))
train<-cbind(train[,-d],Fire.PCA)
Fire.PCA<-0
Fire.PCA<-firepca$rotation[1]*test$Fireplace.Qu+firepca$rotation[2]*test$Fireplaces
d<-c(which(colnames(test)=="Fireplace.Qu"),which(colnames(test)=="Fireplaces"))
test<-cbind(test[,-d],Fire.PCA)
# Models
y<-train$SalePrice
train<-train[,-36]
train<-cbind(train,y)
colnames(train)[length(train)]<-"SalePrice"
fit.RF<-randomForest(train$SalePrice~.,data=train)
yhat.rf<-predict(fit.RF,newdata=test,type="response")
write.table(file="AmesHousePrice_RF.txt", yhat.rf, row.names = FALSE, col.names = FALSE)
head(yhat.rf)
1 2 3 4 5 6
183155.7 231716.1 187002.1 206298.2 179257.8 181690.0