NAME : Giacomo Saccaggi

BADGE: 833063

NICKNAME: g.saccaggi

TEAM: Random_Forest_512

ROUND: 1st

All my projects Introduction Information Script in R Results

Packages we used:
caret
tree
randomForest


Competition 1: Ames house prices

Description

The Ames Housing dataset was compiled by Dean De Cock for use in data science.

With 81 predictors describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges >you to predict the final price of each home.

See DeCook “Ames Housing dataset” description in Data Sets. See Kaggle competition “House Prices: Advanced Regression Techniques”

Valutation

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the >logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap >houses will affect the result equally.)

RMSE = sqrt( mean( ( log(y) - log(yhat) )^2 ) )

During the competition, the leaderboard displays your partial score, which is the RMSE for 735 (random) houses of >the test set. At the end of the contest, the leaderboard will display the final score, which is the RMSE for the remaining 735 >houses of the test set. The final score will determine the final winner. This method prevents users from >overfitting to the leaderboard.


My strategies was:

    1. Transforming ordinal variables in a numerical measurement that is able to trap more information.
    2. Transforming qualitative predictors into dummy variables.
    3. Elimination outliers through the analysis of the cook's distance
    4. Imputation missing values with prediction trees.
    5. Creation new variables with principal component analysis (PCA)
    6. Random Forest

References:

  • The part of detection outliers:

Inspired by

Models

  • Random Forest

Non-standard R packages

  • caret
  • tree
  • randomForest


R code to reproduce the last submission:

# get the required R packages
library(tree)
library(caret)
library(randomForest)
library(ranger)
library(Boruta)

set.seed(512)
train <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/60.csv", stringsAsFactors=T)
test <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/61.csv", stringsAsFactors=F)

train<-train[,-c(1,2)]
test<-test[,-c(1,2)]
ordinalinormali<-c(7,9,11,27,28,40,42,53,55,65)

ordinaliconNAN<-c(30,31,32,33,35,57,60,63,64,72,73)
qualordinali<-function(i,ordine){
        varb=levels(train[ ,i])
        ordinal<-rep(0,length(train[ ,i])) 
        ordinaltest<-rep(0,length(test[ ,i])) 
        varb<-varb[ordine]
        for( t in 1:length(varb)){
          d<-which(as.character(train[ ,i])==as.character(varb[t]))
          ordinal[d]<-t
          d<-which(as.character(test[ ,i])==varb[t])
          ordinaltest[d]<-t
        }
        return(list(ordinal,ordinaltest))
}
# Second type (with NAN means the lack of a characteristic):
      qualordinaliconNAN<-function(i,ordine){
        varb=levels(train[ ,i])
        varb<-varb[ordine]
        listaa<-list()
        listab<-list()
        listac<-numeric()
        esp<-1:1000/100
        for (h in 1:length(esp)) {
         r<-esp[h]
         ordinal<-rep(0,length(train[ ,i])) 
         ordinaltest<-rep(0,length(test[ ,i])) 
         d<-which(is.na(train[ ,i])==T)
         ordinal[d]<-0
         d<-which(is.na(test[ ,i])==T)
         ordinaltest[d]<-0
        for( t in 1:length(varb)){
            d<-which(as.character(train[ ,i])==varb[t])
            ordinal[d]<-(t+1)^r
            d<-which(as.character(test[ ,i])==varb[t])
            ordinaltest[d]<-(t+1)^r       
          }
          listaa[[h]]<-ordinal
          listab[[h]]<-ordinaltest
          listac[h]<-cor(ordinal,train$SalePrice)
        }
        
        d<- 0
        d<-which.max(abs(listac))
        
        return(list(listaa[[d]],listab[[d]],esp[d]))
      }
### First type  
#lot shape

    k=ordinalinormali[1]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(4,1,2,3))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]


#Utilities -> ha un solo livello è una variabile inutile
    
    k=ordinalinormali[2]
    
    #colnames(train)[k];levels(train[ ,k])
    
#Land.Slope
    
    k=ordinalinormali[3]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,2,3))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
#Exter.Qual
    
    k=ordinalinormali[4]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,4,2))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#Exter.Cond
    
    k=ordinalinormali[5]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,5,2,4))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]

#Heating.QC
    
    k=ordinalinormali[6]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,5,2,4))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]

#Electrical
    
    k=ordinalinormali[7]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(5,2,3,4)) # elimino livello che corrisponde al niente
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#Kitchen.Qual
    
    k=ordinalinormali[8]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,4,2)) 
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]

#Functional
    
    k=ordinalinormali[9]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(8,3,4,5,1,2,7,6))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
    
#Paved.Drive
    
    k=ordinalinormali[10]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(3,2,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
### Second type 
#Bsmt.Qual
    
    k=ordinaliconNAN[1]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]   
    
    
#Bsmt.Cond
    
    k=ordinaliconNAN[2]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#Bsmt.Exposure
    
    k=ordinaliconNAN[3]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(5,4,2,3)) #due valori senza senso il 28 917
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#BsmtFin.Type.1
    
    k=ordinaliconNAN[4]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(6,4,5,2,1,3))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]  
    
#BsmtFin.Type.2
    
    k=ordinaliconNAN[5]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(7,5,6,3,2,4)) #ci sono dei valori strani
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]  
    
#Fireplace.Qu
    
    k=ordinaliconNAN[6]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
    
#Garage.Finish
    
    k=ordinaliconNAN[7]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(3,2,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
#Garage.Qual
    
    k=ordinaliconNAN[8]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(3,1,4,2))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
    
#Garage.Cond
    
    k=ordinaliconNAN[9]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
    
#Pool.QC
    
    k=ordinaliconNAN[10]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(3,2,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
#Fence
    
    k=ordinaliconNAN[11]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
############
    #dummy
no<-numeric()
trainnum<-numeric()
for(i in 1:length(train)){
  ifelse(is.numeric(train[,i])==TRUE & is.factor(train[,i])==FALSE, trainnum<-cbind(trainnum, train[,i]),no[length(no)+1]<-i)
}
trainqualitative<-train[,no]
testqualitative<-test[,no]
train<-train[,-no]
test<-test[,-no]
dummyqualitative<-function(i){
  
  varb=levels(trainqualitative[ ,i])
  
    dummy<-numeric()
    dummytest<-numeric()
    for( t in 1:length(varb)){
      d<-0
      d<-which(as.character(trainqualitative[ ,i])==as.character(varb[t]))
      vett<-rep(0,length(trainqualitative[ ,i]))
      vett[d]<-1
      dummy<-cbind(dummy,vett)
      d<-0
      d<-which(as.character(testqualitative[ ,i])==as.character(varb[t]))
      vetttest<-rep(0,length(testqualitative[ ,i]))
      vetttest[d]<-1
      dummytest<-cbind(dummytest,vetttest)
    }
    colnames(dummy)<-paste0("Dummy.",colnames(trainqualitative)[i],".",varb)
    colnames(dummytest)<-paste0("Dummy.",colnames(trainqualitative)[i],".",varb)
    return(list(dummy,dummytest))
}
    
for (i in 1:length(trainqualitative)) {
  x<-0
  x<-dummyqualitative(i)
  train<-cbind(train,x[[1]])
  test<-cbind(test,x[[2]])
}
# Outliers
mod <- lm(SalePrice ~ ., data=train)
cooksd <- cooks.distance(mod)
righeout <- as.numeric(names(cooksd)[(cooksd > 7*mean(cooksd, na.rm=T))])  
righeout<-righeout[-which(is.na(righeout))]
train<-train[-as.integer(righeout), ]

# Missing data

data<-train[,-57]
data<-data.frame(data)
train<-data.frame(train)
test<-data.frame(test)
for(i in 1:length(data)){
  d<-numeric()
  d<-which(is.na(data[,i])==T)
  ifelse(length(d)!=0,data<-data[-d,],"")
  
}
# Garage.Yr.Blt
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Yr.Blt~. , data=data)
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Yr.Blt)==T)
train$Garage.Yr.Blt[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Yr.Blt)==T)
test$Garage.Yr.Blt[d]<-yhat[d]

#Bsmt.Full.Bath
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Full.Bath~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Full.Bath)==T)
train$Bsmt.Full.Bath[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Full.Bath)==T)
test$Bsmt.Full.Bath[d]<-yhat[d]


#Bsmt.Half.Bath
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Half.Bath~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Half.Bath)==T)
train$Bsmt.Half.Bath[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Half.Bath)==T)
test$Bsmt.Half.Bath[d]<-yhat[d]

#BsmtFin.SF.2
abero<-yhat<-d<-numeric()
albero <- tree(data$BsmtFin.SF.2~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$BsmtFin.SF.2)==T)
train$BsmtFin.SF.2[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$BsmtFin.SF.2)==T)
test$BsmtFin.SF.2[d]<-yhat[d]

#Bsmt.Unf.SF
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Unf.SF~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Unf.SF)==T)
train$Bsmt.Unf.SF[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Unf.SF)==T)
test$Bsmt.Unf.SF[d]<-yhat[d]

#Total.Bsmt.SF
abero<-yhat<-d<-numeric()
albero <- tree(data$Total.Bsmt.SF~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Total.Bsmt.SF)==T)
train$Total.Bsmt.SF[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Total.Bsmt.SF)==T)
test$Total.Bsmt.SF[d]<-yhat[d]

#BsmtFin.SF.1
abero<-yhat<-d<-numeric()
albero <- tree(data$BsmtFin.SF.1~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$BsmtFin.SF.1)==T)
train$BsmtFin.SF.1[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$BsmtFin.SF.1)==T)
test$BsmtFin.SF.1[d]<-yhat[d]



#Mas.Vnr.Area
abero<-yhat<-d<-numeric()
albero <- tree(data$Mas.Vnr.Area~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Mas.Vnr.Area)==T)
train$Mas.Vnr.Area[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Mas.Vnr.Area)==T)
test$Mas.Vnr.Area[d]<-yhat[d]


#Garage.Cars
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Cars~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Cars)==T)
train$Garage.Cars[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Cars)==T)
test$Garage.Cars[d]<-yhat[d]

#Garage.Area
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Area~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Area)==T)
train$Garage.Area[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Area)==T)
test$Garage.Area[d]<-yhat[d]

#Lot.Frontage
abero<-yhat<-d<-numeric()
albero <- tree(data$Lot.Frontage~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Lot.Frontage)==T)
train$Lot.Frontage[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Lot.Frontage)==T)
test$Lot.Frontage[d]<-yhat[d]



# PCA
#1
# cucina
KITCH<-cbind(train$Kitchen.AbvGr,train$Kitchen.Qual,train$SalePrice)
colnames(KITCH)<-c("Kitchen.AbvGr","Kitchen.Qual","sales.price")
kitch<-prcomp(KITCH[,-3])
Kitchen.PCA<-0
Kitchen.PCA<-train$Kitchen.AbvGr*kitch$rotation[1]+train$Kitchen.Qual*kitch$rotation[2]
d<-c(which(colnames(train)=="Kitchen.AbvGr"),which(colnames(train)=="Kitchen.Qual"))
train<-cbind(train[,-d],Kitchen.PCA)
Kitchen.PCA<-0
Kitchen.PCA<-test$Kitchen.AbvGr*kitch$rotation[1]+test$Kitchen.Qual*kitch$rotation[2]
d<-c(which(colnames(test)=="Kitchen.AbvGr"),which(colnames(test)=="Kitchen.Qual"))
test<-cbind(test[,-d],Kitchen.PCA)
#2
# fireplaces
FIRE<-cbind((train$Fireplace.Qu),(train$Fireplaces),train$SalePrice)
colnames(FIRE)<-c("Fireplace.Qu","Fireplaces","salesprice")
firepca<-prcomp(FIRE[,-length(colnames(FIRE))])
Fire.PCA<-0
Fire.PCA<-firepca$rotation[1]*train$Fireplace.Qu+firepca$rotation[2]*train$Fireplaces
d<-c(which(colnames(train)=="Fireplace.Qu"),which(colnames(train)=="Fireplaces"))
train<-cbind(train[,-d],Fire.PCA)
Fire.PCA<-0
Fire.PCA<-firepca$rotation[1]*test$Fireplace.Qu+firepca$rotation[2]*test$Fireplaces
d<-c(which(colnames(test)=="Fireplace.Qu"),which(colnames(test)=="Fireplaces"))
test<-cbind(test[,-d],Fire.PCA)

# Models
y<-train$SalePrice
train<-train[,-36]
train<-cbind(train,y)
colnames(train)[length(train)]<-"SalePrice"


fit.RF<-randomForest(train$SalePrice~.,data=train)
yhat.rf<-predict(fit.RF,newdata=test,type="response")
write.table(file="AmesHousePrice_RF.txt", yhat.rf, row.names = FALSE, col.names = FALSE)

    head(yhat.rf)
       1        2        3        4        5        6 
183155.7 231716.1 187002.1 206298.2 179257.8 181690.0