Competition 1: Ames house prices

Description

The Ames Housing dataset was compiled by Dean De Cock for use in data science.

With 81 predictors describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges >you to predict the final price of each home.

See DeCook “Ames Housing dataset” description in Data Sets. See Kaggle competition “House Prices: Advanced Regression Techniques”

Valutation

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the >logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap >houses will affect the result equally.)

RMSE = sqrt( mean( ( log(y) - log(yhat) )^2 ) )

During the competition, the leaderboard displays your partial score, which is the RMSE for 735 (random) houses of >the test set. At the end of the contest, the leaderboard will display the final score, which is the RMSE for the remaining 735 >houses of the test set. The final score will determine the final winner. This method prevents users from >overfitting to the leaderboard.

My strategies was:

References:

The part of detection outliers:

Inspired by

Models

Random Forest

Non-standard R packages

caret
tree
randomForest

R code to reproduce the last submission:

# get the required R packages
library(tree)
library(caret)
library(randomForest)
library(ranger)
library(Boruta)

set.seed(512)
train <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/60.csv", stringsAsFactors=T)
test <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/61.csv", stringsAsFactors=F)

train<-train[,-c(1,2)]
test<-test[,-c(1,2)]
ordinalinormali<-c(7,9,11,27,28,40,42,53,55,65)

ordinaliconNAN<-c(30,31,32,33,35,57,60,63,64,72,73)
qualordinali<-function(i,ordine){
        varb=levels(train[ ,i])
        ordinal<-rep(0,length(train[ ,i])) 
        ordinaltest<-rep(0,length(test[ ,i])) 
        varb<-varb[ordine]
        for( t in 1:length(varb)){
          d<-which(as.character(train[ ,i])==as.character(varb[t]))
          ordinal[d]<-t
          d<-which(as.character(test[ ,i])==varb[t])
          ordinaltest[d]<-t
        }
        return(list(ordinal,ordinaltest))
}
# Second type (with NAN means the lack of a characteristic):
      qualordinaliconNAN<-function(i,ordine){
        varb=levels(train[ ,i])
        varb<-varb[ordine]
        listaa<-list()
        listab<-list()
        listac<-numeric()
        esp<-1:1000/100
        for (h in 1:length(esp)) {
         r<-esp[h]
         ordinal<-rep(0,length(train[ ,i])) 
         ordinaltest<-rep(0,length(test[ ,i])) 
         d<-which(is.na(train[ ,i])==T)
         ordinal[d]<-0
         d<-which(is.na(test[ ,i])==T)
         ordinaltest[d]<-0
        for( t in 1:length(varb)){
            d<-which(as.character(train[ ,i])==varb[t])
            ordinal[d]<-(t+1)^r
            d<-which(as.character(test[ ,i])==varb[t])
            ordinaltest[d]<-(t+1)^r       
          }
          listaa[[h]]<-ordinal
          listab[[h]]<-ordinaltest
          listac[h]<-cor(ordinal,train$SalePrice)
        }
        
        d<- 0
        d<-which.max(abs(listac))
        
        return(list(listaa[[d]],listab[[d]],esp[d]))
      }
### First type  
#lot shape

    k=ordinalinormali[1]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(4,1,2,3))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]


#Utilities -> ha un solo livello è una variabile inutile
    
    k=ordinalinormali[2]
    
    #colnames(train)[k];levels(train[ ,k])
    
#Land.Slope
    
    k=ordinalinormali[3]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,2,3))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
#Exter.Qual
    
    k=ordinalinormali[4]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,4,2))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#Exter.Cond
    
    k=ordinalinormali[5]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,5,2,4))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]

#Heating.QC
    
    k=ordinalinormali[6]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,5,2,4))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]

#Electrical
    
    k=ordinalinormali[7]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(5,2,3,4)) # elimino livello che corrisponde al niente
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#Kitchen.Qual
    
    k=ordinalinormali[8]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(1,3,4,2)) 
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]

#Functional
    
    k=ordinalinormali[9]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(8,3,4,5,1,2,7,6))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
    
#Paved.Drive
    
    k=ordinalinormali[10]
    
    #colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinali(k,c(3,2,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
### Second type 
#Bsmt.Qual
    
    k=ordinaliconNAN[1]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]   
    
    
#Bsmt.Cond
    
    k=ordinaliconNAN[2]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#Bsmt.Exposure
    
    k=ordinaliconNAN[3]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(5,4,2,3)) #due valori senza senso il 28 917
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]
    
    
#BsmtFin.Type.1
    
    k=ordinaliconNAN[4]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(6,4,5,2,1,3))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]  
    
#BsmtFin.Type.2
    
    k=ordinaliconNAN[5]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(7,5,6,3,2,4)) #ci sono dei valori strani
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]]  
    
#Fireplace.Qu
    
    k=ordinaliconNAN[6]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
    
#Garage.Finish
    
    k=ordinaliconNAN[7]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(3,2,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
#Garage.Qual
    
    k=ordinaliconNAN[8]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(3,1,4,2))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
    
#Garage.Cond
    
    k=ordinaliconNAN[9]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,5,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
    
#Pool.QC
    
    k=ordinaliconNAN[10]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(3,2,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
    
#Fence
    
    k=ordinaliconNAN[11]
    
    #k;colnames(train)[k];levels(train[ ,k])
    x<-numeric()
    x<-qualordinaliconNAN(k,c(4,2,3,1))
    
    train[ ,k]<-x[[1]]
    test[ ,k]<-x[[2]] 
############
    #dummy
no<-numeric()
trainnum<-numeric()
for(i in 1:length(train)){
  ifelse(is.numeric(train[,i])==TRUE & is.factor(train[,i])==FALSE, trainnum<-cbind(trainnum, train[,i]),no[length(no)+1]<-i)
}
trainqualitative<-train[,no]
testqualitative<-test[,no]
train<-train[,-no]
test<-test[,-no]
dummyqualitative<-function(i){
  
  varb=levels(trainqualitative[ ,i])
  
    dummy<-numeric()
    dummytest<-numeric()
    for( t in 1:length(varb)){
      d<-0
      d<-which(as.character(trainqualitative[ ,i])==as.character(varb[t]))
      vett<-rep(0,length(trainqualitative[ ,i]))
      vett[d]<-1
      dummy<-cbind(dummy,vett)
      d<-0
      d<-which(as.character(testqualitative[ ,i])==as.character(varb[t]))
      vetttest<-rep(0,length(testqualitative[ ,i]))
      vetttest[d]<-1
      dummytest<-cbind(dummytest,vetttest)
    }
    colnames(dummy)<-paste0("Dummy.",colnames(trainqualitative)[i],".",varb)
    colnames(dummytest)<-paste0("Dummy.",colnames(trainqualitative)[i],".",varb)
    return(list(dummy,dummytest))
}
    
for (i in 1:length(trainqualitative)) {
  x<-0
  x<-dummyqualitative(i)
  train<-cbind(train,x[[1]])
  test<-cbind(test,x[[2]])
}
# Outliers
mod <- lm(SalePrice ~ ., data=train)
cooksd <- cooks.distance(mod)
righeout <- as.numeric(names(cooksd)[(cooksd > 7*mean(cooksd, na.rm=T))])  
righeout<-righeout[-which(is.na(righeout))]
train<-train[-as.integer(righeout), ]

# Missing data

data<-train[,-57]
data<-data.frame(data)
train<-data.frame(train)
test<-data.frame(test)
for(i in 1:length(data)){
  d<-numeric()
  d<-which(is.na(data[,i])==T)
  ifelse(length(d)!=0,data<-data[-d,],"")
  
}
# Garage.Yr.Blt
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Yr.Blt~. , data=data)
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Yr.Blt)==T)
train$Garage.Yr.Blt[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Yr.Blt)==T)
test$Garage.Yr.Blt[d]<-yhat[d]

#Bsmt.Full.Bath
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Full.Bath~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Full.Bath)==T)
train$Bsmt.Full.Bath[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Full.Bath)==T)
test$Bsmt.Full.Bath[d]<-yhat[d]


#Bsmt.Half.Bath
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Half.Bath~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Half.Bath)==T)
train$Bsmt.Half.Bath[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Half.Bath)==T)
test$Bsmt.Half.Bath[d]<-yhat[d]

#BsmtFin.SF.2
abero<-yhat<-d<-numeric()
albero <- tree(data$BsmtFin.SF.2~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$BsmtFin.SF.2)==T)
train$BsmtFin.SF.2[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$BsmtFin.SF.2)==T)
test$BsmtFin.SF.2[d]<-yhat[d]

#Bsmt.Unf.SF
abero<-yhat<-d<-numeric()
albero <- tree(data$Bsmt.Unf.SF~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Bsmt.Unf.SF)==T)
train$Bsmt.Unf.SF[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Bsmt.Unf.SF)==T)
test$Bsmt.Unf.SF[d]<-yhat[d]

#Total.Bsmt.SF
abero<-yhat<-d<-numeric()
albero <- tree(data$Total.Bsmt.SF~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Total.Bsmt.SF)==T)
train$Total.Bsmt.SF[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Total.Bsmt.SF)==T)
test$Total.Bsmt.SF[d]<-yhat[d]

#BsmtFin.SF.1
abero<-yhat<-d<-numeric()
albero <- tree(data$BsmtFin.SF.1~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$BsmtFin.SF.1)==T)
train$BsmtFin.SF.1[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$BsmtFin.SF.1)==T)
test$BsmtFin.SF.1[d]<-yhat[d]



#Mas.Vnr.Area
abero<-yhat<-d<-numeric()
albero <- tree(data$Mas.Vnr.Area~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Mas.Vnr.Area)==T)
train$Mas.Vnr.Area[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Mas.Vnr.Area)==T)
test$Mas.Vnr.Area[d]<-yhat[d]


#Garage.Cars
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Cars~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Cars)==T)
train$Garage.Cars[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Cars)==T)
test$Garage.Cars[d]<-yhat[d]

#Garage.Area
abero<-yhat<-d<-numeric()
albero <- tree(data$Garage.Area~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Garage.Area)==T)
train$Garage.Area[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Garage.Area)==T)
test$Garage.Area[d]<-yhat[d]

#Lot.Frontage
abero<-yhat<-d<-numeric()
albero <- tree(data$Lot.Frontage~. , data=data, split="deviance")
yhat<-predict(albero, newdata=train)
d<-which(is.na(train$Lot.Frontage)==T)
train$Lot.Frontage[d]<-yhat[d]
yhat<-d<-numeric()
yhat<-predict(albero, newdata=test)
d<-which(is.na(test$Lot.Frontage)==T)
test$Lot.Frontage[d]<-yhat[d]



# PCA
#1
# cucina
KITCH<-cbind(train$Kitchen.AbvGr,train$Kitchen.Qual,train$SalePrice)
colnames(KITCH)<-c("Kitchen.AbvGr","Kitchen.Qual","sales.price")
kitch<-prcomp(KITCH[,-3])
Kitchen.PCA<-0
Kitchen.PCA<-train$Kitchen.AbvGr*kitch$rotation[1]+train$Kitchen.Qual*kitch$rotation[2]
d<-c(which(colnames(train)=="Kitchen.AbvGr"),which(colnames(train)=="Kitchen.Qual"))
train<-cbind(train[,-d],Kitchen.PCA)
Kitchen.PCA<-0
Kitchen.PCA<-test$Kitchen.AbvGr*kitch$rotation[1]+test$Kitchen.Qual*kitch$rotation[2]
d<-c(which(colnames(test)=="Kitchen.AbvGr"),which(colnames(test)=="Kitchen.Qual"))
test<-cbind(test[,-d],Kitchen.PCA)
#2
# fireplaces
FIRE<-cbind((train$Fireplace.Qu),(train$Fireplaces),train$SalePrice)
colnames(FIRE)<-c("Fireplace.Qu","Fireplaces","salesprice")
firepca<-prcomp(FIRE[,-length(colnames(FIRE))])
Fire.PCA<-0
Fire.PCA<-firepca$rotation[1]*train$Fireplace.Qu+firepca$rotation[2]*train$Fireplaces
d<-c(which(colnames(train)=="Fireplace.Qu"),which(colnames(train)=="Fireplaces"))
train<-cbind(train[,-d],Fire.PCA)
Fire.PCA<-0
Fire.PCA<-firepca$rotation[1]*test$Fireplace.Qu+firepca$rotation[2]*test$Fireplaces
d<-c(which(colnames(test)=="Fireplace.Qu"),which(colnames(test)=="Fireplaces"))
test<-cbind(test[,-d],Fire.PCA)

# Models
y<-train$SalePrice
train<-train[,-36]
train<-cbind(train,y)
colnames(train)[length(train)]<-"SalePrice"


fit.RF<-randomForest(train$SalePrice~.,data=train)
yhat.rf<-predict(fit.RF,newdata=test,type="response")
write.table(file="AmesHousePrice_RF.txt", yhat.rf, row.names = FALSE, col.names = FALSE)

    head(yhat.rf)

       1        2        3        4        5        6 
183155.7 231716.1 187002.1 206298.2 179257.8 181690.0

Ames House Prices

Competition 1: Ames house prices

Description

Valutation

My strategies was:

References:

Models

Non-standard R packages

R code to reproduce the last submission: