NAME : Giacomo Saccaggi

BADGE: 833063

NICKNAME: g.saccaggi

TEAM: Random_Forest_512

ROUND: 1st

All my projects Introduction Information Script in R Results

Packages we used:
caret
tree
randomForest
ranger
Boruta
e1071


Competition 2: OK Cupid

Description:

OkCupid is an online dating site that serves international users. Kim and Escobedo-Land (2015) describe a data set where over 50,000 profiles from the San Francisco area were made available.

The goal will be to predict whether a person’s profession is in the STEM fields (science, technology, engineering, and math) using a random sample of the overall dataset.

Valutation:

Submissions are evaluated by the Area Under the Curve (AUC).

During the competition, the leaderboard displays your partial score, which is the AUC for 500 (random) observations of the test set. At the end of the contest, the leaderboard will display the final score, which is the AUC for the remaining 500 observations of the test set. The final score will determine the final winner. This method prevents users from overfitting to the leaderboard.


My strategies was:

    1. Create a datset combining test set and train set, with no missing values and the response variable.
    2. Identify missing values.
    3. Predict missing values through classification trees.
    4. Operate the features selection to eliminate any distortion variables.
    5. Random Forest and Naive Bayes

Models

  • Random Forest
  • Naive Bayes

Non-standard R packages

  • caret
  • tree
  • randomForest
  • ranger
  • Boruta
  • e1071


R code to reproduce the last submission:

# get the required R packages
library(tree)
library(caret)
library(randomForest)
library(ranger)
library(Boruta)
library(e1071)

set.seed(512)
train <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/101.csv", stringsAsFactors=T)
test <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/102.csv", stringsAsFactors=F)

test$Class <-NA 


# JOIN TRAIN SET AND TEST SET
{
col.target<-which(colnames(train)=="Class")
train_90<-train[,-col.target]
test_90<-test[,-91]
file_90<-rbind(train_90,test_90)
}

#----- MISSING VALUES

# $body_type
colonna_1<-which(colnames(file_90)=="body_type")
righe_1<-which(file_90[,colonna_1]=="body_type_missing")
file_90[righe_1,colonna_1]<-NA

# $diet         
colonna_2<-which(colnames(file_90)=="diet")
righe_2<-which(file_90[,colonna_2]=="diet_missing")
file_90[,colonna_2]<-as.character(file_90[,colonna_2])
file_90[righe_2,colonna_2] <- rep("no_important",length(righe_2))

# $drinks 
colonna_3<-which(colnames(file_90)=="drinks")
righe_3<-which(file_90[,colonna_3]=="drinks_missing")
file_90[righe_3,colonna_3]<-NA

# $drugs 
colonna_4<-which(colnames(file_90)=="drugs")
righe_4<-which(file_90[,colonna_4]=="drugs_missing")
file_90[righe_4,colonna_4]<-NA
file_90$drugs<-ordered(file_90$drugs,levels=c("0","never","sometimes","often"))

# $education

file_90$education<-as.character(file_90$education)
colonna_5<-which(colnames(file_90)=="education")
righe_5<-which(file_90[,colonna_5]=="ed_missing")
file_90[righe_5,colonna_5]<-"no_answer"
eduvm<-which(is.na(file_90[,colonna_5]))
file_90[eduvm,colonna_5]<-"no_answer"

# $income
file_90$income<-as.character(file_90$income)
for(i in 1:length(file_90$age)){
file_90$income[i]<-ifelse(file_90$income[i]!="missing",     
as.numeric(substr(file_90$income[i],4,nchar(file_90$income[i]))), file_90$income[i]<-0)}
file_90$income<-as.numeric(file_90$income)

# $offspring
colonna_6<-which(colnames(file_90)=="offspring")
righe_6<-which(file_90[,colonna_6]=="kids_missing")
#2840 valori mancanti su 5000
file_90[,colonna_6]<-as.character(file_90[,colonna_6])
file_90[righe_6,colonna_6] <- rep("no_answer",length(righe_6))

# $pets
colonna_7<-which(colnames(file_90)=="pets")
righe_7<-which(file_90[,colonna_7]=="pets_missing")
#1465 valori mancanti su 5000
file_90[,colonna_7]<-as.character(file_90[,colonna_7])
file_90[righe_7,colonna_7] <- rep("no_answer",length(righe_7))

# $sign 
colonna_8<-which(colnames(file_90)=="sign")
righe_8<-which(file_90[,colonna_8]=="sign_missing")
file_90[righe_8,colonna_8]<-NA

# $smokes 
colonna_9<-which(colnames(file_90)=="smokes")
righe_9<-which(file_90[,colonna_9]=="smokes_missing")
file_90[righe_9,colonna_9]<-NA

# $male
colonna_10<-which(colnames(file_90)=="male")
colnames(file_90)[colonna_10]<-"sex"
file_90$sex<-ifelse(file_90$sex==0,"F","M")

# $religion_modifer 
colonna_10<-which(colnames(file_90)=="religion_modifer")
righe_10<-which(file_90[,colonna_10]=="religion_mod_missing")
file_90$religion_modifer<-as.character(file_90$religion_modifer)
file_90[righe_10,colonna_10]<-0
for(i in 1:nrow(file_90)){
        if (file_90[i,colonna_10]=="and_laughing_about_it"){
           file_90$religion_modifer=1}
        if (file_90[i,colonna_10]=="but_not_too_serious_about_it") {
          file_90$religion_modifer=2}
        if (file_90[i,colonna_10]=="and_somewhat_serious_about_it"){
          file_90$religion_modifer=3}
        if (file_90[i,colonna_10]=="and_very_serious_about_it"){
          file_90$religion_modifer=4}
}

# $sign_modifer - MISSING VALUES

colonna_11<-which(colnames(file_90)=="sign_modifer")
righe_11<-which(file_90[,colonna_11]=="sign_mod_missing")
file_90$sign_modifer<-as.character(file_90$sign_modifer)
file_90[righe_11,colonna_11]<-0
  for(i in 1:nrow(file_90)){
    if (file_90[i,colonna_11]=="and_its_fun_to_think_about"){
      file_90$religion_modifer=1}
    if (file_90[i,colonna_11]=="but_it_doesnt_matter") {
      file_90$religion_modifer=2}
    if (file_90[i,colonna_11]=="and_it_matters_a_lot"){
      file_90$religion_modifer=3}
}

# THE VARIABLES:
#$asian 
#$black 
#$hispanic_latin 
#$indian $middle_eastern 
#$native_american 
#$pacific_islander 
#$white
# WITHOUT VALUE, DELETE

# CLASSIFICATION TREE TO MAKE FORECASTS ON THE MISSING VALUE

val.manc_1<-which(is.na(file_90$body_type))
val.manc_2<-which(is.na(file_90$drinks))
val.manc_3<-which(is.na(file_90$drugs))
val.manc_5<-which(is.na(file_90$sign))
val.manc_6<-which(is.na(file_90$smokes))
valmancantitot<-as.numeric(levels(as.factor(c(val.manc_1,val.manc_2,val.manc_3,val.manc_5,val.manc_6))))

which(is.na(file_90[-valmancantitot,]))

asd<-file_90[-valmancantitot,]
for(i in 1:length(asd)){print(paste(colnames(asd)[i] ,length(which(is.na(asd[,i])))))}


test_body_type1<-file_90[val.manc_1,]

test_drinks1<-file_90[val.manc_2,]

test_drugs1<-file_90[val.manc_3,]

test_sign1<-file_90[val.manc_5,]

test_smokes1<-file_90[val.manc_6,]

file_NMV<-file_90[-valmancantitot,-18]

file_NMV<-file_NMV[,-c(31:length(file_NMV))]
file_NMV<-file_NMV[,-c(20:27)]
file_NMV<-file_NMV[,-6]
file_NMV<-file_NMV[,-c(20,21)]

file_NMV$diet<-as.factor(file_NMV$diet)
file_NMV$offspring<-as.factor(file_NMV$offspring)
file_NMV$pets<-as.factor(file_NMV$pets)
file_NMV$sign_modifer<-as.factor(file_NMV$sign_modifer)
file_NMV$sex<-as.factor(file_NMV$sex)

test_body_type1$diet<-as.factor(test_body_type1$diet)
test_body_type1$offspring<-as.factor(test_body_type1$offspring)
test_body_type1$pets<-as.factor(test_body_type1$pets)
test_body_type1$sign_modifer<-as.factor(test_body_type1$sign_modifer)
test_body_type1$sex<-as.factor(test_body_type1$sex)

cat(paste("+",colnames(file_NMV)))

#age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link

fit_1<-tree(body_type ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link, file_NMV)
prev_1<-predict(fit_1,newdata=test_body_type1,type = "class")
file_90[val.manc_1,colonna_1]<-prev_1

test_drinks1$diet<-as.factor(test_drinks1$diet)
test_drinks1$offspring<-as.factor(test_drinks1$offspring)
test_drinks1$pets<-as.factor(test_drinks1$pets)
test_drinks1$sign_modifer<-as.factor(test_drinks1$sign_modifer)
test_drinks1$sex<-as.factor(test_drinks1$sex)

fit_2<-tree(as.factor(drinks) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_2<-predict(fit_2,test_drinks1,type = "class")
file_90[val.manc_2,colonna_3]<-prev_2

test_drugs1$diet<-as.factor(test_drugs1$diet)
test_drugs1$offspring<-as.factor(test_drugs1$offspring)
test_drugs1$pets<-as.factor(test_drugs1$pets)
test_drugs1$sign_modifer<-as.factor(test_drugs1$sign_modifer)
test_drugs1$sex<-as.factor(test_drugs1$sex)

fit_3<-tree(as.factor(drugs) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_3<-predict(fit_3,test_drugs1,type = "class")
test_drugs1$drugs<-prev_3
file_90[val.manc_3,colonna_4]<-test_drugs1$drugs

test_sign1$diet<-as.factor(test_sign1$diet)
test_sign1$offspring<-as.factor(test_sign1$offspring)
test_sign1$pets<-as.factor(test_sign1$pets)
test_sign1$sign_modifer<-as.factor(test_sign1$sign_modifer)
test_sign1$sex<-as.factor(test_sign1$sex)

fit_5<-tree(as.factor(sign) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_5<-predict(fit_5,test_sign1,type = "class")
test_sign1$sign<-prev_5
file_90[val.manc_5,colonna_8]<-test_sign1$sign

test_smokes1$diet<-as.factor(test_smokes1$diet)
test_smokes1$offspring<-as.factor(test_smokes1$offspring)
test_smokes1$pets<-as.factor(test_smokes1$pets)
test_smokes1$sign_modifer<-as.factor(test_smokes1$sign_modifer)
test_smokes1$sex<-as.factor(test_smokes1$sex)

fit_6<-tree(as.factor(smokes) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_6<-predict(fit_6,test_smokes1,type = "class")
test_smokes1$smokes<-prev_6
file_90[val.manc_6,colonna_9]<-test_smokes1$smokes

file_90$diet<-as.factor(file_90$diet)
file_90$education<-as.factor(file_90$education)
file_90$offspring<-as.factor(file_90$offspring)
file_90$pets<-as.factor(file_90$pets)
file_90$sex<-as.factor(file_90$sex)
file_90$sign_modifer<-as.factor(file_90$sign_modifer)
file_90$pets<-as.factor(file_90$pets)

#------------------ SPLIT THE FILE IN 4000 AND 1000
Trainn<-file_90[1:4000,]
Test<-file_90[4001:5000,]
Train<-cbind(Trainn,train$Class)
colnames(Train)[91]<-"Class"

filefs<-Train
convert <- c(1:dim(filefs)[2])
filefs[,convert] <- data.frame(apply(filefs[convert], 2, as.factor))
boruta.train <- Boruta(filefs$Class~., data =filefs, doTrace = 2)

boruta.train$finalDecision

final.boruta <- TentativeRoughFix(boruta.train)
boruta.df <- attStats(final.boruta)
#variabili importanti
varimpportant<-row.names(boruta.df)
val<-abs(boruta.df$meanImp)
d<-which(val>1.3)
# val > 1.3 seleziona 48 attributi importanti
varimpportant<-varimpportant[d]
dataimp<-cbind(Train[,varimpportant],Train$Class)
colnames(dataimp)[length(colnames(dataimp))]<-"Class"
dataimptest<-Test[,varimpportant]

fit.bayes = naiveBayes(as.factor(dataimp$Class) ~ ., data=dataimp)
yhat.bayes = predict(fit.bayes, newdata=dataimptest, type = "raw")[,"stem",drop=F]

fit.RF =randomForest(as.factor(dataimp$Class) ~ ., data=dataimp)
yhat.RF = predict(fit.RF, newdata=dataimptest, type = "prob")[,"stem",drop=F]

yhat.mean = (yhat.bayes+yhat.RF)/2
write.table(file="OK.Cupid.txt", yhat.mean, row.names = FALSE, col.names = FALSE)
head(yhat.mean)
          stem
[1,] 0.6772953
[2,] 0.6920000
[3,] 0.0910000
[4,] 0.0268952
[5,] 0.4165008
[6,] 0.1826057