NAME : Giacomo Saccaggi
BADGE: 833063
NICKNAME: g.saccaggi
TEAM: Random_Forest_512
ROUND: 1st
Packages we used: caret tree randomForest ranger Boruta e1071
OkCupid is an online dating site that serves international users. Kim and Escobedo-Land (2015) describe a data set where over 50,000 profiles from the San Francisco area were made available.
The goal will be to predict whether a person’s profession is in the STEM fields (science, technology, engineering, and math) using a random sample of the overall dataset.
Submissions are evaluated by the Area Under the Curve (AUC).
During the competition, the leaderboard displays your partial score, which is the AUC for 500 (random) observations of the test set. At the end of the contest, the leaderboard will display the final score, which is the AUC for the remaining 500 observations of the test set. The final score will determine the final winner. This method prevents users from overfitting to the leaderboard.
# get the required R packages
library(tree)
library(caret)
library(randomForest)
library(ranger)
library(Boruta)
library(e1071)
set.seed(512)
train <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/101.csv", stringsAsFactors=T)
test <- read.csv("http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/102.csv", stringsAsFactors=F)
test$Class <-NA
# JOIN TRAIN SET AND TEST SET
{
col.target<-which(colnames(train)=="Class")
train_90<-train[,-col.target]
test_90<-test[,-91]
file_90<-rbind(train_90,test_90)
}
#----- MISSING VALUES
# $body_type
colonna_1<-which(colnames(file_90)=="body_type")
righe_1<-which(file_90[,colonna_1]=="body_type_missing")
file_90[righe_1,colonna_1]<-NA
# $diet
colonna_2<-which(colnames(file_90)=="diet")
righe_2<-which(file_90[,colonna_2]=="diet_missing")
file_90[,colonna_2]<-as.character(file_90[,colonna_2])
file_90[righe_2,colonna_2] <- rep("no_important",length(righe_2))
# $drinks
colonna_3<-which(colnames(file_90)=="drinks")
righe_3<-which(file_90[,colonna_3]=="drinks_missing")
file_90[righe_3,colonna_3]<-NA
# $drugs
colonna_4<-which(colnames(file_90)=="drugs")
righe_4<-which(file_90[,colonna_4]=="drugs_missing")
file_90[righe_4,colonna_4]<-NA
file_90$drugs<-ordered(file_90$drugs,levels=c("0","never","sometimes","often"))
# $education
file_90$education<-as.character(file_90$education)
colonna_5<-which(colnames(file_90)=="education")
righe_5<-which(file_90[,colonna_5]=="ed_missing")
file_90[righe_5,colonna_5]<-"no_answer"
eduvm<-which(is.na(file_90[,colonna_5]))
file_90[eduvm,colonna_5]<-"no_answer"
# $income
file_90$income<-as.character(file_90$income)
for(i in 1:length(file_90$age)){
file_90$income[i]<-ifelse(file_90$income[i]!="missing",
as.numeric(substr(file_90$income[i],4,nchar(file_90$income[i]))), file_90$income[i]<-0)}
file_90$income<-as.numeric(file_90$income)
# $offspring
colonna_6<-which(colnames(file_90)=="offspring")
righe_6<-which(file_90[,colonna_6]=="kids_missing")
#2840 valori mancanti su 5000
file_90[,colonna_6]<-as.character(file_90[,colonna_6])
file_90[righe_6,colonna_6] <- rep("no_answer",length(righe_6))
# $pets
colonna_7<-which(colnames(file_90)=="pets")
righe_7<-which(file_90[,colonna_7]=="pets_missing")
#1465 valori mancanti su 5000
file_90[,colonna_7]<-as.character(file_90[,colonna_7])
file_90[righe_7,colonna_7] <- rep("no_answer",length(righe_7))
# $sign
colonna_8<-which(colnames(file_90)=="sign")
righe_8<-which(file_90[,colonna_8]=="sign_missing")
file_90[righe_8,colonna_8]<-NA
# $smokes
colonna_9<-which(colnames(file_90)=="smokes")
righe_9<-which(file_90[,colonna_9]=="smokes_missing")
file_90[righe_9,colonna_9]<-NA
# $male
colonna_10<-which(colnames(file_90)=="male")
colnames(file_90)[colonna_10]<-"sex"
file_90$sex<-ifelse(file_90$sex==0,"F","M")
# $religion_modifer
colonna_10<-which(colnames(file_90)=="religion_modifer")
righe_10<-which(file_90[,colonna_10]=="religion_mod_missing")
file_90$religion_modifer<-as.character(file_90$religion_modifer)
file_90[righe_10,colonna_10]<-0
for(i in 1:nrow(file_90)){
if (file_90[i,colonna_10]=="and_laughing_about_it"){
file_90$religion_modifer=1}
if (file_90[i,colonna_10]=="but_not_too_serious_about_it") {
file_90$religion_modifer=2}
if (file_90[i,colonna_10]=="and_somewhat_serious_about_it"){
file_90$religion_modifer=3}
if (file_90[i,colonna_10]=="and_very_serious_about_it"){
file_90$religion_modifer=4}
}
# $sign_modifer - MISSING VALUES
colonna_11<-which(colnames(file_90)=="sign_modifer")
righe_11<-which(file_90[,colonna_11]=="sign_mod_missing")
file_90$sign_modifer<-as.character(file_90$sign_modifer)
file_90[righe_11,colonna_11]<-0
for(i in 1:nrow(file_90)){
if (file_90[i,colonna_11]=="and_its_fun_to_think_about"){
file_90$religion_modifer=1}
if (file_90[i,colonna_11]=="but_it_doesnt_matter") {
file_90$religion_modifer=2}
if (file_90[i,colonna_11]=="and_it_matters_a_lot"){
file_90$religion_modifer=3}
}
# THE VARIABLES:
#$asian
#$black
#$hispanic_latin
#$indian $middle_eastern
#$native_american
#$pacific_islander
#$white
# WITHOUT VALUE, DELETE
# CLASSIFICATION TREE TO MAKE FORECASTS ON THE MISSING VALUE
val.manc_1<-which(is.na(file_90$body_type))
val.manc_2<-which(is.na(file_90$drinks))
val.manc_3<-which(is.na(file_90$drugs))
val.manc_5<-which(is.na(file_90$sign))
val.manc_6<-which(is.na(file_90$smokes))
valmancantitot<-as.numeric(levels(as.factor(c(val.manc_1,val.manc_2,val.manc_3,val.manc_5,val.manc_6))))
which(is.na(file_90[-valmancantitot,]))
asd<-file_90[-valmancantitot,]
for(i in 1:length(asd)){print(paste(colnames(asd)[i] ,length(which(is.na(asd[,i])))))}
test_body_type1<-file_90[val.manc_1,]
test_drinks1<-file_90[val.manc_2,]
test_drugs1<-file_90[val.manc_3,]
test_sign1<-file_90[val.manc_5,]
test_smokes1<-file_90[val.manc_6,]
file_NMV<-file_90[-valmancantitot,-18]
file_NMV<-file_NMV[,-c(31:length(file_NMV))]
file_NMV<-file_NMV[,-c(20:27)]
file_NMV<-file_NMV[,-6]
file_NMV<-file_NMV[,-c(20,21)]
file_NMV$diet<-as.factor(file_NMV$diet)
file_NMV$offspring<-as.factor(file_NMV$offspring)
file_NMV$pets<-as.factor(file_NMV$pets)
file_NMV$sign_modifer<-as.factor(file_NMV$sign_modifer)
file_NMV$sex<-as.factor(file_NMV$sex)
test_body_type1$diet<-as.factor(test_body_type1$diet)
test_body_type1$offspring<-as.factor(test_body_type1$offspring)
test_body_type1$pets<-as.factor(test_body_type1$pets)
test_body_type1$sign_modifer<-as.factor(test_body_type1$sign_modifer)
test_body_type1$sex<-as.factor(test_body_type1$sex)
cat(paste("+",colnames(file_NMV)))
#age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link
fit_1<-tree(body_type ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link, file_NMV)
prev_1<-predict(fit_1,newdata=test_body_type1,type = "class")
file_90[val.manc_1,colonna_1]<-prev_1
test_drinks1$diet<-as.factor(test_drinks1$diet)
test_drinks1$offspring<-as.factor(test_drinks1$offspring)
test_drinks1$pets<-as.factor(test_drinks1$pets)
test_drinks1$sign_modifer<-as.factor(test_drinks1$sign_modifer)
test_drinks1$sex<-as.factor(test_drinks1$sex)
fit_2<-tree(as.factor(drinks) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_2<-predict(fit_2,test_drinks1,type = "class")
file_90[val.manc_2,colonna_3]<-prev_2
test_drugs1$diet<-as.factor(test_drugs1$diet)
test_drugs1$offspring<-as.factor(test_drugs1$offspring)
test_drugs1$pets<-as.factor(test_drugs1$pets)
test_drugs1$sign_modifer<-as.factor(test_drugs1$sign_modifer)
test_drugs1$sex<-as.factor(test_drugs1$sex)
fit_3<-tree(as.factor(drugs) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_3<-predict(fit_3,test_drugs1,type = "class")
test_drugs1$drugs<-prev_3
file_90[val.manc_3,colonna_4]<-test_drugs1$drugs
test_sign1$diet<-as.factor(test_sign1$diet)
test_sign1$offspring<-as.factor(test_sign1$offspring)
test_sign1$pets<-as.factor(test_sign1$pets)
test_sign1$sign_modifer<-as.factor(test_sign1$sign_modifer)
test_sign1$sex<-as.factor(test_sign1$sex)
fit_5<-tree(as.factor(sign) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_5<-predict(fit_5,test_sign1,type = "class")
test_sign1$sign<-prev_5
file_90[val.manc_5,colonna_8]<-test_sign1$sign
test_smokes1$diet<-as.factor(test_smokes1$diet)
test_smokes1$offspring<-as.factor(test_smokes1$offspring)
test_smokes1$pets<-as.factor(test_smokes1$pets)
test_smokes1$sign_modifer<-as.factor(test_smokes1$sign_modifer)
test_smokes1$sex<-as.factor(test_smokes1$sex)
fit_6<-tree(as.factor(smokes) ~ age + diet + height + income + last_online + offspring + orientation + pets + religion + status + sex + religion_modifer + sign_modifer + essay_link , data=file_NMV)
prev_6<-predict(fit_6,test_smokes1,type = "class")
test_smokes1$smokes<-prev_6
file_90[val.manc_6,colonna_9]<-test_smokes1$smokes
file_90$diet<-as.factor(file_90$diet)
file_90$education<-as.factor(file_90$education)
file_90$offspring<-as.factor(file_90$offspring)
file_90$pets<-as.factor(file_90$pets)
file_90$sex<-as.factor(file_90$sex)
file_90$sign_modifer<-as.factor(file_90$sign_modifer)
file_90$pets<-as.factor(file_90$pets)
#------------------ SPLIT THE FILE IN 4000 AND 1000
Trainn<-file_90[1:4000,]
Test<-file_90[4001:5000,]
Train<-cbind(Trainn,train$Class)
colnames(Train)[91]<-"Class"
filefs<-Train
convert <- c(1:dim(filefs)[2])
filefs[,convert] <- data.frame(apply(filefs[convert], 2, as.factor))
boruta.train <- Boruta(filefs$Class~., data =filefs, doTrace = 2)
boruta.train$finalDecision
final.boruta <- TentativeRoughFix(boruta.train)
boruta.df <- attStats(final.boruta)
#variabili importanti
varimpportant<-row.names(boruta.df)
val<-abs(boruta.df$meanImp)
d<-which(val>1.3)
# val > 1.3 seleziona 48 attributi importanti
varimpportant<-varimpportant[d]
dataimp<-cbind(Train[,varimpportant],Train$Class)
colnames(dataimp)[length(colnames(dataimp))]<-"Class"
dataimptest<-Test[,varimpportant]
fit.bayes = naiveBayes(as.factor(dataimp$Class) ~ ., data=dataimp)
yhat.bayes = predict(fit.bayes, newdata=dataimptest, type = "raw")[,"stem",drop=F]
fit.RF =randomForest(as.factor(dataimp$Class) ~ ., data=dataimp)
yhat.RF = predict(fit.RF, newdata=dataimptest, type = "prob")[,"stem",drop=F]
yhat.mean = (yhat.bayes+yhat.RF)/2
write.table(file="OK.Cupid.txt", yhat.mean, row.names = FALSE, col.names = FALSE)
head(yhat.mean)
stem
[1,] 0.6772953
[2,] 0.6920000
[3,] 0.0910000
[4,] 0.0268952
[5,] 0.4165008
[6,] 0.1826057