Poop Sheet

Melbourne Housing

This is the example used in Kaggle’s Basic Data Exploration.

melbourne_data <- read.csv("melb_data.csv", header = T, na.strings = "?", stringsAsFactors = T)
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)

Introductory Example

Kaggle provides a training set (which includes the result, Survived, along with various predictors), a test set which doesn’t include Survived, and a simple example gender_submission.csv to show how results should be submitted.

Writing submission file

The model used for gender_submission.csv is all female passengers survived and all male passengers didn’t.

test <- read.csv("test.csv", header = T, na.strings = "?", stringsAsFactors = T)
test$Survived <- as.integer(test$Sex == "female")
write.csv(test[,c("PassengerId","Survived")], "submission.csv", quote = F, row.names = F)

Checking accuracy of model

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
train$Prediction <- as.integer(train$Sex == "female")
(sum(train$Survived == train$Prediction)/length(train$Survived)) * 100

This shows assuming all females survived and all males died is about 79% accurate.

Making the model more complex

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- glm(Survived ~ Sex, data = train)
train$Prediction <- round(predict(model, newdata = train))
(sum(train$Survived == train$Prediction)/length(train$Survived)) * 100

predict(model) allocates 0.1889081 to males and 0.7420382 to females, so rounding produces the same as assuming all females survived and all females didn’t of 78.67565%.

Making age a factor

A snag here is Age is NA for many entries

Subset train to only rows with Age provided
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
has_age <- subset(train, !is.na(train$Age))
model <- glm(Survived ~ Sex + Age, data = has_age)
has_age$Prediction <- round(predict(model, newdata = has_age))
(sum(has_age$Survived == has_age$Prediction)/length(has_age$Survived)) * 100

This reduces the accuracy of the model slightly to 78.0112%

Subset train to only rows with no Age provided
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
no_age <- subset(train, is.na(train$Age))
model <- glm(Survived ~ Sex, data = no_age)
no_age$Prediction <- round(predict(model, newdata = no_age))
(sum(no_age$Survived == no_age$Prediction)/length(no_age$Survived)) * 100

Passengers with no age provided had a lower survival rate, 0.1290323 for males and 0.6792453 for females. The accuracy of the prediction rose to 81.35593%.

Subset to children and adults
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
children <- subset(train, train$Age < 15)
model <- glm(Survived ~ Sex, data = children)
children$Prediction <- round(predict(model, newdata = children))
(sum(children$Survived == children$Prediction)/length(children$Survived)) * 100

The odds for males rose to 0.5384615 if they were under 15 while those for females dropped to 0.6153846. The accuracy fell to 57.69231%.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
adults <- subset(train, train$Age >= 15)
model <- glm(Survived ~ Sex, data = adults)
adults$Prediction <- round(predict(model, newdata = adults))
(sum(adults$Survived == adults$Prediction)/length(adults$Survived)) * 100

The odds for male adults fell to 0.1739130 and for females fell to 0.7792793. Accuracy rose to 80.97484%.

Assuming all females and children under 14 survived

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
train$Prediction <- as.integer(train$Sex == "female" | (!is.na(train$Age) & train$Age < 14))
(sum(train$Survived == train$Prediction)/length(train$Survived)) * 100

This increases the accuracy slightly to 79.23681%.

Subsetting by class

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
first <- subset(train, Pclass == 1)
(sum(first$Survived == 1)/length(first$PassengerId)) * 100
first_male <- subset(train, Pclass == 1 & Sex == "male")
(sum(first_male$Survived == 1)/length(first_male$PassengerId)) * 100
first_female <- subset(train, Pclass == 1 & Sex == "female")
(sum(first_female$Survived == 1)/length(first_female$PassengerId)) * 100

62.96296% of first class passengers survived. The odds for males rose to 36.88525% and females to 96.80851%.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
second <- subset(train, Pclass == 2)
(sum(second$Survived == 1)/length(second$PassengerId)) * 100
second_male <- subset(train, Pclass == 2 & Sex == "male")
(sum(second_male$Survived == 1)/length(second_male$PassengerId)) * 100
second_female <- subset(train, Pclass == 2 & Sex == "female")
(sum(second_female$Survived == 1)/length(second_female$PassengerId)) * 100

47.28261% overall, 15.74074% males and 92.10526% females.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
third <- subset(train, Pclass == 3)
(sum(third$Survived == 1)/length(third$PassengerId)) * 100
third_male <- subset(train, Pclass == 3 & Sex == "male")
(sum(third_male$Survived == 1)/length(third_male$PassengerId)) * 100
third_female <- subset(train, Pclass == 3 & Sex == "female")
(sum(third_female$Survived == 1)/length(third_female$PassengerId)) * 100

Only 24.23625% of third class passengers survived. Only 13.54467% of males and 50% of females.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
above_third <- subset(train, Pclass < 3)
(sum(above_third$Survived == 1)/length(above_third$PassengerId)) * 100
above_third_male <- subset(train, Pclass < 3 & Sex == "male")
(sum(above_third_male$Survived == 1)/length(above_third_male$PassengerId)) * 100
above_third_female <- subset(train, Pclass < 3 & Sex == "female")
(sum(above_third_female$Survived == 1)/length(above_third_female$PassengerId)) * 100

Better than 3rd class survived 55.75% overall, 26.95652% of males survived and 94.70588% of females.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
third_children <- subset(train, Pclass == 3 & Age < 14)
(sum(third_children$Survived == 1)/length(third_children$PassengerId)) * 100
above_third_children <- subset(train, Pclass < 3 & Age < 14)
(sum(above_third_children$Survived == 1)/length(above_third_children$PassengerId)) * 100

Only 42.85714% of 3rd class children survived while 95.45455% of children in 2nd and 1st class survived.

Assuming females and children in 1st or 2nd class survived, and all 3rd class passengers died.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
train$Prediction <- as.integer((train$Sex == "female" & train$Pclass < 3) | 
  (!is.na(train$Age) & train$Age < 14 & train$Pclass < 3))
(sum(train$Survived == train$Prediction)/length(train$Survived)) * 100

This raises the accuracy to 80.02245%.

test <- read.csv("test.csv", header = T, na.strings = "?", stringsAsFactors = T)
test$Survived <- as.integer((test$Sex == "female" & test$Pclass < 3) | 
  (!is.na(test$Age) & test$Age < 14 & test$Pclass < 3))
write.csv(test[,c("PassengerId","Survived")], "submission.csv", quote = F, row.names = F)

This worked and received a public score of 0.78468, placing me 1783 on the leaderboard.