House Prices
This is another learning example offered by Kaggle.
The object is to predict SalePrice which is provided in train.csv, but missing in test.csv. The submission should just have two columns, Id and SalePrice.
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
ncol(train)
There are 81 columns, so an overhwelming number.
Qualitative Summary
summary(train$MSZoning)
C (all) FV RH RL RM
10 65 16 1151 218
summary(train$Id)
“MSSubClass” “MSZoning” “LotFrontage”
[5] “LotArea” “Street” “Alley” “LotShape”
[9] “LandContour” “Utilities” “LotConfig” “LandSlope”
[13] “Neighborhood” “Condition1” “Condition2” “BldgType”
[17] “HouseStyle” “OverallQual” “OverallCond” “YearBuilt”
[21] “YearRemodAdd” “RoofStyle” “RoofMatl” “Exterior1st”
[25] “Exterior2nd” “MasVnrType” “MasVnrArea” “ExterQual”
[29] “ExterCond” “Foundation” “BsmtQual” “BsmtCond”
[33] “BsmtExposure” “BsmtFinType1” “BsmtFinSF1” “BsmtFinType2”
[37] “BsmtFinSF2” “BsmtUnfSF” “TotalBsmtSF” “Heating”
[41] “HeatingQC” “CentralAir” “Electrical” “X1stFlrSF”
[45] “X2ndFlrSF” “LowQualFinSF” “GrLivArea” “BsmtFullBath”
[49] “BsmtHalfBath” “FullBath” “HalfBath” “BedroomAbvGr”
[53] “KitchenAbvGr” “KitchenQual” “TotRmsAbvGrd” “Functional”
[57] “Fireplaces” “FireplaceQu” “GarageType” “GarageYrBlt”
[61] “GarageFinish” “GarageCars” “GarageArea” “GarageQual”
[65] “GarageCond” “PavedDrive” “WoodDeckSF” “OpenPorchSF”
[69] “EnclosedPorch” “X3SsnPorch” “ScreenPorch” “PoolArea”
[73] “PoolQC” “Fence” “MiscFeature” “MiscVal”
[77] “MoSold” “YrSold” “SaleType” “SaleCondition”
[81] “SalePrice”
Best Subset Selection
The code below crashes my laptop after several hours.
library(leaps)
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
hpsubsets <- regsubsets(SalePrice ~ . - Id - , data = train, really.big = T)
Foundation Heating CentralAir Electrical GrLivArea
That causes a crash.
library(leaps)
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
hpsubsets <- regsubsets(SalePrice ~ OverallQual + OverallCond + LotArea + RoofMatl + ExterQual +
BsmtQual + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + KitchenQual, data = train)
To get a clue which of the many factors to focus on:
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ . , data = train)
summary(model)
Next limiting this to those with three asterisks. A snag is one of the rows in KitchenQual has an “NA” which needs to be replaced by some other value for predict not to throw an error.
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + OverallCond + LotArea + RoofMatl + ExterQual +
BsmtQual + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + KitchenQual, data = train)
test <- read.csv("test.csv", header = T, na.strings = "?", stringsAsFactors = T)
test[c(96), c("KitchenQual")] <- "TA"
test$BsmtFinSF1 <- as.numeric(test$BsmtFinSF1)
test$SalePrice <- predict(object = model, newdata = test)
write.csv(test[,c("Id","SalePrice")], "submission.csv", quote = F, row.names = F)
This produced a really lousy score of 0.17, so time to step back and work through the basics of linear regression.
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
test <- read.csv("test.csv", header = T, na.strings = "?", stringsAsFactors = T)
test$SalePrice <- predict(object = model, newdata = test)
write.csv(test[,c("Id","SalePrice")], "submission.csv", quote = F, row.names = F)
This produced a score of 0.23, so better than the first attempt.
Straight Line
Looking at plot(SalePrice ~ OverallQual, data = train) shows a relatively linear looking relation of SalePrice = -96206 + (45436 * OverallQual)
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual, data = train)
svg("simple-line1.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallQual, data = train, col = OverallCond, pch = 20)
abline(model)
dev.off()
Exponential
myfunc <- function(x) {
y = 151604 - 38006*x + 6676*x^2
return(y)
}
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + I(OverallQual^2), data = train)
svg("qual-exp2.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallQual, data = train, col = OverallCond, pch = 20)
curve(myfunc, from = 0, to = 10, n = 10, add = TRUE)
dev.off()
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model1 <- lm(SalePrice ~ OverallQual, data = train)
model2 <- lm(SalePrice ~ OverallQual + I(OverallQual^2), data = train)
anova(model1, model2)
Model 1: SalePrice ~ OverallQual
Model 2: SalePrice ~ OverallQual + I(OverallQual^2)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1458 3.4470e+12
2 1457 2.9608e+12 1 4.8617e+11 239.24 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
myfunc <- function(x) {
y = 32828.2 + 26931.6*x - 4472.4*x^2 + 604.9*x^3
return(y)
}
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
svg("qual-exp3.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallQual, data = train, col = OverallCond, pch = 20)
curve(myfunc, from = 0, to = 10, n = 10, add = TRUE)
dev.off()
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model2 <- lm(SalePrice ~ OverallQual + I(OverallQual^2), data = train)
model3 <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
anova(model2, model3)
Model 1: SalePrice ~ OverallQual + I(OverallQual^2)
Model 2: SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1457 2.9608e+12
2 1456 2.9327e+12 1 2.8078e+10 13.94 0.000196 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model1 <- lm(SalePrice ~ OverallQual, data = train)
model3 <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
anova(model1, model3)
Analysis of Variance Table
Model 1: SalePrice ~ OverallQual
Model 2: SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1458 3.4470e+12
2 1456 2.9327e+12 2 5.1425e+11 127.65 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallCond, data = train)
svg("simple-line2.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallCond, data = train, col = OverallQual, pch = 20)
abline(model)
dev.off()
Is There a Relationship Between the Response and Predictors?
There are an overwhelming number of columns:
names(train)
[1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
[5] "LotArea" "Street" "Alley" "LotShape"
[9] "LandContour" "Utilities" "LotConfig" "LandSlope"
[13] "Neighborhood" "Condition1" "Condition2" "BldgType"
[17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
[21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
[25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
[29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
[33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
[37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
[41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
[45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
[49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
[53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
[57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
[61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
[65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
[69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
[73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
[77] "MoSold" "YrSold" "SaleType" "SaleCondition"
[81] "SalePrice"