House Prices

August 30, 2025

This is another learning example offered by Kaggle.

The object is to predict SalePrice which is provided in train.csv, but missing in test.csv. The submission should just have two columns, Id and SalePrice.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
ncol(train)

There are 81 columns, so an overhwelming number.

Qualitative Summary

summary(train$MSZoning)
C (all)      FV      RH      RL      RM 
     10      65      16    1151     218

summary(train$Id)

“MSSubClass” “MSZoning” “LotFrontage”
[5] “LotArea” “Street” “Alley” “LotShape”
[9] “LandContour” “Utilities” “LotConfig” “LandSlope”
[13] “Neighborhood” “Condition1” “Condition2” “BldgType”
[17] “HouseStyle” “OverallQual” “OverallCond” “YearBuilt”
[21] “YearRemodAdd” “RoofStyle” “RoofMatl” “Exterior1st”
[25] “Exterior2nd” “MasVnrType” “MasVnrArea” “ExterQual”
[29] “ExterCond” “Foundation” “BsmtQual” “BsmtCond”
[33] “BsmtExposure” “BsmtFinType1” “BsmtFinSF1” “BsmtFinType2” [37] “BsmtFinSF2” “BsmtUnfSF” “TotalBsmtSF” “Heating”
[41] “HeatingQC” “CentralAir” “Electrical” “X1stFlrSF”
[45] “X2ndFlrSF” “LowQualFinSF” “GrLivArea” “BsmtFullBath” [49] “BsmtHalfBath” “FullBath” “HalfBath” “BedroomAbvGr” [53] “KitchenAbvGr” “KitchenQual” “TotRmsAbvGrd” “Functional”
[57] “Fireplaces” “FireplaceQu” “GarageType” “GarageYrBlt”
[61] “GarageFinish” “GarageCars” “GarageArea” “GarageQual”
[65] “GarageCond” “PavedDrive” “WoodDeckSF” “OpenPorchSF”
[69] “EnclosedPorch” “X3SsnPorch” “ScreenPorch” “PoolArea”
[73] “PoolQC” “Fence” “MiscFeature” “MiscVal”
[77] “MoSold” “YrSold” “SaleType” “SaleCondition” [81] “SalePrice”

Best Subset Selection

The code below crashes my laptop after several hours.

library(leaps)
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
hpsubsets <- regsubsets(SalePrice ~ . - Id - , data = train, really.big = T)

Foundation Heating CentralAir Electrical GrLivArea

That causes a crash.

library(leaps)
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
hpsubsets <- regsubsets(SalePrice ~ OverallQual + OverallCond + LotArea + RoofMatl + ExterQual + 
  BsmtQual + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + KitchenQual, data = train)

To get a clue which of the many factors to focus on:

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ . , data = train)
summary(model)

Next limiting this to those with three asterisks. A snag is one of the rows in KitchenQual has an “NA” which needs to be replaced by some other value for predict not to throw an error.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + OverallCond + LotArea + RoofMatl + ExterQual + 
  BsmtQual + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + KitchenQual, data = train)
test <- read.csv("test.csv", header = T, na.strings = "?", stringsAsFactors = T)
test[c(96), c("KitchenQual")] <- "TA"
test$BsmtFinSF1 <- as.numeric(test$BsmtFinSF1)
test$SalePrice <- predict(object = model, newdata = test)
write.csv(test[,c("Id","SalePrice")], "submission.csv", quote = F, row.names = F)

This produced a really lousy score of 0.17, so time to step back and work through the basics of linear regression.

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
test <- read.csv("test.csv", header = T, na.strings = "?", stringsAsFactors = T)
test$SalePrice <- predict(object = model, newdata = test)
write.csv(test[,c("Id","SalePrice")], "submission.csv", quote = F, row.names = F)

This produced a score of 0.23, so better than the first attempt.

Straight Line

Looking at plot(SalePrice ~ OverallQual, data = train) shows a relatively linear looking relation of SalePrice = -96206 + (45436 * OverallQual)

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual, data = train)
svg("simple-line1.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallQual, data = train, col = OverallCond, pch = 20)
abline(model)
dev.off()

straight line

Exponential

myfunc <- function(x) {
  y = 151604 - 38006*x + 6676*x^2
  return(y)
}
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + I(OverallQual^2), data = train)
svg("qual-exp2.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallQual, data = train, col = OverallCond, pch = 20)
curve(myfunc, from = 0, to = 10, n = 10, add = TRUE)
dev.off()

SalePrice = 151604 - 38006OverallQual + 6676OverallQual^2

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model1 <- lm(SalePrice ~ OverallQual, data = train)
model2 <- lm(SalePrice ~ OverallQual + I(OverallQual^2), data = train)
anova(model1, model2)

Model 1: SalePrice ~ OverallQual
Model 2: SalePrice ~ OverallQual + I(OverallQual^2)
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1   1458 3.4470e+12                                   
2   1457 2.9608e+12  1 4.8617e+11 239.24 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

myfunc <- function(x) {
  y = 32828.2 + 26931.6*x - 4472.4*x^2 + 604.9*x^3 
  return(y)
}
train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
svg("qual-exp3.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallQual, data = train, col = OverallCond, pch = 20)
curve(myfunc, from = 0, to = 10, n = 10, add = TRUE)
dev.off()

y = 32828.2 + 26931.6x - 4472.4x^2 + 604.9*x^3

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model2 <- lm(SalePrice ~ OverallQual + I(OverallQual^2), data = train)
model3 <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
anova(model2, model3)

Model 1: SalePrice ~ OverallQual + I(OverallQual^2)
Model 2: SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3)
  Res.Df        RSS Df  Sum of Sq     F   Pr(>F)    
1   1457 2.9608e+12                                 
2   1456 2.9327e+12  1 2.8078e+10 13.94 0.000196 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model1 <- lm(SalePrice ~ OverallQual, data = train)
model3 <- lm(SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3), data = train)
anova(model1, model3)

Analysis of Variance Table

Model 1: SalePrice ~ OverallQual
Model 2: SalePrice ~ OverallQual + I(OverallQual^2) + I(OverallQual^3)
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1   1458 3.4470e+12                                   
2   1456 2.9327e+12  2 5.1425e+11 127.65 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

train <- read.csv("train.csv", header = T, na.strings = "?", stringsAsFactors = T)
model <- lm(SalePrice ~ OverallCond, data = train)
svg("simple-line2.svg", width = 11, pointsize = 12, family = "sans")
plot(SalePrice ~ OverallCond, data = train, col = OverallQual, pch = 20)
abline(model)
dev.off()

Is There a Relationship Between the Response and Predictors?

There are an overwhelming number of columns:

names(train)
 [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
 [5] "LotArea"       "Street"        "Alley"         "LotShape"     
 [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
[13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
[17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
[21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
[25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
[29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
[33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
[37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
[41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
[45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
[49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
[53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
[57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
[61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
[65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
[69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
[73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
[77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
[81] "SalePrice"