Introduction

This is an exploratory data analysis for the Kaggle House Prices competition.

The use here of rpart to impute missing values leans heavily on the analysis by Bisaria, while some of the feature engineering was suggested by that of Amit Choudhary. The correlation plot was borrowed from AiO.

The data consists of a testing set with 1460 house sale price observations with 81 features. The evaluation set consists of 1459 observations of the same features. The evaluation metric is the RMSE of the logs of the predicted vs. true sale prices.

# load  packages 
require(ggplot2)
require(caret)
require(rpart)
require(corrplot)

# Set working directory and import datafiles
setwd("~/code/kaggle/housepricesART")

train <- read.csv("./input/train.csv", stringsAsFactors = F)
test <- read.csv("./input/test.csv", stringsAsFactors = F)
  
dim(train) 
## [1] 1460   81
dim(test)
## [1] 1459   80
# take log of sale price
train$SalePrice <- log(train$SalePrice)
test$SalePrice <- rep(NA)

df <- rbind(train, test)

str(df)
## 'data.frame':    2919 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num  12.2 12.1 12.3 11.8 12.4 ...
colSums(sapply(df, is.na))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             4           486             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          2721             0             0             2 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             1             1 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##            24            23             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            81            82            82            79             1 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            80             1             1             1             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             2             2             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             1             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             2             0          1420           157           159 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##           159             1             1           159           159 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          2909          2348          2814 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             1             0 
##     SalePrice 
##          1459

Zero Round

After a full consideration of the analysis in the later sections, we return here to add an early stage of data processing. Based on the observations in the analysis below, we identified a set of possible outliers. We then ran the sci-kit learn model LassoLarsCV with 10 folds on the various data sets obtained by dropping combinations of outliers. The best dataset resulted in an decrease in RMS error of around 15% vs the set that includes all points.

Our strategy then is to produce data sets for some of these possibilities so that various exploratory modeling can be done in python. We will design some code to make it relatively easy to generate the necessary data sets and write them to coded file names.

train_size = 1460

# Failsafe in case some operation after dropping rows changes the indexing
df$HouseId <- df$Id

rel_outliers = c(31, 496, 524, 534, 692, 917, 969, 1183, 1299)

# Modify by hand to drop desired subset of outliers, then adjust the output file name
# drop_set <- c(524, 692, 1183, 1299)
# 
# 
# 
# df <- df[-drop_set,]
# train_size <- train_size - length(drop_set)

# Use one-hot coding in output file name
# train_output <- "train_tidy-new.csv"
# test_output <- "test_tidy-new.csv"
train_output <- "train_tidy_000000000.csv"
test_output <- "test_tidy_000000000.csv"

First Round

In the first round, we want to tidy the data, so we will look for inconsistencies and impute missing values. Both the testing and test set have been combined into the dataframe df. We will start with categorical features first, except those that form groups like those for the Basement and Garage.

MSZoning: Identifies the general zoning classification of the sale.

summary(as.factor(df$MSZoning))
## C (all)      FV      RH      RL      RM    NA's 
##      25     139      26    2265     460       4
which(is.na(df$MSZoning))
## [1] 1916 2217 2251 2905

Typical predictors of the zoning might be Neighborhood and local conditions. We then use rpart to impute the missing values.

mszoning.pred <- c("MSZoning", "Neighborhood", "Condition1", "Condition2", "HouseStyle", "BldgType")
mszoning.rpart <- rpart(as.factor(MSZoning) ~ ., data = df[!is.na(df$MSZoning),mszoning.pred], method = "class",
                                na.action=na.omit)

as.character(predict(mszoning.rpart, df[is.na(df$MSZoning),mszoning.pred], type = "class"))
## [1] "RM" "RM" "RM" "RL"
df$MSZoning[is.na(df$MSZoning)] <- as.character(predict(mszoning.rpart, df[is.na(df$MSZoning),mszoning.pred], 
                                                        type = "class"))

Alley: Type of alley access to property

summary(as.factor(df$Alley))
## Grvl Pave NA's 
##  120   78 2721
df$Alley[is.na(df$Alley)] <- rep('None')

Here we simply substitute missing values with ‘None’.

Utilities: Type of utilities available

summary(as.factor(df$Utilities))
## AllPub NoSeWa   NA's 
##   2916      1      2
df$Utilities <- NULL

Only at most 3 values show variation, so we drop this feature.

Exterior1st/2nd: Exterior covering on house

summary(as.factor(df$Exterior1st))
## AsbShng AsphShn BrkComm BrkFace  CBlock CemntBd HdBoard ImStucc MetalSd 
##      44       2       6      87       2     126     442       1     450 
## Plywood   Stone  Stucco VinylSd Wd Sdng WdShing    NA's 
##     221       2      43    1025     411      56       1
which(is.na(df$Exterior1st))
## [1] 2152
ext1.pred <- c("Neighborhood", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", 
              "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
              "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond")
ext1.rpart <- rpart(as.factor(Exterior1st) ~ ., data = df[!is.na(df$Exterior1st),ext1.pred], method = "class",
                    na.action=na.omit)

as.character(predict(ext1.rpart, df[is.na(df$Exterior1st),ext1.pred], type = "class"))
## [1] "Wd Sdng"
df$Exterior1st[is.na(df$Exterior1st)] <- 
  as.character(predict(ext1.rpart, df[is.na(df$Exterior1st),ext1.pred], type = "class"))

levels(as.factor(train$Exterior1st))
##  [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock"  "CemntBd" "HdBoard"
##  [8] "ImStucc" "MetalSd" "Plywood" "Stone"   "Stucco"  "VinylSd" "Wd Sdng"
## [15] "WdShing"
levels(as.factor(test$Exterior1st))
##  [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock"  "CemntBd" "HdBoard"
##  [8] "MetalSd" "Plywood" "Stucco"  "VinylSd" "Wd Sdng" "WdShing"

We note that “ImStucc” and “Stone” are not in the test set, so we will remove the corresponding one-hot variables later in the preprocessing.

summary(as.factor(df$Exterior2nd))
## AsbShng AsphShn Brk Cmn BrkFace  CBlock CmentBd HdBoard ImStucc MetalSd 
##      38       4      22      47       3     126     406      15     447 
##   Other Plywood   Stone  Stucco VinylSd Wd Sdng Wd Shng    NA's 
##       1     270       6      47    1014     391      81       1
which(is.na(df$Exterior2nd))
## [1] 2152
ext2.rpart <- rpart(as.factor(Exterior2nd) ~ ., data = df[!is.na(df$Exterior2nd),ext1.pred], method = "class",
                    na.action=na.omit)

as.character(predict(ext2.rpart, df[is.na(df$Exterior2nd),ext1.pred], type = "class"))
## [1] "Wd Sdng"
df$Exterior2nd[is.na(df$Exterior2nd)] <- 
  as.character(predict(ext1.rpart, df[is.na(df$Exterior2nd),ext1.pred], type = "class"))

levels(as.factor(train$Exterior2nd))
##  [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock"  "CmentBd" "HdBoard"
##  [8] "ImStucc" "MetalSd" "Other"   "Plywood" "Stone"   "Stucco"  "VinylSd"
## [15] "Wd Sdng" "Wd Shng"
levels(as.factor(test$Exterior2nd))
##  [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock"  "CmentBd" "HdBoard"
##  [8] "ImStucc" "MetalSd" "Plywood" "Stone"   "Stucco"  "VinylSd" "Wd Sdng"
## [15] "Wd Shng"

“Other” is not in the test set.

Electrical: Electrical system

summary(as.factor(df$Electrical))
## FuseA FuseF FuseP   Mix SBrkr  NA's 
##   188    50     8     1  2671     1
elec.pred <- c("BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "Electrical")
elec.rpart <- rpart(as.factor(Electrical) ~ ., data = df[!is.na(df$Electrical),elec.pred],
                    method = "class", na.action=na.omit)
as.character(predict(elec.rpart, df[is.na(df$Electrical),elec.pred], type = "class"))
## [1] "SBrkr"
df$Electrical[is.na(df$Electrical)] <- 
  as.character(predict(elec.rpart, df[is.na(df$Electrical),elec.pred], type = "class"))

levels(as.factor(train$Electrical))
## [1] "FuseA" "FuseF" "FuseP" "Mix"   "SBrkr"
levels(as.factor(test$Electrical))
## [1] "FuseA" "FuseF" "FuseP" "SBrkr"

‘Mix’ is not in the test set.

KitchenQual: Kitchen quality

summary(as.factor(df$KitchenQual))
##   Ex   Fa   Gd   TA NA's 
##  205   70 1151 1492    1
kit.pred <- c("BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "KitchenQual")
kit.rpart <- rpart(as.factor(KitchenQual) ~ .,data = df[!is.na(df$KitchenQual),kit.pred],
                   method = "class",na.action=na.omit)
as.character(predict(kit.rpart, df[is.na(df$KitchenQual),kit.pred], type = "class"))
## [1] "TA"
df$KitchenQual[is.na(df$KitchenQual)] <- 
  as.character(predict(kit.rpart, df[is.na(df$KitchenQual),kit.pred], type = "class"))

Functional: Home functionality (Assume typical unless deductions are warranted)

summary(as.factor(df$Functional))
## Maj1 Maj2 Min1 Min2  Mod  Sev  Typ NA's 
##   19    9   65   70   35    2 2717    2
func.pred <- c("OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "ExterQual", "ExterCond","BsmtQual", 
              "BsmtCond","GarageQual", "GarageCond","SaleType", "SaleCondition", "Functional")
func.rpart <- rpart(as.factor(Functional) ~ .,data = df[!is.na(df$Functional),func.pred],
                    method = "class",na.action=na.omit)
as.character(predict(func.rpart, df[is.na(df$Functional),func.pred], type = "class"))
## [1] "Typ" "Typ"
df$Functional[is.na(df$Functional)] <- 
  as.character(predict(func.rpart, df[is.na(df$Functional),func.pred], type = "class"))

FireplaceQu: Fireplace quality

summary(as.factor(df$FireplaceQu))
##   Ex   Fa   Gd   Po   TA NA's 
##   43   74  744   46  592 1420
which(df$Fireplaces != 0 & is.na(df$FireplaceQu))
## integer(0)
df$FireplaceQu[is.na(df$FireplaceQu)] <- rep('None')

The NAs here had no fireplace.

PoolQC: Pool quality

summary(as.factor(df$PoolQC))
##   Ex   Fa   Gd NA's 
##    4    2    4 2909
df$PoolQC[which(df$PoolArea == 0 & is.na(df$PoolQC))] <- "None"

These had no pools.

which(df$PoolArea > 0 & is.na(df$PoolQC))
## [1] 2421 2504 2600
poolq.pred <- c("YearBuilt","YearRemodAdd", "PoolQC", "PoolArea","ExterQual","ExterCond",
              "YrSold","SaleType","SaleCondition")
poolq.rpart <- rpart(as.factor(PoolQC) ~ ., data = df[!is.na(df$PoolQC),poolq.pred],
                    method = "class", na.action=na.omit)
as.character(predict(poolq.rpart,df[is.na(df$PoolQC),poolq.pred], type="class"))
## [1] "Ex" "Ex" "Ex"
df$PoolQC[is.na(df$PoolQC)] <- as.character(predict(poolq.rpart,df[is.na(df$PoolQC),poolq.pred], type="class"))

levels(as.factor(train$PoolQC))
## [1] "Ex" "Fa" "Gd"
levels(as.factor(test$PoolQC))
## [1] "Ex" "Gd"

‘Fa’ is missing in the test set.

Fence: Fence quality

table(is.na(df$Fence))
## 
## FALSE  TRUE 
##   571  2348
df$Fence[is.na(df$Fence)] <- rep('None')

These had no fence.

MiscFeature: Miscellaneous feature not covered in other categories

table(is.na(df$MiscFeature))
## 
## FALSE  TRUE 
##   105  2814
df$MiscFeature[is.na(df$MiscFeature)] <- rep('None')

These had no misc. feature.

SaleType: Type of sale

table(is.na(df$SaleType))
## 
## FALSE  TRUE 
##  2918     1
sale.pred <- c("MSSubClass", "MSZoning", "Street", "Alley", "Neighborhood", "Condition1", 
              "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", 
              "YearRemodAdd", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "KitchenQual", 
              "FireplaceQu", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "YrSold", 
              "SaleType", "SaleCondition")
sale.rpart <- rpart(as.factor(SaleType) ~ ., data = df[!is.na(df$SaleType),sale.pred],
                    method = "class", na.action=na.omit)
as.character(predict(sale.rpart, df[is.na(df$SaleType),sale.pred], type = "class"))
## [1] "WD"
df$SaleType[is.na(df$SaleType)] <- as.character(predict(sale.rpart, 
                                                        df[is.na(df$SaleType),sale.pred], type = "class"))

MasVnrType: Masonry veneer type

summary(as.factor(df$MasVnrType))
##  BrkCmn BrkFace    None   Stone    NA's 
##      25     879    1742     249      24
summary(df$MasVnrArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     0.0   102.2   164.0  1600.0      23
which(is.na(df$MasVnrType) & !is.na(df$MasVnrArea))
## [1] 2611
df$MasVnrArea[which(is.na(df$MasVnrType) & !is.na(df$MasVnrArea))]
## [1] 198

We have 24 missing types, 23 missing areas and one obs. with an area but no type. For the 23 with no type or area we will assume that there is no veneer:

df$MasVnrArea[is.na(df$MasVnrArea)] <-rep(0)
df$MasVnrType[is.na(df$MasVnrType) & df$MasVnrArea == 0] <- rep("None")

Let’s look for missing types with zero area

which(is.na(df$MasVnrType) & df$MasVnrArea == 0)
## integer(0)

None of these. How about non-None types where the area is zero?

which(df$MasVnrType != "None" & df$MasVnrArea == 0)
## [1]  689 1242 2320
df$MasVnrType[which(df$MasVnrType != "None" & df$MasVnrArea == 0)] <- rep("None")

We assume that the zero area is right and these have no veneer. Are there nonzero areas for None type?

table(df$MasVnrArea[df$MasVnrType == 'None'])
## 
##    0    1  285  288  312  344 
## 1761    3    1    1    1    1
df$MasVnrArea <- ifelse(df$MasVnrArea == 1,0,df$MasVnrArea)
df$MasVnrType[df$MasVnrArea > 0 & df$MasVnrType == "None" & !is.na(df$MasVnrType)] <- rep(NA)

The ones with area 1 are probably wrong and we set those to zero area. The others should get their type re-imputed so we set to NA.

masvnr.rpart <- rpart(as.factor(MasVnrType) ~ MasVnrArea, 
                    data = df[!is.na(df$MasVnrType),c("MasVnrType","MasVnrArea")], 
                    method = "class", na.action=na.omit)
as.character(predict(masvnr.rpart,df[is.na(df$MasVnrType),c("MasVnrType","MasVnrArea")], type="class"))
## [1] "BrkFace" "BrkFace" "BrkFace" "BrkFace" "BrkFace"
df$MasVnrType[is.na(df$MasVnrType)] <- 
      as.character(predict(masvnr.rpart,df[is.na(df$MasVnrType),c("MasVnrType","MasVnrArea")], type="class"))

Basement Features

Quite a few houses have no basement, so there are many missing values among the features associated with basements, which are

c("TotalBsmtSF", "BsmtExposure", "BsmtCond", "BsmtQual","BsmtFinType1", "BsmtFinType2", 
              "BsmtFinSF1","BsmtFinSF2", "BsmtUnfSF")
## [1] "TotalBsmtSF"  "BsmtExposure" "BsmtCond"     "BsmtQual"    
## [5] "BsmtFinType1" "BsmtFinType2" "BsmtFinSF1"   "BsmtFinSF2"  
## [9] "BsmtUnfSF"
summary(as.factor(df$BsmtQual))
##   Ex   Fa   Gd   TA NA's 
##  258   88 1209 1283   81
table(is.na(df$BsmtExposure))
## 
## FALSE  TRUE 
##  2837    82
table(is.na(df$BsmtCond))
## 
## FALSE  TRUE 
##  2837    82
table(is.na(df$BsmtQual))
## 
## FALSE  TRUE 
##  2838    81
table(is.na(df$BsmtFinType2))
## 
## FALSE  TRUE 
##  2839    80
table(is.na(df$BsmtFinType1))
## 
## FALSE  TRUE 
##  2840    79
table(is.na(df$BsmtFinSF1) & is.na(df$BsmtFinSF2) & is.na(df$BsmtUnfSF))
## 
## FALSE  TRUE 
##  2918     1
table(is.na(df$BsmtFullBath) & is.na(df$BsmtHalfBath))
## 
## FALSE  TRUE 
##  2917     2
table(df$TotalBsmtSF == 0 & is.na(df$BsmtExposure))
## 
## FALSE  TRUE 
##  2840    78

The categorical features are

bsmt.col <- c("BsmtExposure", "BsmtCond", "BsmtQual","BsmtFinType1", "BsmtFinType2")
which(is.na(df$BsmtExposure) & is.na(df$TotalBsmtSF))
## [1] 2121
df$TotalBsmtSF[which(is.na(df$BsmtExposure) & is.na(df$TotalBsmtSF))] <- 0

2121 has NAs for everything including TotalBsmtSF, so we assume there is no basement. There are 79 entries with zero area as well, so we set these to no basement:

df[df$TotalBsmtSF == 0 & is.na(df$BsmtExposure), bsmt.col] <- 
  apply(df[df$TotalBsmtSF == 0 & is.na(df$BsmtExposure), bsmt.col], 2, function(x) x <- rep("None"))

We’ll use the following predictors for the missing values (I tried adding BldgType and Foundation here, but rpart is too slow):

bsmt.pred <- c("BsmtExposure", "BsmtCond", "BsmtQual","BsmtFinType1", "BsmtFinType2","TotalBsmtSF","YearBuilt")

BsmtFinType2.rpart <- rpart(as.factor(BsmtFinType2) ~ ., data = df[!is.na(df$BsmtFinType2),bsmt.pred], 
                            method = "class", na.action=na.omit)

df$BsmtFinType2[is.na(df$BsmtFinType2)] <- 
  as.character(predict(BsmtFinType2.rpart, df[is.na(df$BsmtFinType2),bsmt.pred], type="class"))

BsmtQual.rpart <- rpart(as.factor(BsmtQual) ~ ., data = df[!is.na(df$BsmtQual),bsmt.pred], 
                        method = "class", na.action=na.omit)

df$BsmtQual[is.na(df$BsmtQual)] <- 
  as.character(predict(BsmtQual.rpart, df[is.na(df$BsmtQual),bsmt.pred], type="class"))

BsmtCond.rpart <- rpart(as.factor(BsmtCond) ~ ., data = df[!is.na(df$BsmtCond),bsmt.pred], 
                        method = "class",  na.action=na.omit)

df$BsmtCond[is.na(df$BsmtCond)] <- 
  as.character(predict(BsmtCond.rpart, df[is.na(df$BsmtCond),bsmt.pred],  type="class"))

BsmtExposure.rpart <- rpart(as.factor(BsmtExposure) ~ ., data = df[!is.na(df$BsmtExposure),bsmt.pred], 
                            method = "class", na.action=na.omit)

df$BsmtExposure[is.na(df$BsmtExposure)] <- 
  as.character(predict(BsmtExposure.rpart, df[is.na(df$BsmtExposure),bsmt.pred], type="class"))

df[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF),
  c(bsmt.pred, c("BsmtFinSF1", "BsmtFinSF2","BsmtUnfSF", "BsmtFullBath","BsmtHalfBath"))]
##      BsmtExposure BsmtCond BsmtQual BsmtFinType1 BsmtFinType2 TotalBsmtSF
## 2121         None     None     None         None         None           0
##      YearBuilt BsmtFinSF1 BsmtFinSF2 BsmtUnfSF BsmtFullBath BsmtHalfBath
## 2121      1946         NA         NA        NA           NA           NA
df$BsmtFinSF1[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF)] <- 0
df$BsmtFinSF2[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF)] <- 0
df$BsmtUnfSF[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF)] <- 0
df$BsmtFullBath[df$TotalBsmtSF == 0 & is.na(df$BsmtFullBath)] <- rep(0)
df$BsmtHalfBath[df$TotalBsmtSF == 0 & is.na(df$BsmtHalfBath)] <- rep(0)

The entry 2121 was assigned no basement.

Garage features

summary(df$GarageYrBlt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1895    1960    1979    1978    2002    2207     159
which(df$GarageYrBlt == 2207)
## [1] 2593
df$YearBuilt[which(df$GarageYrBlt == 2207)]
## [1] 2006
df$GarageYrBlt[which(df$GarageYrBlt == 2207)] <- 2007

We’re assuming 2207 was a typo and should be 2007, since the house was built in 2006.

table(is.na(df$GarageType))
## 
## FALSE  TRUE 
##  2762   157
table(is.na(df$GarageArea))
## 
## FALSE  TRUE 
##  2918     1
table(is.na(df$GarageCars))
## 
## FALSE  TRUE 
##  2918     1
table(is.na(df$GarageArea) & is.na(df$GarageCars))
## 
## FALSE  TRUE 
##  2918     1
table(is.na(df$GarageYrBlt))
## 
## FALSE  TRUE 
##  2760   159
table(is.na(df$GarageFinish))
## 
## FALSE  TRUE 
##  2760   159
table(is.na(df$GarageQual))
## 
## FALSE  TRUE 
##  2760   159
table(is.na(df$GarageCond))
## 
## FALSE  TRUE 
##  2760   159
table(is.na(df$GarageYrBlt) & is.na(df$GarageFinish) & is.na(df$GarageQual) & is.na(df$GarageCond))
## 
## FALSE  TRUE 
##  2760   159
table(df$GarageArea == 0 & df$GarageCars==0 & is.na(df$GarageType))
## 
## FALSE  TRUE 
##  2762   157
col.Garage <- c("GarageType", "GarageYrBlt", "GarageFinish", "GarageQual","GarageCond")

We find 157 with GarageType NA,one with GarageArea and GarageCars as NA, and 159 with GarageYrBlt, GarageFinish, GarageQual and GarageCond as NA. For the 157 houses with GarageArea = 0 and no GarageType we set all other garage features to None:

df[df$GarageArea == 0 & df$GarageCars==0 & is.na(df$GarageType), col.Garage] <- 
  apply(df[df$GarageArea == 0 & df$GarageCars==0 & is.na(df$GarageType), 
           col.Garage], 2, function(x) x <- rep("None"))

For the remaining values, we impute:

col.pred <- c("GarageType", "GarageYrBlt", "GarageFinish", "GarageQual","GarageCond","YearBuilt", 
              "GarageCars", "GarageArea")

area.rpart <- rpart(GarageArea ~ ., data = df[!is.na(df$GarageArea),col.pred],
                    method = "anova", na.action=na.omit)
df$GarageArea[is.na(df$GarageArea)] <- round(predict(area.rpart, df[is.na(df$GarageArea),col.pred]))


cars.rpart <- rpart(GarageCars ~ .,data = df[!is.na(df$GarageCars),col.pred],
                    method = "anova", na.action=na.omit)
df$GarageCars[is.na(df$GarageCars)] <- round(predict(cars.rpart, df[is.na(df$GarageCars),col.pred]))

blt.rpart <- rpart(as.factor(GarageYrBlt) ~ .,data = df[!is.na(df$GarageYrBlt),col.pred],
                   method = "class",na.action=na.omit)

df$GarageYrBlt[is.na(df$GarageYrBlt)] <- 
  as.numeric(as.character(predict(blt.rpart, 
                                  df[is.na(df$GarageYrBlt),col.pred], type = "class")))

df[is.na(df$GarageFinish) & is.na(df$GarageQual) & is.na(df$GarageCond),
   c(col.Garage, c("GarageCars", "GarageArea"))]
##      GarageType GarageYrBlt GarageFinish GarageQual GarageCond GarageCars
## 2127     Detchd        1950         <NA>       <NA>       <NA>          1
## 2577     Detchd        1950         <NA>       <NA>       <NA>          1
##      GarageArea
## 2127        360
## 2577        300

For the two remaining houses with missing data, we’ll look at comparables:

df[which(df$GarageType == "Detchd" &  df$GarageCars == 1 & df$GarageYrBlt == 1950),col.Garage]
##      GarageType GarageYrBlt GarageFinish GarageQual GarageCond
## 460      Detchd        1950          Unf         TA         TA
## 986      Detchd        1950          Unf         TA         TA
## 1449     Detchd        1950          Unf         Fa         TA
## 1528     Detchd        1950          Unf         TA         TA
## 1542     Detchd        1950          Unf         TA         TA
## 1602     Detchd        1950          Unf         TA         TA
## 1813     Detchd        1950          Unf         TA         TA
## 2089     Detchd        1950          Unf         TA         TA
## 2112     Detchd        1950          Unf         TA         TA
## 2127     Detchd        1950         <NA>       <NA>       <NA>
## 2456     Detchd        1950          Unf         TA         TA
## 2464     Detchd        1950          Unf         TA         TA
## 2471     Detchd        1950          Fin         TA         TA
## 2577     Detchd        1950         <NA>       <NA>       <NA>
## 2579     Detchd        1950          Unf         TA         Fa
## 2766     Detchd        1950          Unf         Fa         TA
df$GarageFinish[df$GarageType == "Detchd" &  df$GarageYrBlt == 1950 & is.na(df$GarageFinish)] <- "Unf"
df$GarageQual[df$GarageType == "Detchd" &  df$GarageYrBlt == 1950 & is.na(df$GarageQual)] <- "TA"
df$GarageCond[df$GarageType == "Detchd" &  df$GarageYrBlt == 1950 & is.na(df$GarageCond)] <- "TA"

levels(as.factor(train$GarageQual))
## [1] "Ex" "Fa" "Gd" "Po" "TA"
levels(as.factor(test$GarageQual))
## [1] "Fa" "Gd" "Po" "TA"

The GarageQual level “Ex” is not in the test set

LotFrontage: Linear feet of street connected to property

summary(df$LotFrontage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   59.00   68.00   69.31   80.00  313.00     486

There are 486 NA’s. Let’s impute these values.

lotf.pred <- c("MSSubClass", "MSZoning", "LotFrontage", "Street", "Alley", "LotArea", "LotShape", 
              "LandContour", "LotConfig", "LandSlope",  "Neighborhood", "GarageType")
lotf.rpart <- rpart(LotFrontage ~ ., data = df[!is.na(df$LotFrontage),lotf.pred],
                       method = "anova", na.action=na.omit)
df.frontage <- as.data.frame(rbind(cbind(rep("Existing", nrow(df[!is.na(df$LotFrontage),])),
                                         df[!is.na(df$LotFrontage), "LotFrontage"]),
                                   cbind(rep("Imputed", nrow(df[is.na(df$LotFrontage),])),
                                         ceiling(predict(lotf.rpart, df[is.na(df$LotFrontage),lotf.pred])))))

The imputed values are somewhat less-normal/more-skewed than the existing ones:

ggplot(df.frontage, aes (x = as.numeric(as.character(V2)), colour = V1)) +
  geom_density() + xlab("Lot Frontage") + theme(legend.title=element_blank())

df$LotFrontage[is.na(df$LotFrontage)] <- ceiling(predict(lotf.rpart, df[is.na(df$LotFrontage),lotf.pred]))

MSSubClass: Identifies the type of dwelling involved in the sale.

summary(as.factor(df$MSSubClass))
##   20   30   40   45   50   60   70   75   80   85   90  120  150  160  180 
## 1079  139    6   18  287  575  128   23  118   48  109  182    1  128   17 
##  190 
##   61

There are no NAs, but some factors have low counts and therefore poor statistical significance. We note that much of the information in these factors is duplicated in HouseStyle and BldgType. Where there are inconsistencies, we will trust the latter information, since MSSubClass relies on translation to a numerical code, which could be a source of coding error.

which(df$HouseStyle == "1.5Fin" & (df$MSSubClass != 50 & df$MSSubClass != 90 & df$MSSubClass != 150 &
                                     df$MSSubClass != 190))
## [1]  730 1534 1825 2025 2197 2468 2555
df$MSSubClass[which(df$HouseStyle == "1.5Fin" & (df$MSSubClass != 50 & df$MSSubClass != 90 &
                                                   df$MSSubClass != 150 & df$MSSubClass != 190))]
## [1] 30 45 70 60 30 45 40
df$MSSubClass[which(df$HouseStyle == "1.5Fin" & (df$MSSubClass != 50 & df$MSSubClass != 90 & 
                                                   df$MSSubClass != 150 & df$MSSubClass != 190))] <- 50

which(df$HouseStyle == "1.5Unf" & (df$MSSubClass != 45 & df$MSSubClass != 90 & 
                                     df$MSSubClass != 150 & df$MSSubClass != 190))
## [1] 1444 2792
df$MSSubClass[which(df$HouseStyle == "1.5Unf" & (df$MSSubClass != 45 & df$MSSubClass != 90 &
                                                   df$MSSubClass != 150 & df$MSSubClass != 190))]
## [1] 30 50
df$MSSubClass[which(df$HouseStyle == "1.5Unf" & (df$MSSubClass != 45 & df$MSSubClass != 90 & 
                                                   df$MSSubClass != 150 & df$MSSubClass != 190))] <- 45

which(df$HouseStyle == "1Story" & df$MSSubClass != 20 &  df$MSSubClass != 30 &  df$MSSubClass != 40 & 
        df$MSSubClass != 90 & df$MSSubClass != 120 & df$MSSubClass != 190)
## integer(0)
which(df$HouseStyle == "2.5Fin" & df$MSSubClass != 75 & df$MSSubClass != 90 & df$MSSubClass != 160 &
        df$MSSubClass != 190)
## [1] 1441
df$MSSubClass[which(df$HouseStyle == "2.5Fin" & df$MSSubClass != 75 & df$MSSubClass != 90 & 
                      df$MSSubClass != 160 &df$MSSubClass != 190)]
## [1] 70
df$MSSubClass[which(df$HouseStyle == "2.5Fin" & df$MSSubClass != 75 & df$MSSubClass != 90 & 
                      df$MSSubClass != 160 & df$MSSubClass != 190)] <- 75

which(df$HouseStyle == "2Story" & df$MSSubClass != 60 &  df$MSSubClass != 70 & df$MSSubClass != 90 & 
        df$MSSubClass != 160 & df$MSSubClass != 190)
## [1]   75   80  198  608 1449 2102 2881
df$MSSubClass[which(df$HouseStyle == "2Story" & df$MSSubClass != 60 &  df$MSSubClass != 70 & 
                      df$MSSubClass != 90 &  df$MSSubClass != 160 & df$MSSubClass != 190)]
## [1] 50 50 75 20 50 75 50
df$MSSubClass[which(df$YearBuilt >= 1946 & df$HouseStyle == "2Story" & df$MSSubClass != 60 &  
                      df$MSSubClass != 70 & df$MSSubClass != 90 &   df$MSSubClass != 160 & 
                      df$MSSubClass != 190)] <- 60
df$MSSubClass[which(df$YearBuilt <= 1945 & df$HouseStyle == "2Story" &  df$MSSubClass != 60 &  
                      df$MSSubClass != 70 & df$MSSubClass != 90 & df$MSSubClass != 160 &
                      df$MSSubClass != 190)] <- 70

which(df$HouseStyle == "SFoyer" & (df$MSSubClass != 85 & df$MSSubClass != 90 & 
                                     df$MSSubClass != 180 & df$MSSubClass != 190))
## [1]  544 2601
df$MSSubClass[which(df$HouseStyle == "SFoyer" & (df$MSSubClass != 85 & df$MSSubClass != 90 & 
                                                   df$MSSubClass != 180 & df$MSSubClass != 190))]
## [1] 120 120
df$MSSubClass[which(df$HouseStyle == "SFoyer" & (df$MSSubClass != 85 & df$MSSubClass != 90 & 
                                                   df$MSSubClass != 180 & df$MSSubClass != 190))] <- 180

which(df$HouseStyle == "SLvl" & (df$MSSubClass != 80 & df$MSSubClass != 90 & 
                                   df$MSSubClass != 180 & df$MSSubClass != 190))
## [1]  945 1440 2034
df$MSSubClass[which(df$HouseStyle == "SLvl" & (df$MSSubClass != 80 & df$MSSubClass != 90 &
                                                 df$MSSubClass != 180 & df$MSSubClass != 190))]
## [1]  20  60 160
df$MSSubClass[which(df$MSSubClass == 160 & df$HouseStyle == "SLvl" )] <- 180
df$MSSubClass[which(df$HouseStyle == "SLvl" & 
                      (df$MSSubClass == 20 | df$MSSubClass == 60) )] <- 80

which(df$BldgType == "1Fam" & df$MSSubClass == 190)
## [1] 2147
df$HouseStyle[which(df$BldgType == "1Fam" & df$MSSubClass == 190)]
## [1] "1Story"
df$YearBuilt[which(df$BldgType == "1Fam" & df$MSSubClass == 190)]
## [1] 1960
df$MSSubClass[which(df$BldgType == "1Fam" & df$MSSubClass == 190)] <- 20

which(df$BldgType == "2fmCon" & df$MSSubClass != 190)
## [1]  822 2127
df$MSSubClass[which(df$BldgType == "2fmCon" & df$MSSubClass != 190)] <- 190

which(df$BldgType == "Duplex" & df$MSSubClass != 90)
## integer(0)
which((df$BldgType == "Twnhs" | df$BldgType == "TwnhsE") & 
        (df$MSSubClass != 120 & df$MSSubClass != 150 & df$MSSubClass != 160 & df$MSSubClass != 180))
## [1] 2665
df$MSSubClass[which((df$BldgType == "Twnhs" | df$BldgType == "TwnhsE") & 
                      (df$MSSubClass != 120 & df$MSSubClass != 150 & df$MSSubClass != 160 & df$MSSubClass != 180))]
## [1] 20
df$MSSubClass[which((df$BldgType == "Twnhs" | df$BldgType == "TwnhsE") & 
                      (df$MSSubClass != 120 & df$MSSubClass != 150 & df$MSSubClass != 160 & df$MSSubClass != 180))] <- 120

# this is a categorical variable
df$MSSubClass <- as.factor(df$MSSubClass)
summary(df$MSSubClass)
##   20   30   40   45   50   60   70   75   80   85   90  120  150  160  180 
## 1076  136    5   18  289  573  132   22  120   48  109  181    1  127   20 
##  190 
##   62

Second Round

In this 2nd round, we begin engineering features. We will start with the categorical variables and then move on to the numerical ones.

MSSubClass

Most of the information here is duplicated by HouseStyle and BldgType. We’ll build a numeric feature that increases with the median price in the testing class

mssc = levels(df$MSSubClass)
mssc_df = as.data.frame(mssc)
colnames(mssc_df) <- "MSSubClass"

mssc_meds = (1:length(mssc))
for (i in (1:length(mssc))){
  mssc_meds[i] <- median(df$SalePrice[df$MSSubClass == mssc[i]], na.rm = TRUE)
}
# class 150 is not in the testing set
mssc_meds[[13]] <- 
  median(df$SalePrice[df$MSSubClass == '120' | df$MSSubClass == '160' | df$MSSubClass == '180' ], na.rm = TRUE)
mssc_df$meds = mssc_meds

mssc_df[ order(mssc_df$meds), ]
##    MSSubClass     meds
## 15        180 11.41861
## 2          30 11.49272
## 4          45 11.60368
## 16        190 11.75587
## 5          50 11.79622
## 11         90 11.82026
## 10         85 11.85447
## 3          40 11.86487
## 14        160 11.89136
## 7          70 11.95118
## 1          20 11.97980
## 8          75 12.00456
## 13        150 12.00914
## 9          80 12.02273
## 12        120 12.16655
## 6          60 12.28303
df$MSSubClassordered[df$MSSubClass == "180"] <- 1
df$MSSubClassordered[df$MSSubClass == "30"] <- 2
df$MSSubClassordered[df$MSSubClass == "45"] <- 3
df$MSSubClassordered[df$MSSubClass == "190"] <- 4
df$MSSubClassordered[df$MSSubClass == "50"] <- 5
df$MSSubClassordered[df$MSSubClass == "90"] <- 6
df$MSSubClassordered[df$MSSubClass == "85"] <- 7
df$MSSubClassordered[df$MSSubClass == "40"] <- 8
df$MSSubClassordered[df$MSSubClass == "160"] <- 9
df$MSSubClassordered[df$MSSubClass == "70"] <- 10
df$MSSubClassordered[df$MSSubClass == "20"] <- 11
df$MSSubClassordered[df$MSSubClass == "75"] <- 12
df$MSSubClassordered[df$MSSubClass == "150"] <- 13
df$MSSubClassordered[df$MSSubClass == "80"] <- 14
df$MSSubClassordered[df$MSSubClass == "120"] <- 15
df$MSSubClassordered[df$MSSubClass == "60"] <- 16
ggplot(data=df, aes(as.factor(df$MSSubClassordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

ggplot(data=df, aes(df$MSSubClassordered,df$SalePrice)) + geom_point()
## Warning: Removed 1459 rows containing missing values (geom_point).

MSZoning

df$MSZoning <- as.factor(df$MSZoning)

summary(df$MSZoning)
## C (all)      FV      RH      RL      RM 
##      25     139      26    2266     463
ggplot(data=df, aes(df$MSZoning,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

mszoning = levels(df$MSZoning)
mszoning_df = as.data.frame(mszoning)
colnames(mszoning_df) <- "MSZoning"

mszoning_meds = (1:length(mszoning))
for (i in (1:length(mszoning))){
  mszoning_meds[i] <- median(df$SalePrice[df$MSZoning == mszoning[i]], na.rm = TRUE)
}
mszoning_df$meds = mszoning_meds
mszoning_df[ order(mszoning_df$meds), ]
##   MSZoning     meds
## 1  C (all) 11.21767
## 5       RM 11.69941
## 3       RH 11.82375
## 4       RL 12.06681
## 2       FV 12.23539
df$MSZoningordered[df$MSZoning == "C (all)"] <- 1
df$MSZoningordered[df$MSZoning == "RM"] <- 2
df$MSZoningordered[df$MSZoning == "RH"] <- 3
df$MSZoningordered[df$MSZoning == "RL"] <- 4
df$MSZoningordered[df$MSZoning == "FV"] <- 5
ggplot(data=df, aes(as.factor(df$MSZoningordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Street

df$Street <- as.factor(df$Street)

summary(df$Street)
## Grvl Pave 
##   12 2907
ggplot(data=df, aes(df$Street,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Alley

df$Alley <- as.factor(df$Alley)
summary(df$Alley)
## Grvl None Pave 
##  120 2721   78
ggplot(data=df, aes(df$Alley,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

LotShape

df$LotShape <- as.factor(df$LotShape)
summary(df$LotShape)
##  IR1  IR2  IR3  Reg 
##  968   76   16 1859
ggplot(data=df, aes(df$LotShape,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

LandContour

df$LandContour <- as.factor(df$LandContour)
summary(df$LandContour)
##  Bnk  HLS  Low  Lvl 
##  117  120   60 2622
ggplot(data=df, aes(df$LandContour,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$LandContourordered[df$LandContour == "Bnk"] <- 1
df$LandContourordered[df$LandContour == "Lvl"] <- 2
df$LandContourordered[df$LandContour == "Low"] <- 3
df$LandContourordered[df$LandContour == "HLS"] <- 4
ggplot(data=df, aes(as.factor(df$LandContourordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

LotConfig

df$LotConfig <- as.factor(df$LotConfig)
summary(df$LotConfig)
##  Corner CulDSac     FR2     FR3  Inside 
##     511     176      85      14    2133
ggplot(data=df, aes(df$LotConfig,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$LotConfigordered[df$LotConfig == "Inside"] <- 1
df$LotConfigordered[df$LotConfig == "Corner"] <- 2
df$LotConfigordered[df$LotConfig == "FR2"] <- 3
df$LotConfigordered[df$LotConfig == "FR3"] <- 4
df$LotConfigordered[df$LotConfig == "CulDSac"] <- 5
ggplot(data=df, aes(as.factor(df$LotConfigordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

LandSlope

df$LandSlope <- as.factor(df$LandSlope)
summary(df$LandSlope)
##  Gtl  Mod  Sev 
## 2778  125   16
ggplot(data=df, aes(df$LandSlope,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$LandSlopeordered[df$LandSlope == "Gtl"] <- 1
df$LandSlopeordered[df$LandSlope == "Sev"] <- 2
df$LandSlopeordered[df$LandSlope == "Mod"] <- 3
ggplot(data=df, aes(as.factor(df$LandSlopeordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Neighborhood

df$Neighborhood <- as.factor(df$Neighborhood)
summary(df$Neighborhood)
## Blmngtn Blueste  BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert 
##      28      10      30     108      44     267     103     194     165 
##  IDOTRR MeadowV Mitchel   NAmes NoRidge NPkVill NridgHt  NWAmes OldTown 
##      93      37     114     443      71      23     166     131     239 
##  Sawyer SawyerW Somerst StoneBr   SWISU  Timber Veenker 
##     151     125     182      51      48      72      24
neigh = levels(df$Neighborhood)
neigh_df = as.data.frame(neigh)
colnames(neigh_df) <- "Neighborhood"
neigh_meds = (1:length(neigh))
for (i in (1:length(neigh))){
  neigh_meds[i] <- median(df$SalePrice[df$Neighborhood == neigh[i]], na.rm = TRUE)
}
 neigh_df$meds = neigh_meds
for (i in (1:length(neigh))){
   neigh_df$count[i] <- length(df$Neighborhood[df$Neighborhood == neigh[i]])
}

ggplot(data=neigh_df, aes(neigh_df$meds,neigh_df$count)) + geom_point()

neigh_df[ order(neigh_df$meds), ]
##    Neighborhood     meds count
## 11      MeadowV 11.38509    37
## 10       IDOTRR 11.54248    93
## 3        BrDale 11.57119    30
## 18      OldTown 11.68688   239
## 8       Edwards 11.70962   194
## 4       BrkSide 11.73022   108
## 19       Sawyer 11.81303   151
## 2       Blueste 11.82654    10
## 23        SWISU 11.84582    48
## 13        NAmes 11.84940   443
## 15      NPkVill 11.89136    23
## 12      Mitchel 11.94146   114
## 20      SawyerW 12.10016   125
## 9       Gilbert 12.10625   165
## 17       NWAmes 12.11669   131
## 1       Blmngtn 12.16003    28
## 6       CollgCr 12.19197   267
## 5       ClearCr 12.20732    44
## 7       Crawfor 12.20919   103
## 25      Veenker 12.29225    24
## 21      Somerst 12.32607   182
## 24       Timber 12.33918    72
## 22      StoneBr 12.53538    51
## 14      NoRidge 12.61653    71
## 16      NridgHt 12.66033   166
median(df$SalePrice, na.rm = TRUE)
## [1] 12.00151
sd(df$SalePrice, na.rm = TRUE)/2
## [1] 0.1997259
ggplot(data=df, aes(as.factor(df$Neighborhood),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

We will cut price bins around 250000, 195000, 170000, 130000, 115000

df$Neighborhoodbins[df$Neighborhood == "MeadowV"] <- 1
df$Neighborhoodbins[df$Neighborhood == "IDOTRR"] <- 1
df$Neighborhoodbins[df$Neighborhood == "BrDale"] <- 1
df$Neighborhoodbins[df$Neighborhood == "OldTown"] <- 2
df$Neighborhoodbins[df$Neighborhood == "Edwards"] <- 2
df$Neighborhoodbins[df$Neighborhood == "BrkSide"] <- 2
df$Neighborhoodbins[df$Neighborhood == "Sawyer"] <- 3
df$Neighborhoodbins[df$Neighborhood == "Blueste"] <- 3
df$Neighborhoodbins[df$Neighborhood == "SWISU"] <- 3
df$Neighborhoodbins[df$Neighborhood == "NAmes"] <- 3
df$Neighborhoodbins[df$Neighborhood == "NPkVill"] <- 3
df$Neighborhoodbins[df$Neighborhood == "Mitchel"] <- 3
df$Neighborhoodbins[df$Neighborhood == "SawyerW"] <- 4
df$Neighborhoodbins[df$Neighborhood == "Gilbert"] <- 4
df$Neighborhoodbins[df$Neighborhood == "NWAmes"] <- 4
df$Neighborhoodbins[df$Neighborhood == "Blmngtn"] <- 4
df$Neighborhoodbins[df$Neighborhood == "CollgCr"] <- 5
df$Neighborhoodbins[df$Neighborhood == "ClearCr"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Crawfor"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Veenker"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Somerst"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Timber"] <- 5
df$Neighborhoodbins[df$Neighborhood == "StoneBr"] <- 6
df$Neighborhoodbins[df$Neighborhood == "NoRidge"] <- 6
df$Neighborhoodbins[df$Neighborhood == "NridgHt"] <- 6
ggplot(data=df, aes(as.factor(df$Neighborhoodbins),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$GoodNeighborhood <- 0
df$GoodNeighborhood[df$Neighborhood == "CollgCr"] <- 1
df$GoodNeighborhood[df$Neighborhood == "ClearCr"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Crawfor"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Veenker"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Somerst"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Timber"] <- 1
df$GoodNeighborhood[df$Neighborhood == "StoneBr"] <- 1
df$GoodNeighborhood[df$Neighborhood == "NoRidge"] <- 1
df$GoodNeighborhood[df$Neighborhood == "NridgHt"] <- 1

df$GreatNeighborhood <- 0
df$GreatNeighborhood[df$Neighborhood == "StoneBr"] <- 1
df$GreatNeighborhood[df$Neighborhood == "NoRidge"] <- 1
df$GreatNeighborhood[df$Neighborhood == "NridgHt"] <- 1

We also created features that indicate two categories of premium neighborhoods.

Condition1

df$Condition1 <- as.factor(df$Condition1)
summary(df$Condition1)
## Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNe   RRNn 
##     92    164   2511     20     39     28     50      6      9
ggplot(data=df, aes(df$Condition1,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

condition1 = levels(df$Condition1)
condition1_df = as.data.frame(condition1)
colnames(condition1_df) <- "Condition1"
condition1_meds = (1:length(condition1))
for (i in (1:length(condition1))){
  condition1_meds[i] <- median(df$SalePrice[df$Condition1 == condition1[i]], na.rm = TRUE)
}
condition1_df$meds = condition1_meds
condition1_df[ order(condition1_df$meds), ]
##   Condition1     meds
## 1     Artery 11.69149
## 2      Feedr 11.84940
## 6       RRAe 11.86710
## 3       Norm 12.02275
## 7       RRAn 12.05227
## 8       RRNe 12.15853
## 5       PosN 12.20607
## 4       PosA 12.26106
## 9       RRNn 12.27373
df$Condition1nums[df$Condition1 == "Artery"] <- 1
df$Condition1nums[df$Condition1 == "Feedr"] <- 2
df$Condition1nums[df$Condition1 == "RRAe"] <- 3
df$Condition1nums[df$Condition1 == "Norm"] <- 4
df$Condition1nums[df$Condition1 == "RRAn"] <- 5
df$Condition1nums[df$Condition1 == "RRNe"] <- 6
df$Condition1nums[df$Condition1 == "PosN"] <- 7
df$Condition1nums[df$Condition1 == "PosA"] <- 8
df$Condition1nums[df$Condition1 == "RRNn"] <- 9
ggplot(data=df, aes(as.factor(df$Condition1nums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Condition2

df$Condition2 <- as.factor(df$Condition2)
summary(df$Condition2)
## Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNn 
##      5     13   2889      4      4      1      1      2
levels(as.factor(train$Condition2))
## [1] "Artery" "Feedr"  "Norm"   "PosA"   "PosN"   "RRAe"   "RRAn"   "RRNn"
levels(as.factor(test$Condition2))
## [1] "Artery" "Feedr"  "Norm"   "PosA"   "PosN"

We note that the levelss “RRAe”, “RRAn”, and “RRNn” are missing from the test set. We will need to remove these from the testing set after encoding.

BldgType

df$BldgType <- as.factor(df$BldgType)
summary(df$BldgType)
##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   2425     62    109     96    227
ggplot(data=df, aes(df$BldgType,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

bldgtype = levels(df$BldgType)
bldgtype_df = as.data.frame(bldgtype)
colnames(bldgtype_df) <- "BldgType"
bldgtype_meds = (1:length(bldgtype))
for (i in (1:length(bldgtype))){
  bldgtype_meds[i] <- median(df$SalePrice[df$BldgType == bldgtype[i]], na.rm = TRUE)
}
bldgtype_df$meds = bldgtype_meds
bldgtype_df[ order(bldgtype_df$meds), ]
##   BldgType     meds
## 2   2fmCon 11.75587
## 3   Duplex 11.82026
## 4    Twnhs 11.83138
## 1     1Fam 12.03112
## 5   TwnhsE 12.05641
df$BldgTypenums[df$BldgType == "2fmCon"] <- 1
df$BldgTypenums[df$BldgType == "Duplex"] <- 2
df$BldgTypenums[df$BldgType == "Twnhs"] <- 3
df$BldgTypenums[df$BldgType == "1Fam"] <- 4
df$BldgTypenums[df$BldgType == "TwnhsE"] <- 5
ggplot(data=df, aes(as.factor(df$BldgTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

HouseStyle

df$HouseStyle <- as.factor(df$HouseStyle)
summary(df$HouseStyle)
## 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer   SLvl 
##    314     19   1471      8     24    872     83    128
ggplot(data=df, aes(df$HouseStyle,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

housestyle = levels(df$HouseStyle)
housestyle_df = as.data.frame(housestyle)
colnames(housestyle_df) <- "HouseStyle"
housestyle_meds = (1:length(housestyle))
for (i in (1:length(housestyle))){
  housestyle_meds[i] <- median(df$SalePrice[df$HouseStyle == housestyle[i]], na.rm = TRUE)
}
housestyle_df$meds = housestyle_meds
housestyle_df[ order(housestyle_df$meds), ]
##   HouseStyle     meds
## 2     1.5Unf 11.61941
## 1     1.5Fin 11.79055
## 5     2.5Unf 11.80485
## 7     SFoyer 11.82012
## 3     1Story 11.94956
## 8       SLvl 12.01067
## 6     2Story 12.15478
## 4     2.5Fin 12.17549
df$HouseStylenums[df$HouseStyle == "1.5Unf"] <- 1
df$HouseStylenums[df$HouseStyle == "1.5Fin"] <- 2
df$HouseStylenums[df$HouseStyle == "2.5Unf"] <- 3
df$HouseStylenums[df$HouseStyle == "SFoyer"] <- 4
df$HouseStylenums[df$HouseStyle == "1Story"] <- 5
df$HouseStylenums[df$HouseStyle == "SLvl"] <- 6
df$HouseStylenums[df$HouseStyle == "2Story"] <- 7
df$HouseStylenums[df$HouseStyle == "2.5Fin"] <- 8
ggplot(data=df, aes(as.factor(df$HouseStylenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

levels(as.factor(train$HouseStyle))
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Fin" "2.5Unf" "2Story" "SFoyer" "SLvl"
levels(as.factor(test$HouseStyle))
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Unf" "2Story" "SFoyer" "SLvl"

Here “2.5Fin” is missing from the test set.

OverallQual

While seemingly ordinal, this variable is also a translation of categorical ratings “Poor”, “Fair”, etc. We’ll set a dummy ordinal for this and OverallCond below.

df$OverallQualcat <- as.factor(df$OverallQual)
summary(df$OverallQualcat)
##   1   2   3   4   5   6   7   8   9  10 
##   4  13  40 226 825 731 600 342 107  31
ggplot(data=df, aes(as.factor(df$OverallQual),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

We will make bins for the top and low ends.

df$OverallQualGood <- 0
df$OverallQualBad <- 0
df$OverallQualGood[df$OverallQual == 8] <- 1
df$OverallQualGood[df$OverallQual == 9] <- 1
df$OverallQualGood[df$OverallQual == 10] <- 1
df$OverallQualBad[df$OverallQual == 1] <- 1
df$OverallQualBad[df$OverallQual == 2] <- 1

OverallCond

df$OverallCondcat <- as.factor(df$OverallCond)
summary(df$OverallCondcat)
##    1    2    3    4    5    6    7    8    9 
##    7   10   50  101 1645  531  390  144   41
ggplot(data=df, aes(as.factor(df$OverallCond),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$OverallCondBad <- 0
df$OverallCondBad[df$OverallCond == 1] <- 1
df$OverallCondBad[df$OverallCond == 2] <- 1
df$OverallCondBad[df$OverallCond == 3] <- 1
df$OverallCondBad[df$OverallCond == 4] <- 1

ggplot(data=df, aes(df$OverallCond,df$SalePrice)) + geom_point() + 
  geom_smooth(method=lm, data=df, formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

Only need a bin for bad condition. In the last plot, we see that there are some outliers and the ordinal nature of the variable doesn’t suggest a useful transformation to make this any more linear.

RoofStyle

df$RoofStyle <- as.factor(df$RoofStyle)
summary(df$RoofStyle)
##    Flat   Gable Gambrel     Hip Mansard    Shed 
##      20    2310      22     551      11       5
ggplot(data=df, aes(df$RoofStyle,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

roofstyle = levels(df$RoofStyle)
roofstyle_df = as.data.frame(roofstyle)
colnames(roofstyle_df) <- "RoofStyle"
roofstyle_meds = (1:length(roofstyle))
for (i in (1:length(roofstyle))){
  roofstyle_meds[i] <- median(df$SalePrice[df$RoofStyle == roofstyle[i]], na.rm = TRUE)
}
# 
roofstyle_df$meds = roofstyle_meds
roofstyle_df[ order(roofstyle_df$meds), ]
##   RoofStyle     meds
## 3   Gambrel 11.84223
## 2     Gable 11.98293
## 5   Mansard 12.07254
## 4       Hip 12.08108
## 1      Flat 12.12811
## 6      Shed 12.31161
df$RoofStylenums[df$RoofStyle == "Gambrel"] <- 1
df$RoofStylenums[df$RoofStyle == "Gable"] <- 2
df$RoofStylenums[df$RoofStyle == "Mansard"] <- 3
df$RoofStylenums[df$RoofStyle == "Hip"] <- 4
df$RoofStylenums[df$RoofStyle == "Flat"] <- 5
df$RoofStylenums[df$RoofStyle == "Shed"] <- 6
ggplot(data=df, aes(as.factor(df$RoofStylenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

RoofMatl

df$RoofMatl <- as.factor(df$RoofMatl)
summary(df$RoofMatl)
## ClyTile CompShg Membran   Metal    Roll Tar&Grv WdShake WdShngl 
##       1    2876       1       1       1      23       9       7
ggplot(data=df, aes(as.factor(df$RoofMatl),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

levels(as.factor(train$RoofMatl))
## [1] "ClyTile" "CompShg" "Membran" "Metal"   "Roll"    "Tar&Grv" "WdShake"
## [8] "WdShngl"
levels(as.factor(test$RoofMatl))
## [1] "CompShg" "Tar&Grv" "WdShake" "WdShngl"

Here “ClyTile”, “Membran”, “Metal”, and “Roll” are not in the test set. Overall low variation suggests dropping this at some point.

Exterior1st

df$Exterior1st <- as.factor(df$Exterior1st)
summary(df$Exterior1st)
## AsbShng AsphShn BrkComm BrkFace  CBlock CemntBd HdBoard ImStucc MetalSd 
##      44       2       6      87       2     126     442       1     450 
## Plywood   Stone  Stucco VinylSd Wd Sdng WdShing 
##     221       2      43    1025     412      56
exterior1st = levels(df$Exterior1st)
exterior1st_df = as.data.frame(exterior1st)
colnames(exterior1st_df) <- "Exterior1st"
exterior1st_meds = (1:length(exterior1st))
for (i in (1:length(exterior1st))){
  exterior1st_meds[i] <- median(df$SalePrice[df$Exterior1st == exterior1st[i]], na.rm = TRUE)
}
exterior1st_df$meds = exterior1st_meds
exterior1st_df[ order(exterior1st_df$meds), ]
##    Exterior1st     meds
## 3      BrkComm 11.15829
## 2      AsphShn 11.51293
## 5       CBlock 11.56172
## 1      AsbShng 11.58972
## 15     WdShing 11.76520
## 14     Wd Sdng 11.84182
## 9      MetalSd 11.84223
## 12      Stucco 11.87757
## 7      HdBoard 11.91772
## 4      BrkFace 12.01823
## 10     Plywood 12.02844
## 13     VinylSd 12.20607
## 6      CemntBd 12.37370
## 11       Stone 12.45654
## 8      ImStucc 12.47610
df$Exterior1stnums[df$Exterior1st == "BrkComm"] <- 1
df$Exterior1stnums[df$Exterior1st == "AsphShn"] <- 2
df$Exterior1stnums[df$Exterior1st == "CBlock"] <- 3
df$Exterior1stnums[df$Exterior1st == "AsbShng"] <- 4
df$Exterior1stnums[df$Exterior1st == "WdShing"] <- 5
df$Exterior1stnums[df$Exterior1st == "Wd Sdng"] <- 6
df$Exterior1stnums[df$Exterior1st == "MetalSd"] <- 7
df$Exterior1stnums[df$Exterior1st == "Stucco"] <- 8
df$Exterior1stnums[df$Exterior1st == "HdBoard"] <- 9
df$Exterior1stnums[df$Exterior1st == "BrkFace"] <- 10
df$Exterior1stnums[df$Exterior1st == "Plywood"] <- 11
df$Exterior1stnums[df$Exterior1st == "VinylSd"] <- 12
df$Exterior1stnums[df$Exterior1st == "CemntBd"] <- 13
df$Exterior1stnums[df$Exterior1st == "Stone"] <- 14
df$Exterior1stnums[df$Exterior1st == "ImStucc"] <- 15
ggplot(data=df, aes(as.factor(df$Exterior1stnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Exterior2nd

df$Exterior2nd <- as.factor(df$Exterior2nd)
summary(df$Exterior2nd)
## AsbShng AsphShn Brk Cmn BrkFace  CBlock CmentBd HdBoard ImStucc MetalSd 
##      38       4      22      47       3     126     406      15     447 
##   Other Plywood   Stone  Stucco VinylSd Wd Sdng Wd Shng 
##       1     270       6      47    1014     392      81
exterior2nd = levels(df$Exterior2nd)
exterior2nd_df = as.data.frame(exterior2nd)
colnames(exterior2nd_df) <- "Exterior2nd"
exterior2nd_meds = (1:length(exterior2nd))
for (i in (1:length(exterior2nd))){
  exterior2nd_meds[i] <- median(df$SalePrice[df$Exterior2nd == exterior2nd[i]], na.rm = TRUE)
}
exterior2nd_df$meds = exterior2nd_meds
exterior2nd_df[ order(exterior2nd_df$meds), ]
##    Exterior2nd     meds
## 5       CBlock 11.56172
## 1      AsbShng 11.61724
## 15     Wd Sdng 11.83501
## 16     Wd Shng 11.83662
## 9      MetalSd 11.84043
## 2      AsphShn 11.84223
## 13      Stucco 11.86348
## 3      Brk Cmn 11.89819
## 7      HdBoard 11.95118
## 4      BrkFace 11.98293
## 11     Plywood 11.98758
## 12       Stone 12.08391
## 8      ImStucc 12.14106
## 14     VinylSd 12.20643
## 6      CmentBd 12.38313
## 10       Other 12.67295
df$Exterior2ndnums[df$Exterior2nd == "Brk Cmn"] <- 8
df$Exterior2ndnums[df$Exterior2nd == "AsphShn"] <- 6
df$Exterior2ndnums[df$Exterior2nd == "CBlock"] <- 1
df$Exterior2ndnums[df$Exterior2nd == "AsbShng"] <- 2
df$Exterior2ndnums[df$Exterior2nd == "Wd Shng"] <- 4
df$Exterior2ndnums[df$Exterior2nd == "Wd Sdng"] <- 3
df$Exterior2ndnums[df$Exterior2nd == "MetalSd"] <- 5
df$Exterior2ndnums[df$Exterior2nd == "Stucco"] <- 7
df$Exterior2ndnums[df$Exterior2nd == "HdBoard"] <- 9
df$Exterior2ndnums[df$Exterior2nd == "BrkFace"] <- 10
df$Exterior2ndnums[df$Exterior2nd == "Plywood"] <- 11
df$Exterior2ndnums[df$Exterior2nd == "VinylSd"] <- 14
df$Exterior2ndnums[df$Exterior2nd == "CmentBd"] <- 15
df$Exterior2ndnums[df$Exterior2nd == "Stone"] <- 12
df$Exterior2ndnums[df$Exterior2nd == "ImStucc"] <- 13
df$Exterior2ndnums[df$Exterior2nd == "Other"] <- 16
ggplot(data=df, aes(as.factor(df$Exterior2ndnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

MasVnrType

df$MasVnrType <- as.factor(df$MasVnrType)
summary(df$MasVnrType)
##  BrkCmn BrkFace    None   Stone 
##      25     882    1764     248
ggplot(data=df, aes(factor(df$MasVnrType),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$MasVnrTypenums[df$MasVnrType == "BrkCmn"] <- 1
df$MasVnrTypenums[df$MasVnrType == "None"] <- 2
df$MasVnrTypenums[df$MasVnrType == "BrkFace"] <- 3
df$MasVnrTypenums[df$MasVnrType == "Stone"] <- 4
ggplot(data=df, aes(as.factor(df$MasVnrTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

ExterQual

df$ExterQual <- as.factor(df$ExterQual)
summary(df$ExterQual)
##   Ex   Fa   Gd   TA 
##  107   35  979 1798
ggplot(data=df, aes(factor(df$ExterQual),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$ExterQualnums[df$ExterQual == "Fa"] <- 1
df$ExterQualnums[df$ExterQual == "TA"] <- 2
df$ExterQualnums[df$ExterQual == "Gd"] <- 3
df$ExterQualnums[df$ExterQual == "Ex"] <- 4

ExterCond

df$ExterCond <- as.factor(df$ExterCond)
summary(df$ExterCond)
##   Ex   Fa   Gd   Po   TA 
##   12   67  299    3 2538
ggplot(data=df, aes(factor(df$ExterCond),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$ExterCondnums[df$ExterCond == "Po"] <- 0
df$ExterCondnums[df$ExterCond == "Fa"] <- 1
df$ExterCondnums[df$ExterCond == "TA"] <- 2
df$ExterCondnums[df$ExterCond == "Gd"] <- 3
df$ExterCondnums[df$ExterCond == "Ex"] <- 4

Foundation

df$Foundation <- as.factor(df$Foundation)
summary(df$Foundation)
## BrkTil CBlock  PConc   Slab  Stone   Wood 
##    311   1235   1308     49     11      5
ggplot(data=df, aes(factor(df$Foundation),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

foundation = levels(df$Foundation)
foundation_df = as.data.frame(foundation)
colnames(foundation_df) <- "Foundation"
foundation_meds = (1:length(foundation))
for (i in (1:length(foundation))){
  foundation_meds[i] <- median(df$SalePrice[df$Foundation == foundation[i]], na.rm = TRUE)
}
foundation_df$meds = foundation_meds
foundation_df[ order(foundation_df$meds), ]
##   Foundation     meds
## 4       Slab 11.55333
## 1     BrkTil 11.73807
## 5      Stone 11.74454
## 2     CBlock 11.86005
## 6       Wood 12.00762
## 3      PConc 12.23077
df$Foundationnums[df$Foundation == "Slab"] <- 1
df$Foundationnums[df$Foundation == "BrkTil"] <- 2
df$Foundationnums[df$Foundation == "Stone"] <- 3
df$Foundationnums[df$Foundation == "CBlock"] <- 4
df$Foundationnums[df$Foundation == "Wood"] <- 5
df$Foundationnums[df$Foundation == "PConc"] <- 6
ggplot(data=df, aes(as.factor(df$Foundationnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

BsmtQual

df$BsmtQual <- as.factor(df$BsmtQual)
summary(df$BsmtQual)
##   Ex   Fa   Gd None   TA 
##  258   88 1209   79 1285
ggplot(data=df, aes(df$BsmtQual,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BsmtQualnums[df$BsmtQual == "None"] <- 0
df$BsmtQualnums[df$BsmtQual == "Fa"] <- 1
df$BsmtQualnums[df$BsmtQual == "TA"] <- 2
df$BsmtQualnums[df$BsmtQual == "Gd"] <- 3
df$BsmtQualnums[df$BsmtQual == "Ex"] <- 4

BsmtCond

df$BsmtCond <- as.factor(df$BsmtCond)
summary(df$BsmtCond)
##   Fa   Gd None   Po   TA 
##  104  122   79    5 2609
ggplot(data=df, aes(df$BsmtCond,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BsmtCondnums[df$BsmtCond == "Po"] <- 1
df$BsmtCondnums[df$BsmtCond == "None"] <- 2
df$BsmtCondnums[df$BsmtCond == "Fa"] <- 3
df$BsmtCondnums[df$BsmtCond == "TA"] <- 4
df$BsmtCondnums[df$BsmtCond == "Gd"] <- 5

BsmtExposure

df$BsmtExposure <- as.factor(df$BsmtExposure)
summary(df$BsmtExposure)
##   Av   Gd   Mn   No None 
##  418  276  239 1907   79
ggplot(data=df, aes(df$BsmtExposure,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BsmtExposurenums[df$BsmtExposure == "None"] <- 0
df$BsmtExposurenums[df$BsmtExposure == "No"] <- 1
df$BsmtExposurenums[df$BsmtExposure == "Mn"] <- 2
df$BsmtExposurenums[df$BsmtExposure == "Av"] <- 3
df$BsmtExposurenums[df$BsmtExposure == "Gd"] <- 4

BsmtFinType1

df$BsmtFinType1 <- as.factor(df$BsmtFinType1)
summary(df$BsmtFinType1)
##  ALQ  BLQ  GLQ  LwQ None  Rec  Unf 
##  429  269  849  154   79  288  851
ggplot(data=df, aes(df$BsmtFinType1,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

bsmtfintype1 = levels(df$BsmtFinType1)
bsmtfintype1_df = as.data.frame(bsmtfintype1)
colnames(bsmtfintype1_df) <- "BsmtFinType1"
bsmtfintype1_meds = (1:length(bsmtfintype1))
for (i in (1:length(bsmtfintype1))){
  bsmtfintype1_meds[i] <- median(df$SalePrice[df$BsmtFinType1 == bsmtfintype1[i]], na.rm = TRUE)
}
bsmtfintype1_df$meds = bsmtfintype1_meds
bsmtfintype1_df[ order(bsmtfintype1_df$meds), ]
##   BsmtFinType1     meds
## 5         None 11.53077
## 4          LwQ 11.84223
## 2          BLQ 11.84295
## 6          Rec 11.86358
## 1          ALQ 11.91338
## 7          Unf 11.99381
## 3          GLQ 12.27256
df$BsmtFinType1nums[df$BsmtFinType1 == "None"] <- 0
df$BsmtFinType1nums[df$BsmtFinType1 == "LwQ"] <- 1
df$BsmtFinType1nums[df$BsmtFinType1 == "BLQ"] <- 2
df$BsmtFinType1nums[df$BsmtFinType1 == "Rec"] <- 3
df$BsmtFinType1nums[df$BsmtFinType1 == "ALQ"] <- 4
df$BsmtFinType1nums[df$BsmtFinType1 == "Unf"] <- 5
df$BsmtFinType1nums[df$BsmtFinType1 == "GLQ"] <- 6
ggplot(data=df, aes(as.factor(df$BsmtFinType1nums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

BsmtFinType2

df$BsmtFinType2 <- as.factor(df$BsmtFinType2)
summary(df$BsmtFinType2)
##  ALQ  BLQ  GLQ  LwQ None  Rec  Unf 
##   52   68   34   87   79  105 2494
ggplot(data=df, aes(df$BsmtFinType2,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

bsmtfintype2 = levels(df$BsmtFinType2)
bsmtfintype2_df = as.data.frame(bsmtfintype2)
colnames(bsmtfintype2_df) <- "BsmtFinType2"
bsmtfintype2_meds = (1:length(bsmtfintype2))
for (i in (1:length(bsmtfintype2))){
 bsmtfintype2_meds[i] <- median(df$SalePrice[df$BsmtFinType2 == bsmtfintype2[i]], na.rm = TRUE)
}
bsmtfintype2_df$meds = bsmtfintype2_meds
bsmtfintype2_df[ order(bsmtfintype2_df$meds), ]
##   BsmtFinType2     meds
## 5         None 11.53077
## 2          BLQ 11.87060
## 6          Rec 11.90995
## 4          LwQ 11.94469
## 7          Unf 12.02575
## 1          ALQ 12.07197
## 3          GLQ 12.22033
df$BsmtFinType2nums[df$BsmtFinType2 == "None"] <- 0
df$BsmtFinType2nums[df$BsmtFinType2 == "BLQ"] <- 1
df$BsmtFinType2nums[df$BsmtFinType2 == "Rec"] <- 2
df$BsmtFinType2nums[df$BsmtFinType2 == "LwQ"] <- 3
df$BsmtFinType2nums[df$BsmtFinType2 == "Unf"] <- 4
df$BsmtFinType2nums[df$BsmtFinType2 == "ALQ"] <- 5
df$BsmtFinType2nums[df$BsmtFinType2 == "GLQ"] <- 6
ggplot(data=df, aes(as.factor(df$BsmtFinType2nums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Heating

df$Heating <- as.factor(df$Heating) 
summary(df$Heating)
## Floor  GasA  GasW  Grav  OthW  Wall 
##     1  2874    27     9     2     6
ggplot(data=df, aes(df$Heating,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$NoGasHeat <- 1
df$NoGasHeat[df$Heating == 'GasA' | df$Heating == 'GasW'] <- 0

df$Heating <- NULL

HeatingQC

df$HeatingQC <- as.factor(df$HeatingQC) 
summary(df$HeatingQC)
##   Ex   Fa   Gd   Po   TA 
## 1493   92  474    3  857
ggplot(data=df, aes(df$HeatingQC,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BadHeating <- 0
df$BadHeating[df$HeatingQC == "Po"] <- 1
df$BadHeating[df$HeatingQC == "Fa"] <- 1

df$HeatingQCnums[df$HeatingQC == "Po"] <- 1
df$HeatingQCnums[df$HeatingQC == "Fa"] <- 2
df$HeatingQCnums[df$HeatingQC == "TA"] <- 3
df$HeatingQCnums[df$HeatingQC == "Gd"] <- 4
df$HeatingQCnums[df$HeatingQC == "Ex"] <- 5
ggplot(data=df, aes(as.factor(df$HeatingQCnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

CentralAir

df$CentralAir <- as.factor(df$CentralAir)
summary(df$CentralAir)
##    N    Y 
##  196 2723
ggplot(data=df, aes(df$CentralAir,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$NoCentralAir <- 0
df$NoCentralAir[df$CentralAir == "N"] <- 1
df$CentralAir <- NULL

Electrical

df$Electrical <- as.factor(df$Electrical)
summary(df$Electrical)
## FuseA FuseF FuseP   Mix SBrkr 
##   188    50     8     1  2672
ggplot(data=df, aes(df$Electrical,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$HasFuse <- 1
df$HasFuse[df$Electrical == "SBrkr"] <- 0

df$Electrical <- NULL

KitchenQual

df$KitchenQual <- as.factor(df$KitchenQual)
summary(df$KitchenQual)
##   Ex   Fa   Gd   TA 
##  205   70 1151 1493
ggplot(data=df, aes(df$KitchenQual,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$KitchenQualnums[df$KitchenQual == "Fa"] <- 1
df$KitchenQualnums[df$KitchenQual == "TA"] <- 2
df$KitchenQualnums[df$KitchenQual == "Gd"] <- 3
df$KitchenQualnums[df$KitchenQual == "Ex"] <- 4

Functional

df$Functional <- as.factor(df$Functional)
summary(df$Functional)
## Maj1 Maj2 Min1 Min2  Mod  Sev  Typ 
##   19    9   65   70   35    2 2719
ggplot(data=df, aes(df$Functional,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

This doesn’t seem to be a good regressor for price. The level “Maj2” has only 9 entries and is statistically dubious.

FireplaceQu

df$FireplaceQu <- as.factor(df$FireplaceQu)
summary(df$FireplaceQu)
##   Ex   Fa   Gd None   Po   TA 
##   43   74  744 1420   46  592
ggplot(data=df, aes(df$FireplaceQu,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BadFireplace <- 0
df$BadFireplace[df$FireplaceQu == "TA"] <- 1
df$BadFireplace[df$FireplaceQu == "Po"] <- 1
df$BadFireplace[df$FireplaceQu == "None"] <- 1

df$FireplaceQunums[df$FireplaceQu == "Po"] <- 1
df$FireplaceQunums[df$FireplaceQu == "None"] <- 2
df$FireplaceQunums[df$FireplaceQu == "Fa"] <- 3
df$FireplaceQunums[df$FireplaceQu == "TA"] <- 4
df$FireplaceQunums[df$FireplaceQu == "Gd"] <- 5
df$FireplaceQunums[df$FireplaceQu == "Ex"] <- 6

GarageType

df$GarageType <- as.factor(df$GarageType)
summary(df$GarageType)
##  2Types  Attchd Basment BuiltIn CarPort  Detchd    None 
##      23    1723      36     186      15     779     157
ggplot(data=df, aes(df$GarageType,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$GarageDetach <- 0
df$GarageDetach[df$GarageType == "Detchd"] <- 1
df$GarageDetach[df$GarageType == "CarPort"] <- 1
df$GarageDetach[df$GarageType == "None"] <- 1

df$GarageTypenums[df$GarageType == "None"] <- 0
df$GarageTypenums[df$GarageType == "CarPort"] <- 1
df$GarageTypenums[df$GarageType == "Detchd"] <- 2
df$GarageTypenums[df$GarageType == "Basment"] <- 3
df$GarageTypenums[df$GarageType == "2Types"] <- 4
df$GarageTypenums[df$GarageType == "Attchd"] <- 5
df$GarageTypenums[df$GarageType == "BuiltIn"] <- 6
ggplot(data=df, aes(as.factor(df$GarageTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

GarageFinish: Interior finish of the garage

df$GarageFinish = as.factor(df$GarageFinish)
summary(df$GarageFinish)
##  Fin None  RFn  Unf 
##  719  157  811 1232
ggplot(data=df, aes(df$GarageFinish,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$GarageFinishnums[df$GarageFinish == "None"] <- 0
df$GarageFinishnums[df$GarageFinish == "Unf"] <- 1
df$GarageFinishnums[df$GarageFinish == "RFn"] <- 2
df$GarageFinishnums[df$GarageFinish == "Fin"] <- 3
ggplot(data=df, aes(as.factor(df$GarageFinishnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

GarageQual

df$GarageQual <- as.factor(df$GarageQual)
summary(df$GarageQual)
##   Ex   Fa   Gd None   Po   TA 
##    3  124   24  157    5 2606
ggplot(data=df, aes(df$GarageQual,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BadGarageQ <- 0
df$BadGarageQ[df$GarageQual == "Fa"] <- 1
df$BadGarageQ[df$GarageQual == "Po"] <- 1
df$BadGarageQ[df$GarageQual == "None"] <- 1

df$GarageQualnums[df$GarageQual == "Po"] <- 0
df$GarageQualnums[df$GarageQual == "None"] <- 1
df$GarageQualnums[df$GarageQual == "Fa"] <- 2
df$GarageQualnums[df$GarageQual == "Ex"] <- 3
df$GarageQualnums[df$GarageQual == "TA"] <- 4
df$GarageQualnums[df$GarageQual == "Gd"] <- 5
ggplot(data=df, aes(as.factor(df$GarageQualnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

GarageCond

df$GarageCond <- as.factor(df$GarageCond)
summary(df$GarageCond)
##   Ex   Fa   Gd None   Po   TA 
##    3   74   15  157   14 2656
ggplot(data=df, aes(df$GarageCond,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$BadGarageC <- 0
df$BadGarageC[df$GarageCond == "Fa"] <- 1
df$BadGarageC[df$GarageCond == "Po"] <- 1
df$BadGarageC[df$GarageCond == "None"] <- 1

df$GarageCondnums[df$GarageCond == "None"] <- 0
df$GarageCondnums[df$GarageCond == "Po"] <- 1
df$GarageCondnums[df$GarageCond == "Fa"] <- 2
df$GarageCondnums[df$GarageCond == "Ex"] <- 3
df$GarageCondnums[df$GarageCond == "Gd"] <- 4
df$GarageCondnums[df$GarageCond == "TA"] <- 5
ggplot(data=df, aes(as.factor(df$GarageCondnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

PavedDrive

df$PavedDrive <- as.factor(df$PavedDrive)
summary(df$PavedDrive)
##    N    P    Y 
##  216   62 2641
ggplot(data=df, aes(df$PavedDrive,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

PoolQC

df$PoolQC <- as.factor(df$PoolQC)
summary(df$PoolQC)
##   Ex   Fa   Gd None 
##    7    2    4 2906
ggplot(data=df, aes(df$PoolQC,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$PoolQCnums[df$PoolQC == "None"] <- 0
df$PoolQCnums[df$PoolQC == "TA"] <- 1
df$PoolQCnums[df$PoolQC == "Gd"] <- 2
df$PoolQCnums[df$PoolQC == "Fa"] <- 3
df$PoolQCnums[df$PoolQC == "Ex"] <- 4

Fence

df$Fence <- as.factor(df$Fence)
summary(df$Fence)
## GdPrv  GdWo MnPrv  MnWw  None 
##   118   112   329    12  2348
ggplot(data=df, aes(df$Fence,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$Fencenums[df$Fence == "MnWw"] <- 1
df$Fencenums[df$Fence == "MnPrv"] <- 2
df$Fencenums[df$Fence == "GdWo"] <- 3
df$Fencenums[df$Fence == "None"] <- 4
df$Fencenums[df$Fence == "GdPrv"] <- 5

MiscFeature

df$MiscFeature <- as.factor(df$MiscFeature)
summary(df$MiscFeature)
## Gar2 None Othr Shed TenC 
##    5 2814    4   95    1
ggplot(data=df, aes(df$MiscFeature,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

MoSold: Month Sold (MM)

This is coded numeric, but might be useful to treat as a categorical variable.

df$MoSoldcat <- as.factor(df$MoSold)
summary(df$MoSoldcat)
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 122 133 232 279 394 503 446 233 158 173 142 104
mosold = levels(df$MoSoldcat)
mosold_df = as.data.frame(mosold)
colnames(mosold_df) <- "MoSold"
mosold_meds = (1:length(mosold))
for (i in (1:length(mosold))){
  mosold_meds[i] <- median(df$SalePrice[df$MoSoldcat == mosold[i]], na.rm = TRUE)
}
mosold_df$meds = mosold_meds
for (i in (1:length(mosold))){
 mosold_df$count[i] <- length(df$MoSoldcat[df$MoSoldcat == mosold[i]])
}
mosold_df[ order(mosold_df$meds), ]
##    MoSold     meds count
## 4       4 11.89819   279
## 1       1 11.92918   122
## 5       5 11.94471   394
## 10     10 11.97666   173
## 6       6 12.00151   503
## 7       7 12.01488   446
## 3       3 12.01516   232
## 11     11 12.06105   142
## 2       2 12.06249   133
## 8       8 12.06939   233
## 12     12 12.08954   104
## 9       9 12.13296   158
mosold_df[ order(mosold_df$count), ]
##    MoSold     meds count
## 12     12 12.08954   104
## 1       1 11.92918   122
## 2       2 12.06249   133
## 11     11 12.06105   142
## 9       9 12.13296   158
## 10     10 11.97666   173
## 3       3 12.01516   232
## 8       8 12.06939   233
## 4       4 11.89819   279
## 5       5 11.94471   394
## 7       7 12.01488   446
## 6       6 12.00151   503
df$MoSoldnums[df$MoSoldcat == "4"] <- 1
df$MoSoldnums[df$MoSoldcat == "1"] <- 2
df$MoSoldnums[df$MoSoldcat == "5"] <- 3
df$MoSoldnums[df$MoSoldcat == "10"] <- 4
df$MoSoldnums[df$MoSoldcat == "6"] <- 5
df$MoSoldnums[df$MoSoldcat == "7"] <- 6
df$MoSoldnums[df$MoSoldcat == "3"] <- 7
df$MoSoldnums[df$MoSoldcat == "11"] <- 8
df$MoSoldnums[df$MoSoldcat == "2"] <- 9
df$MoSoldnums[df$MoSoldcat == "8"] <- 10
df$MoSoldnums[df$MoSoldcat == "12"] <- 11
df$MoSoldnums[df$MoSoldcat == "9"] <- 12
ggplot(data=df, aes(as.factor(df$MoSoldnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

df$MoSoldcat <- NULL

df$SummerSale <- 0
df$SummerSale[df$MoSold == 5 | df$MoSold == 6 | df$MoSold == 7] <- 1

YrSold: Year Sold (YYYY)

df$YrSoldcat <- as.factor(df$YrSold)
ggplot(data=df, aes(df$YrSoldcat,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

SaleType: Type of sale

df$SaleType <- as.factor(df$SaleType)
summary(df$SaleType)
##   COD   Con ConLD ConLI ConLw   CWD   New   Oth    WD 
##    87     5    26     9     8    12   239     7  2526
ggplot(data=df, aes(df$SaleType,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

saletype = levels(df$SaleType)
saletype_df = as.data.frame(saletype)
colnames(saletype_df) <- "SaleType"
saletype_meds = (1:length(saletype))
for (i in (1:length(saletype))){
  saletype_meds[i] <- median(df$SalePrice[df$SaleType == saletype[i]], na.rm = TRUE)
}
saletype_df$meds = saletype_meds
saletype_df[ order(saletype_df$meds), ]
##   SaleType     meds
## 8      Oth 11.66178
## 4    ConLI 11.73607
## 1      COD 11.84223
## 3    ConLD 11.84940
## 5    ConLw 11.87757
## 9       WD 11.97035
## 6      CWD 12.13473
## 7      New 12.41897
## 2      Con 12.48391
df$SaleTypenums[df$SaleType == "Oth"] <- 1
df$SaleTypenums[df$SaleType == "ConLI"] <- 2
df$SaleTypenums[df$SaleType == "COD"] <- 3
df$SaleTypenums[df$SaleType == "ConLD"] <- 4
df$SaleTypenums[df$SaleType == "ConLw"] <- 5
df$SaleTypenums[df$SaleType == "WD"] <- 6
df$SaleTypenums[df$SaleType == "CWD"] <- 7
df$SaleTypenums[df$SaleType == "New"] <- 8
df$SaleTypenums[df$SaleType == "Con"] <- 9
ggplot(data=df, aes(as.factor(df$SaleTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Bin ConLI CWD New together, since they seem to compare to a premium price:

df$SaleTypePrem <- 0
df$SaleTypePrem[df$SaleType == "ConLI"] <- 1    
df$SaleTypePrem[df$SaleType == "CWD"]  <- 1     
df$SaleTypePrem[df$SaleType == "New"]  <- 1 

SaleCondition: Condition of sale

df$SaleCondition <- as.factor(df$SaleCondition)
summary(df$SaleCondition)
## Abnorml AdjLand  Alloca  Family  Normal Partial 
##     190      12      24      46    2402     245
ggplot(data=df, aes(df$SaleCondition,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

salecondition = levels(df$SaleCondition)
salecondition_df = as.data.frame(salecondition)
colnames(salecondition_df) <- "SaleCondition"
salecondition_meds = (1:length(salecondition))
for (i in (1:length(salecondition))){
  salecondition_meds[i] <- median(df$SalePrice[df$SaleCondition == salecondition[i]], na.rm = TRUE)
}
salecondition_df$meds = salecondition_meds
salecondition_df[ order(salecondition_df$meds), ]
##   SaleCondition     meds
## 2       AdjLand 11.52926
## 1       Abnorml 11.77529
## 4        Family 11.85245
## 3        Alloca 11.90533
## 5        Normal 11.98293
## 6       Partial 12.40738
df$SaleConditionnums[df$SaleCondition == "AdjLand"] <- 1
df$SaleConditionnums[df$SaleCondition == "Abnorml"] <- 2
df$SaleConditionnums[df$SaleCondition == "Family"] <- 3
df$SaleConditionnums[df$SaleCondition == "Alloca"] <- 4
df$SaleConditionnums[df$SaleCondition == "Normal"] <- 5
df$SaleConditionnums[df$SaleCondition == "Partial"] <- 6
ggplot(data=df, aes(as.factor(df$SaleConditionnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Numerical Features

We want to examine the base numeric features and their relationship with the SalePrice and with each other. We will look for transformations to new features that are more suitable for regression. We also examine potential outlier and determine the effect of removing observations from the dataset.

These are the base numeric features

base_numerics <- c('Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
                   'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'X1stFlrSF', 
                   'X2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
                   'HalfBath', 'BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', #'GarageYrBlt', 
                   'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'X3SsnPorch',
                   'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'SalePrice')

# GarageYrBlt is set aside for now since the 'None' entries make it non-numeric

train_nums <- df[1:train_size, base_numerics]

# str(train_nums)

correlations <- cor(train_nums[,-1])


# correlations
row_indic <- apply(correlations, 1, function(x) sum(x > 0.3 | x < -0.3) > 1)

correlations <- correlations[row_indic ,row_indic ]
corrplot(correlations, method="square")

correlations[order(correlations[,'SalePrice'], decreasing = TRUE),'SalePrice']
##     SalePrice   OverallQual     GrLivArea    GarageCars    GarageArea 
##    1.00000000    0.81718442    0.70092665    0.68062481    0.65088756 
##   TotalBsmtSF     X1stFlrSF      FullBath     YearBuilt  YearRemodAdd 
##    0.61213398    0.59698105    0.59477054    0.58657024    0.56560783 
##  TotRmsAbvGrd    Fireplaces    MasVnrArea    BsmtFinSF1   LotFrontage 
##    0.53442220    0.48944943    0.42677562    0.37202307    0.34867197 
##    WoodDeckSF   OpenPorchSF     X2ndFlrSF      HalfBath       LotArea 
##    0.33413507    0.32105297    0.31929998    0.31398225    0.25731989 
##  BsmtFullBath     BsmtUnfSF  BedroomAbvGr   OverallCond EnclosedPorch 
##    0.23622411    0.22198505    0.20904368   -0.03686799   -0.14905028

We will consider these features in order of correlation with the response.

OverallQual

ggplot(data=df, aes(df$OverallQual,df$SalePrice)) + geom_point() + geom_smooth(method='lm')
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

OverallQual.mod = lm(SalePrice ~ OverallQual, data=train_nums)
LogOverallQual.mod = lm(SalePrice ~ log(OverallQual+1), data=train_nums)

summary(OverallQual.mod)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual, data = train_nums)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06831 -0.12974  0.01309  0.13332  0.92438 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.58444    0.02727  388.18   <2e-16 ***
## OverallQual  0.23603    0.00436   54.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2303 on 1458 degrees of freedom
## Multiple R-squared:  0.6678, Adjusted R-squared:  0.6676 
## F-statistic:  2931 on 1 and 1458 DF,  p-value: < 2.2e-16
summary(LogOverallQual.mod)
## 
## Call:
## lm(formula = SalePrice ~ log(OverallQual + 1), data = train_nums)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03898 -0.13573  0.01371  0.13573  0.97284 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8.94631    0.06034  148.26   <2e-16 ***
## log(OverallQual + 1)  1.58621    0.03093   51.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2386 on 1458 degrees of freedom
## Multiple R-squared:  0.6433, Adjusted R-squared:  0.6431 
## F-statistic:  2630 on 1 and 1458 DF,  p-value: < 2.2e-16

OverallQual isn’t especially skewed and a log-transform doesn’t appreciably improve the linear fit.

GrLivArea: Above grade (ground) living area square feet

ggplot(data=df, aes(df$GrLivArea)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df, aes(df$GrLivArea,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$GrLivArea+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$LogGrLivArea <- log(df$GrLivArea+1)

GrLivArea is skewed, but taking the log leads to a fairly linear trend. There are a series of outliers at large values of Area, as well as at low SalePrice. These are:

which(df$GrLivArea[1:train_size] > 4000) 
## [1]  524  692 1183 1299
which(df$SalePrice[1:train_size] < 10.7)
## [1]  31 496 534 917 969

Before continuing to examine other promising regressors, we can generate a new feature based on binning roughly around the mean

df$GrLivAreabin <- 0
df$GrLivAreabin[df$GrLivArea < 800] <- 1
df$GrLivAreabin[df$GrLivArea >= 800 & df$GrLivArea < 1000] <- 2
df$GrLivAreabin[df$GrLivArea >= 1000 & df$GrLivArea < 1200] <- 3
df$GrLivAreabin[df$GrLivArea >= 1200 & df$GrLivArea < 1400] <- 4
df$GrLivAreabin[df$GrLivArea >= 1400 & df$GrLivArea < 1600] <- 5
df$GrLivAreabin[df$GrLivArea >= 1600 & df$GrLivArea < 1800] <- 6
df$GrLivAreabin[df$GrLivArea >= 1800 & df$GrLivArea < 2000] <- 7
df$GrLivAreabin[df$GrLivArea >= 2000 & df$GrLivArea < 2200] <- 8
df$GrLivAreabin[df$GrLivArea >= 2200 & df$GrLivArea < 2400] <- 9
df$GrLivAreabin[df$GrLivArea >= 2400] <- 10

GarageCars and Area

ggplot(data=df, aes(df$GarageCars,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(df$GarageArea,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

which(df$GarageArea[1:train_size] > 1240)
## [1]  582 1062 1191 1299

There are around 160 houses (81 in the training set) without a garage, but the distribution appears to be such that they don’t put too much pressure on the fit. It might be worth doing some statistics on the subset of houses w/garages. Note that only one of the outliers here, 1299, overlaps with the GrLivArea outliers.

df$NoGarage <- 0
df$NoGarage[df$GarageArea == 0] <- 1

TotalBsmtSF

df$NoBasement <- 0
df$NoBasement[df$TotalBsmtSF == 0] <- 1

ggplot(data=df, aes(df$TotalBsmtSF,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$TotalBsmtSF+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

which(df$TotalBsmtSF[1:train_size] > 3000)
## [1]  333  441  497  524 1299

Here the log is particular bad because of the accumulation of houses with no basement. The set of houses with no basement in the training set has around 80 houses, which is probably too small to get good results. It might be useful to do fits on the set of ~ 1420 houses with basement in the training set.

Finally, we note that two outliers, 524 and 1299, overlap with the GrLivArea set.

We can create a new feature by adding GrLivArea and TotalBsmtSF.

df$TotalHouseArea <- df$GrLivArea + df$TotalBsmtSF

ggplot(data=df, aes(df$TotalHouseArea,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$TotalHouseArea+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$TotalHouseArea <- log(df$TotalHouseArea+1)

TotalHouseArea.mod = lm(SalePrice ~ TotalHouseArea + sqrt(TotalHouseArea), data=df)
summary(TotalHouseArea.mod)
## 
## Call:
## lm(formula = SalePrice ~ TotalHouseArea + sqrt(TotalHouseArea), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99580 -0.11295  0.02835  0.15434  0.71377 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           39.8814     7.4339   5.365 9.41e-08 ***
## TotalHouseArea         5.6630     0.9614   5.890 4.78e-09 ***
## sqrt(TotalHouseArea) -25.7977     5.3468  -4.825 1.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.236 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.6514, Adjusted R-squared:  0.6509 
## F-statistic:  1361 on 2 and 1457 DF,  p-value: < 2.2e-16
df$TotalHouseArea <- as.numeric(TotalHouseArea.mod$coefficients[1]) * df$TotalHouseArea + 
  as.numeric(TotalHouseArea.mod$coefficients[2]) * sqrt(df$TotalHouseArea) 

ggplot(data=df, aes(df$TotalHouseArea,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

1stFlrSF: First Floor square feet

ggplot(data=df, aes(df$X1stFlrSF,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$X1stFlrSF+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

The logarithm helps with the skewness, but there is some significant nonlinearity left. We can build a new feature to remove this:

df$X1stLin <- log(df$X1stFlrSF+1)

X1stLin.mod = lm(SalePrice ~ X1stLin + sqrt(X1stLin), data=df)
summary(X1stLin.mod)
## 
## Call:
## lm(formula = SalePrice ~ X1stLin + sqrt(X1stLin), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34019 -0.18760 -0.03179  0.23515  0.91934 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     27.624     11.053   2.499   0.0126 *
## X1stLin          3.759      1.578   2.383   0.0173 *
## sqrt(X1stLin)  -15.849      8.353  -1.897   0.0580 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3167 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.3724, Adjusted R-squared:  0.3715 
## F-statistic: 432.2 on 2 and 1457 DF,  p-value: < 2.2e-16
df$X1stLin <- as.numeric(X1stLin.mod$coefficients[1]) * df$X1stLin + 
  as.numeric(X1stLin.mod$coefficients[2]) * sqrt(df$X1stLin) 

ggplot(data=df, aes(df$X1stLin,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

FullBath

ggplot(data=df, aes(df$FullBath,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

We would like to engineer a new feature that takes all of the bathrooms into account. We’ll get weights from a simple regression:

bathmodel = lm(SalePrice ~ FullBath + HalfBath + BsmtFullBath + BsmtHalfBath, data = df)
summary(bathmodel)
## 
## Call:
## lm(formula = SalePrice ~ FullBath + HalfBath + BsmtFullBath + 
##     BsmtHalfBath, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14743 -0.16214  0.00057  0.16762  1.03304 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.18402    0.02423 461.666  < 2e-16 ***
## FullBath      0.42365    0.01372  30.885  < 2e-16 ***
## HalfBath      0.19400    0.01497  12.956  < 2e-16 ***
## BsmtFullBath  0.22497    0.01457  15.436  < 2e-16 ***
## BsmtHalfBath  0.12204    0.03165   3.856  0.00012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2849 on 1455 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.4929, Adjusted R-squared:  0.4915 
## F-statistic: 353.5 on 4 and 1455 DF,  p-value: < 2.2e-16
as.numeric(bathmodel$coefficients[2])
## [1] 0.4236537
df$TotalBath <- as.numeric(bathmodel$coefficients[2]) * df$FullBath + 
  as.numeric(bathmodel$coefficients[3]) * df$HalfBath + 
  as.numeric(bathmodel$coefficients[4]) * df$BsmtFullBath + 
  as.numeric(bathmodel$coefficients[5]) * df$BsmtHalfBath 

ggplot(data=df, aes(df$TotalBath,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

This data frame has the testing data plus whatever transformations we’ve made so far. Now we should combine some numerical variables before we perform any scalings or standardization.

YearBuilt

ggplot(data=df, aes(df$YearBuilt,df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x + x^2 + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$YearBuilt+1),df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x + x^2 + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

It might be easier to interpret the age of the house at the time of sale as a feature, this is

df$Age <- df$YrSold - df$YearBuilt 

ggplot(data=df, aes(df$Age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df, aes(df$Age,df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x  + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$Age+2),df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x  + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(df$Age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that the logarithm helps smooth the distribution out. Note that some houses were sold before they were built, so we have to take log(x+2). Let’s build a linearized feature:

age.mod = lm(data=df, SalePrice ~ log(Age + 2) + sqrt(log(Age+2)))
summary(age.mod)
## 
## Call:
## lm(formula = SalePrice ~ log(Age + 2) + sqrt(log(Age + 2)), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34757 -0.18443 -0.01755  0.16385  1.45468 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        11.84951    0.14739  80.395  < 2e-16 ***
## log(Age + 2)       -0.57193    0.06341  -9.020  < 2e-16 ***
## sqrt(log(Age + 2))  1.14003    0.19833   5.748  1.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3118 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.3916, Adjusted R-squared:  0.3908 
## F-statistic: 468.9 on 2 and 1457 DF,  p-value: < 2.2e-16
df$AgeLin <- as.numeric(age.mod$coefficients[2]) * log(df$Age + 2) +
  as.numeric(age.mod$coefficients[3]) * sqrt(log(df$Age+2))

ggplot(data=df, aes(df$AgeLin,df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

We should also create some new features, including some bins

# very new sales
df$VNewSale <- 0
df$VNewSale[df$Age <= 1] <- 1

# 10 year bins
df$Agebin <- 0
df$Agebin[df$Age < 10] <- 1
df$Agebin[df$Age >= 10 & df$Age < 20] <- 2
df$Agebin[df$Age >= 20 & df$Age < 30] <- 3
df$Agebin[df$Age >= 30 & df$Age < 40] <- 4
df$Agebin[df$Age >= 40 & df$Age < 50] <- 5
df$Agebin[df$Age >= 50 & df$Age < 60] <- 6
df$Agebin[df$Age >= 60 & df$Age < 70] <- 7
df$Agebin[df$Age >= 70 & df$Age < 80] <- 8
df$Agebin[df$Age >= 80 & df$Age < 90] <- 9
df$Agebin[df$Age >= 90 & df$Age < 100] <- 10
df$Agebin[df$Age >= 100] <- 11

# note whether a house was remodeled
df$Remodeled <- 0
df$Remodeled[which(df$YearRemodAdd != df$YearBuilt)] <- 1

# have to get rid of YearBuilt because it is collinear with Age
df$YearBuilt <- NULL

YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)

Part of the correlation with SalePrice is probably due to the identification with construction date for houses that weren’t remodeled. We will again convert to an age variable.

df$RemodAge <- df$YrSold - df$YearRemodAdd

# sold soon after remodel?
df$RemodSale <- 0
df$RemodSale[df$RemodAge < 1] <- 1

df$YearRemodAdd <- NULL

# less than 1/2 of houses were remodeled
summary(as.factor(df$Remodeled))
##    0    1 
## 1560 1359
ggplot(data=df, aes(df$RemodAge)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df, aes(df$RemodAge,df$SalePrice)) + geom_point() + geom_smooth(method = 'lm')
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(df$RemodAge,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x +  sqrt(x) )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning in sqrt(x): NaNs produced

## Warning in sqrt(x): NaNs produced
## Warning: Removed 1459 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_smooth).

ggplot(data=df, aes(log(df$RemodAge+3),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

remodage.mod = lm(SalePrice ~ log(RemodAge+3) + sqrt(log(RemodAge+3)), data = df)
summary(remodage.mod)
## 
## Call:
## lm(formula = SalePrice ~ log(RemodAge + 3) + sqrt(log(RemodAge + 
##     3)), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36637 -0.19174  0.00784  0.18951  1.43793 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             11.55795    0.23777  48.611  < 2e-16 ***
## log(RemodAge + 3)       -0.66577    0.09854  -6.756 2.04e-11 ***
## sqrt(log(RemodAge + 3))  1.42289    0.31129   4.571 5.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3319 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.3105, Adjusted R-squared:  0.3096 
## F-statistic: 328.1 on 2 and 1457 DF,  p-value: < 2.2e-16
df$RemodAgeLin <- as.numeric(remodage.mod$coefficients[2]) * log(df$RemodAge+3) +
  as.numeric(remodage.mod$coefficients[3]) * sqrt(log(df$RemodAge+3)) 

ggplot(data=df, aes(df$RemodAgeLin,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

ggplot(data=df, aes(df$TotRmsAbvGrd,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

Fireplaces: Number of fireplaces

ggplot(data=df, aes(df$Fireplaces,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

MasVnrArea

ggplot(data=df, aes(df$MasVnrArea,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$MasVnrArea+1),df$SalePrice)) + geom_point()+ 
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

masvnr.mod = lm(data=df, SalePrice ~ log(MasVnrArea+1) + sqrt(log(MasVnrArea+1)))
summary(masvnr.mod)
## 
## Call:
## lm(formula = SalePrice ~ log(MasVnrArea + 1) + sqrt(log(MasVnrArea + 
##     1)), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43848 -0.23563 -0.01423  0.24198  1.62242 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               11.89872    0.01222 973.346  < 2e-16 ***
## log(MasVnrArea + 1)        0.25532    0.03802   6.715 2.69e-11 ***
## sqrt(log(MasVnrArea + 1)) -0.45110    0.08869  -5.086 4.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3608 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1842 
## F-statistic: 165.7 on 2 and 1457 DF,  p-value: < 2.2e-16
df$MasVnrAreaLin =  as.numeric(masvnr.mod$coefficients[2]) * (log(df$MasVnrArea+1)) + 
  as.numeric(masvnr.mod$coefficients[3]) * sqrt(log(df$MasVnrArea+1))     

ggplot(data=df, aes(df$MasVnrAreaLin,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

BsmtFinSF1

ggplot(data=df, aes(df$BsmtFinSF1,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$BsmtFinSF1+1),df$SalePrice)) + geom_point()+ 
  geom_smooth(method='lm', formula = y ~ x + sqrt(x) )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

This isn’t a great regressor. Later, we will create a new feature by adding GrLivArea, BsmtFinSF1 and BsmtFinSF2.

LotFrontage: Linear feet of street connected to property

We had to impute a large number of these values, so we should be suspicious.

ggplot(data=df, aes(df$LotFrontage)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df, aes(df$LotFrontage,df$SalePrice)) + geom_point() + 
  geom_smooth(method='lm', formula = y ~ x + x^2 + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

which(df$LotFrontage > 200)
## [1]  935 1299
# only 1299 is in the GrLivArea outliers

df$LotFrontagebin <- 0
df$LotFrontagebin[df$LotFrontage < 55] <- 1
df$LotFrontagebin[df$LotFrontage >= 55 & df$LotFrontage < 65] <- 2
df$LotFrontagebin[df$LotFrontage >= 65 & df$LotFrontage < 75] <- 3
df$LotFrontagebin[df$LotFrontage >= 75 & df$LotFrontage < 85] <- 4
df$LotFrontagebin[df$LotFrontage >= 85 & df$LotFrontage < 95] <- 5
df$LotFrontagebin[df$LotFrontage >= 95 & df$LotFrontage < 105] <- 6
df$LotFrontagebin[df$LotFrontage >= 105] <- 7
ggplot(data=df, aes(as.factor(df$LotFrontagebin),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

Again not a great regressor. We might choose to drop this and keep the binned feature. Also note that 1299 was also an outlier here.

Porches and Decks

Some consolidation might be in order here.

porch.mod = lm(data=df, SalePrice ~ WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch)
summary(porch.mod)
## 
## Call:
## lm(formula = SalePrice ~ WoodDeckSF + OpenPorchSF + EnclosedPorch + 
##     X3SsnPorch + ScreenPorch, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23450 -0.21007 -0.00246  0.20779  1.37074 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.184e+01  1.427e-02 829.558  < 2e-16 ***
## WoodDeckSF     1.019e-03  7.448e-05  13.678  < 2e-16 ***
## OpenPorchSF    1.732e-03  1.401e-04  12.367  < 2e-16 ***
## EnclosedPorch -4.546e-04  1.531e-04  -2.968  0.00305 ** 
## X3SsnPorch     9.296e-04  3.147e-04   2.954  0.00319 ** 
## ScreenPorch    8.594e-04  1.668e-04   5.151 2.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3516 on 1454 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.2277, Adjusted R-squared:  0.225 
## F-statistic: 85.74 on 5 and 1454 DF,  p-value: < 2.2e-16
df$DeckPorch <- 1.019e-03* df$WoodDeckSF + 1.732e-03 * df$OpenPorchSF - 4.546e-04 *  df$EnclosedPorch + 
  9.296e-04 * df$X3SsnPorch + 8.594e-04 * df$ScreenPorch

ggplot(data=df, aes(df$DeckPorch,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning in sqrt(x): NaNs produced

## Warning in sqrt(x): NaNs produced
## Warning: Removed 1459 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_smooth).

summary(df$DeckPorch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.2232  0.0000  0.1625  0.1835  0.2927  1.4510
ggplot(data=df, aes(log(df$DeckPorch+3),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

deckporch.mod = lm(data=df, SalePrice ~ log(df$DeckPorch+3) + sqrt(log(df$DeckPorch+3)))
summary(deckporch.mod)
## 
## Call:
## lm(formula = SalePrice ~ log(df$DeckPorch + 3) + sqrt(log(df$DeckPorch + 
##     3)), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96368 -0.20708 -0.00681  0.20375  1.34252 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -50.105      9.897  -5.062 4.67e-07 ***
## log(df$DeckPorch + 3)        -46.146      8.398  -5.495 4.61e-08 ***
## sqrt(log(df$DeckPorch + 3))  107.433     18.237   5.891 4.76e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3457 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.2521, Adjusted R-squared:  0.251 
## F-statistic: 245.5 on 2 and 1457 DF,  p-value: < 2.2e-16
df$DeckPorchLin <- as.numeric(deckporch.mod$coefficients[2]) * sqrt(log(df$DeckPorch+3)) + as.numeric(deckporch.mod$coefficients[3]) * log(df$DeckPorch+3)

df$DeckPorch <- NULL

ggplot(data=df, aes(df$DeckPorchLin,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$HasWoodDeck <-0
df$HasWoodDeck[df$WoodDeckSF != 0] <- 1

df$HasOpenPorch <-0
df$HasOpenPorch[df$OpenPorchSF != 0] <- 1

df$HasEnclosedPorch <-0
df$HasEnclosedPorch[df$EnclosedPorch != 0] <- 1

df$Has3SsnPorch <-0
df$Has3SsnPorch[df$X3SsnPorch != 0] <- 1

df$HasScreenPorch <-0
df$HasScreenPorch[df$ScreenPorch != 0] <- 1

df$HasDeckorPorch <- 0 
df$HasDeckorPorch <- (df$HasWoodDeck == 1 | df$HasOpenPorch == 1 | df$HasEnclosedPorch == 1 | 
                        df$Has3SsnPorch == 1 | df$HasScreenPorch)*1

X2ndFlrSF

ggplot(data=df, aes(df$X2ndFlrSF,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$X2ndFlrSF+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$X2ndLin <- log(df$X2ndFlrSF+1)

X2ndLin.mod = lm(SalePrice ~ X2ndLin + sqrt(X2ndLin), data=df)
summary(X2ndLin.mod)
## 
## Call:
## lm(formula = SalePrice ~ X2ndLin + sqrt(X2ndLin), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.51146 -0.19843 -0.01829  0.17561  1.35222 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   11.97170    0.01240  965.44   <2e-16 ***
## X2ndLin        1.40976    0.07893   17.86   <2e-16 ***
## sqrt(X2ndLin) -3.58378    0.20368  -17.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.357 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.2022, Adjusted R-squared:  0.2011 
## F-statistic: 184.6 on 2 and 1457 DF,  p-value: < 2.2e-16
df$X2ndLin <- as.numeric(X2ndLin.mod$coefficients[2]) * df$X2ndLin +
  as.numeric(X2ndLin.mod$coefficients[3]) * sqrt(df$X2ndLin)

ggplot(data=df, aes(df$X2ndLin,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

LotArea: Lot size in square feet

summary(df$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7478    9453   10170   11570  215200
ggplot(data=df, aes(df$LotArea)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df, aes(df$LotArea,df$SalePrice)) + geom_point() + geom_smooth(method = 'lm')
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

which(df$LotArea[1:train_size] > 50000)
##  [1]   54  250  314  336  385  452  458  707  770 1299 1397
# 1299 is an outlier again

ggplot(data=df, aes(df$LotArea,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

# bin
df$LotAreabin <- 0
df$LotAreabin[df$LotArea < 3000] <- 1
df$LotAreabin[df$LotArea >= 3000 & df$LotArea < 4500] <- 2
df$LotAreabin[df$LotArea >= 4500 & df$LotArea < 6000] <- 3
df$LotAreabin[df$LotArea >= 6000 & df$LotArea < 7500] <- 4
df$LotAreabin[df$LotArea >= 7500 & df$LotArea < 9000] <- 5
df$LotAreabin[df$LotArea >= 9000 & df$LotArea < 10500] <- 6
df$LotAreabin[df$LotArea >= 9000 & df$LotArea < 10500] <- 7
df$LotAreabin[df$LotArea >= 10500 & df$LotArea < 12000] <- 8
df$LotAreabin[df$LotArea >= 12000 & df$LotArea < 13500] <- 9
df$LotAreabin[df$LotArea >= 13500] <- 10
ggplot(data=df, aes(as.factor(df$LotAreabin),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).

The presence of so many outliers makes this a poor regressor, though it suggests 1299 is again a global outlier.

OverallCond

ggplot(data=df, aes(df$OverallCond)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df, aes(df$OverallCond,df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$OverallCond+1),df$SalePrice)) + geom_point() + 
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x) )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

If anything it seems like higher values of OverallCond don’t correlate with higher sale prices. So the market is happy with a certain level of condition and doesn’t value anything rated higher with a particular premium. Our OverallCondBad boolean might capture the actual variation due to the condition rating.

Additional Feature Combinations

There are several other logical combinations of features that we want to explore.

AllSizesSum

This is a feature introduced by Choudhary. It adds lengths and areas, but we’ll include it along with several variations.

df$AllSizesSum <- df$LotFrontage + df$LotArea + df$MasVnrArea + df$BsmtFinSF1 + df$BsmtFinSF2 + df$BsmtUnfSF +
                 df$TotalBsmtSF + df$X1stFlrSF + df$X2ndFlrSF + df$GrLivArea + df$GarageArea + df$WoodDeckSF + 
                 df$OpenPorchSF + df$EnclosedPorch + df$X3SsnPorch + df$ScreenPorch + df$LowQualFinSF + df$PoolArea

ggplot(data=df, aes(df$AllSizesSum,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$AllSizesSum),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

which(df$AllSizesSum > 100000)
## [1] 250 314 336 707
which(df$AllSizesSum > 75000)
## [1]  250  314  336  452  707 1299
which(df$AllSizesSum[1:train_size] > 50000)
##  [1]   54  250  314  336  385  452  458  524  662  707  770  849 1299 1397
# 524 and 1299 were GrLivArea outliers

Because of the pressure of outliers, we will also linearize this

df$AllSizesSumLin <- log(df$AllSizesSum+1)

allsizessum_mod = lm(SalePrice ~ AllSizesSumLin + sqrt(AllSizesSumLin), data = df)
summary(allsizessum_mod)
## 
## Call:
## lm(formula = SalePrice ~ AllSizesSumLin + sqrt(AllSizesSumLin), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39276 -0.20120 -0.01933  0.22291  1.07282 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -35.7211     8.5088  -4.198 2.85e-05 ***
## AllSizesSumLin        -3.6767     0.8745  -4.204 2.78e-05 ***
## sqrt(AllSizesSumLin)  26.8017     5.4550   4.913 9.97e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.32 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.3593, Adjusted R-squared:  0.3584 
## F-statistic: 408.5 on 2 and 1457 DF,  p-value: < 2.2e-16
df$AllSizesSumLin <- as.numeric(allsizessum_mod$coefficients[2]) * df$AllSizesSumLin +
  as.numeric(allsizessum_mod$coefficients[3]) * sqrt(df$AllSizesSumLin) 

ggplot(data=df, aes(df$AllSizesSumLin,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

In keeping with the theme, we also examine the weighted sum of all areas:

areamodel = lm(SalePrice ~ LotArea +  TotalBsmtSF + GrLivArea + GarageArea + WoodDeckSF +
                 OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea, data =df)
summary(areamodel)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + TotalBsmtSF + GrLivArea + 
##     GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + 
##     ScreenPorch + PoolArea, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.81157 -0.09421  0.02328  0.12855  0.74747 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.101e+01  2.002e-02 550.059  < 2e-16 ***
## LotArea        4.643e-07  6.137e-07   0.757  0.44940    
## TotalBsmtSF    2.154e-04  1.621e-05  13.286  < 2e-16 ***
## GrLivArea      3.173e-04  1.391e-05  22.813  < 2e-16 ***
## GarageArea     5.539e-04  3.315e-05  16.711  < 2e-16 ***
## WoodDeckSF     3.347e-04  4.945e-05   6.768 1.89e-11 ***
## OpenPorchSF    2.405e-04  9.413e-05   2.555  0.01073 *  
## EnclosedPorch -4.505e-04  9.777e-05  -4.608 4.43e-06 ***
## X3SsnPorch     3.936e-04  1.986e-04   1.982  0.04771 *  
## ScreenPorch    3.305e-04  1.060e-04   3.118  0.00186 ** 
## PoolArea      -5.825e-04  1.473e-04  -3.954 8.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2215 on 1449 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.6946, Adjusted R-squared:  0.6925 
## F-statistic: 329.5 on 10 and 1449 DF,  p-value: < 2.2e-16
df$AreasSum <- as.numeric(areamodel$coefficients[2]) * df$LotArea + 
  as.numeric(areamodel$coefficients[3]) * df$TotalBsmtSF + 
  as.numeric(areamodel$coefficients[4]) * df$GrLivArea + 
  as.numeric(areamodel$coefficients[5]) * df$GarageArea +
  as.numeric(areamodel$coefficients[6]) * df$WoodDeckSF +
  as.numeric(areamodel$coefficients[7]) * df$OpenPorchSF + 
  as.numeric(areamodel$coefficients[8]) * df$EnclosedPorch +
  as.numeric(areamodel$coefficients[9]) * df$X3SsnPorch + 
  as.numeric(areamodel$coefficients[10]) * df$ScreenPorch +
  as.numeric(areamodel$coefficients[11]) * df$PoolArea

ggplot(data=df, aes(df$AreasSum,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$AreasSum+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x +sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$AreasSum <-  log(df$AreasSum+1)

Another possibility is the total living area

livingareamodel = lm(SalePrice ~ BsmtFinSF1 + BsmtFinSF2  + GrLivArea, data = df)
summary(livingareamodel)
## 
## Call:
## lm(formula = SalePrice ~ BsmtFinSF1 + BsmtFinSF2 + GrLivArea, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1645 -0.1348  0.0326  0.1528  0.8011 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.118e+01  2.184e-02 511.764   <2e-16 ***
## BsmtFinSF1  2.080e-04  1.584e-05  13.129   <2e-16 ***
## BsmtFinSF2  5.700e-05  4.381e-05   1.301    0.194    
## GrLivArea   4.954e-04  1.373e-05  36.069   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2696 on 1456 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.5453, Adjusted R-squared:  0.5443 
## F-statistic:   582 on 3 and 1456 DF,  p-value: < 2.2e-16
df$LivArea <- df$BsmtFinSF1 + df$BsmtFinSF2 + df$GrLivArea

df$LivAreaWt <- as.numeric(livingareamodel$coefficients[2]) * df$BsmtFinSF1 + 
  as.numeric(livingareamodel$coefficients[3]) * df$BsmtFinSF2 + 
  as.numeric(livingareamodel$coefficients[4]) * df$GrLivArea

ggplot(data=df, aes(df$LivArea,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$LivArea+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(df$LivAreaWt,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$LivAreaWt+1),df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$LivArea <- log(df$LivArea+1)
df$LivAreaWt <- log(df$LivAreaWt+1)

LivArea.mod = lm(SalePrice ~ LivArea + sqrt(LivArea), data=df)
summary(LivArea.mod)
## 
## Call:
## lm(formula = SalePrice ~ LivArea + sqrt(LivArea), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.54966 -0.16713 -0.01589  0.17313  0.92012 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)     10.117      7.299   1.386    0.166
## LivArea          1.373      0.972   1.412    0.158
## sqrt(LivArea)   -3.075      5.328  -0.577    0.564
## 
## Residual standard error: 0.2698 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.5445, Adjusted R-squared:  0.5439 
## F-statistic: 870.9 on 2 and 1457 DF,  p-value: < 2.2e-16
df$LivArea <- as.numeric(LivArea.mod$coefficients[2]) * df$LivArea + 
  as.numeric(LivArea.mod$coefficients[3]) * sqrt(df$LivArea)

LivAreaWt.mod = lm(SalePrice ~ LivAreaWt + sqrt(LivAreaWt), data=df)
summary(LivAreaWt.mod)
## 
## Call:
## lm(formula = SalePrice ~ LivAreaWt + sqrt(LivAreaWt), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.32534 -0.13569  0.01827  0.15437  0.73786 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       8.5256     0.2751  30.994  < 2e-16 ***
## LivAreaWt        -1.5833     0.4485  -3.530 0.000428 ***
## sqrt(LivAreaWt)   5.7819     0.7044   8.209 4.86e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2482 on 1457 degrees of freedom
##   (1459 observations deleted due to missingness)
## Multiple R-squared:  0.6144, Adjusted R-squared:  0.6139 
## F-statistic:  1161 on 2 and 1457 DF,  p-value: < 2.2e-16
df$LivAreaWt <- as.numeric(LivAreaWt.mod$coefficients[2]) * df$LivAreaWt + 
  as.numeric(LivAreaWt.mod$coefficients[3]) * sqrt(df$LivAreaWt)

ggplot(data=df, aes(df$LivArea,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(df$LivAreaWt,df$SalePrice)) + geom_point()+
  geom_smooth(method='lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

Remaining Numerical Features

BsmtUnfSF

ggplot(data=df, aes(df$BsmtUnfSF,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$BsmtUnfSF+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

This is not great, but there’s no good reason to transform it.

LowQualFinSF

ggplot(data=df, aes(df$LowQualFinSF,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$LowQualFinSF+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

length(which(df$LowQualFinSF > 0))
## [1] 40

This has very low statistics, so we might want to drop it.

BedroomAbvGr and KitchenAbvGr

ggplot(data=df, aes(df$BedroomAbvGr,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$BedroomAbvGr+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(df$KitchenAbvGr,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$KitchenAbvGr+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

GarageYrBlt

We will set this up in order to do an separate regression on only the homes with garages. When regressing on the entire data set, we should drop the columns associated with the GarageAge.

df_garage <- subset(df, df$NoGarage == 0)
df_garage$GarageYrBlt <- as.numeric(df_garage$GarageYrBlt)

df_garage$GarageAge <- df_garage$YrSold - df_garage$GarageYrBlt 
  
ggplot(data=df_garage, aes(df_garage$GarageAge,df_garage$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1383 rows containing non-finite values (stat_smooth).
## Warning: Removed 1383 rows containing missing values (geom_point).

ggplot(data=df_garage, aes(log(df_garage$GarageAge+2),df_garage$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1383 rows containing non-finite values (stat_smooth).

## Warning: Removed 1383 rows containing missing values (geom_point).

summary(df_garage$GarageAge)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.00    6.00   29.00   29.77   48.00  114.00
df_garage$GarageAgeLin <- log(df_garage$GarageAge+2)

garageage.mod = lm(data=df_garage, SalePrice ~ GarageAgeLin + sqrt(GarageAgeLin))
summary(garageage.mod)
## 
## Call:
## lm(formula = SalePrice ~ GarageAgeLin + sqrt(GarageAgeLin), data = df_garage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39325 -0.19050 -0.01748  0.16530  1.40009 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        12.20766    0.14402  84.766  < 2e-16 ***
## GarageAgeLin       -0.38708    0.06517  -5.940 3.61e-09 ***
## sqrt(GarageAgeLin)  0.59245    0.19862   2.983  0.00291 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3106 on 1376 degrees of freedom
##   (1383 observations deleted due to missingness)
## Multiple R-squared:  0.3336, Adjusted R-squared:  0.3327 
## F-statistic: 344.4 on 2 and 1376 DF,  p-value: < 2.2e-16
df_garage$GarageAgeLin <- as.numeric(garageage.mod$coefficients[2]) * df_garage$GarageAgeLin +
  as.numeric(garageage.mod$coefficients[3]) * sqrt(df_garage$GarageAgeLin)

ggplot(data=df_garage, aes(df_garage$GarageAgeLin,df_garage$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1383 rows containing non-finite values (stat_smooth).

## Warning: Removed 1383 rows containing missing values (geom_point).

df <- merge(x = df, y = df_garage, by = names(df), all.x = TRUE)

# df$GarageAge[which(is.na(df$GarageAge))] <- rep('None')
# df$GarageAgeLin[which(is.na(df$GarageAgeLin))] <- rep('None')

df$GarageYrBlt <- NULL

PoolArea

With only 13 houses with pools, we can’t regress on this with any significance. We will setup a boolean:

length(which(df$PoolArea > 0))
## [1] 13
df$HasPool <- 0
df$HasPool[df$PoolArea > 0] <- 1
df$PoolArea <- NULL

MiscVal

length(which(df$MiscVal > 0))
## [1] 103
ggplot(data=df, aes(df$MiscVal,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$MiscVal+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$MiscVal <- NULL

There doesn’t seem to be any useful functional relationship here, even if we forget about the large number of observations at zero. We won’t regress on this.

MoSold

ggplot(data=df, aes(df$MoSold,df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(data=df, aes(log(df$MoSold+1),df$SalePrice)) + geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).

## Warning: Removed 1459 rows containing missing values (geom_point).

df$MoSold <- as.factor(df$MoSold)
df$YrSold <- as.factor(df$YrSold)
df$YrSoldcat <- NULL
# cleaning up unnecessary features

This doesn’t seem too interesting.

More Preprocessing

At this point, the only NAs are in SalePrice, GarageAge and GarageAgeLin. We want to range the numeric features, besides Id and SalePrice, as well as create numerical dummies for the factors. We will use caret.

train_df <- df[1:train_size, -72]
saleprice_df <- data.frame(df[1:train_size, c(1,72,73)])
test_df <- df[(train_size + 1):(train_size + 1459),-72]

train_df[,'HouseId'] <- sapply(train_df[,'HouseId'] , as.character)
test_df[,'HouseId'] <- sapply(test_df[,'HouseId'] , as.character)

pprules <- preProcess(train_df,  method = c("range"))

train_pp_df <- data.frame(predict(pprules, newdata = train_df))
test_pp_df <- data.frame(predict(pprules, newdata = test_df))

train_pp_df$Id <- as.numeric(row.names(train_pp_df))
test_pp_df$Id <- as.numeric(row.names(test_pp_df))

train_pp_df[,'HouseId'] <- sapply(train_pp_df[,'HouseId'] , as.numeric)
test_pp_df[,'HouseId'] <- sapply(test_pp_df[,'HouseId'] , as.numeric)

# train_pp_df <- merge(x = train_pp_df, y = saleprice_df, by = 'Id', all.x = TRUE)

train_pp_df <- merge(x = train_pp_df, y = saleprice_df, by = 'HouseId', all.x = TRUE)

train_pp_df$Id.x <- NULL
train_pp_df$Id.y <- NULL

dummies <- dummyVars(~ ., data = train_pp_df)

train_ppd_df <- data.frame(predict(dummies, newdata = train_pp_df))

# test needs to have same columns as train
test_pp_df$SalePrice <- rep(0)
test_ppd_df <- data.frame(predict(dummies, newdata = test_pp_df))
test_ppd_df$SalePrice <- NULL

We have to remove the levels that were missing in the test set

train_ppd_df$Condition.2RRAe <- NULL
train_ppd_df$Condition.2RRAn <- NULL
train_ppd_df$Condition.2RRNn <- NULL
train_ppd_df$HouseStyle.2.5Fin <- NULL
train_ppd_df$RoofMatl.ClyTile <- NULL
train_ppd_df$RoofMatl.Membran <- NULL
train_ppd_df$RoofMatl.Metal <- NULL
train_ppd_df$RoofMatl.Roll <- NULL
train_ppd_df$Exterior1st.ImStucc <- NULL
train_ppd_df$Exterior1st.Stone <- NULL
train_ppd_df$Exterior2nd.Other <- NULL
train_ppd_df$Heating.Floor <- NULL
train_ppd_df$Heating.OthW <- NULL
train_ppd_df$Electrical.Mix <- NULL
train_ppd_df$GarageQual.Ex <- NULL
train_ppd_df$PoolQC.Fa <- NULL

test_ppd_df$Condition.2RRAe <- NULL
test_ppd_df$Condition.2RRAn <- NULL
test_ppd_df$Condition.2RRNn <- NULL
test_ppd_df$HouseStyle.2.5Fin <- NULL
test_ppd_df$RoofMatl.ClyTile <- NULL
test_ppd_df$RoofMatl.Membran <- NULL
test_ppd_df$RoofMatl.Metal <- NULL
test_ppd_df$RoofMatl.Roll <- NULL
test_ppd_df$Exterior1st.ImStucc <- NULL
test_ppd_df$Exterior1st.Stone <- NULL
test_ppd_df$Exterior2nd.Other <- NULL
test_ppd_df$Heating.Floor <- NULL
test_ppd_df$Heating.OthW <- NULL
test_ppd_df$Electrical.Mix <- NULL
test_ppd_df$GarageQual.Ex <- NULL
test_ppd_df$PoolQC.Fa <- NULL

Test for near-zero variance.

x = nearZeroVar(train_ppd_df, saveMetrics = TRUE)
x[x[,"zeroVar"] > 0, ]
##                freqRatio percentUnique zeroVar  nzv
## MSSubClass.150         0    0.06849315    TRUE TRUE
summary(as.factor(train_ppd_df$MSSubClass.150))
##    0 
## 1460
summary(as.factor(test_ppd_df$MSSubClass.150))
##    0    1 
## 1458    1
train_ppd_df$MSSubClass.150 <- NULL
test_ppd_df$MSSubClass.150 <- NULL

Look more features with very low statistics:

train_ppd_cols = colnames(train_ppd_df)

train_ppd_cols_df = as.data.frame(train_ppd_cols)
for (i in 1:nrow(train_ppd_cols_df)){
  train_ppd_cols_df$count[i] <- length(which(train_ppd_df[train_ppd_cols[i]] !=0
))
}

lowstat_df <- train_ppd_cols_df[which(train_ppd_cols_df$count <= 10),]
lowstat_df
##            train_ppd_cols count
## 4           MSSubClass.40     4
## 17       MSZoning.C..all.    10
## 24            Street.Grvl     6
## 31           LotShape.IR3    10
## 40          LotConfig.FR3     4
## 46   Neighborhood.Blueste     2
## 59   Neighborhood.NPkVill     9
## 73        Condition1.PosA     8
## 77        Condition1.RRNe     2
## 78        Condition1.RRNn     5
## 79      Condition2.Artery     2
## 80       Condition2.Feedr     6
## 82        Condition2.PosA     1
## 83        Condition2.PosN     2
## 84        Condition2.RRAe     1
## 85        Condition2.RRAn     1
## 86        Condition2.RRNn     2
## 105     RoofStyle.Mansard     7
## 106        RoofStyle.Shed     2
## 109      RoofMatl.WdShake     5
## 110      RoofMatl.WdShngl     6
## 112   Exterior1st.AsphShn     1
## 113   Exterior1st.BrkComm     2
## 115    Exterior1st.CBlock     1
## 125   Exterior2nd.AsphShn     3
## 126   Exterior2nd.Brk.Cmn     7
## 128    Exterior2nd.CBlock     1
## 131   Exterior2nd.ImStucc    10
## 134     Exterior2nd.Stone     5
## 148          ExterCond.Ex     3
## 151          ExterCond.Po     1
## 157      Foundation.Stone     6
## 158       Foundation.Wood     3
## 167           BsmtCond.Po     2
## 195          HeatingQC.Po     1
## 213       Functional.Maj2     5
## 217        Functional.Sev     1
## 226     GarageType.2Types     6
## 230    GarageType.CarPort     9
## 242         GarageQual.Po     3
## 244         GarageCond.Ex     2
## 246         GarageCond.Gd     9
## 248         GarageCond.Po     7
## 258             PoolQC.Ex     2
## 259             PoolQC.Gd     3
## 266      MiscFeature.Gar2     2
## 268      MiscFeature.Othr     2
## 270      MiscFeature.TenC     1
## 289          SaleType.Con     2
## 290        SaleType.ConLD     9
## 291        SaleType.ConLI     5
## 292        SaleType.ConLw     5
## 293          SaleType.CWD     4
## 295          SaleType.Oth     3
## 298 SaleCondition.AdjLand     4
## 314      OverallQualcat.1     2
## 315      OverallQualcat.2     3
## 325        OverallQualBad     5
## 326      OverallCondcat.1     1
## 327      OverallCondcat.2     5
## 363            PoolQCnums     7
## 403               HasPool     7
verylowstat_df <- train_ppd_cols_df[which(train_ppd_cols_df$count <= 5),]
verylowstat_df
##            train_ppd_cols count
## 4           MSSubClass.40     4
## 40          LotConfig.FR3     4
## 46   Neighborhood.Blueste     2
## 77        Condition1.RRNe     2
## 78        Condition1.RRNn     5
## 79      Condition2.Artery     2
## 82        Condition2.PosA     1
## 83        Condition2.PosN     2
## 84        Condition2.RRAe     1
## 85        Condition2.RRAn     1
## 86        Condition2.RRNn     2
## 106        RoofStyle.Shed     2
## 109      RoofMatl.WdShake     5
## 112   Exterior1st.AsphShn     1
## 113   Exterior1st.BrkComm     2
## 115    Exterior1st.CBlock     1
## 125   Exterior2nd.AsphShn     3
## 128    Exterior2nd.CBlock     1
## 134     Exterior2nd.Stone     5
## 148          ExterCond.Ex     3
## 151          ExterCond.Po     1
## 158       Foundation.Wood     3
## 167           BsmtCond.Po     2
## 195          HeatingQC.Po     1
## 213       Functional.Maj2     5
## 217        Functional.Sev     1
## 242         GarageQual.Po     3
## 244         GarageCond.Ex     2
## 258             PoolQC.Ex     2
## 259             PoolQC.Gd     3
## 266      MiscFeature.Gar2     2
## 268      MiscFeature.Othr     2
## 270      MiscFeature.TenC     1
## 289          SaleType.Con     2
## 291        SaleType.ConLI     5
## 292        SaleType.ConLw     5
## 293          SaleType.CWD     4
## 295          SaleType.Oth     3
## 298 SaleCondition.AdjLand     4
## 314      OverallQualcat.1     2
## 315      OverallQualcat.2     3
## 325        OverallQualBad     5
## 326      OverallCondcat.1     1
## 327      OverallCondcat.2     5
verylowstat_cols <- as.factor(verylowstat_df$train_ppd_cols)

for (i in verylowstat_cols){
  train_ppd_df[i] <- NULL
  test_ppd_df[i] <- NULL
}

We’ve dropped any feature that had 5 or less observations in the training set. Now we want to look for very highly correlated features. We can start by guessing that there are some in the Garage and Basement subsets

train_ppd_cols <- colnames(train_ppd_df)


garage_cols <- grep("Garage+", train_ppd_cols, perl=TRUE, value=TRUE)
garage_cor  <- abs(cor(train_ppd_df[garage_cols]))
diag(garage_cor) <- 0
which(garage_cor == 1, arr.ind = T)
##                   row col
## GarageFinish.None   9   7
## GarageQual.None    16   7
## GarageCond.None    20   7
## NoGarage           30   7
## GarageType.None     7   9
## GarageQual.None    16   9
## GarageCond.None    20   9
## NoGarage           30   9
## GarageType.None     7  16
## GarageFinish.None   9  16
## GarageCond.None    20  16
## NoGarage           30  16
## GarageType.None     7  20
## GarageFinish.None   9  20
## GarageQual.None    16  20
## NoGarage           30  20
## GarageType.None     7  30
## GarageFinish.None   9  30
## GarageQual.None    16  30
## GarageCond.None    20  30
train_ppd_df$GarageFinish.None <- NULL
train_ppd_df$GarageQual.None <- NULL 
train_ppd_df$GarageCond.None <- NULL 
train_ppd_df$GarageType.None <- NULL
test_ppd_df$GarageFinish.None <- NULL
test_ppd_df$GarageQual.None <- NULL 
test_ppd_df$GarageCond.None <- NULL 
test_ppd_df$GarageType.None <- NULL

basement_cols <- grep("Bsmt+", train_ppd_cols, perl=TRUE, value=TRUE)
basement_cols <- append(basement_cols, "NoBasement")
basement_cols
##  [1] "BsmtQual.Ex"       "BsmtQual.Fa"       "BsmtQual.Gd"      
##  [4] "BsmtQual.None"     "BsmtQual.TA"       "BsmtCond.Fa"      
##  [7] "BsmtCond.Gd"       "BsmtCond.None"     "BsmtCond.TA"      
## [10] "BsmtExposure.Av"   "BsmtExposure.Gd"   "BsmtExposure.Mn"  
## [13] "BsmtExposure.No"   "BsmtExposure.None" "BsmtFinType1.ALQ" 
## [16] "BsmtFinType1.BLQ"  "BsmtFinType1.GLQ"  "BsmtFinType1.LwQ" 
## [19] "BsmtFinType1.None" "BsmtFinType1.Rec"  "BsmtFinType1.Unf" 
## [22] "BsmtFinSF1"        "BsmtFinType2.ALQ"  "BsmtFinType2.BLQ" 
## [25] "BsmtFinType2.GLQ"  "BsmtFinType2.LwQ"  "BsmtFinType2.None"
## [28] "BsmtFinType2.Rec"  "BsmtFinType2.Unf"  "BsmtFinSF2"       
## [31] "BsmtUnfSF"         "TotalBsmtSF"       "BsmtFullBath"     
## [34] "BsmtHalfBath"      "BsmtQualnums"      "BsmtCondnums"     
## [37] "BsmtExposurenums"  "BsmtFinType1nums"  "BsmtFinType2nums" 
## [40] "NoBasement"
basement_cor  <- abs(cor(train_ppd_df[basement_cols]))
diag(basement_cor) <- 0
which(basement_cor > 0.99, arr.ind = T)
##                   row col
## BsmtCond.None       8   4
## BsmtExposure.None  14   4
## BsmtFinType1.None  19   4
## BsmtFinType2.None  27   4
## NoBasement         40   4
## BsmtQual.None       4   8
## BsmtExposure.None  14   8
## BsmtFinType1.None  19   8
## BsmtFinType2.None  27   8
## NoBasement         40   8
## BsmtQual.None       4  14
## BsmtCond.None       8  14
## BsmtFinType1.None  19  14
## BsmtFinType2.None  27  14
## NoBasement         40  14
## BsmtQual.None       4  19
## BsmtCond.None       8  19
## BsmtExposure.None  14  19
## BsmtFinType2.None  27  19
## NoBasement         40  19
## BsmtQual.None       4  27
## BsmtCond.None       8  27
## BsmtExposure.None  14  27
## BsmtFinType1.None  19  27
## NoBasement         40  27
## BsmtQual.None       4  40
## BsmtCond.None       8  40
## BsmtExposure.None  14  40
## BsmtFinType1.None  19  40
## BsmtFinType2.None  27  40
train_ppd_df$BsmtCond.None <- NULL
train_ppd_df$BsmtExposure.None <- NULL
train_ppd_df$BsmtFinType1.None <- NULL
train_ppd_df$BsmtFinType2.None <- NULL
train_ppd_df$BsmtQual.None <- NULL 
test_ppd_df$BsmtCond.None <- NULL
test_ppd_df$BsmtExposure.None <- NULL
test_ppd_df$BsmtFinType1.None <- NULL
test_ppd_df$BsmtFinType2.None <- NULL
test_ppd_df$BsmtQual.None <- NULL 

Now the numerical variables.

nums <- sapply(train_ppd_df, is.numeric)
nums["SalePrice"] <- FALSE

correlmatrix <- abs(cor(train_ppd_df[nums]))
diag(correlmatrix) <- 0
which(correlmatrix > 0.99, arr.ind = T)
##                 row col
## BldgType.Duplex  78  11
## BldgType.2fmCon  77  15
## Street.Pave      24  23
## Street.Grvl      23  24
## MSSubClass.190   15  77
## MSSubClass.90    11  78
## HasPool         350 221
## Agebin          327 324
## Age             324 327
## PoolQC.None     221 350
train_ppd_df$Street.Pave <- NULL
train_ppd_df$MSSubClass.190 <- NULL
train_ppd_df$MSSubClass.90 <- NULL
train_ppd_df$PoolQC.None <- NULL
test_ppd_df$Street.Pave <- NULL
test_ppd_df$MSSubClass.190 <- NULL
test_ppd_df$MSSubClass.90 <- NULL
test_ppd_df$PoolQC.None <- NULL

Finally we’ll write our tidy data to a file.

dim(train_ppd_df)
## [1] 1460  347
dim(test_ppd_df)
## [1] 1459  346
# write our cleaned data to a file
write.csv(train_ppd_df, file = train_output, row.names = FALSE)
write.csv(test_ppd_df, file = test_output, row.names = FALSE)

We’ll fit our models in python.

Note: GarageAge and GarageAgeLin both have NAs and should be dropped if we’re not restricting to the subset with NoGarage == 0.