This is an exploratory data analysis for the Kaggle House Prices competition.
The use here of rpart to impute missing values leans heavily on the analysis by Bisaria, while some of the feature engineering was suggested by that of Amit Choudhary. The correlation plot was borrowed from AiO.
The data consists of a testing set with 1460 house sale price observations with 81 features. The evaluation set consists of 1459 observations of the same features. The evaluation metric is the RMSE of the logs of the predicted vs. true sale prices.
# load packages
require(ggplot2)
require(caret)
require(rpart)
require(corrplot)
# Set working directory and import datafiles
setwd("~/code/kaggle/housepricesART")
train <- read.csv("./input/train.csv", stringsAsFactors = F)
test <- read.csv("./input/test.csv", stringsAsFactors = F)
dim(train)
## [1] 1460 81
dim(test)
## [1] 1459 80
# take log of sale price
train$SalePrice <- log(train$SalePrice)
test$SalePrice <- rep(NA)
df <- rbind(train, test)
str(df)
## 'data.frame': 2919 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num 12.2 12.1 12.3 11.8 12.4 ...
colSums(sapply(df, is.na))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 4 486 0
## Street Alley LotShape LandContour Utilities
## 0 2721 0 0 2
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 1 1
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 24 23 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 81 82 82 79 1
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 80 1 1 1 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 2 2 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 1 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 2 0 1420 157 159
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 159 1 1 159 159
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 2909 2348 2814
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 1 0
## SalePrice
## 1459
After a full consideration of the analysis in the later sections, we return here to add an early stage of data processing. Based on the observations in the analysis below, we identified a set of possible outliers. We then ran the sci-kit learn model LassoLarsCV with 10 folds on the various data sets obtained by dropping combinations of outliers. The best dataset resulted in an decrease in RMS error of around 15% vs the set that includes all points.
Our strategy then is to produce data sets for some of these possibilities so that various exploratory modeling can be done in python. We will design some code to make it relatively easy to generate the necessary data sets and write them to coded file names.
train_size = 1460
# Failsafe in case some operation after dropping rows changes the indexing
df$HouseId <- df$Id
rel_outliers = c(31, 496, 524, 534, 692, 917, 969, 1183, 1299)
# Modify by hand to drop desired subset of outliers, then adjust the output file name
# drop_set <- c(524, 692, 1183, 1299)
#
#
#
# df <- df[-drop_set,]
# train_size <- train_size - length(drop_set)
# Use one-hot coding in output file name
# train_output <- "train_tidy-new.csv"
# test_output <- "test_tidy-new.csv"
train_output <- "train_tidy_000000000.csv"
test_output <- "test_tidy_000000000.csv"
In the first round, we want to tidy the data, so we will look for inconsistencies and impute missing values. Both the testing and test set have been combined into the dataframe df. We will start with categorical features first, except those that form groups like those for the Basement and Garage.
summary(as.factor(df$MSZoning))
## C (all) FV RH RL RM NA's
## 25 139 26 2265 460 4
which(is.na(df$MSZoning))
## [1] 1916 2217 2251 2905
Typical predictors of the zoning might be Neighborhood and local conditions. We then use rpart to impute the missing values.
mszoning.pred <- c("MSZoning", "Neighborhood", "Condition1", "Condition2", "HouseStyle", "BldgType")
mszoning.rpart <- rpart(as.factor(MSZoning) ~ ., data = df[!is.na(df$MSZoning),mszoning.pred], method = "class",
na.action=na.omit)
as.character(predict(mszoning.rpart, df[is.na(df$MSZoning),mszoning.pred], type = "class"))
## [1] "RM" "RM" "RM" "RL"
df$MSZoning[is.na(df$MSZoning)] <- as.character(predict(mszoning.rpart, df[is.na(df$MSZoning),mszoning.pred],
type = "class"))
summary(as.factor(df$Alley))
## Grvl Pave NA's
## 120 78 2721
df$Alley[is.na(df$Alley)] <- rep('None')
Here we simply substitute missing values with ‘None’.
summary(as.factor(df$Utilities))
## AllPub NoSeWa NA's
## 2916 1 2
df$Utilities <- NULL
Only at most 3 values show variation, so we drop this feature.
summary(as.factor(df$Exterior1st))
## AsbShng AsphShn BrkComm BrkFace CBlock CemntBd HdBoard ImStucc MetalSd
## 44 2 6 87 2 126 442 1 450
## Plywood Stone Stucco VinylSd Wd Sdng WdShing NA's
## 221 2 43 1025 411 56 1
which(is.na(df$Exterior1st))
## [1] 2152
ext1.pred <- c("Neighborhood", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt",
"YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "MasVnrArea", "ExterQual", "ExterCond")
ext1.rpart <- rpart(as.factor(Exterior1st) ~ ., data = df[!is.na(df$Exterior1st),ext1.pred], method = "class",
na.action=na.omit)
as.character(predict(ext1.rpart, df[is.na(df$Exterior1st),ext1.pred], type = "class"))
## [1] "Wd Sdng"
df$Exterior1st[is.na(df$Exterior1st)] <-
as.character(predict(ext1.rpart, df[is.na(df$Exterior1st),ext1.pred], type = "class"))
levels(as.factor(train$Exterior1st))
## [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock" "CemntBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Plywood" "Stone" "Stucco" "VinylSd" "Wd Sdng"
## [15] "WdShing"
levels(as.factor(test$Exterior1st))
## [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock" "CemntBd" "HdBoard"
## [8] "MetalSd" "Plywood" "Stucco" "VinylSd" "Wd Sdng" "WdShing"
We note that “ImStucc” and “Stone” are not in the test set, so we will remove the corresponding one-hot variables later in the preprocessing.
summary(as.factor(df$Exterior2nd))
## AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ImStucc MetalSd
## 38 4 22 47 3 126 406 15 447
## Other Plywood Stone Stucco VinylSd Wd Sdng Wd Shng NA's
## 1 270 6 47 1014 391 81 1
which(is.na(df$Exterior2nd))
## [1] 2152
ext2.rpart <- rpart(as.factor(Exterior2nd) ~ ., data = df[!is.na(df$Exterior2nd),ext1.pred], method = "class",
na.action=na.omit)
as.character(predict(ext2.rpart, df[is.na(df$Exterior2nd),ext1.pred], type = "class"))
## [1] "Wd Sdng"
df$Exterior2nd[is.na(df$Exterior2nd)] <-
as.character(predict(ext1.rpart, df[is.na(df$Exterior2nd),ext1.pred], type = "class"))
levels(as.factor(train$Exterior2nd))
## [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock" "CmentBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Other" "Plywood" "Stone" "Stucco" "VinylSd"
## [15] "Wd Sdng" "Wd Shng"
levels(as.factor(test$Exterior2nd))
## [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock" "CmentBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Plywood" "Stone" "Stucco" "VinylSd" "Wd Sdng"
## [15] "Wd Shng"
“Other” is not in the test set.
summary(as.factor(df$Electrical))
## FuseA FuseF FuseP Mix SBrkr NA's
## 188 50 8 1 2671 1
elec.pred <- c("BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "Electrical")
elec.rpart <- rpart(as.factor(Electrical) ~ ., data = df[!is.na(df$Electrical),elec.pred],
method = "class", na.action=na.omit)
as.character(predict(elec.rpart, df[is.na(df$Electrical),elec.pred], type = "class"))
## [1] "SBrkr"
df$Electrical[is.na(df$Electrical)] <-
as.character(predict(elec.rpart, df[is.na(df$Electrical),elec.pred], type = "class"))
levels(as.factor(train$Electrical))
## [1] "FuseA" "FuseF" "FuseP" "Mix" "SBrkr"
levels(as.factor(test$Electrical))
## [1] "FuseA" "FuseF" "FuseP" "SBrkr"
‘Mix’ is not in the test set.
summary(as.factor(df$KitchenQual))
## Ex Fa Gd TA NA's
## 205 70 1151 1492 1
kit.pred <- c("BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "KitchenQual")
kit.rpart <- rpart(as.factor(KitchenQual) ~ .,data = df[!is.na(df$KitchenQual),kit.pred],
method = "class",na.action=na.omit)
as.character(predict(kit.rpart, df[is.na(df$KitchenQual),kit.pred], type = "class"))
## [1] "TA"
df$KitchenQual[is.na(df$KitchenQual)] <-
as.character(predict(kit.rpart, df[is.na(df$KitchenQual),kit.pred], type = "class"))
summary(as.factor(df$Functional))
## Maj1 Maj2 Min1 Min2 Mod Sev Typ NA's
## 19 9 65 70 35 2 2717 2
func.pred <- c("OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "ExterQual", "ExterCond","BsmtQual",
"BsmtCond","GarageQual", "GarageCond","SaleType", "SaleCondition", "Functional")
func.rpart <- rpart(as.factor(Functional) ~ .,data = df[!is.na(df$Functional),func.pred],
method = "class",na.action=na.omit)
as.character(predict(func.rpart, df[is.na(df$Functional),func.pred], type = "class"))
## [1] "Typ" "Typ"
df$Functional[is.na(df$Functional)] <-
as.character(predict(func.rpart, df[is.na(df$Functional),func.pred], type = "class"))
summary(as.factor(df$FireplaceQu))
## Ex Fa Gd Po TA NA's
## 43 74 744 46 592 1420
which(df$Fireplaces != 0 & is.na(df$FireplaceQu))
## integer(0)
df$FireplaceQu[is.na(df$FireplaceQu)] <- rep('None')
The NAs here had no fireplace.
summary(as.factor(df$PoolQC))
## Ex Fa Gd NA's
## 4 2 4 2909
df$PoolQC[which(df$PoolArea == 0 & is.na(df$PoolQC))] <- "None"
These had no pools.
which(df$PoolArea > 0 & is.na(df$PoolQC))
## [1] 2421 2504 2600
poolq.pred <- c("YearBuilt","YearRemodAdd", "PoolQC", "PoolArea","ExterQual","ExterCond",
"YrSold","SaleType","SaleCondition")
poolq.rpart <- rpart(as.factor(PoolQC) ~ ., data = df[!is.na(df$PoolQC),poolq.pred],
method = "class", na.action=na.omit)
as.character(predict(poolq.rpart,df[is.na(df$PoolQC),poolq.pred], type="class"))
## [1] "Ex" "Ex" "Ex"
df$PoolQC[is.na(df$PoolQC)] <- as.character(predict(poolq.rpart,df[is.na(df$PoolQC),poolq.pred], type="class"))
levels(as.factor(train$PoolQC))
## [1] "Ex" "Fa" "Gd"
levels(as.factor(test$PoolQC))
## [1] "Ex" "Gd"
‘Fa’ is missing in the test set.
table(is.na(df$Fence))
##
## FALSE TRUE
## 571 2348
df$Fence[is.na(df$Fence)] <- rep('None')
These had no fence.
table(is.na(df$MiscFeature))
##
## FALSE TRUE
## 105 2814
df$MiscFeature[is.na(df$MiscFeature)] <- rep('None')
These had no misc. feature.
table(is.na(df$SaleType))
##
## FALSE TRUE
## 2918 1
sale.pred <- c("MSSubClass", "MSZoning", "Street", "Alley", "Neighborhood", "Condition1",
"Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt",
"YearRemodAdd", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "KitchenQual",
"FireplaceQu", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "YrSold",
"SaleType", "SaleCondition")
sale.rpart <- rpart(as.factor(SaleType) ~ ., data = df[!is.na(df$SaleType),sale.pred],
method = "class", na.action=na.omit)
as.character(predict(sale.rpart, df[is.na(df$SaleType),sale.pred], type = "class"))
## [1] "WD"
df$SaleType[is.na(df$SaleType)] <- as.character(predict(sale.rpart,
df[is.na(df$SaleType),sale.pred], type = "class"))
summary(as.factor(df$MasVnrType))
## BrkCmn BrkFace None Stone NA's
## 25 879 1742 249 24
summary(df$MasVnrArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 0.0 102.2 164.0 1600.0 23
which(is.na(df$MasVnrType) & !is.na(df$MasVnrArea))
## [1] 2611
df$MasVnrArea[which(is.na(df$MasVnrType) & !is.na(df$MasVnrArea))]
## [1] 198
We have 24 missing types, 23 missing areas and one obs. with an area but no type. For the 23 with no type or area we will assume that there is no veneer:
df$MasVnrArea[is.na(df$MasVnrArea)] <-rep(0)
df$MasVnrType[is.na(df$MasVnrType) & df$MasVnrArea == 0] <- rep("None")
Let’s look for missing types with zero area
which(is.na(df$MasVnrType) & df$MasVnrArea == 0)
## integer(0)
None of these. How about non-None types where the area is zero?
which(df$MasVnrType != "None" & df$MasVnrArea == 0)
## [1] 689 1242 2320
df$MasVnrType[which(df$MasVnrType != "None" & df$MasVnrArea == 0)] <- rep("None")
We assume that the zero area is right and these have no veneer. Are there nonzero areas for None type?
table(df$MasVnrArea[df$MasVnrType == 'None'])
##
## 0 1 285 288 312 344
## 1761 3 1 1 1 1
df$MasVnrArea <- ifelse(df$MasVnrArea == 1,0,df$MasVnrArea)
df$MasVnrType[df$MasVnrArea > 0 & df$MasVnrType == "None" & !is.na(df$MasVnrType)] <- rep(NA)
The ones with area 1 are probably wrong and we set those to zero area. The others should get their type re-imputed so we set to NA.
masvnr.rpart <- rpart(as.factor(MasVnrType) ~ MasVnrArea,
data = df[!is.na(df$MasVnrType),c("MasVnrType","MasVnrArea")],
method = "class", na.action=na.omit)
as.character(predict(masvnr.rpart,df[is.na(df$MasVnrType),c("MasVnrType","MasVnrArea")], type="class"))
## [1] "BrkFace" "BrkFace" "BrkFace" "BrkFace" "BrkFace"
df$MasVnrType[is.na(df$MasVnrType)] <-
as.character(predict(masvnr.rpart,df[is.na(df$MasVnrType),c("MasVnrType","MasVnrArea")], type="class"))
Quite a few houses have no basement, so there are many missing values among the features associated with basements, which are
c("TotalBsmtSF", "BsmtExposure", "BsmtCond", "BsmtQual","BsmtFinType1", "BsmtFinType2",
"BsmtFinSF1","BsmtFinSF2", "BsmtUnfSF")
## [1] "TotalBsmtSF" "BsmtExposure" "BsmtCond" "BsmtQual"
## [5] "BsmtFinType1" "BsmtFinType2" "BsmtFinSF1" "BsmtFinSF2"
## [9] "BsmtUnfSF"
summary(as.factor(df$BsmtQual))
## Ex Fa Gd TA NA's
## 258 88 1209 1283 81
table(is.na(df$BsmtExposure))
##
## FALSE TRUE
## 2837 82
table(is.na(df$BsmtCond))
##
## FALSE TRUE
## 2837 82
table(is.na(df$BsmtQual))
##
## FALSE TRUE
## 2838 81
table(is.na(df$BsmtFinType2))
##
## FALSE TRUE
## 2839 80
table(is.na(df$BsmtFinType1))
##
## FALSE TRUE
## 2840 79
table(is.na(df$BsmtFinSF1) & is.na(df$BsmtFinSF2) & is.na(df$BsmtUnfSF))
##
## FALSE TRUE
## 2918 1
table(is.na(df$BsmtFullBath) & is.na(df$BsmtHalfBath))
##
## FALSE TRUE
## 2917 2
table(df$TotalBsmtSF == 0 & is.na(df$BsmtExposure))
##
## FALSE TRUE
## 2840 78
The categorical features are
bsmt.col <- c("BsmtExposure", "BsmtCond", "BsmtQual","BsmtFinType1", "BsmtFinType2")
which(is.na(df$BsmtExposure) & is.na(df$TotalBsmtSF))
## [1] 2121
df$TotalBsmtSF[which(is.na(df$BsmtExposure) & is.na(df$TotalBsmtSF))] <- 0
2121 has NAs for everything including TotalBsmtSF, so we assume there is no basement. There are 79 entries with zero area as well, so we set these to no basement:
df[df$TotalBsmtSF == 0 & is.na(df$BsmtExposure), bsmt.col] <-
apply(df[df$TotalBsmtSF == 0 & is.na(df$BsmtExposure), bsmt.col], 2, function(x) x <- rep("None"))
We’ll use the following predictors for the missing values (I tried adding BldgType and Foundation here, but rpart is too slow):
bsmt.pred <- c("BsmtExposure", "BsmtCond", "BsmtQual","BsmtFinType1", "BsmtFinType2","TotalBsmtSF","YearBuilt")
BsmtFinType2.rpart <- rpart(as.factor(BsmtFinType2) ~ ., data = df[!is.na(df$BsmtFinType2),bsmt.pred],
method = "class", na.action=na.omit)
df$BsmtFinType2[is.na(df$BsmtFinType2)] <-
as.character(predict(BsmtFinType2.rpart, df[is.na(df$BsmtFinType2),bsmt.pred], type="class"))
BsmtQual.rpart <- rpart(as.factor(BsmtQual) ~ ., data = df[!is.na(df$BsmtQual),bsmt.pred],
method = "class", na.action=na.omit)
df$BsmtQual[is.na(df$BsmtQual)] <-
as.character(predict(BsmtQual.rpart, df[is.na(df$BsmtQual),bsmt.pred], type="class"))
BsmtCond.rpart <- rpart(as.factor(BsmtCond) ~ ., data = df[!is.na(df$BsmtCond),bsmt.pred],
method = "class", na.action=na.omit)
df$BsmtCond[is.na(df$BsmtCond)] <-
as.character(predict(BsmtCond.rpart, df[is.na(df$BsmtCond),bsmt.pred], type="class"))
BsmtExposure.rpart <- rpart(as.factor(BsmtExposure) ~ ., data = df[!is.na(df$BsmtExposure),bsmt.pred],
method = "class", na.action=na.omit)
df$BsmtExposure[is.na(df$BsmtExposure)] <-
as.character(predict(BsmtExposure.rpart, df[is.na(df$BsmtExposure),bsmt.pred], type="class"))
df[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF),
c(bsmt.pred, c("BsmtFinSF1", "BsmtFinSF2","BsmtUnfSF", "BsmtFullBath","BsmtHalfBath"))]
## BsmtExposure BsmtCond BsmtQual BsmtFinType1 BsmtFinType2 TotalBsmtSF
## 2121 None None None None None 0
## YearBuilt BsmtFinSF1 BsmtFinSF2 BsmtUnfSF BsmtFullBath BsmtHalfBath
## 2121 1946 NA NA NA NA NA
df$BsmtFinSF1[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF)] <- 0
df$BsmtFinSF2[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF)] <- 0
df$BsmtUnfSF[is.na(df$BsmtFinSF1)|is.na(df$BsmtFinSF2)|is.na(df$BsmtUnfSF)] <- 0
df$BsmtFullBath[df$TotalBsmtSF == 0 & is.na(df$BsmtFullBath)] <- rep(0)
df$BsmtHalfBath[df$TotalBsmtSF == 0 & is.na(df$BsmtHalfBath)] <- rep(0)
The entry 2121 was assigned no basement.
summary(df$GarageYrBlt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1895 1960 1979 1978 2002 2207 159
which(df$GarageYrBlt == 2207)
## [1] 2593
df$YearBuilt[which(df$GarageYrBlt == 2207)]
## [1] 2006
df$GarageYrBlt[which(df$GarageYrBlt == 2207)] <- 2007
We’re assuming 2207 was a typo and should be 2007, since the house was built in 2006.
table(is.na(df$GarageType))
##
## FALSE TRUE
## 2762 157
table(is.na(df$GarageArea))
##
## FALSE TRUE
## 2918 1
table(is.na(df$GarageCars))
##
## FALSE TRUE
## 2918 1
table(is.na(df$GarageArea) & is.na(df$GarageCars))
##
## FALSE TRUE
## 2918 1
table(is.na(df$GarageYrBlt))
##
## FALSE TRUE
## 2760 159
table(is.na(df$GarageFinish))
##
## FALSE TRUE
## 2760 159
table(is.na(df$GarageQual))
##
## FALSE TRUE
## 2760 159
table(is.na(df$GarageCond))
##
## FALSE TRUE
## 2760 159
table(is.na(df$GarageYrBlt) & is.na(df$GarageFinish) & is.na(df$GarageQual) & is.na(df$GarageCond))
##
## FALSE TRUE
## 2760 159
table(df$GarageArea == 0 & df$GarageCars==0 & is.na(df$GarageType))
##
## FALSE TRUE
## 2762 157
col.Garage <- c("GarageType", "GarageYrBlt", "GarageFinish", "GarageQual","GarageCond")
We find 157 with GarageType NA,one with GarageArea and GarageCars as NA, and 159 with GarageYrBlt, GarageFinish, GarageQual and GarageCond as NA. For the 157 houses with GarageArea = 0 and no GarageType we set all other garage features to None:
df[df$GarageArea == 0 & df$GarageCars==0 & is.na(df$GarageType), col.Garage] <-
apply(df[df$GarageArea == 0 & df$GarageCars==0 & is.na(df$GarageType),
col.Garage], 2, function(x) x <- rep("None"))
For the remaining values, we impute:
col.pred <- c("GarageType", "GarageYrBlt", "GarageFinish", "GarageQual","GarageCond","YearBuilt",
"GarageCars", "GarageArea")
area.rpart <- rpart(GarageArea ~ ., data = df[!is.na(df$GarageArea),col.pred],
method = "anova", na.action=na.omit)
df$GarageArea[is.na(df$GarageArea)] <- round(predict(area.rpart, df[is.na(df$GarageArea),col.pred]))
cars.rpart <- rpart(GarageCars ~ .,data = df[!is.na(df$GarageCars),col.pred],
method = "anova", na.action=na.omit)
df$GarageCars[is.na(df$GarageCars)] <- round(predict(cars.rpart, df[is.na(df$GarageCars),col.pred]))
blt.rpart <- rpart(as.factor(GarageYrBlt) ~ .,data = df[!is.na(df$GarageYrBlt),col.pred],
method = "class",na.action=na.omit)
df$GarageYrBlt[is.na(df$GarageYrBlt)] <-
as.numeric(as.character(predict(blt.rpart,
df[is.na(df$GarageYrBlt),col.pred], type = "class")))
df[is.na(df$GarageFinish) & is.na(df$GarageQual) & is.na(df$GarageCond),
c(col.Garage, c("GarageCars", "GarageArea"))]
## GarageType GarageYrBlt GarageFinish GarageQual GarageCond GarageCars
## 2127 Detchd 1950 <NA> <NA> <NA> 1
## 2577 Detchd 1950 <NA> <NA> <NA> 1
## GarageArea
## 2127 360
## 2577 300
For the two remaining houses with missing data, we’ll look at comparables:
df[which(df$GarageType == "Detchd" & df$GarageCars == 1 & df$GarageYrBlt == 1950),col.Garage]
## GarageType GarageYrBlt GarageFinish GarageQual GarageCond
## 460 Detchd 1950 Unf TA TA
## 986 Detchd 1950 Unf TA TA
## 1449 Detchd 1950 Unf Fa TA
## 1528 Detchd 1950 Unf TA TA
## 1542 Detchd 1950 Unf TA TA
## 1602 Detchd 1950 Unf TA TA
## 1813 Detchd 1950 Unf TA TA
## 2089 Detchd 1950 Unf TA TA
## 2112 Detchd 1950 Unf TA TA
## 2127 Detchd 1950 <NA> <NA> <NA>
## 2456 Detchd 1950 Unf TA TA
## 2464 Detchd 1950 Unf TA TA
## 2471 Detchd 1950 Fin TA TA
## 2577 Detchd 1950 <NA> <NA> <NA>
## 2579 Detchd 1950 Unf TA Fa
## 2766 Detchd 1950 Unf Fa TA
df$GarageFinish[df$GarageType == "Detchd" & df$GarageYrBlt == 1950 & is.na(df$GarageFinish)] <- "Unf"
df$GarageQual[df$GarageType == "Detchd" & df$GarageYrBlt == 1950 & is.na(df$GarageQual)] <- "TA"
df$GarageCond[df$GarageType == "Detchd" & df$GarageYrBlt == 1950 & is.na(df$GarageCond)] <- "TA"
levels(as.factor(train$GarageQual))
## [1] "Ex" "Fa" "Gd" "Po" "TA"
levels(as.factor(test$GarageQual))
## [1] "Fa" "Gd" "Po" "TA"
The GarageQual level “Ex” is not in the test set
summary(df$LotFrontage)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 59.00 68.00 69.31 80.00 313.00 486
There are 486 NA’s. Let’s impute these values.
lotf.pred <- c("MSSubClass", "MSZoning", "LotFrontage", "Street", "Alley", "LotArea", "LotShape",
"LandContour", "LotConfig", "LandSlope", "Neighborhood", "GarageType")
lotf.rpart <- rpart(LotFrontage ~ ., data = df[!is.na(df$LotFrontage),lotf.pred],
method = "anova", na.action=na.omit)
df.frontage <- as.data.frame(rbind(cbind(rep("Existing", nrow(df[!is.na(df$LotFrontage),])),
df[!is.na(df$LotFrontage), "LotFrontage"]),
cbind(rep("Imputed", nrow(df[is.na(df$LotFrontage),])),
ceiling(predict(lotf.rpart, df[is.na(df$LotFrontage),lotf.pred])))))
The imputed values are somewhat less-normal/more-skewed than the existing ones:
ggplot(df.frontage, aes (x = as.numeric(as.character(V2)), colour = V1)) +
geom_density() + xlab("Lot Frontage") + theme(legend.title=element_blank())
df$LotFrontage[is.na(df$LotFrontage)] <- ceiling(predict(lotf.rpart, df[is.na(df$LotFrontage),lotf.pred]))
summary(as.factor(df$MSSubClass))
## 20 30 40 45 50 60 70 75 80 85 90 120 150 160 180
## 1079 139 6 18 287 575 128 23 118 48 109 182 1 128 17
## 190
## 61
There are no NAs, but some factors have low counts and therefore poor statistical significance. We note that much of the information in these factors is duplicated in HouseStyle and BldgType. Where there are inconsistencies, we will trust the latter information, since MSSubClass relies on translation to a numerical code, which could be a source of coding error.
which(df$HouseStyle == "1.5Fin" & (df$MSSubClass != 50 & df$MSSubClass != 90 & df$MSSubClass != 150 &
df$MSSubClass != 190))
## [1] 730 1534 1825 2025 2197 2468 2555
df$MSSubClass[which(df$HouseStyle == "1.5Fin" & (df$MSSubClass != 50 & df$MSSubClass != 90 &
df$MSSubClass != 150 & df$MSSubClass != 190))]
## [1] 30 45 70 60 30 45 40
df$MSSubClass[which(df$HouseStyle == "1.5Fin" & (df$MSSubClass != 50 & df$MSSubClass != 90 &
df$MSSubClass != 150 & df$MSSubClass != 190))] <- 50
which(df$HouseStyle == "1.5Unf" & (df$MSSubClass != 45 & df$MSSubClass != 90 &
df$MSSubClass != 150 & df$MSSubClass != 190))
## [1] 1444 2792
df$MSSubClass[which(df$HouseStyle == "1.5Unf" & (df$MSSubClass != 45 & df$MSSubClass != 90 &
df$MSSubClass != 150 & df$MSSubClass != 190))]
## [1] 30 50
df$MSSubClass[which(df$HouseStyle == "1.5Unf" & (df$MSSubClass != 45 & df$MSSubClass != 90 &
df$MSSubClass != 150 & df$MSSubClass != 190))] <- 45
which(df$HouseStyle == "1Story" & df$MSSubClass != 20 & df$MSSubClass != 30 & df$MSSubClass != 40 &
df$MSSubClass != 90 & df$MSSubClass != 120 & df$MSSubClass != 190)
## integer(0)
which(df$HouseStyle == "2.5Fin" & df$MSSubClass != 75 & df$MSSubClass != 90 & df$MSSubClass != 160 &
df$MSSubClass != 190)
## [1] 1441
df$MSSubClass[which(df$HouseStyle == "2.5Fin" & df$MSSubClass != 75 & df$MSSubClass != 90 &
df$MSSubClass != 160 &df$MSSubClass != 190)]
## [1] 70
df$MSSubClass[which(df$HouseStyle == "2.5Fin" & df$MSSubClass != 75 & df$MSSubClass != 90 &
df$MSSubClass != 160 & df$MSSubClass != 190)] <- 75
which(df$HouseStyle == "2Story" & df$MSSubClass != 60 & df$MSSubClass != 70 & df$MSSubClass != 90 &
df$MSSubClass != 160 & df$MSSubClass != 190)
## [1] 75 80 198 608 1449 2102 2881
df$MSSubClass[which(df$HouseStyle == "2Story" & df$MSSubClass != 60 & df$MSSubClass != 70 &
df$MSSubClass != 90 & df$MSSubClass != 160 & df$MSSubClass != 190)]
## [1] 50 50 75 20 50 75 50
df$MSSubClass[which(df$YearBuilt >= 1946 & df$HouseStyle == "2Story" & df$MSSubClass != 60 &
df$MSSubClass != 70 & df$MSSubClass != 90 & df$MSSubClass != 160 &
df$MSSubClass != 190)] <- 60
df$MSSubClass[which(df$YearBuilt <= 1945 & df$HouseStyle == "2Story" & df$MSSubClass != 60 &
df$MSSubClass != 70 & df$MSSubClass != 90 & df$MSSubClass != 160 &
df$MSSubClass != 190)] <- 70
which(df$HouseStyle == "SFoyer" & (df$MSSubClass != 85 & df$MSSubClass != 90 &
df$MSSubClass != 180 & df$MSSubClass != 190))
## [1] 544 2601
df$MSSubClass[which(df$HouseStyle == "SFoyer" & (df$MSSubClass != 85 & df$MSSubClass != 90 &
df$MSSubClass != 180 & df$MSSubClass != 190))]
## [1] 120 120
df$MSSubClass[which(df$HouseStyle == "SFoyer" & (df$MSSubClass != 85 & df$MSSubClass != 90 &
df$MSSubClass != 180 & df$MSSubClass != 190))] <- 180
which(df$HouseStyle == "SLvl" & (df$MSSubClass != 80 & df$MSSubClass != 90 &
df$MSSubClass != 180 & df$MSSubClass != 190))
## [1] 945 1440 2034
df$MSSubClass[which(df$HouseStyle == "SLvl" & (df$MSSubClass != 80 & df$MSSubClass != 90 &
df$MSSubClass != 180 & df$MSSubClass != 190))]
## [1] 20 60 160
df$MSSubClass[which(df$MSSubClass == 160 & df$HouseStyle == "SLvl" )] <- 180
df$MSSubClass[which(df$HouseStyle == "SLvl" &
(df$MSSubClass == 20 | df$MSSubClass == 60) )] <- 80
which(df$BldgType == "1Fam" & df$MSSubClass == 190)
## [1] 2147
df$HouseStyle[which(df$BldgType == "1Fam" & df$MSSubClass == 190)]
## [1] "1Story"
df$YearBuilt[which(df$BldgType == "1Fam" & df$MSSubClass == 190)]
## [1] 1960
df$MSSubClass[which(df$BldgType == "1Fam" & df$MSSubClass == 190)] <- 20
which(df$BldgType == "2fmCon" & df$MSSubClass != 190)
## [1] 822 2127
df$MSSubClass[which(df$BldgType == "2fmCon" & df$MSSubClass != 190)] <- 190
which(df$BldgType == "Duplex" & df$MSSubClass != 90)
## integer(0)
which((df$BldgType == "Twnhs" | df$BldgType == "TwnhsE") &
(df$MSSubClass != 120 & df$MSSubClass != 150 & df$MSSubClass != 160 & df$MSSubClass != 180))
## [1] 2665
df$MSSubClass[which((df$BldgType == "Twnhs" | df$BldgType == "TwnhsE") &
(df$MSSubClass != 120 & df$MSSubClass != 150 & df$MSSubClass != 160 & df$MSSubClass != 180))]
## [1] 20
df$MSSubClass[which((df$BldgType == "Twnhs" | df$BldgType == "TwnhsE") &
(df$MSSubClass != 120 & df$MSSubClass != 150 & df$MSSubClass != 160 & df$MSSubClass != 180))] <- 120
# this is a categorical variable
df$MSSubClass <- as.factor(df$MSSubClass)
summary(df$MSSubClass)
## 20 30 40 45 50 60 70 75 80 85 90 120 150 160 180
## 1076 136 5 18 289 573 132 22 120 48 109 181 1 127 20
## 190
## 62
In this 2nd round, we begin engineering features. We will start with the categorical variables and then move on to the numerical ones.
Most of the information here is duplicated by HouseStyle and BldgType. We’ll build a numeric feature that increases with the median price in the testing class
mssc = levels(df$MSSubClass)
mssc_df = as.data.frame(mssc)
colnames(mssc_df) <- "MSSubClass"
mssc_meds = (1:length(mssc))
for (i in (1:length(mssc))){
mssc_meds[i] <- median(df$SalePrice[df$MSSubClass == mssc[i]], na.rm = TRUE)
}
# class 150 is not in the testing set
mssc_meds[[13]] <-
median(df$SalePrice[df$MSSubClass == '120' | df$MSSubClass == '160' | df$MSSubClass == '180' ], na.rm = TRUE)
mssc_df$meds = mssc_meds
mssc_df[ order(mssc_df$meds), ]
## MSSubClass meds
## 15 180 11.41861
## 2 30 11.49272
## 4 45 11.60368
## 16 190 11.75587
## 5 50 11.79622
## 11 90 11.82026
## 10 85 11.85447
## 3 40 11.86487
## 14 160 11.89136
## 7 70 11.95118
## 1 20 11.97980
## 8 75 12.00456
## 13 150 12.00914
## 9 80 12.02273
## 12 120 12.16655
## 6 60 12.28303
df$MSSubClassordered[df$MSSubClass == "180"] <- 1
df$MSSubClassordered[df$MSSubClass == "30"] <- 2
df$MSSubClassordered[df$MSSubClass == "45"] <- 3
df$MSSubClassordered[df$MSSubClass == "190"] <- 4
df$MSSubClassordered[df$MSSubClass == "50"] <- 5
df$MSSubClassordered[df$MSSubClass == "90"] <- 6
df$MSSubClassordered[df$MSSubClass == "85"] <- 7
df$MSSubClassordered[df$MSSubClass == "40"] <- 8
df$MSSubClassordered[df$MSSubClass == "160"] <- 9
df$MSSubClassordered[df$MSSubClass == "70"] <- 10
df$MSSubClassordered[df$MSSubClass == "20"] <- 11
df$MSSubClassordered[df$MSSubClass == "75"] <- 12
df$MSSubClassordered[df$MSSubClass == "150"] <- 13
df$MSSubClassordered[df$MSSubClass == "80"] <- 14
df$MSSubClassordered[df$MSSubClass == "120"] <- 15
df$MSSubClassordered[df$MSSubClass == "60"] <- 16
ggplot(data=df, aes(as.factor(df$MSSubClassordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
ggplot(data=df, aes(df$MSSubClassordered,df$SalePrice)) + geom_point()
## Warning: Removed 1459 rows containing missing values (geom_point).
df$MSZoning <- as.factor(df$MSZoning)
summary(df$MSZoning)
## C (all) FV RH RL RM
## 25 139 26 2266 463
ggplot(data=df, aes(df$MSZoning,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
mszoning = levels(df$MSZoning)
mszoning_df = as.data.frame(mszoning)
colnames(mszoning_df) <- "MSZoning"
mszoning_meds = (1:length(mszoning))
for (i in (1:length(mszoning))){
mszoning_meds[i] <- median(df$SalePrice[df$MSZoning == mszoning[i]], na.rm = TRUE)
}
mszoning_df$meds = mszoning_meds
mszoning_df[ order(mszoning_df$meds), ]
## MSZoning meds
## 1 C (all) 11.21767
## 5 RM 11.69941
## 3 RH 11.82375
## 4 RL 12.06681
## 2 FV 12.23539
df$MSZoningordered[df$MSZoning == "C (all)"] <- 1
df$MSZoningordered[df$MSZoning == "RM"] <- 2
df$MSZoningordered[df$MSZoning == "RH"] <- 3
df$MSZoningordered[df$MSZoning == "RL"] <- 4
df$MSZoningordered[df$MSZoning == "FV"] <- 5
ggplot(data=df, aes(as.factor(df$MSZoningordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Street <- as.factor(df$Street)
summary(df$Street)
## Grvl Pave
## 12 2907
ggplot(data=df, aes(df$Street,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Alley <- as.factor(df$Alley)
summary(df$Alley)
## Grvl None Pave
## 120 2721 78
ggplot(data=df, aes(df$Alley,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LotShape <- as.factor(df$LotShape)
summary(df$LotShape)
## IR1 IR2 IR3 Reg
## 968 76 16 1859
ggplot(data=df, aes(df$LotShape,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LandContour <- as.factor(df$LandContour)
summary(df$LandContour)
## Bnk HLS Low Lvl
## 117 120 60 2622
ggplot(data=df, aes(df$LandContour,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LandContourordered[df$LandContour == "Bnk"] <- 1
df$LandContourordered[df$LandContour == "Lvl"] <- 2
df$LandContourordered[df$LandContour == "Low"] <- 3
df$LandContourordered[df$LandContour == "HLS"] <- 4
ggplot(data=df, aes(as.factor(df$LandContourordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LotConfig <- as.factor(df$LotConfig)
summary(df$LotConfig)
## Corner CulDSac FR2 FR3 Inside
## 511 176 85 14 2133
ggplot(data=df, aes(df$LotConfig,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LotConfigordered[df$LotConfig == "Inside"] <- 1
df$LotConfigordered[df$LotConfig == "Corner"] <- 2
df$LotConfigordered[df$LotConfig == "FR2"] <- 3
df$LotConfigordered[df$LotConfig == "FR3"] <- 4
df$LotConfigordered[df$LotConfig == "CulDSac"] <- 5
ggplot(data=df, aes(as.factor(df$LotConfigordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LandSlope <- as.factor(df$LandSlope)
summary(df$LandSlope)
## Gtl Mod Sev
## 2778 125 16
ggplot(data=df, aes(df$LandSlope,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$LandSlopeordered[df$LandSlope == "Gtl"] <- 1
df$LandSlopeordered[df$LandSlope == "Sev"] <- 2
df$LandSlopeordered[df$LandSlope == "Mod"] <- 3
ggplot(data=df, aes(as.factor(df$LandSlopeordered),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Neighborhood <- as.factor(df$Neighborhood)
summary(df$Neighborhood)
## Blmngtn Blueste BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert
## 28 10 30 108 44 267 103 194 165
## IDOTRR MeadowV Mitchel NAmes NoRidge NPkVill NridgHt NWAmes OldTown
## 93 37 114 443 71 23 166 131 239
## Sawyer SawyerW Somerst StoneBr SWISU Timber Veenker
## 151 125 182 51 48 72 24
neigh = levels(df$Neighborhood)
neigh_df = as.data.frame(neigh)
colnames(neigh_df) <- "Neighborhood"
neigh_meds = (1:length(neigh))
for (i in (1:length(neigh))){
neigh_meds[i] <- median(df$SalePrice[df$Neighborhood == neigh[i]], na.rm = TRUE)
}
neigh_df$meds = neigh_meds
for (i in (1:length(neigh))){
neigh_df$count[i] <- length(df$Neighborhood[df$Neighborhood == neigh[i]])
}
ggplot(data=neigh_df, aes(neigh_df$meds,neigh_df$count)) + geom_point()
neigh_df[ order(neigh_df$meds), ]
## Neighborhood meds count
## 11 MeadowV 11.38509 37
## 10 IDOTRR 11.54248 93
## 3 BrDale 11.57119 30
## 18 OldTown 11.68688 239
## 8 Edwards 11.70962 194
## 4 BrkSide 11.73022 108
## 19 Sawyer 11.81303 151
## 2 Blueste 11.82654 10
## 23 SWISU 11.84582 48
## 13 NAmes 11.84940 443
## 15 NPkVill 11.89136 23
## 12 Mitchel 11.94146 114
## 20 SawyerW 12.10016 125
## 9 Gilbert 12.10625 165
## 17 NWAmes 12.11669 131
## 1 Blmngtn 12.16003 28
## 6 CollgCr 12.19197 267
## 5 ClearCr 12.20732 44
## 7 Crawfor 12.20919 103
## 25 Veenker 12.29225 24
## 21 Somerst 12.32607 182
## 24 Timber 12.33918 72
## 22 StoneBr 12.53538 51
## 14 NoRidge 12.61653 71
## 16 NridgHt 12.66033 166
median(df$SalePrice, na.rm = TRUE)
## [1] 12.00151
sd(df$SalePrice, na.rm = TRUE)/2
## [1] 0.1997259
ggplot(data=df, aes(as.factor(df$Neighborhood),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
We will cut price bins around 250000, 195000, 170000, 130000, 115000
df$Neighborhoodbins[df$Neighborhood == "MeadowV"] <- 1
df$Neighborhoodbins[df$Neighborhood == "IDOTRR"] <- 1
df$Neighborhoodbins[df$Neighborhood == "BrDale"] <- 1
df$Neighborhoodbins[df$Neighborhood == "OldTown"] <- 2
df$Neighborhoodbins[df$Neighborhood == "Edwards"] <- 2
df$Neighborhoodbins[df$Neighborhood == "BrkSide"] <- 2
df$Neighborhoodbins[df$Neighborhood == "Sawyer"] <- 3
df$Neighborhoodbins[df$Neighborhood == "Blueste"] <- 3
df$Neighborhoodbins[df$Neighborhood == "SWISU"] <- 3
df$Neighborhoodbins[df$Neighborhood == "NAmes"] <- 3
df$Neighborhoodbins[df$Neighborhood == "NPkVill"] <- 3
df$Neighborhoodbins[df$Neighborhood == "Mitchel"] <- 3
df$Neighborhoodbins[df$Neighborhood == "SawyerW"] <- 4
df$Neighborhoodbins[df$Neighborhood == "Gilbert"] <- 4
df$Neighborhoodbins[df$Neighborhood == "NWAmes"] <- 4
df$Neighborhoodbins[df$Neighborhood == "Blmngtn"] <- 4
df$Neighborhoodbins[df$Neighborhood == "CollgCr"] <- 5
df$Neighborhoodbins[df$Neighborhood == "ClearCr"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Crawfor"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Veenker"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Somerst"] <- 5
df$Neighborhoodbins[df$Neighborhood == "Timber"] <- 5
df$Neighborhoodbins[df$Neighborhood == "StoneBr"] <- 6
df$Neighborhoodbins[df$Neighborhood == "NoRidge"] <- 6
df$Neighborhoodbins[df$Neighborhood == "NridgHt"] <- 6
ggplot(data=df, aes(as.factor(df$Neighborhoodbins),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$GoodNeighborhood <- 0
df$GoodNeighborhood[df$Neighborhood == "CollgCr"] <- 1
df$GoodNeighborhood[df$Neighborhood == "ClearCr"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Crawfor"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Veenker"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Somerst"] <- 1
df$GoodNeighborhood[df$Neighborhood == "Timber"] <- 1
df$GoodNeighborhood[df$Neighborhood == "StoneBr"] <- 1
df$GoodNeighborhood[df$Neighborhood == "NoRidge"] <- 1
df$GoodNeighborhood[df$Neighborhood == "NridgHt"] <- 1
df$GreatNeighborhood <- 0
df$GreatNeighborhood[df$Neighborhood == "StoneBr"] <- 1
df$GreatNeighborhood[df$Neighborhood == "NoRidge"] <- 1
df$GreatNeighborhood[df$Neighborhood == "NridgHt"] <- 1
We also created features that indicate two categories of premium neighborhoods.
df$Condition1 <- as.factor(df$Condition1)
summary(df$Condition1)
## Artery Feedr Norm PosA PosN RRAe RRAn RRNe RRNn
## 92 164 2511 20 39 28 50 6 9
ggplot(data=df, aes(df$Condition1,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
condition1 = levels(df$Condition1)
condition1_df = as.data.frame(condition1)
colnames(condition1_df) <- "Condition1"
condition1_meds = (1:length(condition1))
for (i in (1:length(condition1))){
condition1_meds[i] <- median(df$SalePrice[df$Condition1 == condition1[i]], na.rm = TRUE)
}
condition1_df$meds = condition1_meds
condition1_df[ order(condition1_df$meds), ]
## Condition1 meds
## 1 Artery 11.69149
## 2 Feedr 11.84940
## 6 RRAe 11.86710
## 3 Norm 12.02275
## 7 RRAn 12.05227
## 8 RRNe 12.15853
## 5 PosN 12.20607
## 4 PosA 12.26106
## 9 RRNn 12.27373
df$Condition1nums[df$Condition1 == "Artery"] <- 1
df$Condition1nums[df$Condition1 == "Feedr"] <- 2
df$Condition1nums[df$Condition1 == "RRAe"] <- 3
df$Condition1nums[df$Condition1 == "Norm"] <- 4
df$Condition1nums[df$Condition1 == "RRAn"] <- 5
df$Condition1nums[df$Condition1 == "RRNe"] <- 6
df$Condition1nums[df$Condition1 == "PosN"] <- 7
df$Condition1nums[df$Condition1 == "PosA"] <- 8
df$Condition1nums[df$Condition1 == "RRNn"] <- 9
ggplot(data=df, aes(as.factor(df$Condition1nums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Condition2 <- as.factor(df$Condition2)
summary(df$Condition2)
## Artery Feedr Norm PosA PosN RRAe RRAn RRNn
## 5 13 2889 4 4 1 1 2
levels(as.factor(train$Condition2))
## [1] "Artery" "Feedr" "Norm" "PosA" "PosN" "RRAe" "RRAn" "RRNn"
levels(as.factor(test$Condition2))
## [1] "Artery" "Feedr" "Norm" "PosA" "PosN"
We note that the levelss “RRAe”, “RRAn”, and “RRNn” are missing from the test set. We will need to remove these from the testing set after encoding.
df$BldgType <- as.factor(df$BldgType)
summary(df$BldgType)
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 2425 62 109 96 227
ggplot(data=df, aes(df$BldgType,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
bldgtype = levels(df$BldgType)
bldgtype_df = as.data.frame(bldgtype)
colnames(bldgtype_df) <- "BldgType"
bldgtype_meds = (1:length(bldgtype))
for (i in (1:length(bldgtype))){
bldgtype_meds[i] <- median(df$SalePrice[df$BldgType == bldgtype[i]], na.rm = TRUE)
}
bldgtype_df$meds = bldgtype_meds
bldgtype_df[ order(bldgtype_df$meds), ]
## BldgType meds
## 2 2fmCon 11.75587
## 3 Duplex 11.82026
## 4 Twnhs 11.83138
## 1 1Fam 12.03112
## 5 TwnhsE 12.05641
df$BldgTypenums[df$BldgType == "2fmCon"] <- 1
df$BldgTypenums[df$BldgType == "Duplex"] <- 2
df$BldgTypenums[df$BldgType == "Twnhs"] <- 3
df$BldgTypenums[df$BldgType == "1Fam"] <- 4
df$BldgTypenums[df$BldgType == "TwnhsE"] <- 5
ggplot(data=df, aes(as.factor(df$BldgTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$HouseStyle <- as.factor(df$HouseStyle)
summary(df$HouseStyle)
## 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer SLvl
## 314 19 1471 8 24 872 83 128
ggplot(data=df, aes(df$HouseStyle,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
housestyle = levels(df$HouseStyle)
housestyle_df = as.data.frame(housestyle)
colnames(housestyle_df) <- "HouseStyle"
housestyle_meds = (1:length(housestyle))
for (i in (1:length(housestyle))){
housestyle_meds[i] <- median(df$SalePrice[df$HouseStyle == housestyle[i]], na.rm = TRUE)
}
housestyle_df$meds = housestyle_meds
housestyle_df[ order(housestyle_df$meds), ]
## HouseStyle meds
## 2 1.5Unf 11.61941
## 1 1.5Fin 11.79055
## 5 2.5Unf 11.80485
## 7 SFoyer 11.82012
## 3 1Story 11.94956
## 8 SLvl 12.01067
## 6 2Story 12.15478
## 4 2.5Fin 12.17549
df$HouseStylenums[df$HouseStyle == "1.5Unf"] <- 1
df$HouseStylenums[df$HouseStyle == "1.5Fin"] <- 2
df$HouseStylenums[df$HouseStyle == "2.5Unf"] <- 3
df$HouseStylenums[df$HouseStyle == "SFoyer"] <- 4
df$HouseStylenums[df$HouseStyle == "1Story"] <- 5
df$HouseStylenums[df$HouseStyle == "SLvl"] <- 6
df$HouseStylenums[df$HouseStyle == "2Story"] <- 7
df$HouseStylenums[df$HouseStyle == "2.5Fin"] <- 8
ggplot(data=df, aes(as.factor(df$HouseStylenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
levels(as.factor(train$HouseStyle))
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Fin" "2.5Unf" "2Story" "SFoyer" "SLvl"
levels(as.factor(test$HouseStyle))
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Unf" "2Story" "SFoyer" "SLvl"
Here “2.5Fin” is missing from the test set.
While seemingly ordinal, this variable is also a translation of categorical ratings “Poor”, “Fair”, etc. We’ll set a dummy ordinal for this and OverallCond below.
df$OverallQualcat <- as.factor(df$OverallQual)
summary(df$OverallQualcat)
## 1 2 3 4 5 6 7 8 9 10
## 4 13 40 226 825 731 600 342 107 31
ggplot(data=df, aes(as.factor(df$OverallQual),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
We will make bins for the top and low ends.
df$OverallQualGood <- 0
df$OverallQualBad <- 0
df$OverallQualGood[df$OverallQual == 8] <- 1
df$OverallQualGood[df$OverallQual == 9] <- 1
df$OverallQualGood[df$OverallQual == 10] <- 1
df$OverallQualBad[df$OverallQual == 1] <- 1
df$OverallQualBad[df$OverallQual == 2] <- 1
df$OverallCondcat <- as.factor(df$OverallCond)
summary(df$OverallCondcat)
## 1 2 3 4 5 6 7 8 9
## 7 10 50 101 1645 531 390 144 41
ggplot(data=df, aes(as.factor(df$OverallCond),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$OverallCondBad <- 0
df$OverallCondBad[df$OverallCond == 1] <- 1
df$OverallCondBad[df$OverallCond == 2] <- 1
df$OverallCondBad[df$OverallCond == 3] <- 1
df$OverallCondBad[df$OverallCond == 4] <- 1
ggplot(data=df, aes(df$OverallCond,df$SalePrice)) + geom_point() +
geom_smooth(method=lm, data=df, formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
Only need a bin for bad condition. In the last plot, we see that there are some outliers and the ordinal nature of the variable doesn’t suggest a useful transformation to make this any more linear.
df$RoofStyle <- as.factor(df$RoofStyle)
summary(df$RoofStyle)
## Flat Gable Gambrel Hip Mansard Shed
## 20 2310 22 551 11 5
ggplot(data=df, aes(df$RoofStyle,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
roofstyle = levels(df$RoofStyle)
roofstyle_df = as.data.frame(roofstyle)
colnames(roofstyle_df) <- "RoofStyle"
roofstyle_meds = (1:length(roofstyle))
for (i in (1:length(roofstyle))){
roofstyle_meds[i] <- median(df$SalePrice[df$RoofStyle == roofstyle[i]], na.rm = TRUE)
}
#
roofstyle_df$meds = roofstyle_meds
roofstyle_df[ order(roofstyle_df$meds), ]
## RoofStyle meds
## 3 Gambrel 11.84223
## 2 Gable 11.98293
## 5 Mansard 12.07254
## 4 Hip 12.08108
## 1 Flat 12.12811
## 6 Shed 12.31161
df$RoofStylenums[df$RoofStyle == "Gambrel"] <- 1
df$RoofStylenums[df$RoofStyle == "Gable"] <- 2
df$RoofStylenums[df$RoofStyle == "Mansard"] <- 3
df$RoofStylenums[df$RoofStyle == "Hip"] <- 4
df$RoofStylenums[df$RoofStyle == "Flat"] <- 5
df$RoofStylenums[df$RoofStyle == "Shed"] <- 6
ggplot(data=df, aes(as.factor(df$RoofStylenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$RoofMatl <- as.factor(df$RoofMatl)
summary(df$RoofMatl)
## ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
## 1 2876 1 1 1 23 9 7
ggplot(data=df, aes(as.factor(df$RoofMatl),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
levels(as.factor(train$RoofMatl))
## [1] "ClyTile" "CompShg" "Membran" "Metal" "Roll" "Tar&Grv" "WdShake"
## [8] "WdShngl"
levels(as.factor(test$RoofMatl))
## [1] "CompShg" "Tar&Grv" "WdShake" "WdShngl"
Here “ClyTile”, “Membran”, “Metal”, and “Roll” are not in the test set. Overall low variation suggests dropping this at some point.
df$Exterior1st <- as.factor(df$Exterior1st)
summary(df$Exterior1st)
## AsbShng AsphShn BrkComm BrkFace CBlock CemntBd HdBoard ImStucc MetalSd
## 44 2 6 87 2 126 442 1 450
## Plywood Stone Stucco VinylSd Wd Sdng WdShing
## 221 2 43 1025 412 56
exterior1st = levels(df$Exterior1st)
exterior1st_df = as.data.frame(exterior1st)
colnames(exterior1st_df) <- "Exterior1st"
exterior1st_meds = (1:length(exterior1st))
for (i in (1:length(exterior1st))){
exterior1st_meds[i] <- median(df$SalePrice[df$Exterior1st == exterior1st[i]], na.rm = TRUE)
}
exterior1st_df$meds = exterior1st_meds
exterior1st_df[ order(exterior1st_df$meds), ]
## Exterior1st meds
## 3 BrkComm 11.15829
## 2 AsphShn 11.51293
## 5 CBlock 11.56172
## 1 AsbShng 11.58972
## 15 WdShing 11.76520
## 14 Wd Sdng 11.84182
## 9 MetalSd 11.84223
## 12 Stucco 11.87757
## 7 HdBoard 11.91772
## 4 BrkFace 12.01823
## 10 Plywood 12.02844
## 13 VinylSd 12.20607
## 6 CemntBd 12.37370
## 11 Stone 12.45654
## 8 ImStucc 12.47610
df$Exterior1stnums[df$Exterior1st == "BrkComm"] <- 1
df$Exterior1stnums[df$Exterior1st == "AsphShn"] <- 2
df$Exterior1stnums[df$Exterior1st == "CBlock"] <- 3
df$Exterior1stnums[df$Exterior1st == "AsbShng"] <- 4
df$Exterior1stnums[df$Exterior1st == "WdShing"] <- 5
df$Exterior1stnums[df$Exterior1st == "Wd Sdng"] <- 6
df$Exterior1stnums[df$Exterior1st == "MetalSd"] <- 7
df$Exterior1stnums[df$Exterior1st == "Stucco"] <- 8
df$Exterior1stnums[df$Exterior1st == "HdBoard"] <- 9
df$Exterior1stnums[df$Exterior1st == "BrkFace"] <- 10
df$Exterior1stnums[df$Exterior1st == "Plywood"] <- 11
df$Exterior1stnums[df$Exterior1st == "VinylSd"] <- 12
df$Exterior1stnums[df$Exterior1st == "CemntBd"] <- 13
df$Exterior1stnums[df$Exterior1st == "Stone"] <- 14
df$Exterior1stnums[df$Exterior1st == "ImStucc"] <- 15
ggplot(data=df, aes(as.factor(df$Exterior1stnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Exterior2nd <- as.factor(df$Exterior2nd)
summary(df$Exterior2nd)
## AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ImStucc MetalSd
## 38 4 22 47 3 126 406 15 447
## Other Plywood Stone Stucco VinylSd Wd Sdng Wd Shng
## 1 270 6 47 1014 392 81
exterior2nd = levels(df$Exterior2nd)
exterior2nd_df = as.data.frame(exterior2nd)
colnames(exterior2nd_df) <- "Exterior2nd"
exterior2nd_meds = (1:length(exterior2nd))
for (i in (1:length(exterior2nd))){
exterior2nd_meds[i] <- median(df$SalePrice[df$Exterior2nd == exterior2nd[i]], na.rm = TRUE)
}
exterior2nd_df$meds = exterior2nd_meds
exterior2nd_df[ order(exterior2nd_df$meds), ]
## Exterior2nd meds
## 5 CBlock 11.56172
## 1 AsbShng 11.61724
## 15 Wd Sdng 11.83501
## 16 Wd Shng 11.83662
## 9 MetalSd 11.84043
## 2 AsphShn 11.84223
## 13 Stucco 11.86348
## 3 Brk Cmn 11.89819
## 7 HdBoard 11.95118
## 4 BrkFace 11.98293
## 11 Plywood 11.98758
## 12 Stone 12.08391
## 8 ImStucc 12.14106
## 14 VinylSd 12.20643
## 6 CmentBd 12.38313
## 10 Other 12.67295
df$Exterior2ndnums[df$Exterior2nd == "Brk Cmn"] <- 8
df$Exterior2ndnums[df$Exterior2nd == "AsphShn"] <- 6
df$Exterior2ndnums[df$Exterior2nd == "CBlock"] <- 1
df$Exterior2ndnums[df$Exterior2nd == "AsbShng"] <- 2
df$Exterior2ndnums[df$Exterior2nd == "Wd Shng"] <- 4
df$Exterior2ndnums[df$Exterior2nd == "Wd Sdng"] <- 3
df$Exterior2ndnums[df$Exterior2nd == "MetalSd"] <- 5
df$Exterior2ndnums[df$Exterior2nd == "Stucco"] <- 7
df$Exterior2ndnums[df$Exterior2nd == "HdBoard"] <- 9
df$Exterior2ndnums[df$Exterior2nd == "BrkFace"] <- 10
df$Exterior2ndnums[df$Exterior2nd == "Plywood"] <- 11
df$Exterior2ndnums[df$Exterior2nd == "VinylSd"] <- 14
df$Exterior2ndnums[df$Exterior2nd == "CmentBd"] <- 15
df$Exterior2ndnums[df$Exterior2nd == "Stone"] <- 12
df$Exterior2ndnums[df$Exterior2nd == "ImStucc"] <- 13
df$Exterior2ndnums[df$Exterior2nd == "Other"] <- 16
ggplot(data=df, aes(as.factor(df$Exterior2ndnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$MasVnrType <- as.factor(df$MasVnrType)
summary(df$MasVnrType)
## BrkCmn BrkFace None Stone
## 25 882 1764 248
ggplot(data=df, aes(factor(df$MasVnrType),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$MasVnrTypenums[df$MasVnrType == "BrkCmn"] <- 1
df$MasVnrTypenums[df$MasVnrType == "None"] <- 2
df$MasVnrTypenums[df$MasVnrType == "BrkFace"] <- 3
df$MasVnrTypenums[df$MasVnrType == "Stone"] <- 4
ggplot(data=df, aes(as.factor(df$MasVnrTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$ExterQual <- as.factor(df$ExterQual)
summary(df$ExterQual)
## Ex Fa Gd TA
## 107 35 979 1798
ggplot(data=df, aes(factor(df$ExterQual),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$ExterQualnums[df$ExterQual == "Fa"] <- 1
df$ExterQualnums[df$ExterQual == "TA"] <- 2
df$ExterQualnums[df$ExterQual == "Gd"] <- 3
df$ExterQualnums[df$ExterQual == "Ex"] <- 4
df$ExterCond <- as.factor(df$ExterCond)
summary(df$ExterCond)
## Ex Fa Gd Po TA
## 12 67 299 3 2538
ggplot(data=df, aes(factor(df$ExterCond),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$ExterCondnums[df$ExterCond == "Po"] <- 0
df$ExterCondnums[df$ExterCond == "Fa"] <- 1
df$ExterCondnums[df$ExterCond == "TA"] <- 2
df$ExterCondnums[df$ExterCond == "Gd"] <- 3
df$ExterCondnums[df$ExterCond == "Ex"] <- 4
df$Foundation <- as.factor(df$Foundation)
summary(df$Foundation)
## BrkTil CBlock PConc Slab Stone Wood
## 311 1235 1308 49 11 5
ggplot(data=df, aes(factor(df$Foundation),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
foundation = levels(df$Foundation)
foundation_df = as.data.frame(foundation)
colnames(foundation_df) <- "Foundation"
foundation_meds = (1:length(foundation))
for (i in (1:length(foundation))){
foundation_meds[i] <- median(df$SalePrice[df$Foundation == foundation[i]], na.rm = TRUE)
}
foundation_df$meds = foundation_meds
foundation_df[ order(foundation_df$meds), ]
## Foundation meds
## 4 Slab 11.55333
## 1 BrkTil 11.73807
## 5 Stone 11.74454
## 2 CBlock 11.86005
## 6 Wood 12.00762
## 3 PConc 12.23077
df$Foundationnums[df$Foundation == "Slab"] <- 1
df$Foundationnums[df$Foundation == "BrkTil"] <- 2
df$Foundationnums[df$Foundation == "Stone"] <- 3
df$Foundationnums[df$Foundation == "CBlock"] <- 4
df$Foundationnums[df$Foundation == "Wood"] <- 5
df$Foundationnums[df$Foundation == "PConc"] <- 6
ggplot(data=df, aes(as.factor(df$Foundationnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BsmtQual <- as.factor(df$BsmtQual)
summary(df$BsmtQual)
## Ex Fa Gd None TA
## 258 88 1209 79 1285
ggplot(data=df, aes(df$BsmtQual,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BsmtQualnums[df$BsmtQual == "None"] <- 0
df$BsmtQualnums[df$BsmtQual == "Fa"] <- 1
df$BsmtQualnums[df$BsmtQual == "TA"] <- 2
df$BsmtQualnums[df$BsmtQual == "Gd"] <- 3
df$BsmtQualnums[df$BsmtQual == "Ex"] <- 4
df$BsmtCond <- as.factor(df$BsmtCond)
summary(df$BsmtCond)
## Fa Gd None Po TA
## 104 122 79 5 2609
ggplot(data=df, aes(df$BsmtCond,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BsmtCondnums[df$BsmtCond == "Po"] <- 1
df$BsmtCondnums[df$BsmtCond == "None"] <- 2
df$BsmtCondnums[df$BsmtCond == "Fa"] <- 3
df$BsmtCondnums[df$BsmtCond == "TA"] <- 4
df$BsmtCondnums[df$BsmtCond == "Gd"] <- 5
df$BsmtExposure <- as.factor(df$BsmtExposure)
summary(df$BsmtExposure)
## Av Gd Mn No None
## 418 276 239 1907 79
ggplot(data=df, aes(df$BsmtExposure,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BsmtExposurenums[df$BsmtExposure == "None"] <- 0
df$BsmtExposurenums[df$BsmtExposure == "No"] <- 1
df$BsmtExposurenums[df$BsmtExposure == "Mn"] <- 2
df$BsmtExposurenums[df$BsmtExposure == "Av"] <- 3
df$BsmtExposurenums[df$BsmtExposure == "Gd"] <- 4
df$BsmtFinType1 <- as.factor(df$BsmtFinType1)
summary(df$BsmtFinType1)
## ALQ BLQ GLQ LwQ None Rec Unf
## 429 269 849 154 79 288 851
ggplot(data=df, aes(df$BsmtFinType1,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
bsmtfintype1 = levels(df$BsmtFinType1)
bsmtfintype1_df = as.data.frame(bsmtfintype1)
colnames(bsmtfintype1_df) <- "BsmtFinType1"
bsmtfintype1_meds = (1:length(bsmtfintype1))
for (i in (1:length(bsmtfintype1))){
bsmtfintype1_meds[i] <- median(df$SalePrice[df$BsmtFinType1 == bsmtfintype1[i]], na.rm = TRUE)
}
bsmtfintype1_df$meds = bsmtfintype1_meds
bsmtfintype1_df[ order(bsmtfintype1_df$meds), ]
## BsmtFinType1 meds
## 5 None 11.53077
## 4 LwQ 11.84223
## 2 BLQ 11.84295
## 6 Rec 11.86358
## 1 ALQ 11.91338
## 7 Unf 11.99381
## 3 GLQ 12.27256
df$BsmtFinType1nums[df$BsmtFinType1 == "None"] <- 0
df$BsmtFinType1nums[df$BsmtFinType1 == "LwQ"] <- 1
df$BsmtFinType1nums[df$BsmtFinType1 == "BLQ"] <- 2
df$BsmtFinType1nums[df$BsmtFinType1 == "Rec"] <- 3
df$BsmtFinType1nums[df$BsmtFinType1 == "ALQ"] <- 4
df$BsmtFinType1nums[df$BsmtFinType1 == "Unf"] <- 5
df$BsmtFinType1nums[df$BsmtFinType1 == "GLQ"] <- 6
ggplot(data=df, aes(as.factor(df$BsmtFinType1nums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BsmtFinType2 <- as.factor(df$BsmtFinType2)
summary(df$BsmtFinType2)
## ALQ BLQ GLQ LwQ None Rec Unf
## 52 68 34 87 79 105 2494
ggplot(data=df, aes(df$BsmtFinType2,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
bsmtfintype2 = levels(df$BsmtFinType2)
bsmtfintype2_df = as.data.frame(bsmtfintype2)
colnames(bsmtfintype2_df) <- "BsmtFinType2"
bsmtfintype2_meds = (1:length(bsmtfintype2))
for (i in (1:length(bsmtfintype2))){
bsmtfintype2_meds[i] <- median(df$SalePrice[df$BsmtFinType2 == bsmtfintype2[i]], na.rm = TRUE)
}
bsmtfintype2_df$meds = bsmtfintype2_meds
bsmtfintype2_df[ order(bsmtfintype2_df$meds), ]
## BsmtFinType2 meds
## 5 None 11.53077
## 2 BLQ 11.87060
## 6 Rec 11.90995
## 4 LwQ 11.94469
## 7 Unf 12.02575
## 1 ALQ 12.07197
## 3 GLQ 12.22033
df$BsmtFinType2nums[df$BsmtFinType2 == "None"] <- 0
df$BsmtFinType2nums[df$BsmtFinType2 == "BLQ"] <- 1
df$BsmtFinType2nums[df$BsmtFinType2 == "Rec"] <- 2
df$BsmtFinType2nums[df$BsmtFinType2 == "LwQ"] <- 3
df$BsmtFinType2nums[df$BsmtFinType2 == "Unf"] <- 4
df$BsmtFinType2nums[df$BsmtFinType2 == "ALQ"] <- 5
df$BsmtFinType2nums[df$BsmtFinType2 == "GLQ"] <- 6
ggplot(data=df, aes(as.factor(df$BsmtFinType2nums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Heating <- as.factor(df$Heating)
summary(df$Heating)
## Floor GasA GasW Grav OthW Wall
## 1 2874 27 9 2 6
ggplot(data=df, aes(df$Heating,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$NoGasHeat <- 1
df$NoGasHeat[df$Heating == 'GasA' | df$Heating == 'GasW'] <- 0
df$Heating <- NULL
df$HeatingQC <- as.factor(df$HeatingQC)
summary(df$HeatingQC)
## Ex Fa Gd Po TA
## 1493 92 474 3 857
ggplot(data=df, aes(df$HeatingQC,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BadHeating <- 0
df$BadHeating[df$HeatingQC == "Po"] <- 1
df$BadHeating[df$HeatingQC == "Fa"] <- 1
df$HeatingQCnums[df$HeatingQC == "Po"] <- 1
df$HeatingQCnums[df$HeatingQC == "Fa"] <- 2
df$HeatingQCnums[df$HeatingQC == "TA"] <- 3
df$HeatingQCnums[df$HeatingQC == "Gd"] <- 4
df$HeatingQCnums[df$HeatingQC == "Ex"] <- 5
ggplot(data=df, aes(as.factor(df$HeatingQCnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$CentralAir <- as.factor(df$CentralAir)
summary(df$CentralAir)
## N Y
## 196 2723
ggplot(data=df, aes(df$CentralAir,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$NoCentralAir <- 0
df$NoCentralAir[df$CentralAir == "N"] <- 1
df$CentralAir <- NULL
df$Electrical <- as.factor(df$Electrical)
summary(df$Electrical)
## FuseA FuseF FuseP Mix SBrkr
## 188 50 8 1 2672
ggplot(data=df, aes(df$Electrical,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$HasFuse <- 1
df$HasFuse[df$Electrical == "SBrkr"] <- 0
df$Electrical <- NULL
df$KitchenQual <- as.factor(df$KitchenQual)
summary(df$KitchenQual)
## Ex Fa Gd TA
## 205 70 1151 1493
ggplot(data=df, aes(df$KitchenQual,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$KitchenQualnums[df$KitchenQual == "Fa"] <- 1
df$KitchenQualnums[df$KitchenQual == "TA"] <- 2
df$KitchenQualnums[df$KitchenQual == "Gd"] <- 3
df$KitchenQualnums[df$KitchenQual == "Ex"] <- 4
df$Functional <- as.factor(df$Functional)
summary(df$Functional)
## Maj1 Maj2 Min1 Min2 Mod Sev Typ
## 19 9 65 70 35 2 2719
ggplot(data=df, aes(df$Functional,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
This doesn’t seem to be a good regressor for price. The level “Maj2” has only 9 entries and is statistically dubious.
df$FireplaceQu <- as.factor(df$FireplaceQu)
summary(df$FireplaceQu)
## Ex Fa Gd None Po TA
## 43 74 744 1420 46 592
ggplot(data=df, aes(df$FireplaceQu,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BadFireplace <- 0
df$BadFireplace[df$FireplaceQu == "TA"] <- 1
df$BadFireplace[df$FireplaceQu == "Po"] <- 1
df$BadFireplace[df$FireplaceQu == "None"] <- 1
df$FireplaceQunums[df$FireplaceQu == "Po"] <- 1
df$FireplaceQunums[df$FireplaceQu == "None"] <- 2
df$FireplaceQunums[df$FireplaceQu == "Fa"] <- 3
df$FireplaceQunums[df$FireplaceQu == "TA"] <- 4
df$FireplaceQunums[df$FireplaceQu == "Gd"] <- 5
df$FireplaceQunums[df$FireplaceQu == "Ex"] <- 6
df$GarageType <- as.factor(df$GarageType)
summary(df$GarageType)
## 2Types Attchd Basment BuiltIn CarPort Detchd None
## 23 1723 36 186 15 779 157
ggplot(data=df, aes(df$GarageType,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$GarageDetach <- 0
df$GarageDetach[df$GarageType == "Detchd"] <- 1
df$GarageDetach[df$GarageType == "CarPort"] <- 1
df$GarageDetach[df$GarageType == "None"] <- 1
df$GarageTypenums[df$GarageType == "None"] <- 0
df$GarageTypenums[df$GarageType == "CarPort"] <- 1
df$GarageTypenums[df$GarageType == "Detchd"] <- 2
df$GarageTypenums[df$GarageType == "Basment"] <- 3
df$GarageTypenums[df$GarageType == "2Types"] <- 4
df$GarageTypenums[df$GarageType == "Attchd"] <- 5
df$GarageTypenums[df$GarageType == "BuiltIn"] <- 6
ggplot(data=df, aes(as.factor(df$GarageTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$GarageFinish = as.factor(df$GarageFinish)
summary(df$GarageFinish)
## Fin None RFn Unf
## 719 157 811 1232
ggplot(data=df, aes(df$GarageFinish,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$GarageFinishnums[df$GarageFinish == "None"] <- 0
df$GarageFinishnums[df$GarageFinish == "Unf"] <- 1
df$GarageFinishnums[df$GarageFinish == "RFn"] <- 2
df$GarageFinishnums[df$GarageFinish == "Fin"] <- 3
ggplot(data=df, aes(as.factor(df$GarageFinishnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$GarageQual <- as.factor(df$GarageQual)
summary(df$GarageQual)
## Ex Fa Gd None Po TA
## 3 124 24 157 5 2606
ggplot(data=df, aes(df$GarageQual,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BadGarageQ <- 0
df$BadGarageQ[df$GarageQual == "Fa"] <- 1
df$BadGarageQ[df$GarageQual == "Po"] <- 1
df$BadGarageQ[df$GarageQual == "None"] <- 1
df$GarageQualnums[df$GarageQual == "Po"] <- 0
df$GarageQualnums[df$GarageQual == "None"] <- 1
df$GarageQualnums[df$GarageQual == "Fa"] <- 2
df$GarageQualnums[df$GarageQual == "Ex"] <- 3
df$GarageQualnums[df$GarageQual == "TA"] <- 4
df$GarageQualnums[df$GarageQual == "Gd"] <- 5
ggplot(data=df, aes(as.factor(df$GarageQualnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$GarageCond <- as.factor(df$GarageCond)
summary(df$GarageCond)
## Ex Fa Gd None Po TA
## 3 74 15 157 14 2656
ggplot(data=df, aes(df$GarageCond,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$BadGarageC <- 0
df$BadGarageC[df$GarageCond == "Fa"] <- 1
df$BadGarageC[df$GarageCond == "Po"] <- 1
df$BadGarageC[df$GarageCond == "None"] <- 1
df$GarageCondnums[df$GarageCond == "None"] <- 0
df$GarageCondnums[df$GarageCond == "Po"] <- 1
df$GarageCondnums[df$GarageCond == "Fa"] <- 2
df$GarageCondnums[df$GarageCond == "Ex"] <- 3
df$GarageCondnums[df$GarageCond == "Gd"] <- 4
df$GarageCondnums[df$GarageCond == "TA"] <- 5
ggplot(data=df, aes(as.factor(df$GarageCondnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$PavedDrive <- as.factor(df$PavedDrive)
summary(df$PavedDrive)
## N P Y
## 216 62 2641
ggplot(data=df, aes(df$PavedDrive,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$PoolQC <- as.factor(df$PoolQC)
summary(df$PoolQC)
## Ex Fa Gd None
## 7 2 4 2906
ggplot(data=df, aes(df$PoolQC,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$PoolQCnums[df$PoolQC == "None"] <- 0
df$PoolQCnums[df$PoolQC == "TA"] <- 1
df$PoolQCnums[df$PoolQC == "Gd"] <- 2
df$PoolQCnums[df$PoolQC == "Fa"] <- 3
df$PoolQCnums[df$PoolQC == "Ex"] <- 4
df$Fence <- as.factor(df$Fence)
summary(df$Fence)
## GdPrv GdWo MnPrv MnWw None
## 118 112 329 12 2348
ggplot(data=df, aes(df$Fence,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$Fencenums[df$Fence == "MnWw"] <- 1
df$Fencenums[df$Fence == "MnPrv"] <- 2
df$Fencenums[df$Fence == "GdWo"] <- 3
df$Fencenums[df$Fence == "None"] <- 4
df$Fencenums[df$Fence == "GdPrv"] <- 5
df$MiscFeature <- as.factor(df$MiscFeature)
summary(df$MiscFeature)
## Gar2 None Othr Shed TenC
## 5 2814 4 95 1
ggplot(data=df, aes(df$MiscFeature,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
This is coded numeric, but might be useful to treat as a categorical variable.
df$MoSoldcat <- as.factor(df$MoSold)
summary(df$MoSoldcat)
## 1 2 3 4 5 6 7 8 9 10 11 12
## 122 133 232 279 394 503 446 233 158 173 142 104
mosold = levels(df$MoSoldcat)
mosold_df = as.data.frame(mosold)
colnames(mosold_df) <- "MoSold"
mosold_meds = (1:length(mosold))
for (i in (1:length(mosold))){
mosold_meds[i] <- median(df$SalePrice[df$MoSoldcat == mosold[i]], na.rm = TRUE)
}
mosold_df$meds = mosold_meds
for (i in (1:length(mosold))){
mosold_df$count[i] <- length(df$MoSoldcat[df$MoSoldcat == mosold[i]])
}
mosold_df[ order(mosold_df$meds), ]
## MoSold meds count
## 4 4 11.89819 279
## 1 1 11.92918 122
## 5 5 11.94471 394
## 10 10 11.97666 173
## 6 6 12.00151 503
## 7 7 12.01488 446
## 3 3 12.01516 232
## 11 11 12.06105 142
## 2 2 12.06249 133
## 8 8 12.06939 233
## 12 12 12.08954 104
## 9 9 12.13296 158
mosold_df[ order(mosold_df$count), ]
## MoSold meds count
## 12 12 12.08954 104
## 1 1 11.92918 122
## 2 2 12.06249 133
## 11 11 12.06105 142
## 9 9 12.13296 158
## 10 10 11.97666 173
## 3 3 12.01516 232
## 8 8 12.06939 233
## 4 4 11.89819 279
## 5 5 11.94471 394
## 7 7 12.01488 446
## 6 6 12.00151 503
df$MoSoldnums[df$MoSoldcat == "4"] <- 1
df$MoSoldnums[df$MoSoldcat == "1"] <- 2
df$MoSoldnums[df$MoSoldcat == "5"] <- 3
df$MoSoldnums[df$MoSoldcat == "10"] <- 4
df$MoSoldnums[df$MoSoldcat == "6"] <- 5
df$MoSoldnums[df$MoSoldcat == "7"] <- 6
df$MoSoldnums[df$MoSoldcat == "3"] <- 7
df$MoSoldnums[df$MoSoldcat == "11"] <- 8
df$MoSoldnums[df$MoSoldcat == "2"] <- 9
df$MoSoldnums[df$MoSoldcat == "8"] <- 10
df$MoSoldnums[df$MoSoldcat == "12"] <- 11
df$MoSoldnums[df$MoSoldcat == "9"] <- 12
ggplot(data=df, aes(as.factor(df$MoSoldnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$MoSoldcat <- NULL
df$SummerSale <- 0
df$SummerSale[df$MoSold == 5 | df$MoSold == 6 | df$MoSold == 7] <- 1
df$YrSoldcat <- as.factor(df$YrSold)
ggplot(data=df, aes(df$YrSoldcat,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
df$SaleType <- as.factor(df$SaleType)
summary(df$SaleType)
## COD Con ConLD ConLI ConLw CWD New Oth WD
## 87 5 26 9 8 12 239 7 2526
ggplot(data=df, aes(df$SaleType,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
saletype = levels(df$SaleType)
saletype_df = as.data.frame(saletype)
colnames(saletype_df) <- "SaleType"
saletype_meds = (1:length(saletype))
for (i in (1:length(saletype))){
saletype_meds[i] <- median(df$SalePrice[df$SaleType == saletype[i]], na.rm = TRUE)
}
saletype_df$meds = saletype_meds
saletype_df[ order(saletype_df$meds), ]
## SaleType meds
## 8 Oth 11.66178
## 4 ConLI 11.73607
## 1 COD 11.84223
## 3 ConLD 11.84940
## 5 ConLw 11.87757
## 9 WD 11.97035
## 6 CWD 12.13473
## 7 New 12.41897
## 2 Con 12.48391
df$SaleTypenums[df$SaleType == "Oth"] <- 1
df$SaleTypenums[df$SaleType == "ConLI"] <- 2
df$SaleTypenums[df$SaleType == "COD"] <- 3
df$SaleTypenums[df$SaleType == "ConLD"] <- 4
df$SaleTypenums[df$SaleType == "ConLw"] <- 5
df$SaleTypenums[df$SaleType == "WD"] <- 6
df$SaleTypenums[df$SaleType == "CWD"] <- 7
df$SaleTypenums[df$SaleType == "New"] <- 8
df$SaleTypenums[df$SaleType == "Con"] <- 9
ggplot(data=df, aes(as.factor(df$SaleTypenums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
Bin ConLI CWD New together, since they seem to compare to a premium price:
df$SaleTypePrem <- 0
df$SaleTypePrem[df$SaleType == "ConLI"] <- 1
df$SaleTypePrem[df$SaleType == "CWD"] <- 1
df$SaleTypePrem[df$SaleType == "New"] <- 1
df$SaleCondition <- as.factor(df$SaleCondition)
summary(df$SaleCondition)
## Abnorml AdjLand Alloca Family Normal Partial
## 190 12 24 46 2402 245
ggplot(data=df, aes(df$SaleCondition,df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
salecondition = levels(df$SaleCondition)
salecondition_df = as.data.frame(salecondition)
colnames(salecondition_df) <- "SaleCondition"
salecondition_meds = (1:length(salecondition))
for (i in (1:length(salecondition))){
salecondition_meds[i] <- median(df$SalePrice[df$SaleCondition == salecondition[i]], na.rm = TRUE)
}
salecondition_df$meds = salecondition_meds
salecondition_df[ order(salecondition_df$meds), ]
## SaleCondition meds
## 2 AdjLand 11.52926
## 1 Abnorml 11.77529
## 4 Family 11.85245
## 3 Alloca 11.90533
## 5 Normal 11.98293
## 6 Partial 12.40738
df$SaleConditionnums[df$SaleCondition == "AdjLand"] <- 1
df$SaleConditionnums[df$SaleCondition == "Abnorml"] <- 2
df$SaleConditionnums[df$SaleCondition == "Family"] <- 3
df$SaleConditionnums[df$SaleCondition == "Alloca"] <- 4
df$SaleConditionnums[df$SaleCondition == "Normal"] <- 5
df$SaleConditionnums[df$SaleCondition == "Partial"] <- 6
ggplot(data=df, aes(as.factor(df$SaleConditionnums),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
We want to examine the base numeric features and their relationship with the SalePrice and with each other. We will look for transformations to new features that are more suitable for regression. We also examine potential outlier and determine the effect of removing observations from the dataset.
These are the base numeric features
base_numerics <- c('Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'X1stFlrSF',
'X2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', #'GarageYrBlt',
'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'X3SsnPorch',
'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'SalePrice')
# GarageYrBlt is set aside for now since the 'None' entries make it non-numeric
train_nums <- df[1:train_size, base_numerics]
# str(train_nums)
correlations <- cor(train_nums[,-1])
# correlations
row_indic <- apply(correlations, 1, function(x) sum(x > 0.3 | x < -0.3) > 1)
correlations <- correlations[row_indic ,row_indic ]
corrplot(correlations, method="square")
correlations[order(correlations[,'SalePrice'], decreasing = TRUE),'SalePrice']
## SalePrice OverallQual GrLivArea GarageCars GarageArea
## 1.00000000 0.81718442 0.70092665 0.68062481 0.65088756
## TotalBsmtSF X1stFlrSF FullBath YearBuilt YearRemodAdd
## 0.61213398 0.59698105 0.59477054 0.58657024 0.56560783
## TotRmsAbvGrd Fireplaces MasVnrArea BsmtFinSF1 LotFrontage
## 0.53442220 0.48944943 0.42677562 0.37202307 0.34867197
## WoodDeckSF OpenPorchSF X2ndFlrSF HalfBath LotArea
## 0.33413507 0.32105297 0.31929998 0.31398225 0.25731989
## BsmtFullBath BsmtUnfSF BedroomAbvGr OverallCond EnclosedPorch
## 0.23622411 0.22198505 0.20904368 -0.03686799 -0.14905028
We will consider these features in order of correlation with the response.
ggplot(data=df, aes(df$OverallQual,df$SalePrice)) + geom_point() + geom_smooth(method='lm')
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
OverallQual.mod = lm(SalePrice ~ OverallQual, data=train_nums)
LogOverallQual.mod = lm(SalePrice ~ log(OverallQual+1), data=train_nums)
summary(OverallQual.mod)
##
## Call:
## lm(formula = SalePrice ~ OverallQual, data = train_nums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06831 -0.12974 0.01309 0.13332 0.92438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.58444 0.02727 388.18 <2e-16 ***
## OverallQual 0.23603 0.00436 54.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2303 on 1458 degrees of freedom
## Multiple R-squared: 0.6678, Adjusted R-squared: 0.6676
## F-statistic: 2931 on 1 and 1458 DF, p-value: < 2.2e-16
summary(LogOverallQual.mod)
##
## Call:
## lm(formula = SalePrice ~ log(OverallQual + 1), data = train_nums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03898 -0.13573 0.01371 0.13573 0.97284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.94631 0.06034 148.26 <2e-16 ***
## log(OverallQual + 1) 1.58621 0.03093 51.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2386 on 1458 degrees of freedom
## Multiple R-squared: 0.6433, Adjusted R-squared: 0.6431
## F-statistic: 2630 on 1 and 1458 DF, p-value: < 2.2e-16
OverallQual isn’t especially skewed and a log-transform doesn’t appreciably improve the linear fit.
ggplot(data=df, aes(df$GrLivArea)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df, aes(df$GrLivArea,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$GrLivArea+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$LogGrLivArea <- log(df$GrLivArea+1)
GrLivArea is skewed, but taking the log leads to a fairly linear trend. There are a series of outliers at large values of Area, as well as at low SalePrice. These are:
which(df$GrLivArea[1:train_size] > 4000)
## [1] 524 692 1183 1299
which(df$SalePrice[1:train_size] < 10.7)
## [1] 31 496 534 917 969
Before continuing to examine other promising regressors, we can generate a new feature based on binning roughly around the mean
df$GrLivAreabin <- 0
df$GrLivAreabin[df$GrLivArea < 800] <- 1
df$GrLivAreabin[df$GrLivArea >= 800 & df$GrLivArea < 1000] <- 2
df$GrLivAreabin[df$GrLivArea >= 1000 & df$GrLivArea < 1200] <- 3
df$GrLivAreabin[df$GrLivArea >= 1200 & df$GrLivArea < 1400] <- 4
df$GrLivAreabin[df$GrLivArea >= 1400 & df$GrLivArea < 1600] <- 5
df$GrLivAreabin[df$GrLivArea >= 1600 & df$GrLivArea < 1800] <- 6
df$GrLivAreabin[df$GrLivArea >= 1800 & df$GrLivArea < 2000] <- 7
df$GrLivAreabin[df$GrLivArea >= 2000 & df$GrLivArea < 2200] <- 8
df$GrLivAreabin[df$GrLivArea >= 2200 & df$GrLivArea < 2400] <- 9
df$GrLivAreabin[df$GrLivArea >= 2400] <- 10
ggplot(data=df, aes(df$GarageCars,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$GarageArea,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
which(df$GarageArea[1:train_size] > 1240)
## [1] 582 1062 1191 1299
There are around 160 houses (81 in the training set) without a garage, but the distribution appears to be such that they don’t put too much pressure on the fit. It might be worth doing some statistics on the subset of houses w/garages. Note that only one of the outliers here, 1299, overlaps with the GrLivArea outliers.
df$NoGarage <- 0
df$NoGarage[df$GarageArea == 0] <- 1
df$NoBasement <- 0
df$NoBasement[df$TotalBsmtSF == 0] <- 1
ggplot(data=df, aes(df$TotalBsmtSF,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$TotalBsmtSF+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
which(df$TotalBsmtSF[1:train_size] > 3000)
## [1] 333 441 497 524 1299
Here the log is particular bad because of the accumulation of houses with no basement. The set of houses with no basement in the training set has around 80 houses, which is probably too small to get good results. It might be useful to do fits on the set of ~ 1420 houses with basement in the training set.
Finally, we note that two outliers, 524 and 1299, overlap with the GrLivArea set.
We can create a new feature by adding GrLivArea and TotalBsmtSF.
df$TotalHouseArea <- df$GrLivArea + df$TotalBsmtSF
ggplot(data=df, aes(df$TotalHouseArea,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$TotalHouseArea+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$TotalHouseArea <- log(df$TotalHouseArea+1)
TotalHouseArea.mod = lm(SalePrice ~ TotalHouseArea + sqrt(TotalHouseArea), data=df)
summary(TotalHouseArea.mod)
##
## Call:
## lm(formula = SalePrice ~ TotalHouseArea + sqrt(TotalHouseArea),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.99580 -0.11295 0.02835 0.15434 0.71377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.8814 7.4339 5.365 9.41e-08 ***
## TotalHouseArea 5.6630 0.9614 5.890 4.78e-09 ***
## sqrt(TotalHouseArea) -25.7977 5.3468 -4.825 1.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.236 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.6514, Adjusted R-squared: 0.6509
## F-statistic: 1361 on 2 and 1457 DF, p-value: < 2.2e-16
df$TotalHouseArea <- as.numeric(TotalHouseArea.mod$coefficients[1]) * df$TotalHouseArea +
as.numeric(TotalHouseArea.mod$coefficients[2]) * sqrt(df$TotalHouseArea)
ggplot(data=df, aes(df$TotalHouseArea,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$X1stFlrSF,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$X1stFlrSF+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
The logarithm helps with the skewness, but there is some significant nonlinearity left. We can build a new feature to remove this:
df$X1stLin <- log(df$X1stFlrSF+1)
X1stLin.mod = lm(SalePrice ~ X1stLin + sqrt(X1stLin), data=df)
summary(X1stLin.mod)
##
## Call:
## lm(formula = SalePrice ~ X1stLin + sqrt(X1stLin), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34019 -0.18760 -0.03179 0.23515 0.91934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.624 11.053 2.499 0.0126 *
## X1stLin 3.759 1.578 2.383 0.0173 *
## sqrt(X1stLin) -15.849 8.353 -1.897 0.0580 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3167 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.3724, Adjusted R-squared: 0.3715
## F-statistic: 432.2 on 2 and 1457 DF, p-value: < 2.2e-16
df$X1stLin <- as.numeric(X1stLin.mod$coefficients[1]) * df$X1stLin +
as.numeric(X1stLin.mod$coefficients[2]) * sqrt(df$X1stLin)
ggplot(data=df, aes(df$X1stLin,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$FullBath,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
We would like to engineer a new feature that takes all of the bathrooms into account. We’ll get weights from a simple regression:
bathmodel = lm(SalePrice ~ FullBath + HalfBath + BsmtFullBath + BsmtHalfBath, data = df)
summary(bathmodel)
##
## Call:
## lm(formula = SalePrice ~ FullBath + HalfBath + BsmtFullBath +
## BsmtHalfBath, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14743 -0.16214 0.00057 0.16762 1.03304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.18402 0.02423 461.666 < 2e-16 ***
## FullBath 0.42365 0.01372 30.885 < 2e-16 ***
## HalfBath 0.19400 0.01497 12.956 < 2e-16 ***
## BsmtFullBath 0.22497 0.01457 15.436 < 2e-16 ***
## BsmtHalfBath 0.12204 0.03165 3.856 0.00012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2849 on 1455 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.4929, Adjusted R-squared: 0.4915
## F-statistic: 353.5 on 4 and 1455 DF, p-value: < 2.2e-16
as.numeric(bathmodel$coefficients[2])
## [1] 0.4236537
df$TotalBath <- as.numeric(bathmodel$coefficients[2]) * df$FullBath +
as.numeric(bathmodel$coefficients[3]) * df$HalfBath +
as.numeric(bathmodel$coefficients[4]) * df$BsmtFullBath +
as.numeric(bathmodel$coefficients[5]) * df$BsmtHalfBath
ggplot(data=df, aes(df$TotalBath,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x)
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
This data frame has the testing data plus whatever transformations we’ve made so far. Now we should combine some numerical variables before we perform any scalings or standardization.
ggplot(data=df, aes(df$YearBuilt,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + x^2 + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$YearBuilt+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + x^2 + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
It might be easier to interpret the age of the house at the time of sale as a feature, this is
df$Age <- df$YrSold - df$YearBuilt
ggplot(data=df, aes(df$Age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df, aes(df$Age,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$Age+2),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$Age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
It appears that the logarithm helps smooth the distribution out. Note that some houses were sold before they were built, so we have to take log(x+2). Let’s build a linearized feature:
age.mod = lm(data=df, SalePrice ~ log(Age + 2) + sqrt(log(Age+2)))
summary(age.mod)
##
## Call:
## lm(formula = SalePrice ~ log(Age + 2) + sqrt(log(Age + 2)), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34757 -0.18443 -0.01755 0.16385 1.45468
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.84951 0.14739 80.395 < 2e-16 ***
## log(Age + 2) -0.57193 0.06341 -9.020 < 2e-16 ***
## sqrt(log(Age + 2)) 1.14003 0.19833 5.748 1.1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3118 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.3916, Adjusted R-squared: 0.3908
## F-statistic: 468.9 on 2 and 1457 DF, p-value: < 2.2e-16
df$AgeLin <- as.numeric(age.mod$coefficients[2]) * log(df$Age + 2) +
as.numeric(age.mod$coefficients[3]) * sqrt(log(df$Age+2))
ggplot(data=df, aes(df$AgeLin,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
We should also create some new features, including some bins
# very new sales
df$VNewSale <- 0
df$VNewSale[df$Age <= 1] <- 1
# 10 year bins
df$Agebin <- 0
df$Agebin[df$Age < 10] <- 1
df$Agebin[df$Age >= 10 & df$Age < 20] <- 2
df$Agebin[df$Age >= 20 & df$Age < 30] <- 3
df$Agebin[df$Age >= 30 & df$Age < 40] <- 4
df$Agebin[df$Age >= 40 & df$Age < 50] <- 5
df$Agebin[df$Age >= 50 & df$Age < 60] <- 6
df$Agebin[df$Age >= 60 & df$Age < 70] <- 7
df$Agebin[df$Age >= 70 & df$Age < 80] <- 8
df$Agebin[df$Age >= 80 & df$Age < 90] <- 9
df$Agebin[df$Age >= 90 & df$Age < 100] <- 10
df$Agebin[df$Age >= 100] <- 11
# note whether a house was remodeled
df$Remodeled <- 0
df$Remodeled[which(df$YearRemodAdd != df$YearBuilt)] <- 1
# have to get rid of YearBuilt because it is collinear with Age
df$YearBuilt <- NULL
Part of the correlation with SalePrice is probably due to the identification with construction date for houses that weren’t remodeled. We will again convert to an age variable.
df$RemodAge <- df$YrSold - df$YearRemodAdd
# sold soon after remodel?
df$RemodSale <- 0
df$RemodSale[df$RemodAge < 1] <- 1
df$YearRemodAdd <- NULL
# less than 1/2 of houses were remodeled
summary(as.factor(df$Remodeled))
## 0 1
## 1560 1359
ggplot(data=df, aes(df$RemodAge)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df, aes(df$RemodAge,df$SalePrice)) + geom_point() + geom_smooth(method = 'lm')
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$RemodAge,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x) )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning in sqrt(x): NaNs produced
## Warning in sqrt(x): NaNs produced
## Warning: Removed 1459 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_smooth).
ggplot(data=df, aes(log(df$RemodAge+3),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
remodage.mod = lm(SalePrice ~ log(RemodAge+3) + sqrt(log(RemodAge+3)), data = df)
summary(remodage.mod)
##
## Call:
## lm(formula = SalePrice ~ log(RemodAge + 3) + sqrt(log(RemodAge +
## 3)), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.36637 -0.19174 0.00784 0.18951 1.43793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.55795 0.23777 48.611 < 2e-16 ***
## log(RemodAge + 3) -0.66577 0.09854 -6.756 2.04e-11 ***
## sqrt(log(RemodAge + 3)) 1.42289 0.31129 4.571 5.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3319 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.3105, Adjusted R-squared: 0.3096
## F-statistic: 328.1 on 2 and 1457 DF, p-value: < 2.2e-16
df$RemodAgeLin <- as.numeric(remodage.mod$coefficients[2]) * log(df$RemodAge+3) +
as.numeric(remodage.mod$coefficients[3]) * sqrt(log(df$RemodAge+3))
ggplot(data=df, aes(df$RemodAgeLin,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$TotRmsAbvGrd,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$Fireplaces,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$MasVnrArea,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$MasVnrArea+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
masvnr.mod = lm(data=df, SalePrice ~ log(MasVnrArea+1) + sqrt(log(MasVnrArea+1)))
summary(masvnr.mod)
##
## Call:
## lm(formula = SalePrice ~ log(MasVnrArea + 1) + sqrt(log(MasVnrArea +
## 1)), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43848 -0.23563 -0.01423 0.24198 1.62242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.89872 0.01222 973.346 < 2e-16 ***
## log(MasVnrArea + 1) 0.25532 0.03802 6.715 2.69e-11 ***
## sqrt(log(MasVnrArea + 1)) -0.45110 0.08869 -5.086 4.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3608 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1842
## F-statistic: 165.7 on 2 and 1457 DF, p-value: < 2.2e-16
df$MasVnrAreaLin = as.numeric(masvnr.mod$coefficients[2]) * (log(df$MasVnrArea+1)) +
as.numeric(masvnr.mod$coefficients[3]) * sqrt(log(df$MasVnrArea+1))
ggplot(data=df, aes(df$MasVnrAreaLin,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$BsmtFinSF1,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$BsmtFinSF1+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x) )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
This isn’t a great regressor. Later, we will create a new feature by adding GrLivArea, BsmtFinSF1 and BsmtFinSF2.
We had to impute a large number of these values, so we should be suspicious.
ggplot(data=df, aes(df$LotFrontage)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df, aes(df$LotFrontage,df$SalePrice)) + geom_point() +
geom_smooth(method='lm', formula = y ~ x + x^2 + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
which(df$LotFrontage > 200)
## [1] 935 1299
# only 1299 is in the GrLivArea outliers
df$LotFrontagebin <- 0
df$LotFrontagebin[df$LotFrontage < 55] <- 1
df$LotFrontagebin[df$LotFrontage >= 55 & df$LotFrontage < 65] <- 2
df$LotFrontagebin[df$LotFrontage >= 65 & df$LotFrontage < 75] <- 3
df$LotFrontagebin[df$LotFrontage >= 75 & df$LotFrontage < 85] <- 4
df$LotFrontagebin[df$LotFrontage >= 85 & df$LotFrontage < 95] <- 5
df$LotFrontagebin[df$LotFrontage >= 95 & df$LotFrontage < 105] <- 6
df$LotFrontagebin[df$LotFrontage >= 105] <- 7
ggplot(data=df, aes(as.factor(df$LotFrontagebin),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
Again not a great regressor. We might choose to drop this and keep the binned feature. Also note that 1299 was also an outlier here.
Some consolidation might be in order here.
porch.mod = lm(data=df, SalePrice ~ WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch)
summary(porch.mod)
##
## Call:
## lm(formula = SalePrice ~ WoodDeckSF + OpenPorchSF + EnclosedPorch +
## X3SsnPorch + ScreenPorch, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23450 -0.21007 -0.00246 0.20779 1.37074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.184e+01 1.427e-02 829.558 < 2e-16 ***
## WoodDeckSF 1.019e-03 7.448e-05 13.678 < 2e-16 ***
## OpenPorchSF 1.732e-03 1.401e-04 12.367 < 2e-16 ***
## EnclosedPorch -4.546e-04 1.531e-04 -2.968 0.00305 **
## X3SsnPorch 9.296e-04 3.147e-04 2.954 0.00319 **
## ScreenPorch 8.594e-04 1.668e-04 5.151 2.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3516 on 1454 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.2277, Adjusted R-squared: 0.225
## F-statistic: 85.74 on 5 and 1454 DF, p-value: < 2.2e-16
df$DeckPorch <- 1.019e-03* df$WoodDeckSF + 1.732e-03 * df$OpenPorchSF - 4.546e-04 * df$EnclosedPorch +
9.296e-04 * df$X3SsnPorch + 8.594e-04 * df$ScreenPorch
ggplot(data=df, aes(df$DeckPorch,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning in sqrt(x): NaNs produced
## Warning in sqrt(x): NaNs produced
## Warning: Removed 1459 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_smooth).
summary(df$DeckPorch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2232 0.0000 0.1625 0.1835 0.2927 1.4510
ggplot(data=df, aes(log(df$DeckPorch+3),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
deckporch.mod = lm(data=df, SalePrice ~ log(df$DeckPorch+3) + sqrt(log(df$DeckPorch+3)))
summary(deckporch.mod)
##
## Call:
## lm(formula = SalePrice ~ log(df$DeckPorch + 3) + sqrt(log(df$DeckPorch +
## 3)), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96368 -0.20708 -0.00681 0.20375 1.34252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -50.105 9.897 -5.062 4.67e-07 ***
## log(df$DeckPorch + 3) -46.146 8.398 -5.495 4.61e-08 ***
## sqrt(log(df$DeckPorch + 3)) 107.433 18.237 5.891 4.76e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3457 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.2521, Adjusted R-squared: 0.251
## F-statistic: 245.5 on 2 and 1457 DF, p-value: < 2.2e-16
df$DeckPorchLin <- as.numeric(deckporch.mod$coefficients[2]) * sqrt(log(df$DeckPorch+3)) + as.numeric(deckporch.mod$coefficients[3]) * log(df$DeckPorch+3)
df$DeckPorch <- NULL
ggplot(data=df, aes(df$DeckPorchLin,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$HasWoodDeck <-0
df$HasWoodDeck[df$WoodDeckSF != 0] <- 1
df$HasOpenPorch <-0
df$HasOpenPorch[df$OpenPorchSF != 0] <- 1
df$HasEnclosedPorch <-0
df$HasEnclosedPorch[df$EnclosedPorch != 0] <- 1
df$Has3SsnPorch <-0
df$Has3SsnPorch[df$X3SsnPorch != 0] <- 1
df$HasScreenPorch <-0
df$HasScreenPorch[df$ScreenPorch != 0] <- 1
df$HasDeckorPorch <- 0
df$HasDeckorPorch <- (df$HasWoodDeck == 1 | df$HasOpenPorch == 1 | df$HasEnclosedPorch == 1 |
df$Has3SsnPorch == 1 | df$HasScreenPorch)*1
ggplot(data=df, aes(df$X2ndFlrSF,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$X2ndFlrSF+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$X2ndLin <- log(df$X2ndFlrSF+1)
X2ndLin.mod = lm(SalePrice ~ X2ndLin + sqrt(X2ndLin), data=df)
summary(X2ndLin.mod)
##
## Call:
## lm(formula = SalePrice ~ X2ndLin + sqrt(X2ndLin), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.51146 -0.19843 -0.01829 0.17561 1.35222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.97170 0.01240 965.44 <2e-16 ***
## X2ndLin 1.40976 0.07893 17.86 <2e-16 ***
## sqrt(X2ndLin) -3.58378 0.20368 -17.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.357 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.2022, Adjusted R-squared: 0.2011
## F-statistic: 184.6 on 2 and 1457 DF, p-value: < 2.2e-16
df$X2ndLin <- as.numeric(X2ndLin.mod$coefficients[2]) * df$X2ndLin +
as.numeric(X2ndLin.mod$coefficients[3]) * sqrt(df$X2ndLin)
ggplot(data=df, aes(df$X2ndLin,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
summary(df$LotArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7478 9453 10170 11570 215200
ggplot(data=df, aes(df$LotArea)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df, aes(df$LotArea,df$SalePrice)) + geom_point() + geom_smooth(method = 'lm')
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
which(df$LotArea[1:train_size] > 50000)
## [1] 54 250 314 336 385 452 458 707 770 1299 1397
# 1299 is an outlier again
ggplot(data=df, aes(df$LotArea,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
# bin
df$LotAreabin <- 0
df$LotAreabin[df$LotArea < 3000] <- 1
df$LotAreabin[df$LotArea >= 3000 & df$LotArea < 4500] <- 2
df$LotAreabin[df$LotArea >= 4500 & df$LotArea < 6000] <- 3
df$LotAreabin[df$LotArea >= 6000 & df$LotArea < 7500] <- 4
df$LotAreabin[df$LotArea >= 7500 & df$LotArea < 9000] <- 5
df$LotAreabin[df$LotArea >= 9000 & df$LotArea < 10500] <- 6
df$LotAreabin[df$LotArea >= 9000 & df$LotArea < 10500] <- 7
df$LotAreabin[df$LotArea >= 10500 & df$LotArea < 12000] <- 8
df$LotAreabin[df$LotArea >= 12000 & df$LotArea < 13500] <- 9
df$LotAreabin[df$LotArea >= 13500] <- 10
ggplot(data=df, aes(as.factor(df$LotAreabin),df$SalePrice)) + geom_boxplot()
## Warning: Removed 1459 rows containing non-finite values (stat_boxplot).
The presence of so many outliers makes this a poor regressor, though it suggests 1299 is again a global outlier.
ggplot(data=df, aes(df$OverallCond)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df, aes(df$OverallCond,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$OverallCond+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x) )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
If anything it seems like higher values of OverallCond don’t correlate with higher sale prices. So the market is happy with a certain level of condition and doesn’t value anything rated higher with a particular premium. Our OverallCondBad boolean might capture the actual variation due to the condition rating.
There are several other logical combinations of features that we want to explore.
This is a feature introduced by Choudhary. It adds lengths and areas, but we’ll include it along with several variations.
df$AllSizesSum <- df$LotFrontage + df$LotArea + df$MasVnrArea + df$BsmtFinSF1 + df$BsmtFinSF2 + df$BsmtUnfSF +
df$TotalBsmtSF + df$X1stFlrSF + df$X2ndFlrSF + df$GrLivArea + df$GarageArea + df$WoodDeckSF +
df$OpenPorchSF + df$EnclosedPorch + df$X3SsnPorch + df$ScreenPorch + df$LowQualFinSF + df$PoolArea
ggplot(data=df, aes(df$AllSizesSum,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$AllSizesSum),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
which(df$AllSizesSum > 100000)
## [1] 250 314 336 707
which(df$AllSizesSum > 75000)
## [1] 250 314 336 452 707 1299
which(df$AllSizesSum[1:train_size] > 50000)
## [1] 54 250 314 336 385 452 458 524 662 707 770 849 1299 1397
# 524 and 1299 were GrLivArea outliers
Because of the pressure of outliers, we will also linearize this
df$AllSizesSumLin <- log(df$AllSizesSum+1)
allsizessum_mod = lm(SalePrice ~ AllSizesSumLin + sqrt(AllSizesSumLin), data = df)
summary(allsizessum_mod)
##
## Call:
## lm(formula = SalePrice ~ AllSizesSumLin + sqrt(AllSizesSumLin),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.39276 -0.20120 -0.01933 0.22291 1.07282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.7211 8.5088 -4.198 2.85e-05 ***
## AllSizesSumLin -3.6767 0.8745 -4.204 2.78e-05 ***
## sqrt(AllSizesSumLin) 26.8017 5.4550 4.913 9.97e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.32 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.3593, Adjusted R-squared: 0.3584
## F-statistic: 408.5 on 2 and 1457 DF, p-value: < 2.2e-16
df$AllSizesSumLin <- as.numeric(allsizessum_mod$coefficients[2]) * df$AllSizesSumLin +
as.numeric(allsizessum_mod$coefficients[3]) * sqrt(df$AllSizesSumLin)
ggplot(data=df, aes(df$AllSizesSumLin,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
In keeping with the theme, we also examine the weighted sum of all areas:
areamodel = lm(SalePrice ~ LotArea + TotalBsmtSF + GrLivArea + GarageArea + WoodDeckSF +
OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea, data =df)
summary(areamodel)
##
## Call:
## lm(formula = SalePrice ~ LotArea + TotalBsmtSF + GrLivArea +
## GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch +
## ScreenPorch + PoolArea, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.81157 -0.09421 0.02328 0.12855 0.74747
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.101e+01 2.002e-02 550.059 < 2e-16 ***
## LotArea 4.643e-07 6.137e-07 0.757 0.44940
## TotalBsmtSF 2.154e-04 1.621e-05 13.286 < 2e-16 ***
## GrLivArea 3.173e-04 1.391e-05 22.813 < 2e-16 ***
## GarageArea 5.539e-04 3.315e-05 16.711 < 2e-16 ***
## WoodDeckSF 3.347e-04 4.945e-05 6.768 1.89e-11 ***
## OpenPorchSF 2.405e-04 9.413e-05 2.555 0.01073 *
## EnclosedPorch -4.505e-04 9.777e-05 -4.608 4.43e-06 ***
## X3SsnPorch 3.936e-04 1.986e-04 1.982 0.04771 *
## ScreenPorch 3.305e-04 1.060e-04 3.118 0.00186 **
## PoolArea -5.825e-04 1.473e-04 -3.954 8.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2215 on 1449 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.6946, Adjusted R-squared: 0.6925
## F-statistic: 329.5 on 10 and 1449 DF, p-value: < 2.2e-16
df$AreasSum <- as.numeric(areamodel$coefficients[2]) * df$LotArea +
as.numeric(areamodel$coefficients[3]) * df$TotalBsmtSF +
as.numeric(areamodel$coefficients[4]) * df$GrLivArea +
as.numeric(areamodel$coefficients[5]) * df$GarageArea +
as.numeric(areamodel$coefficients[6]) * df$WoodDeckSF +
as.numeric(areamodel$coefficients[7]) * df$OpenPorchSF +
as.numeric(areamodel$coefficients[8]) * df$EnclosedPorch +
as.numeric(areamodel$coefficients[9]) * df$X3SsnPorch +
as.numeric(areamodel$coefficients[10]) * df$ScreenPorch +
as.numeric(areamodel$coefficients[11]) * df$PoolArea
ggplot(data=df, aes(df$AreasSum,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$AreasSum+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x +sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$AreasSum <- log(df$AreasSum+1)
Another possibility is the total living area
livingareamodel = lm(SalePrice ~ BsmtFinSF1 + BsmtFinSF2 + GrLivArea, data = df)
summary(livingareamodel)
##
## Call:
## lm(formula = SalePrice ~ BsmtFinSF1 + BsmtFinSF2 + GrLivArea,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1645 -0.1348 0.0326 0.1528 0.8011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.118e+01 2.184e-02 511.764 <2e-16 ***
## BsmtFinSF1 2.080e-04 1.584e-05 13.129 <2e-16 ***
## BsmtFinSF2 5.700e-05 4.381e-05 1.301 0.194
## GrLivArea 4.954e-04 1.373e-05 36.069 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2696 on 1456 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.5453, Adjusted R-squared: 0.5443
## F-statistic: 582 on 3 and 1456 DF, p-value: < 2.2e-16
df$LivArea <- df$BsmtFinSF1 + df$BsmtFinSF2 + df$GrLivArea
df$LivAreaWt <- as.numeric(livingareamodel$coefficients[2]) * df$BsmtFinSF1 +
as.numeric(livingareamodel$coefficients[3]) * df$BsmtFinSF2 +
as.numeric(livingareamodel$coefficients[4]) * df$GrLivArea
ggplot(data=df, aes(df$LivArea,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$LivArea+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$LivAreaWt,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$LivAreaWt+1),df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$LivArea <- log(df$LivArea+1)
df$LivAreaWt <- log(df$LivAreaWt+1)
LivArea.mod = lm(SalePrice ~ LivArea + sqrt(LivArea), data=df)
summary(LivArea.mod)
##
## Call:
## lm(formula = SalePrice ~ LivArea + sqrt(LivArea), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54966 -0.16713 -0.01589 0.17313 0.92012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.117 7.299 1.386 0.166
## LivArea 1.373 0.972 1.412 0.158
## sqrt(LivArea) -3.075 5.328 -0.577 0.564
##
## Residual standard error: 0.2698 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.5445, Adjusted R-squared: 0.5439
## F-statistic: 870.9 on 2 and 1457 DF, p-value: < 2.2e-16
df$LivArea <- as.numeric(LivArea.mod$coefficients[2]) * df$LivArea +
as.numeric(LivArea.mod$coefficients[3]) * sqrt(df$LivArea)
LivAreaWt.mod = lm(SalePrice ~ LivAreaWt + sqrt(LivAreaWt), data=df)
summary(LivAreaWt.mod)
##
## Call:
## lm(formula = SalePrice ~ LivAreaWt + sqrt(LivAreaWt), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.32534 -0.13569 0.01827 0.15437 0.73786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5256 0.2751 30.994 < 2e-16 ***
## LivAreaWt -1.5833 0.4485 -3.530 0.000428 ***
## sqrt(LivAreaWt) 5.7819 0.7044 8.209 4.86e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2482 on 1457 degrees of freedom
## (1459 observations deleted due to missingness)
## Multiple R-squared: 0.6144, Adjusted R-squared: 0.6139
## F-statistic: 1161 on 2 and 1457 DF, p-value: < 2.2e-16
df$LivAreaWt <- as.numeric(LivAreaWt.mod$coefficients[2]) * df$LivAreaWt +
as.numeric(LivAreaWt.mod$coefficients[3]) * sqrt(df$LivAreaWt)
ggplot(data=df, aes(df$LivArea,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$LivAreaWt,df$SalePrice)) + geom_point()+
geom_smooth(method='lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$BsmtUnfSF,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$BsmtUnfSF+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
This is not great, but there’s no good reason to transform it.
ggplot(data=df, aes(df$LowQualFinSF,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$LowQualFinSF+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
length(which(df$LowQualFinSF > 0))
## [1] 40
This has very low statistics, so we might want to drop it.
ggplot(data=df, aes(df$BedroomAbvGr,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$BedroomAbvGr+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(df$KitchenAbvGr,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$KitchenAbvGr+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
We will set this up in order to do an separate regression on only the homes with garages. When regressing on the entire data set, we should drop the columns associated with the GarageAge.
df_garage <- subset(df, df$NoGarage == 0)
df_garage$GarageYrBlt <- as.numeric(df_garage$GarageYrBlt)
df_garage$GarageAge <- df_garage$YrSold - df_garage$GarageYrBlt
ggplot(data=df_garage, aes(df_garage$GarageAge,df_garage$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1383 rows containing non-finite values (stat_smooth).
## Warning: Removed 1383 rows containing missing values (geom_point).
ggplot(data=df_garage, aes(log(df_garage$GarageAge+2),df_garage$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x + sqrt(x))
## Warning: Removed 1383 rows containing non-finite values (stat_smooth).
## Warning: Removed 1383 rows containing missing values (geom_point).
summary(df_garage$GarageAge)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 6.00 29.00 29.77 48.00 114.00
df_garage$GarageAgeLin <- log(df_garage$GarageAge+2)
garageage.mod = lm(data=df_garage, SalePrice ~ GarageAgeLin + sqrt(GarageAgeLin))
summary(garageage.mod)
##
## Call:
## lm(formula = SalePrice ~ GarageAgeLin + sqrt(GarageAgeLin), data = df_garage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.39325 -0.19050 -0.01748 0.16530 1.40009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.20766 0.14402 84.766 < 2e-16 ***
## GarageAgeLin -0.38708 0.06517 -5.940 3.61e-09 ***
## sqrt(GarageAgeLin) 0.59245 0.19862 2.983 0.00291 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3106 on 1376 degrees of freedom
## (1383 observations deleted due to missingness)
## Multiple R-squared: 0.3336, Adjusted R-squared: 0.3327
## F-statistic: 344.4 on 2 and 1376 DF, p-value: < 2.2e-16
df_garage$GarageAgeLin <- as.numeric(garageage.mod$coefficients[2]) * df_garage$GarageAgeLin +
as.numeric(garageage.mod$coefficients[3]) * sqrt(df_garage$GarageAgeLin)
ggplot(data=df_garage, aes(df_garage$GarageAgeLin,df_garage$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1383 rows containing non-finite values (stat_smooth).
## Warning: Removed 1383 rows containing missing values (geom_point).
df <- merge(x = df, y = df_garage, by = names(df), all.x = TRUE)
# df$GarageAge[which(is.na(df$GarageAge))] <- rep('None')
# df$GarageAgeLin[which(is.na(df$GarageAgeLin))] <- rep('None')
df$GarageYrBlt <- NULL
With only 13 houses with pools, we can’t regress on this with any significance. We will setup a boolean:
length(which(df$PoolArea > 0))
## [1] 13
df$HasPool <- 0
df$HasPool[df$PoolArea > 0] <- 1
df$PoolArea <- NULL
length(which(df$MiscVal > 0))
## [1] 103
ggplot(data=df, aes(df$MiscVal,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$MiscVal+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$MiscVal <- NULL
There doesn’t seem to be any useful functional relationship here, even if we forget about the large number of observations at zero. We won’t regress on this.
ggplot(data=df, aes(df$MoSold,df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
ggplot(data=df, aes(log(df$MoSold+1),df$SalePrice)) + geom_point() +
geom_smooth(method = 'lm', formula = y ~ x )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).
df$MoSold <- as.factor(df$MoSold)
df$YrSold <- as.factor(df$YrSold)
df$YrSoldcat <- NULL
# cleaning up unnecessary features
This doesn’t seem too interesting.
At this point, the only NAs are in SalePrice, GarageAge and GarageAgeLin. We want to range the numeric features, besides Id and SalePrice, as well as create numerical dummies for the factors. We will use caret.
train_df <- df[1:train_size, -72]
saleprice_df <- data.frame(df[1:train_size, c(1,72,73)])
test_df <- df[(train_size + 1):(train_size + 1459),-72]
train_df[,'HouseId'] <- sapply(train_df[,'HouseId'] , as.character)
test_df[,'HouseId'] <- sapply(test_df[,'HouseId'] , as.character)
pprules <- preProcess(train_df, method = c("range"))
train_pp_df <- data.frame(predict(pprules, newdata = train_df))
test_pp_df <- data.frame(predict(pprules, newdata = test_df))
train_pp_df$Id <- as.numeric(row.names(train_pp_df))
test_pp_df$Id <- as.numeric(row.names(test_pp_df))
train_pp_df[,'HouseId'] <- sapply(train_pp_df[,'HouseId'] , as.numeric)
test_pp_df[,'HouseId'] <- sapply(test_pp_df[,'HouseId'] , as.numeric)
# train_pp_df <- merge(x = train_pp_df, y = saleprice_df, by = 'Id', all.x = TRUE)
train_pp_df <- merge(x = train_pp_df, y = saleprice_df, by = 'HouseId', all.x = TRUE)
train_pp_df$Id.x <- NULL
train_pp_df$Id.y <- NULL
dummies <- dummyVars(~ ., data = train_pp_df)
train_ppd_df <- data.frame(predict(dummies, newdata = train_pp_df))
# test needs to have same columns as train
test_pp_df$SalePrice <- rep(0)
test_ppd_df <- data.frame(predict(dummies, newdata = test_pp_df))
test_ppd_df$SalePrice <- NULL
We have to remove the levels that were missing in the test set
train_ppd_df$Condition.2RRAe <- NULL
train_ppd_df$Condition.2RRAn <- NULL
train_ppd_df$Condition.2RRNn <- NULL
train_ppd_df$HouseStyle.2.5Fin <- NULL
train_ppd_df$RoofMatl.ClyTile <- NULL
train_ppd_df$RoofMatl.Membran <- NULL
train_ppd_df$RoofMatl.Metal <- NULL
train_ppd_df$RoofMatl.Roll <- NULL
train_ppd_df$Exterior1st.ImStucc <- NULL
train_ppd_df$Exterior1st.Stone <- NULL
train_ppd_df$Exterior2nd.Other <- NULL
train_ppd_df$Heating.Floor <- NULL
train_ppd_df$Heating.OthW <- NULL
train_ppd_df$Electrical.Mix <- NULL
train_ppd_df$GarageQual.Ex <- NULL
train_ppd_df$PoolQC.Fa <- NULL
test_ppd_df$Condition.2RRAe <- NULL
test_ppd_df$Condition.2RRAn <- NULL
test_ppd_df$Condition.2RRNn <- NULL
test_ppd_df$HouseStyle.2.5Fin <- NULL
test_ppd_df$RoofMatl.ClyTile <- NULL
test_ppd_df$RoofMatl.Membran <- NULL
test_ppd_df$RoofMatl.Metal <- NULL
test_ppd_df$RoofMatl.Roll <- NULL
test_ppd_df$Exterior1st.ImStucc <- NULL
test_ppd_df$Exterior1st.Stone <- NULL
test_ppd_df$Exterior2nd.Other <- NULL
test_ppd_df$Heating.Floor <- NULL
test_ppd_df$Heating.OthW <- NULL
test_ppd_df$Electrical.Mix <- NULL
test_ppd_df$GarageQual.Ex <- NULL
test_ppd_df$PoolQC.Fa <- NULL
Test for near-zero variance.
x = nearZeroVar(train_ppd_df, saveMetrics = TRUE)
x[x[,"zeroVar"] > 0, ]
## freqRatio percentUnique zeroVar nzv
## MSSubClass.150 0 0.06849315 TRUE TRUE
summary(as.factor(train_ppd_df$MSSubClass.150))
## 0
## 1460
summary(as.factor(test_ppd_df$MSSubClass.150))
## 0 1
## 1458 1
train_ppd_df$MSSubClass.150 <- NULL
test_ppd_df$MSSubClass.150 <- NULL
Look more features with very low statistics:
train_ppd_cols = colnames(train_ppd_df)
train_ppd_cols_df = as.data.frame(train_ppd_cols)
for (i in 1:nrow(train_ppd_cols_df)){
train_ppd_cols_df$count[i] <- length(which(train_ppd_df[train_ppd_cols[i]] !=0
))
}
lowstat_df <- train_ppd_cols_df[which(train_ppd_cols_df$count <= 10),]
lowstat_df
## train_ppd_cols count
## 4 MSSubClass.40 4
## 17 MSZoning.C..all. 10
## 24 Street.Grvl 6
## 31 LotShape.IR3 10
## 40 LotConfig.FR3 4
## 46 Neighborhood.Blueste 2
## 59 Neighborhood.NPkVill 9
## 73 Condition1.PosA 8
## 77 Condition1.RRNe 2
## 78 Condition1.RRNn 5
## 79 Condition2.Artery 2
## 80 Condition2.Feedr 6
## 82 Condition2.PosA 1
## 83 Condition2.PosN 2
## 84 Condition2.RRAe 1
## 85 Condition2.RRAn 1
## 86 Condition2.RRNn 2
## 105 RoofStyle.Mansard 7
## 106 RoofStyle.Shed 2
## 109 RoofMatl.WdShake 5
## 110 RoofMatl.WdShngl 6
## 112 Exterior1st.AsphShn 1
## 113 Exterior1st.BrkComm 2
## 115 Exterior1st.CBlock 1
## 125 Exterior2nd.AsphShn 3
## 126 Exterior2nd.Brk.Cmn 7
## 128 Exterior2nd.CBlock 1
## 131 Exterior2nd.ImStucc 10
## 134 Exterior2nd.Stone 5
## 148 ExterCond.Ex 3
## 151 ExterCond.Po 1
## 157 Foundation.Stone 6
## 158 Foundation.Wood 3
## 167 BsmtCond.Po 2
## 195 HeatingQC.Po 1
## 213 Functional.Maj2 5
## 217 Functional.Sev 1
## 226 GarageType.2Types 6
## 230 GarageType.CarPort 9
## 242 GarageQual.Po 3
## 244 GarageCond.Ex 2
## 246 GarageCond.Gd 9
## 248 GarageCond.Po 7
## 258 PoolQC.Ex 2
## 259 PoolQC.Gd 3
## 266 MiscFeature.Gar2 2
## 268 MiscFeature.Othr 2
## 270 MiscFeature.TenC 1
## 289 SaleType.Con 2
## 290 SaleType.ConLD 9
## 291 SaleType.ConLI 5
## 292 SaleType.ConLw 5
## 293 SaleType.CWD 4
## 295 SaleType.Oth 3
## 298 SaleCondition.AdjLand 4
## 314 OverallQualcat.1 2
## 315 OverallQualcat.2 3
## 325 OverallQualBad 5
## 326 OverallCondcat.1 1
## 327 OverallCondcat.2 5
## 363 PoolQCnums 7
## 403 HasPool 7
verylowstat_df <- train_ppd_cols_df[which(train_ppd_cols_df$count <= 5),]
verylowstat_df
## train_ppd_cols count
## 4 MSSubClass.40 4
## 40 LotConfig.FR3 4
## 46 Neighborhood.Blueste 2
## 77 Condition1.RRNe 2
## 78 Condition1.RRNn 5
## 79 Condition2.Artery 2
## 82 Condition2.PosA 1
## 83 Condition2.PosN 2
## 84 Condition2.RRAe 1
## 85 Condition2.RRAn 1
## 86 Condition2.RRNn 2
## 106 RoofStyle.Shed 2
## 109 RoofMatl.WdShake 5
## 112 Exterior1st.AsphShn 1
## 113 Exterior1st.BrkComm 2
## 115 Exterior1st.CBlock 1
## 125 Exterior2nd.AsphShn 3
## 128 Exterior2nd.CBlock 1
## 134 Exterior2nd.Stone 5
## 148 ExterCond.Ex 3
## 151 ExterCond.Po 1
## 158 Foundation.Wood 3
## 167 BsmtCond.Po 2
## 195 HeatingQC.Po 1
## 213 Functional.Maj2 5
## 217 Functional.Sev 1
## 242 GarageQual.Po 3
## 244 GarageCond.Ex 2
## 258 PoolQC.Ex 2
## 259 PoolQC.Gd 3
## 266 MiscFeature.Gar2 2
## 268 MiscFeature.Othr 2
## 270 MiscFeature.TenC 1
## 289 SaleType.Con 2
## 291 SaleType.ConLI 5
## 292 SaleType.ConLw 5
## 293 SaleType.CWD 4
## 295 SaleType.Oth 3
## 298 SaleCondition.AdjLand 4
## 314 OverallQualcat.1 2
## 315 OverallQualcat.2 3
## 325 OverallQualBad 5
## 326 OverallCondcat.1 1
## 327 OverallCondcat.2 5
verylowstat_cols <- as.factor(verylowstat_df$train_ppd_cols)
for (i in verylowstat_cols){
train_ppd_df[i] <- NULL
test_ppd_df[i] <- NULL
}
We’ve dropped any feature that had 5 or less observations in the training set. Now we want to look for very highly correlated features. We can start by guessing that there are some in the Garage and Basement subsets
train_ppd_cols <- colnames(train_ppd_df)
garage_cols <- grep("Garage+", train_ppd_cols, perl=TRUE, value=TRUE)
garage_cor <- abs(cor(train_ppd_df[garage_cols]))
diag(garage_cor) <- 0
which(garage_cor == 1, arr.ind = T)
## row col
## GarageFinish.None 9 7
## GarageQual.None 16 7
## GarageCond.None 20 7
## NoGarage 30 7
## GarageType.None 7 9
## GarageQual.None 16 9
## GarageCond.None 20 9
## NoGarage 30 9
## GarageType.None 7 16
## GarageFinish.None 9 16
## GarageCond.None 20 16
## NoGarage 30 16
## GarageType.None 7 20
## GarageFinish.None 9 20
## GarageQual.None 16 20
## NoGarage 30 20
## GarageType.None 7 30
## GarageFinish.None 9 30
## GarageQual.None 16 30
## GarageCond.None 20 30
train_ppd_df$GarageFinish.None <- NULL
train_ppd_df$GarageQual.None <- NULL
train_ppd_df$GarageCond.None <- NULL
train_ppd_df$GarageType.None <- NULL
test_ppd_df$GarageFinish.None <- NULL
test_ppd_df$GarageQual.None <- NULL
test_ppd_df$GarageCond.None <- NULL
test_ppd_df$GarageType.None <- NULL
basement_cols <- grep("Bsmt+", train_ppd_cols, perl=TRUE, value=TRUE)
basement_cols <- append(basement_cols, "NoBasement")
basement_cols
## [1] "BsmtQual.Ex" "BsmtQual.Fa" "BsmtQual.Gd"
## [4] "BsmtQual.None" "BsmtQual.TA" "BsmtCond.Fa"
## [7] "BsmtCond.Gd" "BsmtCond.None" "BsmtCond.TA"
## [10] "BsmtExposure.Av" "BsmtExposure.Gd" "BsmtExposure.Mn"
## [13] "BsmtExposure.No" "BsmtExposure.None" "BsmtFinType1.ALQ"
## [16] "BsmtFinType1.BLQ" "BsmtFinType1.GLQ" "BsmtFinType1.LwQ"
## [19] "BsmtFinType1.None" "BsmtFinType1.Rec" "BsmtFinType1.Unf"
## [22] "BsmtFinSF1" "BsmtFinType2.ALQ" "BsmtFinType2.BLQ"
## [25] "BsmtFinType2.GLQ" "BsmtFinType2.LwQ" "BsmtFinType2.None"
## [28] "BsmtFinType2.Rec" "BsmtFinType2.Unf" "BsmtFinSF2"
## [31] "BsmtUnfSF" "TotalBsmtSF" "BsmtFullBath"
## [34] "BsmtHalfBath" "BsmtQualnums" "BsmtCondnums"
## [37] "BsmtExposurenums" "BsmtFinType1nums" "BsmtFinType2nums"
## [40] "NoBasement"
basement_cor <- abs(cor(train_ppd_df[basement_cols]))
diag(basement_cor) <- 0
which(basement_cor > 0.99, arr.ind = T)
## row col
## BsmtCond.None 8 4
## BsmtExposure.None 14 4
## BsmtFinType1.None 19 4
## BsmtFinType2.None 27 4
## NoBasement 40 4
## BsmtQual.None 4 8
## BsmtExposure.None 14 8
## BsmtFinType1.None 19 8
## BsmtFinType2.None 27 8
## NoBasement 40 8
## BsmtQual.None 4 14
## BsmtCond.None 8 14
## BsmtFinType1.None 19 14
## BsmtFinType2.None 27 14
## NoBasement 40 14
## BsmtQual.None 4 19
## BsmtCond.None 8 19
## BsmtExposure.None 14 19
## BsmtFinType2.None 27 19
## NoBasement 40 19
## BsmtQual.None 4 27
## BsmtCond.None 8 27
## BsmtExposure.None 14 27
## BsmtFinType1.None 19 27
## NoBasement 40 27
## BsmtQual.None 4 40
## BsmtCond.None 8 40
## BsmtExposure.None 14 40
## BsmtFinType1.None 19 40
## BsmtFinType2.None 27 40
train_ppd_df$BsmtCond.None <- NULL
train_ppd_df$BsmtExposure.None <- NULL
train_ppd_df$BsmtFinType1.None <- NULL
train_ppd_df$BsmtFinType2.None <- NULL
train_ppd_df$BsmtQual.None <- NULL
test_ppd_df$BsmtCond.None <- NULL
test_ppd_df$BsmtExposure.None <- NULL
test_ppd_df$BsmtFinType1.None <- NULL
test_ppd_df$BsmtFinType2.None <- NULL
test_ppd_df$BsmtQual.None <- NULL
Now the numerical variables.
nums <- sapply(train_ppd_df, is.numeric)
nums["SalePrice"] <- FALSE
correlmatrix <- abs(cor(train_ppd_df[nums]))
diag(correlmatrix) <- 0
which(correlmatrix > 0.99, arr.ind = T)
## row col
## BldgType.Duplex 78 11
## BldgType.2fmCon 77 15
## Street.Pave 24 23
## Street.Grvl 23 24
## MSSubClass.190 15 77
## MSSubClass.90 11 78
## HasPool 350 221
## Agebin 327 324
## Age 324 327
## PoolQC.None 221 350
train_ppd_df$Street.Pave <- NULL
train_ppd_df$MSSubClass.190 <- NULL
train_ppd_df$MSSubClass.90 <- NULL
train_ppd_df$PoolQC.None <- NULL
test_ppd_df$Street.Pave <- NULL
test_ppd_df$MSSubClass.190 <- NULL
test_ppd_df$MSSubClass.90 <- NULL
test_ppd_df$PoolQC.None <- NULL
Finally we’ll write our tidy data to a file.
dim(train_ppd_df)
## [1] 1460 347
dim(test_ppd_df)
## [1] 1459 346
# write our cleaned data to a file
write.csv(train_ppd_df, file = train_output, row.names = FALSE)
write.csv(test_ppd_df, file = test_output, row.names = FALSE)
We’ll fit our models in python.
Note: GarageAge and GarageAgeLin both have NAs and should be dropped if we’re not restricting to the subset with NoGarage == 0.