In this notebook, we're specifically working on the dataset formed by dropping (31, 496, 524, 917, 1299) with all features.
We note that houses with a basement or garage have additional features, which might improve the regression. We will explore the best way to exploit this by considering subsets of the data.
import itertools
import numpy as np
import pandas as pd
import scipy
from scipy import optimize
pd.set_option('display.precision',20)
pd.set_option('display.max_colwidth',100)
from sklearn import linear_model, svm, tree
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_predict, KFold, cross_val_score, \
GridSearchCV, RandomizedSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from time import time
from scipy.stats import randint as sp_randint
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from matplotlib import pyplot
rcParams['figure.figsize'] = 12, 4
%matplotlib inline
# def to compare goodness of fit on training set
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# run randomized search
def random_search(regr, param_dist, n_iter_search):
rs = RandomizedSearchCV(regr, param_distributions=param_dist, scoring = 'neg_mean_squared_error',
n_jobs=-1, n_iter=n_iter_search, cv=kfold) #, verbose = 4)
start = time()
rs.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
report(rs.cv_results_)
# run single parameter search (for ridge or lasso)
def single_search(regr, params):
regr_results_df = pd.DataFrame(dtype = 'float64')
count = 0
for k, v in params.items():
for val in v:
regr.set_params(**{k: val})
regr_results_df.loc[count, k] = val
results = cross_val_score(regr, x_train, y_train, cv=kfold, scoring = 'neg_mean_squared_error')
(regr_results_df.loc[count, 'RMSE'], regr_results_df.loc[count, 'std dev']) = \
(np.sqrt(-results.mean()), np.sqrt(results.std()))
count += 1
return regr_results_df
# test against validation set
def validate(regr):
regr.fit(x_train, y_train)
y_pred = regr.predict(x_validation)
return rmse(y_validation, y_pred)
# Cross-validation sets
kfold = KFold(n_splits=10, random_state=7)
df = pd.read_csv("./input/train_tidy_111001001.csv")
test_df = pd.read_csv("./input/test_tidy_111001001.csv")
df[df['NoGarage']==1].shape
Only 80 of the houses in the training data have no garage. Let us set a baseline validation error for when we don't subset.
ss = ShuffleSplit(n_splits=1, test_size=0.20, random_state=71)
X = df.values
for train_idx, validation_idx in ss.split(X):
train_df = df.iloc[train_idx]
validation_df = df.iloc[validation_idx]
y_validation = validation_df['SalePrice'].values
x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_train = train_df['SalePrice'].values
x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
lassolarscv_regr = linear_model.LassoLarsCV()
baseline = validate(lassolarscv_regr)
baseline
Now we restrict to houses with a garage.
df_hasg = df[df['NoGarage']==0]
X = df_hasg.values
for train_idx, validation_idx in ss.split(X):
train_df = df_hasg.iloc[train_idx]
validation_df = df_hasg.iloc[validation_idx]
y_validation = validation_df['SalePrice'].values
x_validation = validation_df.drop(['HouseId', 'SalePrice'],axis=1).values
y_train = train_df['SalePrice'].values
x_train = train_df.drop(['HouseId', 'SalePrice'],axis=1).values
hasg = validate(lassolarscv_regr)
(hasg, hasg - baseline)
In this case, our validation error has increased. This could be due to sample size.
Let's examine the houses with basements. Only 37 houses have no basement.
df[df['NoBasement']==1].shape
df_hasb = df[df['NoBasement']==0]
X = df_hasb.values
for train_idx, validation_idx in ss.split(X):
train_df = df_hasb.iloc[train_idx]
validation_df = df_hasb.iloc[validation_idx]
y_validation = validation_df['SalePrice'].values
x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_train = train_df['SalePrice'].values
x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
hasb = validate(lassolarscv_regr)
(hasb, hasb - baseline)
Here, our validation error is even worse than in the garage case.
df_hasgb = df_hasg[df_hasg['NoBasement']==0]
df_hasgb.shape
1345 houses have both a garage and basement.
X = df_hasgb.values
for train_idx, validation_idx in ss.split(X):
train_df = df_hasgb.iloc[train_idx]
validation_df = df_hasgb.iloc[validation_idx]
y_validation = validation_df['SalePrice'].values
x_validation = validation_df.drop(['HouseId', 'SalePrice'],axis=1).values
y_train = train_df['SalePrice'].values
x_train = train_df.drop(['HouseId', 'SalePrice'],axis=1).values
hasgb = validate(lassolarscv_regr)
(hasgb, hasgb - baseline)
This is a small improvement over the baseline. We will now work out a scheme to make a separate fit on this subset and then combine the predictions.
Let's store a dataframe that is the complement of df_hasgb.
df_nogorb = df[(df['NoGarage']==1) | (df['NoBasement']==1)]
df_nogorb.shape
We'll build the validation set from the full dataset
X = df.values
for train_idx, validation_idx in ss.split(X):
train_df = df.iloc[train_idx]
validation_df = df.iloc[validation_idx]
We will split the training set
train_df_hasgb = train_df[(train_df['NoGarage']==0) & (train_df['NoBasement']==0)]
train_df_nogorb = train_df[(train_df['NoGarage']==1) | (train_df['NoBasement']==1)]
We need to fit models on the appropriate training data.
llcv_hasgb_regr = linear_model.LassoLarsCV()
llcv_nogorb_regr = linear_model.LassoLarsCV()
y_train_nogorb = train_df['SalePrice'].values
x_train_nogorb = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_train_hasgb = train_df_hasgb['SalePrice'].values
x_train_hasgb = train_df_hasgb.drop(['HouseId', 'SalePrice'],axis=1).values
llcv_hasgb_regr.fit(x_train_hasgb, y_train_hasgb)
llcv_nogorb_regr.fit(x_train_nogorb, y_train_nogorb)
validation_df_hasgb = validation_df[(validation_df['NoGarage']==0) & (validation_df['NoBasement']==0)]
validation_df_nogorb = validation_df[(validation_df['NoGarage']==1) | (validation_df['NoBasement']==1)]
y_validation_nogorb = validation_df_nogorb['SalePrice'].values
x_validation_nogorb = validation_df_nogorb.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_validation_hasgb = validation_df_hasgb['SalePrice'].values
x_validation_hasgb = validation_df_hasgb.drop(['HouseId', 'SalePrice'],axis=1).values
y_pred_hasgb = llcv_hasgb_regr.predict(x_validation_hasgb)
y_pred_nogorb = llcv_nogorb_regr.predict(x_validation_nogorb)
y_validation = np.append(y_validation_nogorb,y_validation_hasgb)
y_pred = np.append(y_pred_nogorb,y_pred_hasgb)
error = rmse(y_validation, y_pred)
(error, error-baseline)
The improvement over the baseline is in line with the previous result.
Before passing to the Kaggle test set, let's fit LassoLars models on the whole training data.
y_train_nogorb = df['SalePrice'].values
x_train_nogorb = df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_train_hasgb = df_hasgb['SalePrice'].values
x_train_hasgb = df_hasgb.drop(['HouseId', 'SalePrice'],axis=1).values
LassoLars had the best results from our modeling experiments, so we'll continue using it, but we'll hand tune the L1 regularization parameter.
x_train = x_train_nogorb
y_train = y_train_nogorb
lassolars_regr = linear_model.LassoLars(max_iter=50000)
lassolars_params = {'alpha': np.arange(0.00008, 0.00014, 0.00001).tolist()}
lassolars_df = single_search(lassolars_regr, lassolars_params)
lassolars_df.plot(x = ['alpha'], y = ['RMSE'])
lassolars_df.sort_values(['RMSE'], ascending = False)
ll_nogorb_regr = linear_model.LassoLars(alpha=0.00009, max_iter=50000)
x_train = x_train_hasgb
y_train = y_train_hasgb
lassolars_regr = linear_model.LassoLars(max_iter=50000)
lassolars_params = {'alpha': np.arange(0.00004, 0.00011, 0.00001).tolist()}
lassolars_df = single_search(lassolars_regr, lassolars_params)
lassolars_df.plot(x = ['alpha'], y = ['RMSE'])
lassolars_df.sort_values(['RMSE'], ascending = False)
ll_hasgb_regr = linear_model.LassoLars(alpha=0.00008, max_iter=50000)
ll_hasgb_regr.fit(x_train_hasgb, y_train_hasgb)
lassolars_features = zip(df_hasgb.drop(['HouseId','SalePrice'],axis=1).columns, ll_hasgb_regr.coef_)
lassolars_features_df = pd.DataFrame.from_dict(lassolars_features)
lassolars_features_df.columns = ["Feature", "Coeff"]
lassolars_features_df = lassolars_features_df[lassolars_features_df["Coeff"]!=0]
lassolars_features_df["sort_ind"] = abs(lassolars_features_df["Coeff"])
lassolars_features_df = lassolars_features_df.sort_values(by="sort_ind", ascending = False)
lassolars_features_df = lassolars_features_df.drop('sort_ind', 1)
lassolars_features_df
ll_nogorb_regr.fit(x_train_nogorb, y_train_nogorb)
lassolars_features = zip(df_nogorb.drop(['HouseId','SalePrice'],axis=1).columns, ll_nogorb_regr.coef_)
lassolars_features_df = pd.DataFrame.from_dict(lassolars_features)
lassolars_features_df.columns = ["Feature", "Coeff"]
lassolars_features_df = lassolars_features_df[lassolars_features_df["Coeff"]!=0]
lassolars_features_df["sort_ind"] = abs(lassolars_features_df["Coeff"])
lassolars_features_df = lassolars_features_df.sort_values(by="sort_ind", ascending = False)
lassolars_features_df = lassolars_features_df.drop('sort_ind', 1)
lassolars_features_df
We want to be able to reassemble subset predictions on the test set back into a dataframe matched on the correct HouseId.
test_df_hasgb = test_df[(test_df['NoGarage']==0) & (test_df['NoBasement']==0)].reset_index(drop=True)
test_df_nogorb = test_df[(test_df['NoGarage']==1) | (test_df['NoBasement']==1)].reset_index(drop=True)
x_test_nogorb = test_df_nogorb.drop(['HouseId', 'GarageAge', 'GarageAgeLin'],axis=1).values
x_test_hasgb = test_df_hasgb.drop(['HouseId'],axis=1).values
y_pred_hasgb = ll_hasgb_regr.predict(x_test_hasgb)
y_pred_nogorb = ll_nogorb_regr.predict(x_test_nogorb)
y_pred_hasgb = np.exp(y_pred_hasgb)
y_pred_nogorb = np.exp(y_pred_nogorb)
ids_hasgb = test_df_hasgb.loc[:,'HouseId'].tolist()
ids_nogorb = test_df_nogorb.loc[:,'HouseId'].tolist()
nogorb_preds = zip(ids_nogorb, y_pred_nogorb)
nogorb_preds_df = pd.DataFrame.from_dict(nogorb_preds)
nogorb_preds_df.columns = ["HouseId", "SalePrice"]
hasgb_preds = zip(ids_hasgb, y_pred_hasgb)
hasgb_preds_df = pd.DataFrame.from_dict(hasgb_preds)
hasgb_preds_df.columns = ["HouseId", "SalePrice"]
preds_df = pd.concat([nogorb_preds_df,hasgb_preds_df]).sort_values('HouseId').set_index('HouseId')
preds_df.to_csv('subset_output-111001001.csv', header=True, index_label='Id')
This results in an error of 0.11998 on the public leaderboard, for a position of 936th on 2/20/17.
We can try to do a bit better by fitting XGBoost and MLP regressors to the data and making additional predictions.
x_train = x_train_nogorb
y_train = y_train_nogorb
xgb_regr = xgb.XGBRegressor(
max_depth = 3,
min_child_weight = 6.25,
gamma = 0.0005,
subsample = 0.4,
colsample_bytree = 0.3,
reg_alpha = 0.1,
reg_lambda = 3.25,
learning_rate = 0.06,
n_estimators = 500,
objective='reg:linear',
seed = 42,
nthread = -1,
silent = 1)
xgb_params = {'max_depth': np.arange(1, 9, 1).tolist()}
xgb_df = single_search(xgb_regr, xgb_params)
xgb_df.plot(x = ['max_depth'], y = ['RMSE'])
xgb_df.sort_values(['RMSE'], ascending = False)
xgb_nogorb_regr = xgb.XGBRegressor(
max_depth = 3,
min_child_weight = 6.25,
gamma = 0.0005,
subsample = 0.4,
colsample_bytree = 0.3,
reg_alpha = 0.1,
reg_lambda = 3.25,
learning_rate = 0.06,
n_estimators = 500,
objective='reg:linear',
seed = 42,
nthread = -1,
silent = 1)
x_train = x_train_hasgb
y_train = y_train_hasgb
xgb_regr = xgb.XGBRegressor(
max_depth = 2,
min_child_weight = 1,
gamma = 0.001,
subsample = 0.7,
colsample_bytree = 0.105,
reg_alpha = 0.35,
reg_lambda = 2.5,
learning_rate = 0.06,
n_estimators = 800,
objective='reg:linear',
seed = 42,
nthread = -1,
silent = 1)
xgb_params = {'max_depth': np.arange(1, 6, 1).tolist()}
xgb_df = single_search(xgb_regr, xgb_params)
xgb_df.plot(x = ['max_depth'], y = ['RMSE'])
xgb_df.sort_values(['RMSE'], ascending = False)
xgb_hasgb_regr = xgb.XGBRegressor(
max_depth = 2,
min_child_weight = 1,
gamma = 0.001,
subsample = 0.7,
colsample_bytree = 0.105,
reg_alpha = 0.35,
reg_lambda = 2.5,
learning_rate = 0.06,
n_estimators = 800,
objective='reg:linear',
seed = 42,
nthread = -1,
silent = 1)
xgb_nogorb_regr.fit(x_train_nogorb, y_train_nogorb)
xgb_hasgb_regr.fit(x_train_hasgb, y_train_hasgb)
x_train = x_train_nogorb
y_train = y_train_nogorb
mlp_regr = MLPRegressor(activation='relu', solver='lbfgs', random_state=641,
hidden_layer_sizes=(9, ),
alpha=0.0004,
tol=0.0004,
max_iter=250)
mlp_params = {'max_iter': np.arange(25, 325, 25).tolist()}
# mlp_params = {'tol': np.logspace(-4, -2, 11).tolist()}
mlp_df = single_search(mlp_regr, mlp_params)
mlp_df.plot(x = ['max_iter'], y = ['RMSE'])
mlp_df.sort_values(['RMSE'], ascending = False)
mlp_nogorb_regr = MLPRegressor(activation='relu', solver='lbfgs', random_state=641,
hidden_layer_sizes=(9, ),
alpha=0.0004,
tol=0.0004,
max_iter=250)
x_train = x_train_hasgb
y_train = y_train_hasgb
mlp_regr = MLPRegressor(activation='relu', solver='lbfgs', random_state=641,
hidden_layer_sizes=(2, ),
alpha=0.0005,
tol=0.0004,
max_iter=250)
mlp_params = {'max_iter': np.arange(300, 425, 25).tolist()}
# mlp_params = {'tol': np.logspace(-4, -2, 11).tolist()}
mlp_df = single_search(mlp_regr, mlp_params)
mlp_df.plot(x = ['max_iter'], y = ['RMSE'])
mlp_df.sort_values(['RMSE'], ascending = False)
mlp_hasgb_regr = MLPRegressor(activation='relu', solver='lbfgs', random_state=641,
hidden_layer_sizes=(2, ),
alpha=0.0005,
tol=0.0004,
max_iter=300)
mlp_nogorb_regr.fit(x_train_nogorb, y_train_nogorb)
mlp_hasgb_regr.fit(x_train_hasgb, y_train_hasgb)
y_pred_hasgb_ll = ll_hasgb_regr.predict(x_test_hasgb)
y_pred_nogorb_ll = ll_nogorb_regr.predict(x_test_nogorb)
y_pred_hasgb_xgb = xgb_hasgb_regr.predict(x_test_hasgb)
y_pred_nogorb_xgb = xgb_nogorb_regr.predict(x_test_nogorb)
y_pred_hasgb_mlp = mlp_hasgb_regr.predict(x_test_hasgb)
y_pred_nogorb_mlp = mlp_nogorb_regr.predict(x_test_nogorb)
We will weight the predictions according to the cross-validation errors.
y_pred_hasgb = (y_pred_hasgb_ll / 0.0998 + y_pred_hasgb_xgb / 0.1053
+ y_pred_hasgb_mlp / 0.1082) / (1 / 0.0998 + 1 / 0.1053 + 1 / 0.1082)
y_pred_nogorb = (y_pred_nogorb_ll / 0.1044 + y_pred_nogorb_xgb / 0.11022
+ y_pred_nogorb_mlp / 0.1166) / (1 / 0.1044 + 1 / 0.11022 + 1 / 0.1166)
y_pred_hasgb = np.exp(y_pred_hasgb)
y_pred_nogorb = np.exp(y_pred_nogorb)
ids_hasgb = test_df_hasgb.loc[:,'HouseId'].tolist()
ids_nogorb = test_df_nogorb.loc[:,'HouseId'].tolist()
nogorb_preds = zip(ids_nogorb, y_pred_nogorb)
nogorb_preds_df = pd.DataFrame.from_dict(nogorb_preds)
nogorb_preds_df.columns = ["HouseId", "SalePrice"]
hasgb_preds = zip(ids_hasgb, y_pred_hasgb)
hasgb_preds_df = pd.DataFrame.from_dict(hasgb_preds)
hasgb_preds_df.columns = ["HouseId", "SalePrice"]
preds_df = pd.concat([nogorb_preds_df,hasgb_preds_df]).sort_values('HouseId').set_index('HouseId')
preds_df.to_csv('subset_blend_output-111001001.csv', header=True, index_label='Id')
This yielded a 0.12211 error on the public leaderboard, which was a worse result. This is likely due to comparably larger variance of the XGB and MLP results, as evidenced by the cross-validation error on the training set.
Since the tree and NN models didn't seem to do so well, it might be useful to blend with a different linear model to reduce some variance error. The Elastic Net was the 2nd performer behind LassoLars, so it is a natural choice.
x_train = x_train_nogorb
y_train = y_train_nogorb
elnet_regr = linear_model.ElasticNet(alpha=0.0006, l1_ratio=0.5, max_iter=15000, random_state=7)
# elnet_params = {'alpha': np.arange(0.0001, 0.0008, 0.0001).tolist()}
elnet_params = {'l1_ratio': np.arange(0.1, 1, 0.1).tolist()}
elnet_df = single_search(elnet_regr, elnet_params)
# elnet_df.plot(x = ['alpha'], y = ['RMSE'])
elnet_df.plot(x = ['l1_ratio'], y = ['RMSE'])
elnet_df.sort_values(['RMSE'], ascending = False)
elnet_nogorb_regr = linear_model.ElasticNet(alpha=0.0006, l1_ratio=0.5, max_iter=15000, random_state=7)
x_train = x_train_hasgb
y_train = y_train_hasgb
elnet_regr = linear_model.ElasticNet(alpha=0.00055, l1_ratio=0.5, max_iter=15000, random_state=7)
# elnet_params = {'alpha': np.arange(0.0001, 0.0008, 0.0001).tolist()}
elnet_params = {'l1_ratio': np.arange(0.1, 1, 0.1).tolist()}
elnet_df = single_search(elnet_regr, elnet_params)
# elnet_df.plot(x = ['alpha'], y = ['RMSE'])
elnet_df.plot(x = ['l1_ratio'], y = ['RMSE'])
elnet_df.sort_values(['RMSE'], ascending = False)
elnet_hasgb_regr = linear_model.ElasticNet(alpha=0.00055, l1_ratio=0.6, max_iter=15000, random_state=7)
elnet_nogorb_regr.fit(x_train_nogorb, y_train_nogorb)
elnet_hasgb_regr.fit(x_train_hasgb, y_train_hasgb)
y_pred_hasgb_elnet = elnet_hasgb_regr.predict(x_test_hasgb)
y_pred_nogorb_elnet = elnet_nogorb_regr.predict(x_test_nogorb)
y_pred_hasgb = (y_pred_hasgb_ll / 0.0998 + y_pred_hasgb_elnet / 0.1015 ) / (1 / 0.0998 + 1 / 0.1015)
y_pred_nogorb = (y_pred_nogorb_ll / 0.1044 + y_pred_nogorb_elnet / 0.1061) / (1 / 0.1044 + 1 / 0.1061)
y_pred_hasgb = np.exp(y_pred_hasgb)
y_pred_nogorb = np.exp(y_pred_nogorb)
nogorb_preds = zip(ids_nogorb, y_pred_nogorb)
nogorb_preds_df = pd.DataFrame.from_dict(nogorb_preds)
nogorb_preds_df.columns = ["HouseId", "SalePrice"]
hasgb_preds = zip(ids_hasgb, y_pred_hasgb)
hasgb_preds_df = pd.DataFrame.from_dict(hasgb_preds)
hasgb_preds_df.columns = ["HouseId", "SalePrice"]
preds_df = pd.concat([nogorb_preds_df,hasgb_preds_df]).sort_values('HouseId').set_index('HouseId')
preds_df.to_csv('subset_linearblend_output-111001001.csv', header=True, index_label='Id')
This result gets 0.12049 on the public leaderboard, which is better than the multimodel blend above, but not as good as the original LassoLars entry.