In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sb

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
houses = pd.read_csv("SaratogaHouses_dummies.csv") 

In [None]:
houses.head()

In [None]:
Corr_mat = houses.corr()
fig = plt.figure(figsize = (15,15))

sb.heatmap(Corr_mat, vmax = .8, square = True)
plt.show()

In [None]:
predictors = ['lotSize', 'age', 'landValue', 'livingArea', 
              'pctCollege', 'bedrooms', 'fireplaces', 'bathrooms', 
              'rooms', 'heatingElectric', 'sewerPublic', 
              'waterfront', 'newConstruction', 'centralAir']
outcome = 'price'

In [None]:
for i in range(len(predictors)):
    for j in range(len(predictors)):
        if i < j:
            corr, _ = pearsonr(houses[predictors[i]], houses[predictors[j]])
            print(f'The correlation between {predictors[i]} and {predictors[j]} is {corr}')

Let's how linearly the predictors correlate with the price

In [None]:
fig, ax = plt.subplots(3, 3, figsize=[12,10])

for i, axi in enumerate(ax.flat):
    axi.scatter(houses[predictors[i]], houses[outcome], color='tab:orange')
    axi.set_ylabel('Price')
    axi.set_xlabel(f'Predictor: {predictors[i]}')
    axi.set_title("One predictor");

plt.tight_layout()
plt.show()

In [None]:
model = sm.OLS(houses[outcome], houses[predictors].assign(const=1))
results = model.fit()

In [None]:
results.summary()

Let's start backwards elimination. The least signigicant predictor is fireplaces -> removed.

In [None]:
predictors1 = ['lotSize', 'age', 'landValue', 'livingArea', 
              'pctCollege', 'bedrooms', 'bathrooms', 
              'rooms', 'heatingElectric', 'sewerPublic', 
              'waterfront', 'newConstruction', 'centralAir']

In [None]:
model1 = sm.OLS(houses[outcome], houses[predictors1].assign(const=1))
results1 = model1.fit()

In [None]:
results1.summary()

Droping fireplaces did not change R^2_adj or the AIC -> dropped fairly. 
The next to be dropped is sewerPublic

In [None]:
predictors2 = ['lotSize', 'age', 'landValue', 'livingArea', 
              'pctCollege', 'bedrooms', 'bathrooms', 
              'rooms', 'heatingElectric', 'waterfront', 'newConstruction', 'centralAir']

In [None]:
model2 = sm.OLS(houses[outcome], houses[predictors2].assign(const=1))
results2 = model2.fit()

In [None]:
results2.summary()

Droping sewerPublic did not change R^2_adj or the AIC -> dropped fairly. The next to be dropped is pctCollege

In [None]:
predictors3 = ['lotSize', 'age', 'landValue', 'livingArea', 'bedrooms', 'bathrooms', 
              'rooms', 'heatingElectric', 'waterfront', 'newConstruction', 'centralAir']

In [None]:
model3 = sm.OLS(houses[outcome], houses[predictors3].assign(const=1))
results3 = model3.fit()

In [None]:
results3.summary()

Droping pctCollege did not change R^2_adj or the AIC -> dropped fairly. heatingElectric is borderline significant. 
Lets dropped and look at how this influences the model.

In [None]:
predictors4 = ['lotSize', 'age', 'landValue', 'livingArea', 'bedrooms', 'bathrooms', 
              'rooms', 'waterfront', 'newConstruction', 'centralAir']

In [None]:
model4 = sm.OLS(houses[outcome], houses[predictors4].assign(const=1))
results4 = model4.fit()

In [None]:
results4.summary()

In [None]:
predictors5 = ['lotSize', 'age', 'landValue', 'livingArea', 'bedrooms', 'bathrooms', 
              'waterfront', 'newConstruction', 'centralAir']

In [None]:
model5 = sm.OLS(houses[outcome], houses[predictors5].assign(const=1))
results5 = model5.fit()

In [None]:
results5.summary()

R^2 dropped a tiny bit but the multicollinearity problems persist. Let's eliminate another dependent predictor.

In [None]:
predictors6 = ['lotSize', 'age', 'landValue', 'livingArea', 'bathrooms', 
              'waterfront', 'newConstruction', 'centralAir']

In [None]:
model6 = sm.OLS(houses[outcome], houses[predictors6].assign(const=1))
results6 = model6.fit()

In [None]:
results6.summary()

R^2 dropped a tiny bit but the multicollinearity problems persist. Let's eliminate another dependent predictor.

In [None]:
predictors7 = ['lotSize', 'age', 'landValue', 'livingArea', 'waterfront', 'newConstruction', 'centralAir']

In [None]:
model7 = sm.OLS(houses[outcome], houses[predictors7].assign(const=1))
results7 = model7.fit()

In [None]:
results7.summary()

R^2 dropped a tiny bit but the multicollinearity problems persist. Let's eliminate another dependent predictor.

In [None]:
predictors8 = ['lotSize', 'age', 'landValue', 'livingArea', 'waterfront', 'centralAir']

In [None]:
model8 = sm.OLS(houses[outcome], houses[predictors8].assign(const=1))
results8 = model8.fit()

In [None]:
results8.summary()

In [None]:
# For comparison with NN methods let's export the mean squared error of the residuals
mse = results8.mse_resid

In [None]:
# Roughly here is by how much the house price is mispredicted on average
mse**0.5

In [None]:
# Here are the parameters of a our final model
results8.params

In [None]:
# Lets make a prediction
para = results8.params
X = [0.58, 40, 40000, 1980, 1, 1, 1]
prediction = np.dot(para, X)
print(f"This house should cost {prediction}$")