In this lesson, we will compare different regression models in order to assess which model fits best. We will be using polynomial regression as a means to examine this topic.
Learning objectives:
We will continue to use the House data from previous notebooks.
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
### Importing data:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('C:/Users/Duy Tung/Downloads/kc_house_data.csv', dtype= dtype_dict)
sales = pd.DataFrame(sales)
sales.head()
This function creates an data frame consisting of the powers of an array up to a specific degree:
def polynominal_dataframe(feature, degree, dataset):
poly_dataframe = dataset[['id','price', feature]]
poly_dataframe.rename(columns = {feature: 'power_1'}, inplace =True)
if degree > 1:
for i in range(2,degree+1):
name = 'power_'+str(i)
poly_dataframe[name] = np.power(poly_dataframe.power_1, i)
return poly_dataframe
Let's test our function
poly1_data = polynominal_dataframe('sqft_living', 3, sales)
poly1_data.head()
Let's use matplotlib to visualize what a polynomial regression looks like on some real data. We start with a degree 1 polynomial using 'sqft_living' (i.e. a line) to predict 'price' and plot what it looks like.
# sort data by 'sqft_living' and 'price'
sales.sort_values(['sqft_living', 'price'], inplace = True)
poly1_data = polynominal_dataframe('sqft_living', 1, sales)
poly1_data.head()
Let's write a function that report intercept, weights, and produce a scatter plot of the training data (just square feet vs price) and add the fitted model based on the coresponding degree polynomial feature ‘sqft_living’
reg = linear_model.LinearRegression()
def plot_lines(dataset, deg):
data = polynominal_dataframe('sqft_living', deg, dataset)
y = data['price'].values.reshape(-1,1)
arr_x = []
for i in range(deg):
name_var = 'power_'+str(i+1)
arr_x.append(name_var)
print(arr_x)
x = data[arr_x]
model_poly1 = reg.fit(x, y)
print('coef', model_poly1.coef_)
print('intercept', model_poly1.intercept_)
y_hat1 = model_poly1.predict(x)
name_var1 = 'power_'+str(i)
x_line = data['power_1']
plt.scatter(x_line, y)
plt.plot(x_line, y_hat1)
### Trying a 1st degree polynomial
plot_lines(sales, 1)
### Trying a 2nd degree polynomial
plot_lines(sales, 2)
### Trying a 3rd degree polynomial
plot_lines(sales, 3)
### Trying a 15th degree polynomial
plot_lines(sales, 15)
What do you think of the 15th degree polynomial? Do you think this is appropriate? If we were to change the data do you think you'd get pretty much the same curve? Let's take a look.
We're going to split the sales data into four subsets of roughly equal size. Then you will estimate a 15th degree polynomial model on all four subsets of the data. Print the coefficients (you should use .print_rows(num_rows = 16) to view all of them) and plot the resulting fit (as we did above).
set1 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_set_1_data.csv', dtype = dtype_dict)
set2 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_set_2_data.csv', dtype = dtype_dict)
set3 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_set_3_data.csv', dtype = dtype_dict)
set4 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_set_4_data.csv', dtype = dtype_dict)
### Trying the first data set with a 15th degree polynomial
plot_lines(set1, 15)
### Trying the second data set with a 15th degree polynomial
plot_lines(set2, 15)
### Trying the third data set with a 15th degree polynomial
plot_lines(set3, 15)
### Trying the fourth data set with a 15th degree polynomial
plot_lines(set1, 15)
Since the “best” polynomial degree is unknown to us we will use cross validation to select the best degree. Now for each degree from 1 to 15:
def model_poly(dataset, deg, val_data):
data = polynominal_dataframe('sqft_living', deg, dataset)
data_v = polynominal_dataframe('sqft_living', deg, val_data)
y = data['price'].values.reshape(-1,1)
arr_x = []
for i in range(deg):
name_var = 'power_'+str(i+1)
arr_x.append(name_var)
print(arr_x)
x = data[arr_x]
model_poly1 = reg.fit(x, y)
x_val = data_v[arr_x]
return model_poly1, x_val
def testing(training_data, validation_data, degree):
data_training = polynominal_dataframe('sqft_living', degree, training_data)
data_val = polynominal_dataframe('sqft_living', degree, validation_data)
y = data_training['price'].values.reshape(-1,1)
for i in range(1,degree+1):
model1, x_val1 = model_poly(training_data, i, validation_data)
print('Model:', i)
print('Coefficients: \n', model1.coef_)
print('Intercept: \n', model1.intercept_)
print(x_val1.shape)
y_val = data_val['price'].values.reshape(-1,1)
print(y_val.shape)
RSS_val = ((y_val - model1.predict(x_val1))**2).sum()
print("{:.2e}".format(RSS_val))
traning_wk3 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_train_data.csv', dtype = dtype_dict)
val_wk3 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_valid_data.csv', dtype = dtype_dict)
test_wk3 = pd.read_csv('C:/Users/Duy Tung/Downloads/wk3_kc_house_test_data.csv', dtype = dtype_dict)
testing(traning_wk3, val_wk3, 15)
testing(traning_wk3, test_wk3, 6)