import pprint as pp
import numpy as np
import pandas as pd
import scipy
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
# Read data file
filename = 'used_ford_focuses.csv'
cars = pd.read_csv(filename)
# Features and target variable
features = [
'logMileage/year',
'Age',
'Horsepower',
'Fuel_Gasoline',
'Transmission type_Manual',
]
target = ['logPrice']
Filters
Mileage km
> 1000, to filter out unused cars or false valuesHorsepower
of 200-400# HELPER FUNCTIONS
# Add new variables to dataframe
def create_variables(frame):
frame['Age'] = 2017 - frame['Manufactured year']
frame['logPrice'] = np.log10(frame['Price EUR'])
frame['Mileage/year'] = frame['Mileage km'] / frame['Age']
frame['logMileage/year'] = np.log10(frame['Mileage/year'])
good_condition = ['Spared', 'Recent', 'Undamaged', 'Excellent', 'Normal']
frame['Undamaged'] = frame['Condition'].isin(good_condition)
cat_features = ['Form', 'Documents', 'Transmission type', 'Fuel']
dummies = pd.get_dummies(frame[cat_features], drop_first = True, dummy_na=True)
frame = pd.concat([frame, dummies], axis=1)
return frame
#Â Filter out unwanted items
def filter_data(frame):
return frame[(frame['Undamaged']) &
(frame['Mileage km'] > 1000) &
(frame['Manufactured year'] < 2017) &
(frame['Form'] != 'Minivan') &
(frame['Horsepower'] < 200) &
(frame['Age'] < 20) &
(frame['Documents_Valid, domestic'] == 1) &
(frame['Documents_nan'] == 0) &
(frame['Form_nan'] == 0) &
(frame['Fuel_nan'] == 0) &
(frame['Transmission type_nan'] == 0)][features + target].dropna()
def check_normality(predict, error):
# Distribution of errors
norm = np.random.normal(0, np.std(error), 100000)
sns.distplot(error, hist=False)
sns.distplot(norm, hist=False, kde_kws={'linewidth': 3, 'ls': 'dotted'})
# Shapiro test for normality
print(scipy.stats.shapiro(error))
#Â Get the dataset for modelling
cars = create_variables(cars)
data = filter_data(cars)
print(data.count())
Age
and logMileage/year
show somewhat higher correlation. That might cause multicollinearity.
sns.heatmap(data.corr())
# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(data[features],
data[target],
test_size=0.20,
random_state=543)
print('Train X: {}'.format(X_train.shape))
print('Train y: {}'.format(y_train.shape))
print('Test X: {}'.format(X_test.shape))
print('Test y: {}'.format(y_test.shape))
# Scaler overrider class
class OverrideScaler:
'''
Plug-in replacement for scalers for experimenting without
rewriting the whole code. Leaves data intact.
'''
def fit(self, data):
return self
def transform(self, data):
return data
def inverse_transform(self, data):
return data
# Scaling data
scalerX = OverrideScaler().fit(X_train)
scalerY = OverrideScaler().fit(y_train)
#Â Linear classifier and fitting model
clf = LinearRegression()
clf.fit(scalerX.transform(X_train), scalerY.transform(y_train))
# Print model
print('Intercept: {}'.format(clf.intercept_))
print('Coefficients: {}'.format(clf.coef_))
print('r2 on train set: {}'.format(clf.score(scalerX.transform(X_train), scalerY.transform(y_train))))
print('r2 on test set: {}'.format(clf.score(scalerX.transform(X_test), scalerY.transform(y_test))))
The resiudal distribution on the test set is clearly not normal. AFAIK that is not necessary a dealbreaker, but nevertheless in theory this should be normal. Is the model biased? Maybe that's where multicollinearity kicks in?
# Calculate errors on test set
predict = clf.predict(scalerX.transform(X_test))
error = scalerY.transform(y_test) - predict
check_normality(predict, error)
#Â Read new data
new_file = 'used_ford_focuses_new_round.csv'
new_cars = pd.read_csv(new_file)
#Â Filter data
new_cars = create_variables(new_cars)
new_data = filter_data(new_cars)
print(new_data.count())
# Split features and target variable
X_new = new_data[features]
y_new = new_data[target]
# Score with scaled variables
print('r2 on new data: {}'.format(clf.score(scalerX.transform(X_new), scalerY.transform(y_new))))
# Calculate errors on test set
predict = clf.predict(scalerX.transform(X_new))
error = scalerY.transform(y_new) - predict
check_normality(predict, error)
The residuals look even uglier. The model is far from perfect.
The coefficients are hard to understand in this model as the variables are scaled and the target variable and one of the independent variables are log transformed. The easiest was to translate these to human language is to create some artificial test data, where only one variable changes from one observation to the other. Quick and dirty.
sample = np.array([
[np.log10(10000), 3, 116.0, 0, 0],
[np.log10(12000), 3, 116.0, 0, 0],
[np.log10(12000), 4, 116.0, 0, 0],
[np.log10(12000), 4, 126.0, 0, 0],
[np.log10(12000), 4, 126.0, 1, 0],
[np.log10(12000), 4, 126.0, 1, 1],
])
predict = 10 ** (scalerY.inverse_transform(clf.predict(scalerX.transform(sample))))
print('20% increase in mileage/year: {}% change in price'.format(np.round((predict[1][0] / predict[0][0] - 1) * 100, 1)))
print('1 year increase in age: {}% change in price'.format(np.round((predict[2][0] / predict[1][0] - 1) * 100, 1)))
print('10 HP increase in power: {}% change in price'.format(np.round((predict[3][0] / predict[2][0] - 1) * 100, 1)))
print('Fuel == Gasoline: {}% change in price'.format(np.round((predict[4][0] / predict[3][0] - 1) * 100, 1)))
print('Transmission == Manual: {}% change in price'.format(np.round((predict[5][0] / predict[4][0] - 1) * 100, 1)))
Adding predicted and inverse log-transformed price to the dataframe for easier plotting.
new_data['Predicted_price'] = 10 ** scalerY.inverse_transform(clf.predict(scalerX.transform(X_new)))
new_data['Price'] = 10 ** new_data['logPrice']
fig, ax = plt.subplots(figsize=(10,5))
plt.title('Predicted and actual price by age on the new data')
sns.regplot(new_data['Age'], new_data['Price'], fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.5})
sns.regplot(new_data['Age'], new_data['Predicted_price'], fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.3, 'color': 'red'})
fig, ax = plt.subplots(figsize=(10,5))
plt.title('Predicted and actual price by age on the new data (log scale)')
sns.regplot(new_data['Age'], new_data['logPrice'], fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.5})
sns.regplot(new_data['Age'], np.log10(new_data['Predicted_price']), fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.3, 'color': 'red'})
total_cars = pd.concat([cars, new_cars])
total_data = filter_data(total_cars)
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(10,10))
for fuel in ['Diesel', 'Gasoline']:
sns.distplot(total_cars[total_cars['Fuel'] == fuel]['logPrice'], label=fuel, hist=False, ax=ax1)
for form in ['Combi', 'Hatchback', 'Sedan']:
sns.distplot(total_cars[total_cars['Form'] == form]['logPrice'], label=form, hist=False, ax=ax2)
forms = ['Combi', 'Hatchback', 'Sedan']
fuels = ['Diesel', 'Gasoline']
d = total_cars[(total_cars['Fuel'].isin(fuels)) & (total_cars['Form'].isin(forms))]
grid = sns.FacetGrid(data=d, hue='Transmission type', col='Form', row='Fuel', legend_out='True')
grid.map(sns.distplot, 'logPrice', hist=False).add_legend()