import pprint as pp
import numpy as np
import pandas as pd
import scipy
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

Initial dataset¶

# Read data file
filename = 'used_ford_focuses.csv'
cars = pd.read_csv(filename)

# Features and target variable
features = [
    'logMileage/year',
    'Age', 
    'Horsepower',
    'Fuel_Gasoline',
    'Transmission type_Manual',
]
target = ['logPrice']

Filters

No damaged cars, they tend to be outliers
Mileage km > 1000, to filter out unused cars or false values
Only cars before 2017, to filter out new cars
No minivans (C-max variant, later became separate model)
No extreme tuning versions (Focus RS) with Horsepower of 200-400
No cars older than 20 years (obviously errors, as Focus was introduced in 1998...)
Only cars with valid, domestic paperwork (others tend to be outliers)

# HELPER FUNCTIONS

# Add new variables to dataframe
def create_variables(frame):
    frame['Age'] = 2017 - frame['Manufactured year']
    frame['logPrice'] = np.log10(frame['Price EUR'])
    frame['Mileage/year'] = frame['Mileage km'] / frame['Age']
    frame['logMileage/year'] = np.log10(frame['Mileage/year'])

    good_condition = ['Spared', 'Recent', 'Undamaged', 'Excellent', 'Normal']
    frame['Undamaged'] = frame['Condition'].isin(good_condition)

    cat_features = ['Form', 'Documents', 'Transmission type', 'Fuel']
    dummies = pd.get_dummies(frame[cat_features], drop_first = True, dummy_na=True)
    frame = pd.concat([frame, dummies], axis=1)
    return frame

# Filter out unwanted items
def filter_data(frame):
    return frame[(frame['Undamaged']) &
                 (frame['Mileage km'] > 1000) &
                 (frame['Manufactured year'] < 2017) &
                 (frame['Form'] != 'Minivan') &
                 (frame['Horsepower'] < 200) &
                 (frame['Age'] < 20) &
                 (frame['Documents_Valid, domestic'] == 1) &
                 (frame['Documents_nan'] == 0) &
                 (frame['Form_nan'] == 0) &
                 (frame['Fuel_nan'] == 0) &
                 (frame['Transmission type_nan'] == 0)][features + target].dropna()

def check_normality(predict, error):
    # Distribution of errors
    norm = np.random.normal(0, np.std(error), 100000)
    sns.distplot(error, hist=False)
    sns.distplot(norm, hist=False, kde_kws={'linewidth': 3, 'ls': 'dotted'})

    # Shapiro test for normality
    print(scipy.stats.shapiro(error))

Getting the data ready¶

# Get the dataset for modelling
cars = create_variables(cars)
data = filter_data(cars)
print(data.count())

logMileage/year             1764
Age                         1764
Horsepower                  1764
Fuel_Gasoline               1764
Transmission type_Manual    1764
logPrice                    1764
dtype: int64

Age and logMileage/year show somewhat higher correlation. That might cause multicollinearity.

sns.heatmap(data.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x1254749e8>

Modeling with scaled data¶

# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(data[features], 
                                                    data[target], 
                                                    test_size=0.20,
                                                   random_state=543)
print('Train X: {}'.format(X_train.shape))
print('Train y: {}'.format(y_train.shape))
print('Test X: {}'.format(X_test.shape))
print('Test y: {}'.format(y_test.shape))

Train X: (1411, 5)
Train y: (1411, 1)
Test X: (353, 5)
Test y: (353, 1)

# Scaler overrider class
class OverrideScaler:
    '''
    Plug-in replacement for scalers for experimenting without 
    rewriting the whole code. Leaves data intact.
    '''
    def fit(self, data):
        return self
    
    def transform(self, data):
        return data
    
    def inverse_transform(self, data):
        return data

# Scaling data
scalerX = OverrideScaler().fit(X_train)
scalerY = OverrideScaler().fit(y_train)

# Linear classifier and fitting model
clf = LinearRegression()
clf.fit(scalerX.transform(X_train), scalerY.transform(y_train))

# Print model
print('Intercept: {}'.format(clf.intercept_))
print('Coefficients: {}'.format(clf.coef_))
print('r2 on train set: {}'.format(clf.score(scalerX.transform(X_train), scalerY.transform(y_train))))
print('r2 on test set: {}'.format(clf.score(scalerX.transform(X_test), scalerY.transform(y_test))))

Intercept: [ 5.01491764]
Coefficients: [[-0.22134832 -0.05955043  0.00166677  0.01463949 -0.02766354]]
r2 on train set: 0.923347200975166
r2 on test set: 0.9278383960027059

The resiudal distribution on the test set is clearly not normal. AFAIK that is not necessary a dealbreaker, but nevertheless in theory this should be normal. Is the model biased? Maybe that's where multicollinearity kicks in?

# Calculate errors on test set
predict = clf.predict(scalerX.transform(X_test))
error = scalerY.transform(y_test) - predict

check_normality(predict, error)

(0.9803362488746643, 9.536957077216357e-05)

New ('wild') data¶

# Read new data
new_file = 'used_ford_focuses_new_round.csv'
new_cars = pd.read_csv(new_file)

# Filter data
new_cars = create_variables(new_cars)
new_data = filter_data(new_cars)
print(new_data.count())

logMileage/year             917
Age                         917
Horsepower                  917
Fuel_Gasoline               917
Transmission type_Manual    917
logPrice                    917
dtype: int64

# Split features and target variable
X_new =  new_data[features]
y_new = new_data[target]

# Score with scaled variables
print('r2 on new data: {}'.format(clf.score(scalerX.transform(X_new), scalerY.transform(y_new))))

# Calculate errors on test set
predict = clf.predict(scalerX.transform(X_new))
error = scalerY.transform(y_new) - predict

check_normality(predict, error)

r2 on new data: 0.9255053176262799
(0.9560282826423645, 6.009916342357587e-16)

The residuals look even uglier. The model is far from perfect.

Quick check¶

The coefficients are hard to understand in this model as the variables are scaled and the target variable and one of the independent variables are log transformed. The easiest was to translate these to human language is to create some artificial test data, where only one variable changes from one observation to the other. Quick and dirty.

sample = np.array([
        [np.log10(10000), 3, 116.0, 0, 0],
        [np.log10(12000), 3, 116.0, 0, 0],
        [np.log10(12000), 4, 116.0, 0, 0],
        [np.log10(12000), 4, 126.0, 0, 0],
        [np.log10(12000), 4, 126.0, 1, 0],
        [np.log10(12000), 4, 126.0, 1, 1],
                 ])
predict = 10 ** (scalerY.inverse_transform(clf.predict(scalerX.transform(sample))))

print('20% increase in mileage/year: {}% change in price'.format(np.round((predict[1][0] / predict[0][0] - 1) * 100, 1)))
print('1 year increase in age: {}% change in price'.format(np.round((predict[2][0] / predict[1][0] - 1) * 100, 1)))
print('10 HP increase in power: {}% change in price'.format(np.round((predict[3][0] / predict[2][0] - 1) * 100, 1)))
print('Fuel == Gasoline: {}% change in price'.format(np.round((predict[4][0] / predict[3][0] - 1) * 100, 1)))
print('Transmission == Manual: {}% change in price'.format(np.round((predict[5][0] / predict[4][0] - 1) * 100, 1)))

20% increase in mileage/year: -4.0% change in price
1 year increase in age: -12.8% change in price
10 HP increase in power: 3.9% change in price
Fuel == Gasoline: 3.4% change in price
Transmission == Manual: -6.2% change in price

Adding predicted and inverse log-transformed price to the dataframe for easier plotting.

new_data['Predicted_price'] = 10 ** scalerY.inverse_transform(clf.predict(scalerX.transform(X_new)))
new_data['Price'] = 10 ** new_data['logPrice']

fig, ax = plt.subplots(figsize=(10,5))
plt.title('Predicted and actual price by age on the new data')
sns.regplot(new_data['Age'], new_data['Price'], fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.5})
sns.regplot(new_data['Age'], new_data['Predicted_price'], fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.3, 'color': 'red'})

<matplotlib.axes._subplots.AxesSubplot at 0x1286a9b70>

fig, ax = plt.subplots(figsize=(10,5))
plt.title('Predicted and actual price by age on the new data (log scale)')
sns.regplot(new_data['Age'], new_data['logPrice'], fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.5})
sns.regplot(new_data['Age'], np.log10(new_data['Predicted_price']), fit_reg=False, x_jitter=1, scatter_kws={'alpha': 0.3, 'color': 'red'})

<matplotlib.axes._subplots.AxesSubplot at 0x122f7bb38>

Merging the datasets¶

total_cars = pd.concat([cars, new_cars])
total_data = filter_data(total_cars)

fig, (ax1, ax2) = plt.subplots(2,1, figsize=(10,10))
for fuel in ['Diesel', 'Gasoline']:
    sns.distplot(total_cars[total_cars['Fuel'] == fuel]['logPrice'], label=fuel, hist=False, ax=ax1)
for form in ['Combi', 'Hatchback', 'Sedan']:
    sns.distplot(total_cars[total_cars['Form'] == form]['logPrice'], label=form, hist=False, ax=ax2)

forms = ['Combi', 'Hatchback', 'Sedan']
fuels = ['Diesel', 'Gasoline']
d = total_cars[(total_cars['Fuel'].isin(fuels)) & (total_cars['Form'].isin(forms))]
grid = sns.FacetGrid(data=d, hue='Transmission type', col='Form', row='Fuel', legend_out='True')
grid.map(sns.distplot, 'logPrice', hist=False).add_legend()

<seaborn.axisgrid.FacetGrid at 0x12545b160>