Data

To show how fit the multiple regression using R and Python, we consider the car data [car] which has the car specifications; HwyMPG, Model, etc,. We fit different regression models to predict Highway MPG using the car specifications.

Regression

Import data

import pandas as pd
car_data = pd.read_csv("https://raw.githubusercontent.com/saeidamiri1/saeidamiri1.github.io/master/public/data/cardata.csv", delimiter=";", decimal=",")
car_data.columns
## Index(['Model', 'Type', 'MinPrice', 'MidRangePrice', 'MaxPrice', 'CityMPG',
##        'HwyMPG', 'AirBags', 'DriveTrain', 'Cyl', 'EngSize', 'HPW', 'RPM',
##        'Rev', 'ManTran', 'GasTank', 'PassCap', 'Length', 'WheelBase', 'Width',
##        'U.turn', 'RearSeat', 'Luggage', 'Weight', 'Domest'],
##       dtype='object')
car_data.head(10)
##         Model     Type  MinPrice  ...  Luggage  Weight   Domest
## 0     Integra    Small      12.9  ...     11.0    2705  non-USA
## 1      Legend  Midsize      29.2  ...     15.0    3560  non-USA
## 2          90  Compact      25.9  ...     14.0    3375  non-USA
## 3         100  Midsize      30.8  ...     17.0    3405  non-USA
## 4        535i  Midsize      23.7  ...     13.0    3640  non-USA
## 5     Century  Midsize      14.2  ...     16.0    2880      USA
## 6     LeSabre    Large      19.9  ...     17.0    3470      USA
## 7  Roadmaster    Large      22.6  ...     21.0    4105      USA
## 8     Riviera  Midsize      26.3  ...     14.0    3495      USA
## 9     DeVille    Large      33.0  ...     18.0    3620      USA
## 
## [10 rows x 25 columns]

Generate a random data

Preprocessing data

Select the numerical variables to fit the simple regression model. The data have the missing values, so we run an imputuion procedure to fill the missiong values.

# Select numerical variables
car_data=car_data.select_dtypes(exclude=['object'])
# Check there is any missing values 
car_data.isnull().any().any()
## True
car_data.apply(lambda x: x.isnull().any().any(), axis=0)
# Run Imputation procedure
## MinPrice         False
## MidRangePrice    False
## MaxPrice         False
## CityMPG          False
## HwyMPG           False
## EngSize          False
## HPW              False
## RPM              False
## Rev              False
## GasTank          False
## PassCap          False
## Length           False
## WheelBase        False
## Width            False
## U.turn           False
## RearSeat          True
## Luggage           True
## Weight           False
## dtype: bool
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(missing_values=np.NaN)
imp.fit(car_data)
## IterativeImputer()
m_car_data=pd.DataFrame(imp.transform(car_data))
m_car_data.columns=car_data.columns

Select variables

To select variable, calculate the correlation #### Using Python

m_car_data.corr().HwyMPG
## MinPrice        -0.579966
## MidRangePrice   -0.560680
## MaxPrice        -0.522561
## CityMPG          0.943936
## HwyMPG           1.000000
## EngSize         -0.626795
## HPW             -0.619044
## RPM              0.313469
## Rev              0.587497
## GasTank         -0.786039
## PassCap         -0.466386
## Length          -0.542897
## WheelBase       -0.615384
## Width           -0.640359
## U.turn          -0.593683
## RearSeat        -0.333649
## Luggage         -0.447480
## Weight          -0.810658
## Name: HwyMPG, dtype: float64
m_car_data.loc[:,['HwyMPG','Length']].corr()
##           HwyMPG    Length
## HwyMPG  1.000000 -0.542897
## Length -0.542897  1.000000
m_car_data.loc[:,['HwyMPG','Width']].corr()
##           HwyMPG     Width
## HwyMPG  1.000000 -0.640359
## Width  -0.640359  1.000000
import seaborn as sns
import matplotlib.pyplot as plt
aa=pd.plotting.scatter_matrix(m_car_data.loc[:,['HwyMPG','GasTank','Rev']])
plt.show()

Fit model

from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
model_r=lr.fit(m_car_data.loc[:,['Rev','RPM']], m_car_data.HwyMPG)
## /usr/local/lib/python3.9/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
## If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
## 
## from sklearn.pipeline import make_pipeline
## 
## model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
## 
## If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
## 
## kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
## model.fit(X, y, **kwargs)
## 
## 
##   warnings.warn(
model_r_pred = model_r.predict(m_car_data.loc[:,['Rev','RPM']])
from sklearn.metrics import r2_score
r2_score(m_car_data.HwyMPG, model_r_pred)
## 0.3458405982219276
model_g=lr.fit(m_car_data.loc[:,['GasTank']], m_car_data.HwyMPG)
## /usr/local/lib/python3.9/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
## If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
## 
## from sklearn.pipeline import make_pipeline
## 
## model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
## 
## If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
## 
## kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
## model.fit(X, y, **kwargs)
## 
## 
##   warnings.warn(
model_g_pred= model_g.predict(m_car_data.loc[:,['GasTank']])
r2_score(m_car_data.HwyMPG,model_g_pred)
## 0.6178567353893548
model_rg=lr.fit(m_car_data.loc[:,['Rev','GasTank']], m_car_data.HwyMPG)
## /usr/local/lib/python3.9/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
## If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
## 
## from sklearn.pipeline import make_pipeline
## 
## model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
## 
## If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
## 
## kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
## model.fit(X, y, **kwargs)
## 
## 
##   warnings.warn(
model_rg_pred = model_rg.predict(m_car_data.loc[:,['Rev','GasTank']])
r2_score(m_car_data.HwyMPG, model_rg_pred )
## 0.6365055495221749

Polynomial Regression

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

x=np.random.randn(100)
y=x**3+.3*np.random.randn(100)
data={'x':x,'y':y}
x_train, x_test, y_train, y_test = train_test_split(data['x'], data['y'], test_size=0.2)

train_data={'x':x_train,'y':y_train}
test_data={'x':x_test,'y':y_test}

def poly_regression(train_data, test_data, degree=0):
    reg_model=LinearRegression()
    x_train = train_data['x'][:, np.newaxis]
    y_train = train_data['y'][:, np.newaxis]
    x_test = test_data['x'][:, np.newaxis]
    y_test = test_data['y'][:, np.newaxis]
    poly_fts =PolynomialFeatures(degree=degree)
    x_train_poly =  poly_fts.fit_transform(x_train)
    x_test_poly =  poly_fts.fit_transform(x_test)
    reg_model.fit(x_train_poly,y_train)
    y_train_pred = reg_model.predict(x_train_poly)
    y_test_pred = reg_model.predict(x_test_poly)
    pred = {'train':y_train_pred, 'test':y_test_pred}
    return (reg_model, pred)

degree=[0,1,2,3,5,7]
preds={}
for d in degree:
    model, pred = poly_regression(train_data, test_data, degree=d)
    preds[d] = {'train':{'x':train_data['x'], 'y':pred['train']}, 'test':{'x':test_data['x'], 'y':pred['test']}}

i=1
for d in degree:
    plt.subplot(2,3,i)
    plt.scatter(preds[d]['test']['x'],preds[d]['test']['y'],s=10,label='Fit with d= {}'.format(d))
    plt.scatter(x_train, y_train,color='m', s=10, label='train data')
    plt.legend()
    i+=1

plt.show(block=False)

Ridge regression

plt.clf()
from sklearn.linear_model import Ridge

def poly_regression_ridge(train_data, test_data, degree=0,alpha=0):
    reg_model=Ridge(alpha=alpha)
    x_train = train_data['x'][:, np.newaxis]
    y_train = train_data['y'][:, np.newaxis]
    x_test = test_data['x'][:, np.newaxis]
    y_test = test_data['y'][:, np.newaxis]
    poly_fts =PolynomialFeatures(degree=degree)
    x_train_poly =  poly_fts.fit_transform(x_train)
    x_test_poly =  poly_fts.fit_transform(x_test)
    reg_model.fit(x_train_poly,y_train)
    y_train_pred = reg_model.predict(x_train_poly)
    y_test_pred = reg_model.predict(x_test_poly)
    pred = {'train':y_train_pred, 'test':y_test_pred}
    return (reg_model, pred)

degree=[0,1,2,3,5,7]
preds={}
for d in degree:
    model, pred = poly_regression_ridge(train_data, test_data, degree=d,alpha=.5)
    preds[d] = {'train':{'x':train_data['x'], 'y':pred['train']}, \
                'test':{'x':test_data['x'], 'y':pred['test']}}

i=1
for d in degree:
    plt.subplot(2,3,i)
    plt.scatter(preds[d]['test']['x'],preds[d]['test']['y'],s=10,label='Fit with d= {}'.format(d))
    plt.scatter(x_train, y_train,color='m', s=10, label='train data')
    plt.legend()
    i+=1

plt.show(block=False)