To show how fit the multiple regression using R and Python, we consider the car data [car] which has the car specifications; HwyMPG, Model, etc,. We fit different regression models to predict Highway MPG using the car specifications.
import pandas as pd
car_data = pd.read_csv("https://raw.githubusercontent.com/saeidamiri1/saeidamiri1.github.io/master/public/data/cardata.csv", delimiter=";", decimal=",")
car_data.columns
## Index(['Model', 'Type', 'MinPrice', 'MidRangePrice', 'MaxPrice', 'CityMPG',
## 'HwyMPG', 'AirBags', 'DriveTrain', 'Cyl', 'EngSize', 'HPW', 'RPM',
## 'Rev', 'ManTran', 'GasTank', 'PassCap', 'Length', 'WheelBase', 'Width',
## 'U.turn', 'RearSeat', 'Luggage', 'Weight', 'Domest'],
## dtype='object')
car_data.head(10)
## Model Type MinPrice ... Luggage Weight Domest
## 0 Integra Small 12.9 ... 11.0 2705 non-USA
## 1 Legend Midsize 29.2 ... 15.0 3560 non-USA
## 2 90 Compact 25.9 ... 14.0 3375 non-USA
## 3 100 Midsize 30.8 ... 17.0 3405 non-USA
## 4 535i Midsize 23.7 ... 13.0 3640 non-USA
## 5 Century Midsize 14.2 ... 16.0 2880 USA
## 6 LeSabre Large 19.9 ... 17.0 3470 USA
## 7 Roadmaster Large 22.6 ... 21.0 4105 USA
## 8 Riviera Midsize 26.3 ... 14.0 3495 USA
## 9 DeVille Large 33.0 ... 18.0 3620 USA
##
## [10 rows x 25 columns]
Select the numerical variables to fit the simple regression model. The data have the missing values, so we run an imputuion procedure to fill the missiong values.
# Select numerical variables
car_data=car_data.select_dtypes(exclude=['object'])
# Check there is any missing values
car_data.isnull().any().any()
## True
car_data.apply(lambda x: x.isnull().any().any(), axis=0)
# Run Imputation procedure
## MinPrice False
## MidRangePrice False
## MaxPrice False
## CityMPG False
## HwyMPG False
## EngSize False
## HPW False
## RPM False
## Rev False
## GasTank False
## PassCap False
## Length False
## WheelBase False
## Width False
## U.turn False
## RearSeat True
## Luggage True
## Weight False
## dtype: bool
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(missing_values=np.NaN)
imp.fit(car_data)
## IterativeImputer()
m_car_data=pd.DataFrame(imp.transform(car_data))
m_car_data.columns=car_data.columns
To select variable, calculate the correlation #### Using Python
m_car_data.corr().HwyMPG
## MinPrice -0.579966
## MidRangePrice -0.560680
## MaxPrice -0.522561
## CityMPG 0.943936
## HwyMPG 1.000000
## EngSize -0.626795
## HPW -0.619044
## RPM 0.313469
## Rev 0.587497
## GasTank -0.786039
## PassCap -0.466386
## Length -0.542897
## WheelBase -0.615384
## Width -0.640359
## U.turn -0.593683
## RearSeat -0.333649
## Luggage -0.447480
## Weight -0.810658
## Name: HwyMPG, dtype: float64
m_car_data.loc[:,['HwyMPG','Length']].corr()
## HwyMPG Length
## HwyMPG 1.000000 -0.542897
## Length -0.542897 1.000000
m_car_data.loc[:,['HwyMPG','Width']].corr()
## HwyMPG Width
## HwyMPG 1.000000 -0.640359
## Width -0.640359 1.000000
import seaborn as sns
import matplotlib.pyplot as plt
aa=pd.plotting.scatter_matrix(m_car_data.loc[:,['HwyMPG','GasTank','Rev']])
plt.show()
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
model_r=lr.fit(m_car_data.loc[:,['Rev','RPM']], m_car_data.HwyMPG)
## /usr/local/lib/python3.9/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
## If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
##
## from sklearn.pipeline import make_pipeline
##
## model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
##
## If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
##
## kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
## model.fit(X, y, **kwargs)
##
##
## warnings.warn(
model_r_pred = model_r.predict(m_car_data.loc[:,['Rev','RPM']])
from sklearn.metrics import r2_score
r2_score(m_car_data.HwyMPG, model_r_pred)
## 0.3458405982219276
model_g=lr.fit(m_car_data.loc[:,['GasTank']], m_car_data.HwyMPG)
## /usr/local/lib/python3.9/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
## If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
##
## from sklearn.pipeline import make_pipeline
##
## model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
##
## If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
##
## kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
## model.fit(X, y, **kwargs)
##
##
## warnings.warn(
model_g_pred= model_g.predict(m_car_data.loc[:,['GasTank']])
r2_score(m_car_data.HwyMPG,model_g_pred)
## 0.6178567353893548
model_rg=lr.fit(m_car_data.loc[:,['Rev','GasTank']], m_car_data.HwyMPG)
## /usr/local/lib/python3.9/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
## If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
##
## from sklearn.pipeline import make_pipeline
##
## model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())
##
## If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
##
## kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
## model.fit(X, y, **kwargs)
##
##
## warnings.warn(
model_rg_pred = model_rg.predict(m_car_data.loc[:,['Rev','GasTank']])
r2_score(m_car_data.HwyMPG, model_rg_pred )
## 0.6365055495221749
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
x=np.random.randn(100)
y=x**3+.3*np.random.randn(100)
data={'x':x,'y':y}
x_train, x_test, y_train, y_test = train_test_split(data['x'], data['y'], test_size=0.2)
train_data={'x':x_train,'y':y_train}
test_data={'x':x_test,'y':y_test}
def poly_regression(train_data, test_data, degree=0):
reg_model=LinearRegression()
x_train = train_data['x'][:, np.newaxis]
y_train = train_data['y'][:, np.newaxis]
x_test = test_data['x'][:, np.newaxis]
y_test = test_data['y'][:, np.newaxis]
poly_fts =PolynomialFeatures(degree=degree)
x_train_poly = poly_fts.fit_transform(x_train)
x_test_poly = poly_fts.fit_transform(x_test)
reg_model.fit(x_train_poly,y_train)
y_train_pred = reg_model.predict(x_train_poly)
y_test_pred = reg_model.predict(x_test_poly)
pred = {'train':y_train_pred, 'test':y_test_pred}
return (reg_model, pred)
degree=[0,1,2,3,5,7]
preds={}
for d in degree:
model, pred = poly_regression(train_data, test_data, degree=d)
preds[d] = {'train':{'x':train_data['x'], 'y':pred['train']}, 'test':{'x':test_data['x'], 'y':pred['test']}}
i=1
for d in degree:
plt.subplot(2,3,i)
plt.scatter(preds[d]['test']['x'],preds[d]['test']['y'],s=10,label='Fit with d= {}'.format(d))
plt.scatter(x_train, y_train,color='m', s=10, label='train data')
plt.legend()
i+=1
plt.show(block=False)
plt.clf()
from sklearn.linear_model import Ridge
def poly_regression_ridge(train_data, test_data, degree=0,alpha=0):
reg_model=Ridge(alpha=alpha)
x_train = train_data['x'][:, np.newaxis]
y_train = train_data['y'][:, np.newaxis]
x_test = test_data['x'][:, np.newaxis]
y_test = test_data['y'][:, np.newaxis]
poly_fts =PolynomialFeatures(degree=degree)
x_train_poly = poly_fts.fit_transform(x_train)
x_test_poly = poly_fts.fit_transform(x_test)
reg_model.fit(x_train_poly,y_train)
y_train_pred = reg_model.predict(x_train_poly)
y_test_pred = reg_model.predict(x_test_poly)
pred = {'train':y_train_pred, 'test':y_test_pred}
return (reg_model, pred)
degree=[0,1,2,3,5,7]
preds={}
for d in degree:
model, pred = poly_regression_ridge(train_data, test_data, degree=d,alpha=.5)
preds[d] = {'train':{'x':train_data['x'], 'y':pred['train']}, \
'test':{'x':test_data['x'], 'y':pred['test']}}
i=1
for d in degree:
plt.subplot(2,3,i)
plt.scatter(preds[d]['test']['x'],preds[d]['test']['y'],s=10,label='Fit with d= {}'.format(d))
plt.scatter(x_train, y_train,color='m', s=10, label='train data')
plt.legend()
i+=1
plt.show(block=False)