course2_assignment4_completed

Project: OLS Linear Regression; testing for proper linear relationships and mathematical data transformations

The application and disection of the linear regression model using the statsmodels library.

In [133]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as sm
from statsmodels.compat import lzip
from statsmodels.graphics.regressionplots import plot_regress_exog, plot_fit, plot_leverage_resid2, influence_plot
from statsmodels.graphics import utils

In [134]:

#open all sheets into individual dataframes
path = 'C:/Users/Z33/Desktop/U of T Data Science/Data Science 2/Assignments/Stats Assignment 4 - OLS Linear Regression/Assignment4_linear_regresion_data.xlsx'
xls=pd.ExcelFile(path)
df1=pd.read_excel(xls, 'Set 1', names = ['y', 'x'])
df2=pd.read_excel(xls, 'Set 2', names = ['y', 'x'])
df3=pd.read_excel(xls, 'Set 3', names = ['y', 'x'])
df4=pd.read_excel(xls, 'Set 4', names = ['y', 'x'])
df5=pd.read_excel(xls, 'Set 5', names = ['y', 'x'])
df6=pd.read_excel(xls, 'Set 6', names = ['y', 'x'])

In [135]:

#function for applying regression calculations extracting relevant information
def funct_ols(df):
    df_original = df.copy(deep=True)                     #create copy of dataframe
    corr_matrix = df[['y', 'x']].corr(method = 'pearson')#calc the correlation of y and x
    mu_x = np.mean(df.x)                                 #calc the mean of x
    mu_y = np.mean(df.y)                                 #calc the mean of y
    sig_x = np.std(df.x, ddof=1)                         #calc the std of x
    sig_y = np.std(df.y, ddof=1)                         #calc the std of y
    beta_1 = sig_y/sig_x * corr_matrix.loc['x', 'y']     #estimate the intercept
    beta_0 = mu_y - beta_1 * mu_x                        #estimate the slope
    m = sm.ols('y ~ x', data = df)                       #run the linear regression calculation
    m = m.fit()                                          #fit data to ols model
    intercept, slope = m.params                          #extract slope & intercept from ols function
    summary = m.summary()                                #extract OLS regression summary
    df['y_prediction'] = intercept + slope * df.x        #calculate predicted variable from slope and intercept
    df['residuals'] = df['y_prediction'] - df['y']       #to assess results, inspect the residuals
    residuals_hist = plt.hist(df.residuals, bins=30)              #plot residuals histogram checking for normal distribution
    residual_info = df.residuals.describe()              #describe statiscally the residual data

    return df_original, df, residuals_hist, residual_info, summary, m

In [136]:

#Question 1
#For each data set in Assignment4_linear_regression_data.xlsx:
#Create a scatter plot and visually decide if a linear model is appropriate (a matrix scatter plot will would be most efficient).

In [137]:

plt.scatter(df1.x, df1.y, color='blue', s=5, edgecolor='none')
plt.title('DF1 linear relationship')
plt.show()
plt.scatter(df2.x, df2.y, color='blue',s=5, edgecolor='none')
plt.title('DF2 linear relationship')
plt.show()
plt.scatter(df3.x, df3.y, color='blue',s=5, edgecolor='none')
plt.title('DF3 non linear relationship')
plt.show()
plt.scatter(df4.x, df4.y, color='blue',s=5, edgecolor='none')
plt.title('DF4 non linear relationship')
plt.show()
plt.scatter(df5.x, df5.y, color='blue',s=5, edgecolor='none')
plt.title('DF5 linear with outlier')
plt.show()
plt.scatter(df6.x, df6.y, color='blue',s=5, edgecolor='none')
plt.title('DF6 linear with outlier')
plt.show()

In [138]:

#Questions:
#Create an OLS model for the original and transformed data if required.

#Evaluate if the OLS assumptions are met: normality of errors centered around zero, equal variance, etc..., 
#for the original data and transformed data if appropriate.

#Comment how the transformation impacted the different assumptions. (This should be done only by looking at the output 
#diagnostic charts created by the software)

In [139]:

#The following datasets are not linear, 3 and 4 and need to be transformed.
#The following datasets have outliers, dataset 5 and 6.

#QUICK ANSWER:
#I will be using the Box Cox function to transform data set 3 and Log to transform data set 4 into a more linear form.

In [164]:

from scipy.stats import normaltest

#Transformations ---

#Dataset #3 using the Box-Cox transformation

#df3_normx, fitted_lamda1 = stats.boxcox(df3.x)
df3_normy, fitted_lamda1 = stats.boxcox(df3.y)

#df3_normy= np.log1p(df3.y)
#df3_normx= np.log1p(df3.x)

#df3_normy= np.log(df3.y)
#df3_normx= np.log(df3.x)

#df3_normy= np.sqrt(df3.y)
#df3_normx= np.sqrt(df3.x)

plt.scatter(df3.x, df3_normy, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF3 Linear after box-cox Transformation for Y')
plt.show()

stat, p = normaltest(df3_normy)
#stat, p = normaltest(df3_normy)

print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks normal (fail to reject H0)')
else:
    print('Sample does not look normal (reject H0)')
    
plt.hist(df3_normy, bins=30)

Statistics=5.584, p=0.061
Sample looks normal (fail to reject H0)

Out[164]:

(array([3., 3., 1., 2., 4., 2., 2., 4., 5., 3., 3., 2., 4., 5., 6., 8., 3.,
        3., 5., 7., 6., 1., 3., 4., 5., 2., 2., 0., 0., 2.]),
 array([ 12.20963656,  15.33872053,  18.46780449,  21.59688846,
         24.72597242,  27.85505638,  30.98414035,  34.11322431,
         37.24230828,  40.37139224,  43.5004762 ,  46.62956017,
         49.75864413,  52.8877281 ,  56.01681206,  59.14589602,
         62.27497999,  65.40406395,  68.53314792,  71.66223188,
         74.79131584,  77.92039981,  81.04948377,  84.17856774,
         87.3076517 ,  90.43673566,  93.56581963,  96.69490359,
         99.82398756, 102.95307152, 106.08215548]),
 <BarContainer object of 30 artists>)

In [165]:

#Dataset #4

#df4_normy, fitted_lamda1 = stats.boxcox(df4.y)
df4_normy = np.log(df4.y)

plt.scatter(df4_normy, df4.x, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF4 Linear after Log Transformation')
plt.show()

In [166]:

#After applying the box cox transformation on dataset 3 and log transformation on datset 4 the 
#data is now linear for both data sets.

In [167]:

#Create an OLS model for the original and transformed data if required.

In [168]:

#Data set 3 Analysis and data transformation.

In [169]:

import statsmodels.api as gm

In [170]:

#DataFrame 3 transformed using the log function for both x and y.

transformed2 = pd.DataFrame({'y':df3_normy,'x':df3.x })
df6_original, df_norm3, residuals_hist_norm3, residual_info_norm3, summary_norm3, model_norm3 = funct_ols(transformed2)
print(residual_info_norm3)
print(summary_norm3)

count    1.000000e+02
mean    -6.110668e-15
std      8.330976e+00
min     -2.218360e+01
25%     -4.805561e+00
50%     -9.475701e-01
75%      4.363414e+00
max      2.592434e+01
Name: residuals, dtype: float64
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     641.8
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           8.32e-45
Time:                        19:54:45   Log-Likelihood:                -353.39
No. Observations:                 100   AIC:                             710.8
Df Residuals:                      98   BIC:                             716.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.7418      2.229      2.576      0.011       1.319      10.165
x              8.7229      0.344     25.333      0.000       8.040       9.406
==============================================================================
Omnibus:                        3.044   Durbin-Watson:                   1.854
Prob(Omnibus):                  0.218   Jarque-Bera (JB):                2.483
Skew:                          -0.254   Prob(JB):                        0.289
Kurtosis:                       3.582   Cond. No.                         17.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [171]:

fig = gm.graphics.plot_regress_exog(model_norm3, 'x')
fig.tight_layout(pad=1.0)
plt.show()

In [172]:

# DataFrame 3 Original.
df3_original, df_3, residuals_hist_3, residual_info_3, summary_3, model_3 = funct_ols(df3)
print(residual_info_3)
print(summary_3)

count    1.000000e+02
mean     4.956746e-13
std      1.505310e+03
min     -6.202119e+03
25%     -9.320867e+02
50%      1.251402e+02
75%      8.943890e+02
max      3.406664e+03
Name: residuals, dtype: float64
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.755
Model:                            OLS   Adj. R-squared:                  0.753
Method:                 Least Squares   F-statistic:                     302.4
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           1.04e-31
Time:                        19:54:48   Log-Likelihood:                -873.07
No. Observations:                 100   AIC:                             1750.
Df Residuals:                      98   BIC:                             1755.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -2636.1748    402.741     -6.546      0.000   -3435.400   -1836.949
x           1081.8266     62.216     17.388      0.000     958.361    1205.292
==============================================================================
Omnibus:                       21.170   Durbin-Watson:                   2.159
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               37.896
Skew:                           0.863   Prob(JB):                     5.90e-09
Kurtosis:                       5.474   Cond. No.                         17.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [173]:

fig = gm.graphics.plot_regress_exog(model_3, 'x')
fig.tight_layout(pad=1.0)
plt.show()

In [174]:

#Evaluation Dataset 3: After the data transformation using LOG for both X and Y the R squared is a lot higher - 15%, meaning 
#that prediction performance has increased, using boc cox there was a homoscedasticity problem and the model was 3% less accurate.
#After the transformation the standard errors are in the normal form and centered around zero whereas before they were not.

In [175]:

#Dataset 4 Linear Regression with data transformation.

In [176]:

# DataFrame 4 Transformed using the log function for Y only.
transformed1 = pd.DataFrame({'y':df4_normy,'x':df4.x})
df4_original, df_norm4, residuals_hist_norm4, residual_info_norm4, summary_norm4, model_norm4 = funct_ols(transformed1)
print(residual_info_norm4)
print(summary_norm4)

count    1.000000e+02
mean    -8.615331e-16
std      3.853615e-01
min     -6.403407e-01
25%     -2.803581e-01
50%     -4.510877e-02
75%      1.901438e-01
max      1.057768e+00
Name: residuals, dtype: float64
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.983
Model:                            OLS   Adj. R-squared:                  0.983
Method:                 Least Squares   F-statistic:                     5765.
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           6.91e-89
Time:                        19:54:54   Log-Likelihood:                -46.034
No. Observations:                 100   AIC:                             96.07
Df Residuals:                      98   BIC:                             101.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.6647      0.078     72.264      0.000       5.509       5.820
x              0.9898      0.013     75.930      0.000       0.964       1.016
==============================================================================
Omnibus:                        7.020   Durbin-Watson:                   2.151
Prob(Omnibus):                  0.030   Jarque-Bera (JB):                7.256
Skew:                          -0.657   Prob(JB):                       0.0266
Kurtosis:                       2.872   Cond. No.                         12.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [177]:

fig = gm.graphics.plot_regress_exog(model_norm4, 'x')
fig.tight_layout(pad=1.0)
plt.show()

In [178]:

# DataFrame 4 Linear Regression without data transformation.
df4_original, df_4, residuals_hist_4, residual_info_4, summary_4, model_4 = funct_ols(df4)
print(residual_info_4)
print(summary_4)

count    1.000000e+02
mean    -2.956949e-10
std      1.033103e+06
min     -6.364011e+06
25%     -3.134527e+05
50%      3.081833e+05
75%      6.187548e+05
max      1.039076e+06
Name: residuals, dtype: float64
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.373
Method:                 Least Squares   F-statistic:                     59.97
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           8.87e-12
Time:                        19:54:55   Log-Likelihood:                -1526.2
No. Observations:                 100   AIC:                             3056.
Df Residuals:                      98   BIC:                             3062.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -7.535e+05    2.1e+05     -3.585      0.001   -1.17e+06   -3.36e+05
x           2.707e+05   3.49e+04      7.744      0.000    2.01e+05     3.4e+05
==============================================================================
Omnibus:                      102.143   Durbin-Watson:                   2.077
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1253.666
Skew:                           3.381   Prob(JB):                    5.89e-273
Kurtosis:                      18.973   Cond. No.                         12.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [179]:

fig = gm.graphics.plot_regress_exog(model_4, 'x')
fig.tight_layout(pad=1.0)
plt.show()

In [180]:

#Dataframe 4 Evaluation: After the data transformation the R squared is much higher meaning that there is a better estimation being performed
#or better prediction model. The data transformation made a significant impact on the quality of the predictions  prior to transformation.
#After the transformation the stand errors are in the normal form and centered around zero whereas before they were not.

In [181]:

#If datasets have outliers, remove the outliers and see the effect in the model (slope, intercept and R-square)

In [182]:

#Dataframes 5 and 6 have outliers in their data. So I will analyze the influences of the outliers for datasets 5 and 6.

In [183]:

#Analyze influences of outliers for dataset 5.

#Dataset 5 - Outlier remains.

df5_original, df5_ols, residuals_hist5, residual_info5, summary5, model5 = funct_ols(df5)

fig = gm.graphics.plot_leverage_resid2(model5)
fig.tight_layout(pad=1.0)
plt.show()

fig = gm.graphics.influence_plot(model5, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()

#it is clear that there is an outlier that is heavily influencing the regression
#data with index 100


#droping outlier with index position 100
df5_edited = df5_original.drop([0,100])

plt.scatter(df5_edited.x, df5_edited.y, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF5 with outlier removed, no transformation')
plt.show()

print(summary5)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.806
Model:                            OLS   Adj. R-squared:                  0.804
Method:                 Least Squares   F-statistic:                     411.9
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           4.70e-37
Time:                        19:55:02   Log-Likelihood:                -334.42
No. Observations:                 101   AIC:                             672.8
Df Residuals:                      99   BIC:                             678.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9213      1.346      0.685      0.495      -1.749       3.591
x              4.7671      0.235     20.294      0.000       4.301       5.233
==============================================================================
Omnibus:                      113.783   Durbin-Watson:                   1.491
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2578.951
Skew:                          -3.591   Prob(JB):                         0.00
Kurtosis:                      26.691   Cond. No.                         11.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [184]:

#Dataframe 5 - Outlier removed

df5_original2, df5_ols2, residuals_hist52, residual_info52, summary52, model52 = funct_ols(df5_edited)

fig = gm.graphics.influence_plot(model52, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()

fig = gm.graphics.plot_regress_exog(model52, 'x')
fig.tight_layout(pad=1.0)
plt.show()

print(summary52)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.904
Model:                            OLS   Adj. R-squared:                  0.903
Method:                 Least Squares   F-statistic:                     913.8
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           3.65e-51
Time:                        19:55:05   Log-Likelihood:                -291.63
No. Observations:                  99   AIC:                             587.3
Df Residuals:                      97   BIC:                             592.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0746      0.942     -0.079      0.937      -1.944       1.795
x              5.0623      0.167     30.228      0.000       4.730       5.395
==============================================================================
Omnibus:                        2.796   Durbin-Watson:                   1.920
Prob(Omnibus):                  0.247   Jarque-Bera (JB):                2.458
Skew:                          -0.150   Prob(JB):                        0.293
Kurtosis:                       3.711   Cond. No.                         11.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [185]:

#Evaluations: The R squared was drastically improved 10% by removing the outlier so we have more accurate predictions and less skewed data 
#as shown in a much lower Kurtosis score. Residuals versus x is a healthy shape. And there is no homoscedasticity.

In [186]:

#Analyze influences of outliers for dataset 6.

#Dataset 6 - Outlier remains

df6_original, df6_ols, residuals_hist6, residual_info6, summary6, model6 = funct_ols(df6)

fig = gm.graphics.influence_plot(model6, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()

#it is clear that there is an outlier that is heavily influencing the regression
#data with index 100

df6_edited = df6_original.drop([0,100])

plt.scatter(df6_edited.x, df6_edited.y, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF6 with outlier removed, no transformation')
plt.show()

fig = gm.graphics.plot_regress_exog(model6, 'x')
fig.tight_layout(pad=1.0)
plt.show()

print(summary6)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.913
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     1041.
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           2.49e-54
Time:                        19:55:09   Log-Likelihood:                -367.52
No. Observations:                 101   AIC:                             739.0
Df Residuals:                      99   BIC:                             744.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.3059      1.534     -0.199      0.842      -3.350       2.739
x              7.0272      0.218     32.259      0.000       6.595       7.459
==============================================================================
Omnibus:                        0.494   Durbin-Watson:                   2.255
Prob(Omnibus):                  0.781   Jarque-Bera (JB):                0.262
Skew:                           0.120   Prob(JB):                        0.877
Kurtosis:                       3.070   Cond. No.                         11.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [187]:

#Dataframe 6 outlier removed

df6_original2, df6_ols2, residuals_hist62, residual_info62, summary62, model62 = funct_ols(df6_edited)

fig = gm.graphics.influence_plot(model62, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()

fig = gm.graphics.plot_regress_exog(model62, 'x')
fig.tight_layout(pad=1.0)
plt.show()

print(summary62)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.834
Model:                            OLS   Adj. R-squared:                  0.832
Method:                 Least Squares   F-statistic:                     487.3
Date:                Sun, 21 Mar 2021   Prob (F-statistic):           1.32e-39
Time:                        19:55:12   Log-Likelihood:                -360.52
No. Observations:                  99   AIC:                             725.0
Df Residuals:                      97   BIC:                             730.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3587      1.924      0.186      0.852      -3.459       4.177
x              6.9196      0.313     22.075      0.000       6.297       7.542
==============================================================================
Omnibus:                        0.435   Durbin-Watson:                   2.226
Prob(Omnibus):                  0.804   Jarque-Bera (JB):                0.204
Skew:                           0.105   Prob(JB):                        0.903
Kurtosis:                       3.074   Cond. No.                         12.9
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [ ]:

#Evaluations: The R squared was reduced by removing the outlier so we have less accurate predictions but less skewed data. Residuals versus 
#x is a healthier shape. However due to the outlier being directly on the regression line and the reduction of R squared / prediction
#accuracy I believe the outlier should remain in the dataset as it provided valuable information to the models ability to predit.

Project: OLS Linear Regression; testing for proper linear relationships and mathematical data transformations

The application and disection of the linear regression model using the statsmodels library.

Links

Portfolio

Contact