import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as sm
from statsmodels.compat import lzip
from statsmodels.graphics.regressionplots import plot_regress_exog, plot_fit, plot_leverage_resid2, influence_plot
from statsmodels.graphics import utils
#open all sheets into individual dataframes
path = 'C:/Users/Z33/Desktop/U of T Data Science/Data Science 2/Assignments/Stats Assignment 4 - OLS Linear Regression/Assignment4_linear_regresion_data.xlsx'
xls=pd.ExcelFile(path)
df1=pd.read_excel(xls, 'Set 1', names = ['y', 'x'])
df2=pd.read_excel(xls, 'Set 2', names = ['y', 'x'])
df3=pd.read_excel(xls, 'Set 3', names = ['y', 'x'])
df4=pd.read_excel(xls, 'Set 4', names = ['y', 'x'])
df5=pd.read_excel(xls, 'Set 5', names = ['y', 'x'])
df6=pd.read_excel(xls, 'Set 6', names = ['y', 'x'])
#function for applying regression calculations extracting relevant information
def funct_ols(df):
df_original = df.copy(deep=True) #create copy of dataframe
corr_matrix = df[['y', 'x']].corr(method = 'pearson')#calc the correlation of y and x
mu_x = np.mean(df.x) #calc the mean of x
mu_y = np.mean(df.y) #calc the mean of y
sig_x = np.std(df.x, ddof=1) #calc the std of x
sig_y = np.std(df.y, ddof=1) #calc the std of y
beta_1 = sig_y/sig_x * corr_matrix.loc['x', 'y'] #estimate the intercept
beta_0 = mu_y - beta_1 * mu_x #estimate the slope
m = sm.ols('y ~ x', data = df) #run the linear regression calculation
m = m.fit() #fit data to ols model
intercept, slope = m.params #extract slope & intercept from ols function
summary = m.summary() #extract OLS regression summary
df['y_prediction'] = intercept + slope * df.x #calculate predicted variable from slope and intercept
df['residuals'] = df['y_prediction'] - df['y'] #to assess results, inspect the residuals
residuals_hist = plt.hist(df.residuals, bins=30) #plot residuals histogram checking for normal distribution
residual_info = df.residuals.describe() #describe statiscally the residual data
return df_original, df, residuals_hist, residual_info, summary, m
#Question 1
#For each data set in Assignment4_linear_regression_data.xlsx:
#Create a scatter plot and visually decide if a linear model is appropriate (a matrix scatter plot will would be most efficient).
plt.scatter(df1.x, df1.y, color='blue', s=5, edgecolor='none')
plt.title('DF1 linear relationship')
plt.show()
plt.scatter(df2.x, df2.y, color='blue',s=5, edgecolor='none')
plt.title('DF2 linear relationship')
plt.show()
plt.scatter(df3.x, df3.y, color='blue',s=5, edgecolor='none')
plt.title('DF3 non linear relationship')
plt.show()
plt.scatter(df4.x, df4.y, color='blue',s=5, edgecolor='none')
plt.title('DF4 non linear relationship')
plt.show()
plt.scatter(df5.x, df5.y, color='blue',s=5, edgecolor='none')
plt.title('DF5 linear with outlier')
plt.show()
plt.scatter(df6.x, df6.y, color='blue',s=5, edgecolor='none')
plt.title('DF6 linear with outlier')
plt.show()
#Questions:
#Create an OLS model for the original and transformed data if required.
#Evaluate if the OLS assumptions are met: normality of errors centered around zero, equal variance, etc...,
#for the original data and transformed data if appropriate.
#Comment how the transformation impacted the different assumptions. (This should be done only by looking at the output
#diagnostic charts created by the software)
#The following datasets are not linear, 3 and 4 and need to be transformed.
#The following datasets have outliers, dataset 5 and 6.
#QUICK ANSWER:
#I will be using the Box Cox function to transform data set 3 and Log to transform data set 4 into a more linear form.
from scipy.stats import normaltest
#Transformations ---
#Dataset #3 using the Box-Cox transformation
#df3_normx, fitted_lamda1 = stats.boxcox(df3.x)
df3_normy, fitted_lamda1 = stats.boxcox(df3.y)
#df3_normy= np.log1p(df3.y)
#df3_normx= np.log1p(df3.x)
#df3_normy= np.log(df3.y)
#df3_normx= np.log(df3.x)
#df3_normy= np.sqrt(df3.y)
#df3_normx= np.sqrt(df3.x)
plt.scatter(df3.x, df3_normy, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF3 Linear after box-cox Transformation for Y')
plt.show()
stat, p = normaltest(df3_normy)
#stat, p = normaltest(df3_normy)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks normal (fail to reject H0)')
else:
print('Sample does not look normal (reject H0)')
plt.hist(df3_normy, bins=30)
Statistics=5.584, p=0.061 Sample looks normal (fail to reject H0)
(array([3., 3., 1., 2., 4., 2., 2., 4., 5., 3., 3., 2., 4., 5., 6., 8., 3.,
3., 5., 7., 6., 1., 3., 4., 5., 2., 2., 0., 0., 2.]),
array([ 12.20963656, 15.33872053, 18.46780449, 21.59688846,
24.72597242, 27.85505638, 30.98414035, 34.11322431,
37.24230828, 40.37139224, 43.5004762 , 46.62956017,
49.75864413, 52.8877281 , 56.01681206, 59.14589602,
62.27497999, 65.40406395, 68.53314792, 71.66223188,
74.79131584, 77.92039981, 81.04948377, 84.17856774,
87.3076517 , 90.43673566, 93.56581963, 96.69490359,
99.82398756, 102.95307152, 106.08215548]),
<BarContainer object of 30 artists>)#Dataset #4
#df4_normy, fitted_lamda1 = stats.boxcox(df4.y)
df4_normy = np.log(df4.y)
plt.scatter(df4_normy, df4.x, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF4 Linear after Log Transformation')
plt.show()
#After applying the box cox transformation on dataset 3 and log transformation on datset 4 the
#data is now linear for both data sets.
#Create an OLS model for the original and transformed data if required.
#Data set 3 Analysis and data transformation.
import statsmodels.api as gm
#DataFrame 3 transformed using the log function for both x and y.
transformed2 = pd.DataFrame({'y':df3_normy,'x':df3.x })
df6_original, df_norm3, residuals_hist_norm3, residual_info_norm3, summary_norm3, model_norm3 = funct_ols(transformed2)
print(residual_info_norm3)
print(summary_norm3)
count 1.000000e+02
mean -6.110668e-15
std 8.330976e+00
min -2.218360e+01
25% -4.805561e+00
50% -9.475701e-01
75% 4.363414e+00
max 2.592434e+01
Name: residuals, dtype: float64
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.868
Model: OLS Adj. R-squared: 0.866
Method: Least Squares F-statistic: 641.8
Date: Sun, 21 Mar 2021 Prob (F-statistic): 8.32e-45
Time: 19:54:45 Log-Likelihood: -353.39
No. Observations: 100 AIC: 710.8
Df Residuals: 98 BIC: 716.0
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.7418 2.229 2.576 0.011 1.319 10.165
x 8.7229 0.344 25.333 0.000 8.040 9.406
==============================================================================
Omnibus: 3.044 Durbin-Watson: 1.854
Prob(Omnibus): 0.218 Jarque-Bera (JB): 2.483
Skew: -0.254 Prob(JB): 0.289
Kurtosis: 3.582 Cond. No. 17.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig = gm.graphics.plot_regress_exog(model_norm3, 'x')
fig.tight_layout(pad=1.0)
plt.show()
# DataFrame 3 Original.
df3_original, df_3, residuals_hist_3, residual_info_3, summary_3, model_3 = funct_ols(df3)
print(residual_info_3)
print(summary_3)
count 1.000000e+02
mean 4.956746e-13
std 1.505310e+03
min -6.202119e+03
25% -9.320867e+02
50% 1.251402e+02
75% 8.943890e+02
max 3.406664e+03
Name: residuals, dtype: float64
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.755
Model: OLS Adj. R-squared: 0.753
Method: Least Squares F-statistic: 302.4
Date: Sun, 21 Mar 2021 Prob (F-statistic): 1.04e-31
Time: 19:54:48 Log-Likelihood: -873.07
No. Observations: 100 AIC: 1750.
Df Residuals: 98 BIC: 1755.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -2636.1748 402.741 -6.546 0.000 -3435.400 -1836.949
x 1081.8266 62.216 17.388 0.000 958.361 1205.292
==============================================================================
Omnibus: 21.170 Durbin-Watson: 2.159
Prob(Omnibus): 0.000 Jarque-Bera (JB): 37.896
Skew: 0.863 Prob(JB): 5.90e-09
Kurtosis: 5.474 Cond. No. 17.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig = gm.graphics.plot_regress_exog(model_3, 'x')
fig.tight_layout(pad=1.0)
plt.show()
#Evaluation Dataset 3: After the data transformation using LOG for both X and Y the R squared is a lot higher - 15%, meaning
#that prediction performance has increased, using boc cox there was a homoscedasticity problem and the model was 3% less accurate.
#After the transformation the standard errors are in the normal form and centered around zero whereas before they were not.
#Dataset 4 Linear Regression with data transformation.
# DataFrame 4 Transformed using the log function for Y only.
transformed1 = pd.DataFrame({'y':df4_normy,'x':df4.x})
df4_original, df_norm4, residuals_hist_norm4, residual_info_norm4, summary_norm4, model_norm4 = funct_ols(transformed1)
print(residual_info_norm4)
print(summary_norm4)
count 1.000000e+02
mean -8.615331e-16
std 3.853615e-01
min -6.403407e-01
25% -2.803581e-01
50% -4.510877e-02
75% 1.901438e-01
max 1.057768e+00
Name: residuals, dtype: float64
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.983
Model: OLS Adj. R-squared: 0.983
Method: Least Squares F-statistic: 5765.
Date: Sun, 21 Mar 2021 Prob (F-statistic): 6.91e-89
Time: 19:54:54 Log-Likelihood: -46.034
No. Observations: 100 AIC: 96.07
Df Residuals: 98 BIC: 101.3
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.6647 0.078 72.264 0.000 5.509 5.820
x 0.9898 0.013 75.930 0.000 0.964 1.016
==============================================================================
Omnibus: 7.020 Durbin-Watson: 2.151
Prob(Omnibus): 0.030 Jarque-Bera (JB): 7.256
Skew: -0.657 Prob(JB): 0.0266
Kurtosis: 2.872 Cond. No. 12.4
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig = gm.graphics.plot_regress_exog(model_norm4, 'x')
fig.tight_layout(pad=1.0)
plt.show()
# DataFrame 4 Linear Regression without data transformation.
df4_original, df_4, residuals_hist_4, residual_info_4, summary_4, model_4 = funct_ols(df4)
print(residual_info_4)
print(summary_4)
count 1.000000e+02
mean -2.956949e-10
std 1.033103e+06
min -6.364011e+06
25% -3.134527e+05
50% 3.081833e+05
75% 6.187548e+05
max 1.039076e+06
Name: residuals, dtype: float64
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.380
Model: OLS Adj. R-squared: 0.373
Method: Least Squares F-statistic: 59.97
Date: Sun, 21 Mar 2021 Prob (F-statistic): 8.87e-12
Time: 19:54:55 Log-Likelihood: -1526.2
No. Observations: 100 AIC: 3056.
Df Residuals: 98 BIC: 3062.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -7.535e+05 2.1e+05 -3.585 0.001 -1.17e+06 -3.36e+05
x 2.707e+05 3.49e+04 7.744 0.000 2.01e+05 3.4e+05
==============================================================================
Omnibus: 102.143 Durbin-Watson: 2.077
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1253.666
Skew: 3.381 Prob(JB): 5.89e-273
Kurtosis: 18.973 Cond. No. 12.4
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig = gm.graphics.plot_regress_exog(model_4, 'x')
fig.tight_layout(pad=1.0)
plt.show()
#Dataframe 4 Evaluation: After the data transformation the R squared is much higher meaning that there is a better estimation being performed
#or better prediction model. The data transformation made a significant impact on the quality of the predictions prior to transformation.
#After the transformation the stand errors are in the normal form and centered around zero whereas before they were not.
#If datasets have outliers, remove the outliers and see the effect in the model (slope, intercept and R-square)
#Dataframes 5 and 6 have outliers in their data. So I will analyze the influences of the outliers for datasets 5 and 6.
#Analyze influences of outliers for dataset 5.
#Dataset 5 - Outlier remains.
df5_original, df5_ols, residuals_hist5, residual_info5, summary5, model5 = funct_ols(df5)
fig = gm.graphics.plot_leverage_resid2(model5)
fig.tight_layout(pad=1.0)
plt.show()
fig = gm.graphics.influence_plot(model5, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()
#it is clear that there is an outlier that is heavily influencing the regression
#data with index 100
#droping outlier with index position 100
df5_edited = df5_original.drop([0,100])
plt.scatter(df5_edited.x, df5_edited.y, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF5 with outlier removed, no transformation')
plt.show()
print(summary5)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.806
Model: OLS Adj. R-squared: 0.804
Method: Least Squares F-statistic: 411.9
Date: Sun, 21 Mar 2021 Prob (F-statistic): 4.70e-37
Time: 19:55:02 Log-Likelihood: -334.42
No. Observations: 101 AIC: 672.8
Df Residuals: 99 BIC: 678.1
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.9213 1.346 0.685 0.495 -1.749 3.591
x 4.7671 0.235 20.294 0.000 4.301 5.233
==============================================================================
Omnibus: 113.783 Durbin-Watson: 1.491
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2578.951
Skew: -3.591 Prob(JB): 0.00
Kurtosis: 26.691 Cond. No. 11.8
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#Dataframe 5 - Outlier removed
df5_original2, df5_ols2, residuals_hist52, residual_info52, summary52, model52 = funct_ols(df5_edited)
fig = gm.graphics.influence_plot(model52, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()
fig = gm.graphics.plot_regress_exog(model52, 'x')
fig.tight_layout(pad=1.0)
plt.show()
print(summary52)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.904
Model: OLS Adj. R-squared: 0.903
Method: Least Squares F-statistic: 913.8
Date: Sun, 21 Mar 2021 Prob (F-statistic): 3.65e-51
Time: 19:55:05 Log-Likelihood: -291.63
No. Observations: 99 AIC: 587.3
Df Residuals: 97 BIC: 592.4
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.0746 0.942 -0.079 0.937 -1.944 1.795
x 5.0623 0.167 30.228 0.000 4.730 5.395
==============================================================================
Omnibus: 2.796 Durbin-Watson: 1.920
Prob(Omnibus): 0.247 Jarque-Bera (JB): 2.458
Skew: -0.150 Prob(JB): 0.293
Kurtosis: 3.711 Cond. No. 11.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#Evaluations: The R squared was drastically improved 10% by removing the outlier so we have more accurate predictions and less skewed data
#as shown in a much lower Kurtosis score. Residuals versus x is a healthy shape. And there is no homoscedasticity.
#Analyze influences of outliers for dataset 6.
#Dataset 6 - Outlier remains
df6_original, df6_ols, residuals_hist6, residual_info6, summary6, model6 = funct_ols(df6)
fig = gm.graphics.influence_plot(model6, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()
#it is clear that there is an outlier that is heavily influencing the regression
#data with index 100
df6_edited = df6_original.drop([0,100])
plt.scatter(df6_edited.x, df6_edited.y, color='blue', s=5, edgecolor='none', label='box cox')
plt.title('DF6 with outlier removed, no transformation')
plt.show()
fig = gm.graphics.plot_regress_exog(model6, 'x')
fig.tight_layout(pad=1.0)
plt.show()
print(summary6)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.913
Model: OLS Adj. R-squared: 0.912
Method: Least Squares F-statistic: 1041.
Date: Sun, 21 Mar 2021 Prob (F-statistic): 2.49e-54
Time: 19:55:09 Log-Likelihood: -367.52
No. Observations: 101 AIC: 739.0
Df Residuals: 99 BIC: 744.3
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.3059 1.534 -0.199 0.842 -3.350 2.739
x 7.0272 0.218 32.259 0.000 6.595 7.459
==============================================================================
Omnibus: 0.494 Durbin-Watson: 2.255
Prob(Omnibus): 0.781 Jarque-Bera (JB): 0.262
Skew: 0.120 Prob(JB): 0.877
Kurtosis: 3.070 Cond. No. 11.8
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#Dataframe 6 outlier removed
df6_original2, df6_ols2, residuals_hist62, residual_info62, summary62, model62 = funct_ols(df6_edited)
fig = gm.graphics.influence_plot(model62, criterion="cooks")
fig.tight_layout(pad=1.0)
plt.show()
fig = gm.graphics.plot_regress_exog(model62, 'x')
fig.tight_layout(pad=1.0)
plt.show()
print(summary62)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.834
Model: OLS Adj. R-squared: 0.832
Method: Least Squares F-statistic: 487.3
Date: Sun, 21 Mar 2021 Prob (F-statistic): 1.32e-39
Time: 19:55:12 Log-Likelihood: -360.52
No. Observations: 99 AIC: 725.0
Df Residuals: 97 BIC: 730.2
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.3587 1.924 0.186 0.852 -3.459 4.177
x 6.9196 0.313 22.075 0.000 6.297 7.542
==============================================================================
Omnibus: 0.435 Durbin-Watson: 2.226
Prob(Omnibus): 0.804 Jarque-Bera (JB): 0.204
Skew: 0.105 Prob(JB): 0.903
Kurtosis: 3.074 Cond. No. 12.9
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#Evaluations: The R squared was reduced by removing the outlier so we have less accurate predictions but less skewed data. Residuals versus
#x is a healthier shape. However due to the outlier being directly on the regression line and the reduction of R squared / prediction
#accuracy I believe the outlier should remain in the dataset as it provided valuable information to the models ability to predit.

Welcome to JGAnalytics, we offer services for hire for Data Analytics, Data Engineering, and Machine Learning.
Copyright © JGAnalytics 2024