Hide the code
%cd 3_1_LinearRegression//wd/3_1_LinearRegression
Machine Learning
%cd 3_1_LinearRegression//wd/3_1_LinearRegression
### Load necessary modules -------------------------------
# interactive plotting
# %matplotlib inline
%config InlineBackend.figure_format = 'png' # ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’
# plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
# sns.set()
import statsmodels.api as sm
# Data management libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
# others
import warnings
# Connecting statsmodels with sklearn via sklearn2pmml and patsy
import statsmodels.api as sm
from statsmodels.api import OLS
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn2pmml.statsmodels import StatsModelsRegressor
# patsy for lineal model formula language
import patsy as ps
from patsy_utils.patsy_formulaic_transformer import FormulaTransformer%run -i "./3_1_models_01.py"df = pd.read_csv("./3_1_simple_linear_regression_01.csv")
inputs =["X0"]
output = "Y"
sk_lm = LinearRegression( )
sk_lm.fit(df[inputs], df[output])LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| fit_intercept | True | |
| copy_X | True | |
| tol | 1e-06 | |
| n_jobs | None | |
| positive | False |
b0 = sk_lm.coef_[0]
b1 = sk_lm.coef_[0]
b0, b1(np.float64(2.002986467673893), np.float64(2.002986467673893))
sns.set_theme(rc={'figure.figsize':(5, 5)})
X_new = np.linspace(df[inputs].min(), df[inputs].max(), 10).ravel()
Y_new = sk_lm.predict(pd.DataFrame({"X0": X_new}))
plt.plot(df[inputs], df[output], 'bo')
plt.plot(X_new, Y_new, 'r-')
plt.title("The regression line is y = {:.4} + {:.4} x".format(b0, b1))
plt.show();plt.close()%run -i "3_1_models_02.py"# %load "./3_1_Exercise_002.py"
'''
3_1 Exercise 002
'''
residuals = model.results_.resid
print("First five residuals: ", residuals[:5])
print(f"\n The sum (and therefore also the mean) of the residuals is zero: {residuals.sum():.4}" )
print(f"\n The mean of the squared residuals is: {(residuals**2).mean():.4f}" )
X = formula_trnsf.fit_transform(df[inputs])
Y = df[output].values
print(f"\n The rsme is : {model.score(X, Y) :.4f}" )
model.results_.summary()Take a look at this figure and note that the lowest \(R\) value corresponds precisely to the one case where the line is a indeed a good model for the data.
{width75% fig-align=“center” fig-alt=“Interpretation of the Correlation COefficient”}
Another important observation about correlation is that it should not be interpreted as causality. Too often the news headlines run sentences like “using (or eating) A linked to cases of B”. And this often comes after a study found a correlation between those two things, without the study proving (or even trying to prove) any sort of causality.
:::
# %load "./3_1_Exercise_004.py"
'''
3_1 Exercise 004
'''
R2 = model.results_.rsquared
print(f"The R^2 coefficient is {R2:.3}")
The R^2 coefficient is 0.795
# %load "3_1_IllustrateModelVariance.py"
# Set the random seed
rng = np.random.default_rng(1)
# Set the figure size
sns.set(rc={'figure.figsize':(8, 8)})
# Common error variance for the linear model
sigma = 0.5
# Number of samples
N = 5
# Sample size
n = 30
# A column to distinguish the data in each sample from the other samples
sampleId = np.repeat(np.arange(N), n)
# The X part of the samples
X = rng.uniform(size = N * n)
# The error variables
Eps = rng.normal(loc = 0, scale = sigma, size = N * n)
# And the Y part according to the model
beta0 = 4
beta1 = -2
Y = beta0 + beta1 * X + Eps
# Put it all together in a DataFrame
df = pd.DataFrame({'X': X, 'Y':Y, 'sampleId':sampleId})
#print(df.head(50)) # test the result
# And plot it using color to identify samples
showLegend = True
if(N > 10):
showLegend = False
sns.scatterplot(data = df, x = X, y = Y, hue = sampleId,
palette="deep", alpha = min(1, 10/n),
legend= showLegend)
# This function gets the coefficients for the regression line
# of Y vs X. Both are assumed t be numerical pandas series of the
# same length.
def getLM(X, Y):
modelXY = LinearRegression(fit_intercept=True)
X = X.values[:, np.newaxis]
Y = Y.values
XY_fit = modelXY.fit(X, Y)
b1 = XY_fit.coef_[0]
b0 = XY_fit.intercept_
return((b0, b1))
# Now let us fit a regression line for each sample and plot the result.
palette2 = iter(sns.color_palette(palette="deep", n_colors=N))
for sample in range(N):
# select the sample
dfs = df.loc[sampleId == sample, :]
Xs = dfs.X
Ys = dfs.Y
# fit the regression line
b0, b1 = getLM(Xs,Ys)
# plot the line
Xnew = np.linspace(0, 1, num = 100)
Ynew = b0 + b1 * Xnew
plt.plot(Xnew, Ynew,
color = next(palette2), alpha = min(1, n/N))
Xnew = np.linspace(0, 1, num = 100)
Ynew = beta0 + beta1 * Xnew
plt.plot(Xnew, Ynew, "k--", lw = 3)
plt.show()# %load "3_1_ConfidencePredictionBands.py"
# # Set the random seed
# rng = np.random.default_rng(1)
# Set the figure size
sns.set(rc={'figure.figsize':(8, 8)})
# # Common error variance for the linear model
# sigma = 0.5
# # Number of samples
# N = 5
# # Sample size
# n = 80
# # A column to distinguish the data in each sample from the other samples
# sampleId = np.repeat(np.arange(N), n)
# # The X part of the samples
# X = rng.uniform(size = N * n)
# # The error variables
# Eps = rng.normal(loc = 0, scale = sigma, size = N * n)
# # And the Y part according to the model
# beta0 = 4
# beta1 = -2
# Y = beta0 + beta1 * X + Eps
# # Put it all together in a DataFrame
# df = pd.DataFrame({'X': X, 'Y':Y, 'sampleId':sampleId})
# #print(df.head(50)) # test the result
# # And plot it using color to identify samples
# showLegend = True
# if(N > 10):
# showLegend = False
# sns.scatterplot(data = df, x = X, y = Y, hue = sampleId,
# palette="deep", alpha = min(1, 10/n),
# legend= showLegend)
# This function gets the coefficients for the regression line
# of Y vs X. Both are assumed t be numerical pandas series of the
# same length.
def getLM(X, Y):
modelXY = LinearRegression(fit_intercept=True)
X = X.values[:, np.newaxis]
Y = Y.values
XY_fit = modelXY.fit(X, Y)
b1 = XY_fit.coef_[0]
b0 = XY_fit.intercept_
return((b0, b1))
# Now let us fit a regression line for each sample and plot the result.
palette2 = iter(sns.color_palette(palette="deep", n_colors=N))
X_1 = sm.add_constant(X)
X_1
model_df0 = sm.OLS(Y, X_1) # OLS comes from Ordinary Least Squares
df0_fit = model_df0.fit()
X_new = np.linspace(X.min(), X.max(), 100)
X_new = sm.add_constant(X_new)
X_new[:5, :]
df0_fit.pred = df0_fit.get_prediction(X_new)
df0_fit.pred = df0_fit.get_prediction(X_new)
df0_fit_fitted_new = df0_fit.pred.summary_frame(alpha=0.05)["mean"]
df0_fit_confBand_low = df0_fit.pred.summary_frame(alpha=0.05)["mean_ci_lower"]
df0_fit_confBand_high = df0_fit.pred.summary_frame(alpha=0.05)["mean_ci_upper"]
############################################################
fig, ax = plt.subplots(figsize=(8, 6))
# Now let us fit a regression line for each sample and plot the result.
palette2 = iter(sns.color_palette(palette="deep", n_colors=N))
for sample in range(N):
# select the sample
dfs = df.loc[sampleId == sample, :]
Xs = dfs.X
Ys = dfs.Y
# fit the regression line
b0, b1 = getLM(Xs,Ys)
#print(sample, b0, b1,"\n", "--"*5)
# plot the line
Xnew = np.linspace(0, 1, num = 100)
Ynew = b0 + b1 * Xnew
df0_fit_predBand_low = df0_fit.pred.summary_frame(alpha=0.05)["obs_ci_lower"]
df0_fit_predBand_high = df0_fit.pred.summary_frame(alpha=0.05)["obs_ci_upper"]
ax.plot(X, Y, "o", label="data")
ax.plot(X_new[:, 1], df0_fit_predBand_low, "y--", lw = 4)
ax.plot(X_new[:, 1], df0_fit_predBand_high, "y--", lw = 4)
ax.fill_between(X_new[:, 1],
y1 = df0_fit_predBand_low,
y2 = df0_fit_predBand_high,
color='cyan', alpha = 0.75)
ax.plot(X_new[:,1], df0_fit_confBand_low, "y--", lw = 4)
ax.plot(X_new[:,1], df0_fit_confBand_high, "y--", lw = 4)
ax.fill_between(X_new[:,1],
y1 = df0_fit_confBand_low,
y2 = df0_fit_confBand_high,
color='yellow', alpha = 1)print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 0.795
Model: OLS Adj. R-squared: 0.793
Method: Least Squares F-statistic: 380.5
Date: Wed, 18 Feb 2026 Prob (F-statistic): 1.62e-35
Time: 09:00:48 Log-Likelihood: -22.754
No. Observations: 100 AIC: 49.51
Df Residuals: 98 BIC: 54.72
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 3.0034 0.058 51.402 0.000 2.887 3.119
X0 2.0030 0.103 19.507 0.000 1.799 2.207
==============================================================================
Omnibus: 0.219 Durbin-Watson: 2.144
Prob(Omnibus): 0.896 Jarque-Bera (JB): 0.162
Skew: -0.096 Prob(JB): 0.922
Kurtosis: 2.953 Cond. No. 4.19
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
First let us examine the beginning of the dataset. Note the names and types of inputs and output.
df = pd.read_csv("./3_1_data01.csv")
df.head(3)| x1 | x2 | y | |
|---|---|---|---|
| 0 | -0.083784 | -0.268621 | 9.325180 |
| 1 | -0.982944 | -0.118862 | 7.881259 |
| 2 | -1.875067 | 0.065455 | 5.441091 |
Now let us write the model formula and use it to obtain a design matrix. We show the first four rows of the design matrix.
model_Formula = "y ~ x1 + x2"
Y, X = ps.dmatrices(model_Formula, df)
X[:4]array([[ 1. , -0.08378436, -0.26862147],
[ 1. , -0.98294375, -0.11886232],
[ 1. , -1.87506732, 0.06545456],
[ 1. , -0.18614466, 1.29091527]])
# %load "./3_1_Exercise_006.py"
'''
3_1 Exercise 006
'''
# We use names that will not interfere with the previous ones
df_07 = pd.read_csv("./3_1_data07.csv")
print("This is the dataset:")
print(df_07.head(13))
model07_Formula = "y ~ x1 + C(x2)"
Y07, X07 = ps.dmatrices(model07_Formula, df_07)
print("And this is the resulting design matrix (note that there are three columns for x2):")
X07[:13]This is the dataset:
x1 x2 y
0 2.333070 1 22.866299
1 1.675955 2 22.709764
2 0.814088 3 16.807665
3 1.980600 4 30.362348
4 0.151959 1 13.194814
5 0.604424 2 15.083150
6 2.130828 3 27.724743
7 3.089289 4 36.682985
8 1.316041 1 22.400463
9 2.381619 2 29.351081
10 4.311595 3 36.712616
11 0.744393 4 21.781646
12 0.902150 1 18.510534
And this is the resulting design matrix (note that there are three columns for x2):
array([[1. , 0. , 0. , 0. , 2.33306972],
[1. , 1. , 0. , 0. , 1.67595477],
[1. , 0. , 1. , 0. , 0.81408781],
[1. , 0. , 0. , 1. , 1.98060009],
[1. , 0. , 0. , 0. , 0.15195865],
[1. , 1. , 0. , 0. , 0.60442435],
[1. , 0. , 1. , 0. , 2.13082829],
[1. , 0. , 0. , 1. , 3.0892894 ],
[1. , 0. , 0. , 0. , 1.3160413 ],
[1. , 1. , 0. , 0. , 2.38161936],
[1. , 0. , 1. , 0. , 4.31159499],
[1. , 0. , 0. , 1. , 0.74439316],
[1. , 0. , 0. , 0. , 0.9021504 ]])
model.results_.cov_params()| Intercept | X0 | |
|---|---|---|
| Intercept | 0.003414 | -0.005106 |
| X0 | -0.005106 | 0.010544 |
hat_sigma2 = np.sum(model.results_.resid**2)/(df.shape[0] - df.shape[1] -1)
hat_sigma2np.float64(0.00926630245861394)
First we load the dataset, visualize it, and create standard names for the variables:
df = pd.read_csv("./3_1_data01.csv")
df.head(4)| x1 | x2 | y | |
|---|---|---|---|
| 0 | -0.083784 | -0.268621 | 9.325180 |
| 1 | -0.982944 | -0.118862 | 7.881259 |
| 2 | -1.875067 | 0.065455 | 5.441091 |
| 3 | -0.186145 | 1.290915 | 13.861506 |
output = "y"
num_inputs = ["x1", "x2"]
cat_inputs = []
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]Next we create the train/test split and associated datasets.
XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSLet us do a basic EDA exploring with pairplots:
sns.set_style("white")
sns.pairplot(dfTR, corner=True, height=1.5, aspect=1.5);The plot of the output vs each one of the inputs shows linear trends. The correlation between inputs seems very low. We confirm that using the correlation matrix.
XTR.corr()| x1 | x2 | |
|---|---|---|
| x1 | 1.00000 | 0.00622 |
| x2 | 0.00622 | 1.00000 |
Let us use for this model the formula language that we have described above inside the scikit pipeline framework. Two remarks: + In order to use it in a pipeline we only need the right hand side of the formula, the part after ´~´ + When you just want your formula to say “using all inputs” you can use the following construction with join.
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
And now we can use FormulaTransformer with this formula in the first step of the pipeline (it is defined in a taylor-made module patsy_utils inspired by this post by Juan Orduz). The FormulaTransformer will create (through its fit_transform method) the design matrix of the model for us as illustrated below. Then in the second step we use LinearRegression (provided by sklearn) to access all the statistical information provided by statsmodels.
FormulaTransformer(model_Formula).fit_transform(dfTR)| Intercept | x1 | x2 | |
|---|---|---|---|
| 382 | 1.0 | 0.144882 | -1.491031 |
| 994 | 1.0 | 1.443869 | -1.596614 |
| 982 | 1.0 | 1.345697 | -0.053474 |
| 47 | 1.0 | 1.075970 | 0.288895 |
| 521 | 1.0 | -1.313146 | -0.029207 |
| ... | ... | ... | ... |
| 767 | 1.0 | -0.576899 | 0.636284 |
| 72 | 1.0 | -0.084902 | -0.462210 |
| 908 | 1.0 | -0.336037 | 1.811883 |
| 235 | 1.0 | -1.191780 | 0.924044 |
| 37 | 1.0 | 1.129219 | -1.852739 |
800 rows × 3 columns
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])Once the pipeline is created, fit proceeds as usual. We create a model object, which is in fact a statsmodels regressor:
lm_pipeline.fit(dfTR[inputs], dfTR[output])
model = lm_pipeline._final_estimator
modelStatsModelsRegressor(fit_intercept=False,
model_class=<class 'statsmodels.regression.linear_model.OLS'>)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | model_class | <class 'stats...ar_model.OLS'> | |
| fit_intercept | False |
The initial information we need is stored in the results_ property of the model. For example, this is the model’s summary, like the one we discussed previously in this session:
(In this case I recommend using print for a nicer display)
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.838
Model: OLS Adj. R-squared: 0.838
Method: Least Squares F-statistic: 2066.
Date: Wed, 18 Feb 2026 Prob (F-statistic): 4.83e-316
Time: 09:03:58 Log-Likelihood: -1474.4
No. Observations: 800 AIC: 2955.
Df Residuals: 797 BIC: 2969.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 9.9043 0.054 182.954 0.000 9.798 10.011
x1 1.9984 0.054 37.300 0.000 1.893 2.104
x2 2.9342 0.056 52.114 0.000 2.824 3.045
==============================================================================
Omnibus: 4.181 Durbin-Watson: 2.082
Prob(Omnibus): 0.124 Jarque-Bera (JB): 4.153
Skew: 0.130 Prob(JB): 0.125
Kurtosis: 3.239 Cond. No. 1.06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Always check if the coefficients of the input variables are significant. In this case they are. And the adjusted \(R^2\) shows that this model accounts for 84% of the variability of \(Y\) in the training set.
The following script contains the definition of a convenience function that when provided with a model and its variables plots:
# %load "./3_1_ResidualPlots.py"
# %load "./3_1_ResidualPlots.py"
def ResidualPlots(model, data, num_inputs=[], cat_inputs=[], output="Y"):
resid = model.results_.resid
fitted = model.results_.fittedvalues
print("-"*50)
print("Density Curve and QQ-plot of Residuals:", num_inputs)
print("-"*50)
fig1, axes = plt.subplots(figsize=(6, 3), nrows = 1, ncols = 2)
fig1.tight_layout()
print(axes)
sns.kdeplot(x=resid, ax=axes[0])
plt.xlabel('Residuals')
plt.ylabel('Density')
sm.qqplot(resid, line='45', fit=True, dist=stats.norm, ax=axes[1])
plt.show(); plt.close()
print("-"*50)
print("Fitted Values vs Residuals:")
print("-"*50)
fig2, axes = plt.subplots(figsize=(3, 3))
fig2.tight_layout()
sns.regplot(data=df, x=fitted, y=resid, order=2,
line_kws=dict(color="r"), scatter_kws=dict(alpha=0.25),
scatter=True)
plt.ylabel('Residuals')
plt.show(); plt.close()
if len(num_inputs) > 0:
print("-"*50)
print("Numerical inputs:", num_inputs)
print("-"*50)
else:
print("No numerical inputs exist.")
if len(num_inputs) > 0:
nrows = max((len(num_inputs) + 1) // 3, 1) # Calculate the number of rows needed for three columns
# print("nrows:", nrows)
fig, axes = plt.subplots(nrows=nrows, ncols=3, figsize=(9, 3 * nrows),
sharey=False, sharex=False, squeeze=False)
# print("axes.shape:", axes.shape)
# fig.tight_layout()
for k in range(len(num_inputs)):
# print(f"k={k}, num_inputs[k]={num_inputs[k]}")
# print("data.shape:", data.shape)
# print("row:", k // 3, "col:", k % 3)
row = k // 3
col = k % 3
sns.regplot(data=data, x=num_inputs[k], y=resid, ax=axes[row, col],
line_kws=dict(color="r"), scatter_kws=dict(alpha=0.25),)
axes[row, col].set_ylabel('Residuals')
# Hide any unused subplots
for i in range(len(num_inputs), nrows * 3):
fig.delaxes(axes.flatten()[i])
plt.show()
print("-"*50)
if len(cat_inputs) > 0:
print("Categorical inputs:", cat_inputs)
else:
print("No categorical inputs exist.")
print("-"*50)
if len(cat_inputs) > 0:
nrows = max((len(cat_inputs) + 1) // 3, 1) # Calculate the number of rows needed for three columns
fig, axes = plt.subplots(nrows=nrows, ncols=3, figsize=(9, 3 * nrows),
sharey=False, sharex=False, squeeze=False)
fig.tight_layout()
for k in range(len(cat_inputs)):
row = k // 3
col = k % 3
sns.boxplot(data=data, x=cat_inputs[k], y=resid, ax=axes[row, col])
axes[row, col].set_ylabel('Residuals')
# Hide any unused subplots
for i in range(len(cat_inputs), nrows * 3):
fig.delaxes(axes.flatten()[i])
plt.show()We get those plots in the next cell and comment on them below.
ResidualPlots(model=model, data=dfTR, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1', 'x2']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1', 'x2']
--------------------------------------------------
--------------------------------------------------
No categorical inputs exist.
--------------------------------------------------
In this our first example everything works as expected. Think of the plots that we obtain here as a model fitting goal: your job is to get the model plots look like these.
df = pd.read_csv("./3_1_data02.csv")
df.head(4)| x1 | x2 | y | |
|---|---|---|---|
| 0 | -0.667721 | 7.616991 | 1.969010 |
| 1 | -3.296181 | 3.811367 | 7.340756 |
| 2 | -6.743649 | 5.733155 | -2.591455 |
| 3 | -2.077600 | -8.420373 | -52.285824 |
output = "y"
num_inputs = ["x1", "x2"]
cat_inputs = []
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]Next we create the train/test split.
XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSsns.set_style("white")
sns.pairplot(dfTR, corner=True, height=1.5, aspect=1.5);The obvious non linear pattern in the Y vs X2 plot is our first warning that things are different now. The correlation between inputs however looks fine.
XTR.corr()| x1 | x2 | |
|---|---|---|
| x1 | 1.000000 | -0.017873 |
| x2 | -0.017873 | 1.000000 |
Let us first use the same formula as in the previous model.
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
We create the pipeline and fit the model.
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])lm_pipeline.fit(dfTR[inputs], dfTR[output])
model = lm_pipeline._final_estimatorThe model’s summary looks ok, all coefficients are significative, but the adjusted \(R^2\) is lower than expected.
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.664
Model: OLS Adj. R-squared: 0.663
Method: Least Squares F-statistic: 788.4
Date: Wed, 18 Feb 2026 Prob (F-statistic): 1.29e-189
Time: 09:04:58 Log-Likelihood: -3306.9
No. Observations: 800 AIC: 6620.
Df Residuals: 797 BIC: 6634.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -6.9606 0.538 -12.946 0.000 -8.016 -5.905
x1 1.8640 0.095 19.595 0.000 1.677 2.051
x2 3.2072 0.092 34.883 0.000 3.027 3.388
==============================================================================
Omnibus: 115.842 Durbin-Watson: 1.941
Prob(Omnibus): 0.000 Jarque-Bera (JB): 70.131
Skew: -0.594 Prob(JB): 5.91e-16
Kurtosis: 2.168 Cond. No. 5.88
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The residual plots in this example are the key to discovering that something is wrong with this model.
ResidualPlots(data= dfTR, model=model, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1', 'x2']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1', 'x2']
--------------------------------------------------
--------------------------------------------------
No categorical inputs exist.
--------------------------------------------------
The non normality of the residuals is the first thing that we see. But then the parabolic patterns in some of the residual plots, in particular in the x2 plot, tell us that there is still signal in the residuals waiting to be captured by our models. The remedy in this case seems clear: try adding polynomial terms in x2.
We will not repeat the EDA and proceed directly to the model formula. It only needs updating with the addition of a new term
model_Formula = model_Formula + "+ I(x2**2)"
model_Formula'x1 + x2+ I(x2**2)'
We fit this new model:
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])
lm_pipeline.fit(dfTR[inputs], dfTR[output])
model = lm_pipeline._final_estimator The first piece of good news is that the model’s summary shows that all coefficients are significative, including the quadratic term. And also check the boost in the value of the adjusted \(R^2\). This model seems a better fit for the data.
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.995
Model: OLS Adj. R-squared: 0.995
Method: Least Squares F-statistic: 4.894e+04
Date: Wed, 18 Feb 2026 Prob (F-statistic): 0.00
Time: 09:05:21 Log-Likelihood: -1654.3
No. Observations: 800 AIC: 3317.
Df Residuals: 796 BIC: 3335.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 10.2091 0.103 98.726 0.000 10.006 10.412
x1 1.9968 0.012 165.332 0.000 1.973 2.020
x2 2.9832 0.012 254.915 0.000 2.960 3.006
I(x2 ** 2) -0.5056 0.002 -220.837 0.000 -0.510 -0.501
==============================================================================
Omnibus: 0.931 Durbin-Watson: 2.053
Prob(Omnibus): 0.628 Jarque-Bera (JB): 0.810
Skew: 0.070 Prob(JB): 0.667
Kurtosis: 3.070 Cond. No. 69.0
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The residual plots of this second model now look as expected. The residuals seem normal, and there are no obvious patterns (no signal) remaining in the scatterplots. We consider this a good model from the statistical point of view.
ResidualPlots(data=dfTR, model=model, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1', 'x2']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1', 'x2']
--------------------------------------------------
--------------------------------------------------
No categorical inputs exist.
--------------------------------------------------
df = pd.read_csv("./3_1_data03.csv")
df.head(4)| x1 | x2 | y | |
|---|---|---|---|
| 0 | 2.333070 | 3.808496 | 23.584776 |
| 1 | 1.675955 | 1.905683 | 18.272128 |
| 2 | 0.814088 | 2.866577 | 20.441051 |
| 3 | 1.980600 | -4.210186 | 11.557789 |
Let us move quickly through the modeling steps until we reach the step where a new situation appears:
output = "y"
num_inputs = ["x1", "x2"]
cat_inputs = []
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSsns.set_style("white")
sns.pairplot(dfTR, corner=True, height=1.5, aspect=1.5);Do you notice anything unexpected here?
The correlation between inputs is ok.
XTR.corr()| x1 | x2 | |
|---|---|---|
| x1 | 1.000000 | -0.017873 |
| x2 | -0.017873 | 1.000000 |
We use the basic all in formula as starting point.
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
We create the pipeline and fit the model.
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])lm_pipeline.fit(dfTR[inputs], dfTR[output])
model = lm_pipeline._final_estimatorThe model’s summary again looks ok, but notice that the adjusted \(R^2\) is quite low.
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.369
Model: OLS Adj. R-squared: 0.368
Method: Least Squares F-statistic: 233.2
Date: Wed, 18 Feb 2026 Prob (F-statistic): 1.83e-80
Time: 09:05:36 Log-Likelihood: -3050.8
No. Observations: 800 AIC: 6108.
Df Residuals: 797 BIC: 6122.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 10.0687 0.772 13.049 0.000 8.554 11.583
x1 1.9683 0.276 7.124 0.000 1.426 2.511
x2 2.7387 0.134 20.512 0.000 2.477 3.001
==============================================================================
Omnibus: 37.784 Durbin-Watson: 2.008
Prob(Omnibus): 0.000 Jarque-Bera (JB): 117.220
Skew: 0.033 Prob(JB): 3.51e-26
Kurtosis: 4.874 Cond. No. 6.33
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The residual plots in this example are the key to discovering that something is wrong with this model.
ResidualPlots(data=dfTR, model=model, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1', 'x2']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1', 'x2']
--------------------------------------------------
--------------------------------------------------
No categorical inputs exist.
--------------------------------------------------
Again there are clear signs of non normality of the residuals. And the most striking feature appears in the residuals vs x1 plot. There is a clear wedge shape that says that the variance of the residuals depends on the values of x1. This goes directly against the assumption of equal conditional variance of the errors.
Be careful: this model can still be a good predictor of Y. But we can not use it for inference and in particular we can not trust the p-values in the model’s summary or compute confidence intervals, etc. Always keep in mind that the Machine Learning goals and the statistical goals are often not exactly the same.
df = pd.read_csv("./3_1_data04.csv")
df.head(4)| x1 | x2 | y | |
|---|---|---|---|
| 0 | 2.333070 | 3.808496 | 32.086970 |
| 1 | 1.675955 | 1.905683 | 20.262496 |
| 2 | 0.814088 | 2.866577 | 41.888154 |
| 3 | 1.980600 | -4.210186 | 6.126667 |
Again let us go quickly to the interesting part:
output = "y"
num_inputs = ["x1", "x2"]
cat_inputs = []
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSsns.set_style("white")
sns.pairplot(dfTR, corner=True, height=1.5, aspect=1.5);Probably nothing pops out here. The correlation between inputs is also ok.
XTR.corr()| x1 | x2 | |
|---|---|---|
| x1 | 1.000000 | -0.017873 |
| x2 | -0.017873 | 1.000000 |
We use the basic all in formula as starting point.
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
We create the pipeline and fit the model.
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])lm_pipeline.fit(dfTR[inputs], dfTR[output])
model = lm_pipeline._final_estimatorThe p-values are ok, but the low adjusted \(R^2\) is intriguing.
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.438
Model: OLS Adj. R-squared: 0.437
Method: Least Squares F-statistic: 310.5
Date: Wed, 18 Feb 2026 Prob (F-statistic): 1.96e-100
Time: 09:05:59 Log-Likelihood: -2977.1
No. Observations: 800 AIC: 5960.
Df Residuals: 797 BIC: 5974.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 21.9059 0.704 31.130 0.000 20.525 23.287
x1 2.0598 0.252 8.175 0.000 1.565 2.554
x2 2.8836 0.122 23.682 0.000 2.645 3.123
==============================================================================
Omnibus: 281.395 Durbin-Watson: 1.982
Prob(Omnibus): 0.000 Jarque-Bera (JB): 918.216
Skew: 1.711 Prob(JB): 4.09e-200
Kurtosis: 6.979 Cond. No. 6.33
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The residual plots in this example are the key to discovering that something is wrong with this model.
ResidualPlots(data=dfTR, model=model, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1', 'x2']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1', 'x2']
--------------------------------------------------
--------------------------------------------------
No categorical inputs exist.
--------------------------------------------------
In this case the non normality of the residuals is the culprit of the problems we see in the plots. The distribution of the residuals seems skewed (non symmetrical) in most of them.
The caveats we made before about models like this being statistically flawed but possible good predictors of Y still hold here.
df = pd.read_csv("./3_1_data05.csv")
df.head(4)| x1 | x2 | x3 | y | |
|---|---|---|---|---|
| 0 | 2.333070 | 3.808496 | 8.206014 | 22.750828 |
| 1 | 1.675955 | 1.905683 | 5.138731 | 17.417961 |
| 2 | 0.814088 | 2.866577 | 4.560207 | 16.625784 |
| 3 | 1.980600 | -4.210186 | 1.041929 | 4.490612 |
output = "y"
num_inputs = ["x1", "x2", "x3"]
cat_inputs = []
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSsns.set_style("white")
sns.pairplot(dfTR, corner=True, height=1.5, aspect=1.5);In this example EDA already indicates the problem, but that will not always be the case.
XTR.corr()| x1 | x2 | x3 | |
|---|---|---|---|
| x1 | 1.000000 | -0.017873 | 0.672476 |
| x2 | -0.017873 | 1.000000 | 0.689660 |
| x3 | 0.672476 | 0.689660 | 1.000000 |
As usual
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2 + x3'
Pipeline and fit
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])lm_pipeline.fit(XTR, YTR)
model = lm_pipeline._final_estimatorNote that the p-values and \(R^2\) all look ok. But looking back at the EDA pairplot, how come that the x3 coefficient is negative?
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.976
Model: OLS Adj. R-squared: 0.976
Method: Least Squares F-statistic: 1.076e+04
Date: Wed, 18 Feb 2026 Prob (F-statistic): 0.00
Time: 09:06:21 Log-Likelihood: -1147.9
No. Observations: 800 AIC: 2304.
Df Residuals: 796 BIC: 2323.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 10.0200 0.072 140.006 0.000 9.880 10.161
x1 4.0574 0.079 51.410 0.000 3.902 4.212
x2 3.0287 0.039 77.696 0.000 2.952 3.105
x3 -1.0469 0.038 -27.918 0.000 -1.121 -0.973
==============================================================================
Omnibus: 4.235 Durbin-Watson: 2.077
Prob(Omnibus): 0.120 Jarque-Bera (JB): 4.202
Skew: 0.132 Prob(JB): 0.122
Kurtosis: 3.237 Cond. No. 18.0
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The residual plots also look perfectly well in this model.
ResidualPlots(data = XTR, model=model, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1', 'x2', 'x3']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1', 'x2', 'x3']
--------------------------------------------------
--------------------------------------------------
No categorical inputs exist.
--------------------------------------------------
Now let us examine the VIFS for this model. Remember that a VIF > 5 is considered a signal of multicollinearity.
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()
vif_df["Variable"] = df[inputs].columns
# calculating VIF for each feature b
vif_df["VIF"] = [variance_inflation_factor(dfTR[inputs].values, i)
for i in range(len(dfTR[inputs].columns))]
vif_df| Variable | VIF | |
|---|---|---|
| 0 | x1 | 34.832504 |
| 1 | x2 | 9.974203 |
| 2 | x3 | 41.052912 |
As you can see, these are clear indicators of multicollinearity. But the predictive performance is not hurt because of this, as you can see in the following trains, validation and test score comparisons.
## Create dataset to store model predictions
dfTR_eval = XTR.copy()
dfTR_eval[output] = YTR
dfTS_eval = XTS.copy()
dfTS_eval[output] = YTSdfTR_eval['model_pred'] = lm_pipeline.predict(XTR)
dfTS_eval['model_pred'] = lm_pipeline.predict(XTS)Now let us compute the RMSE score in training, validation and test:
np.sqrt(mean_squared_error(YTR, dfTR_eval["model_pred"]))np.float64(1.016091274761776)
- cross_val_score(lm_pipeline, XTR, YTR, cv=10, scoring="neg_root_mean_squared_error")array([1.08376295, 1.01528054, 1.01276996, 1.04037752, 0.98126073,
1.0291144 , 0.98930486, 1.04502459, 1.07973088, 0.93764737])
np.sqrt(mean_squared_error(YTS, dfTS_eval["model_pred"]))np.float64(0.9559781249766734)
As we see, the scores in train, validation and test are coherent.
In order to fight multicollinearity we will fit a second model removing the variable with the highest VIF value, in this case x1 (if two variables have very similar VIFs you may keep e.g. the most easily interpretable one).
inputs = ["x1", "x2"]
inputs['x1', 'x2']
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
We create the pipeline and fit the model.
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])lm_pipeline.fit(dfTR[inputs], dfTR[output])
model = lm_pipeline._final_estimatorEverything looks good here:
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.952
Model: OLS Adj. R-squared: 0.952
Method: Least Squares F-statistic: 7969.
Date: Wed, 18 Feb 2026 Prob (F-statistic): 0.00
Time: 09:06:38 Log-Likelihood: -1421.0
No. Observations: 800 AIC: 2848.
Df Residuals: 797 BIC: 2862.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 9.9864 0.101 99.260 0.000 9.789 10.184
x1 1.9733 0.036 54.781 0.000 1.903 2.044
x2 1.9968 0.017 114.703 0.000 1.963 2.031
==============================================================================
Omnibus: 1.824 Durbin-Watson: 2.107
Prob(Omnibus): 0.402 Jarque-Bera (JB): 1.745
Skew: -0.046 Prob(JB): 0.418
Kurtosis: 3.210 Cond. No. 6.33
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Note the changes in the model coefficients for the remaining variables. Even the sign has changed! Let us see the new VIFs:
vif_df = pd.DataFrame()
vif_df["Variable"] = dfTR[inputs].columns
# calculating VIF for each feature
vif_df["VIF"] = [variance_inflation_factor(dfTR[inputs].values, i)
for i in range(len(dfTR[inputs].columns))]
vif_df| Variable | VIF | |
|---|---|---|
| 0 | x1 | 1.006232 |
| 1 | x2 | 1.006232 |
As you can see the VIF values are much more moderate in this second model. This makes the coefficient estimates more reliable (the real coefficients in this example are 10 + 2 x1 + 2 x2)
df = pd.read_csv("./3_1_data06.csv")
df.head(4)| x1 | x2 | y | |
|---|---|---|---|
| 0 | 2.333070 | A | 23.870012 |
| 1 | 1.675955 | B | 17.088453 |
| 2 | 0.814088 | A | 11.796125 |
| 3 | 1.980600 | B | 22.636752 |
As we can see, x2 is a factor encoded with non numeric labels. This is the easiest case as our code will be able to recognize this automatically and build the right design matrix. We store the names for the inputs and make the train/test split as in previous models:
output = "y"
num_inputs = ["x1" ]
cat_inputs = ["x2"]
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]
XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSWith a categorical input it is usually a good idea to incorporate its levels into the pairplots. Note, for example. the vertical arrangement of colors in the Y vs x1 scatterplot.
sns.set_style("white")
plt_df = XTR.copy()
plt_df["Y"] = YTR
sns.pairplot(plt_df, corner=True, height=1.5, aspect=1.5, hue="x2");In this example if we use a basic formula incorporating all inputs
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
then the design matrix will automatically include a column with the right encoding of the categorical input (read the warning!). Let us see how:
dmY, dmX = ps.dmatrices("y ~ " + model_Formula, df)
dmX[:4, :]array([[1. , 0. , 2.33306972],
[1. , 1. , 1.67595477],
[1. , 0. , 0.81408781],
[1. , 1. , 1.98060009]])
The following steps are familiar. We create the pipeline and fit the model.
lm_pipeline = Pipeline([
("formula", FormulaTransformer(model_Formula)),
("regressor", StatsModelsRegressor(OLS, fit_intercept = False))])lm_pipeline.fit(XTR, YTR)
model = lm_pipeline._final_estimatorThe rest of the assessment of the model follows.
print(model.results_.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.784
Model: OLS Adj. R-squared: 0.784
Method: Least Squares F-statistic: 1449.
Date: Wed, 18 Feb 2026 Prob (F-statistic): 3.60e-266
Time: 09:07:01 Log-Likelihood: -2024.7
No. Observations: 800 AIC: 4055.
Df Residuals: 797 BIC: 4069.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 11.0499 0.243 45.408 0.000 10.572 11.528
x2[T.B] -3.0235 0.216 -14.026 0.000 -3.447 -2.600
x1 3.9377 0.077 51.364 0.000 3.787 4.088
==============================================================================
Omnibus: 1.980 Durbin-Watson: 1.930
Prob(Omnibus): 0.372 Jarque-Bera (JB): 1.949
Skew: 0.071 Prob(JB): 0.377
Kurtosis: 2.805 Cond. No. 7.80
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Note that the coefficient of the categorical input indicates the level of the factor used as reference. Here we are using a model with:
\[f(x_1, x_2) = \beta_0 + \beta_1 x_1 + \beta_2 x_2^{enc}\]
Here \(x_2^{enc}\) corresponds to the encoding of \(x_2\) in the design matrix, which is 1 when \(x_0 = B\) and 0 when \(X_2 = A\). In particular that means that the above equation can be interpreted as a straight line in \(x_1\) where the intercept depends on the level of \(x_2\): it equals \(\beta_0\) for \(x_2 = A\) (reference level) and \(\beta_0 + \beta_2\) for \(x_2 = B\). After defining the evaluation datasets we can plot a regression line for each level of x2 to illustrate this.
dfTR_eval = XTR.copy()
dfTR_eval[output] = YTR
dfTS_eval = XTS.copy()
dfTS_eval[output] = YTSsns.lmplot(data=dfTR, x="x1", y="y", hue="x2", scatter_kws={'alpha':0.25});The residual plots for this model now include parallel boxplots of the residuals across different levels of the categorical input(s). There should be no significant difference in the boxplots for each level, as in this example.
ResidualPlots(data = dfTR, model=model, num_inputs=num_inputs, cat_inputs=cat_inputs, output=output)--------------------------------------------------
Density Curve and QQ-plot of Residuals: ['x1']
--------------------------------------------------
[<Axes: > <Axes: >]
--------------------------------------------------
Fitted Values vs Residuals:
--------------------------------------------------
--------------------------------------------------
Numerical inputs: ['x1']
--------------------------------------------------
--------------------------------------------------
Categorical inputs: ['x2']
--------------------------------------------------
df = pd.read_csv("./3_1_data07.csv")
df.head(8)| x1 | x2 | y | |
|---|---|---|---|
| 0 | 2.333070 | 1 | 22.866299 |
| 1 | 1.675955 | 2 | 22.709764 |
| 2 | 0.814088 | 3 | 16.807665 |
| 3 | 1.980600 | 4 | 30.362348 |
| 4 | 0.151959 | 1 | 13.194814 |
| 5 | 0.604424 | 2 | 15.083150 |
| 6 | 2.130828 | 3 | 27.724743 |
| 7 | 3.089289 | 4 | 36.682985 |
Here, if we simply run the next two cells as in the previous example:
model_Formula = " + ".join(inputs)
model_Formula'x1 + x2'
then the design matrix will include a single column for the categorical input, and it will not be really considered as such.
dmY, dmX = ps.dmatrices("y ~ " + model_Formula, df)
dmX[:4, :]array([[1. , 2.33306972, 1. ],
[1. , 1.67595477, 2. ],
[1. , 0.81408781, 3. ],
[1. , 1.98060009, 4. ]])
In order to get the right encoding of categorical inputs we need to be more specific when writing the model formula:
model_Formula = "x1 + C(x2)"
model_Formula'x1 + C(x2)'
Now the design matrix correctly encodes the categorical input in three columns, leaving the first level of x2 as reference level. Again we insist this is not the same as one hot encoding (which would include all four columns).
dmY, dmX = ps.dmatrices("y ~ " + model_Formula, df)
dmX[:8, :]array([[1. , 0. , 0. , 0. , 2.33306972],
[1. , 1. , 0. , 0. , 1.67595477],
[1. , 0. , 1. , 0. , 0.81408781],
[1. , 0. , 0. , 1. , 1.98060009],
[1. , 0. , 0. , 0. , 0.15195865],
[1. , 1. , 0. , 0. , 0.60442435],
[1. , 0. , 1. , 0. , 2.13082829],
[1. , 0. , 0. , 1. , 3.0892894 ]])
df = pd.read_csv("./3_1_data08.csv")
df.head(4)| x1 | x2 | y | |
|---|---|---|---|
| 0 | 1.058582 | B | 0.311131 |
| 1 | 1.683179 | B | -3.298882 |
| 2 | 2.241449 | A | -0.489514 |
| 3 | 3.509912 | B | -3.546480 |
Let us define the split and the associated names:
output = "y"
num_inputs = ["x1" ]
cat_inputs = ["x2"]
inputs = num_inputs + cat_inputs
X = df[inputs]
Y = df[output]
XTR, XTS, YTR, YTS = train_test_split(
X, Y,
test_size = 0.2,
random_state = 1,
shuffle=True)
dfTR = pd.DataFrame(XTR, columns=inputs)
dfTR[output] = YTR
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS[output] = YTSAn exploratory plot like the one we did before reveals the difference between this and the preceding situation:
sns.lmplot(data=dfTR, x="x1", y="y", hue="x2", scatter_kws={'alpha':0.25});The pairplot using x2 for color sends a similar warning.
sns.set_style("white")
plt_df = XTR.copy()
plt_df["Y"] = YTR
sns.pairplot(plt_df, corner=True, height=1.5, aspect=1.5, hue="x2");model_Formula = " * ".join(inputs)
model_Formula
dmY, dmX = ps.dmatrices("y ~ " + model_Formula, df)
dmX[:8, :]array([[1. , 1. , 1.05858222, 1.05858222],
[1. , 1. , 1.68317947, 1.68317947],
[1. , 0. , 2.24144939, 0. ],
[1. , 1. , 3.50991229, 3.50991229],
[1. , 0. , 4.93437 , 0. ],
[1. , 0. , 0.60442435, 0. ],
[1. , 1. , 4.32156963, 4.32156963],
[1. , 0. , 2.96544222, 0. ]])