Hide the code
%cd 2_5_EnsembleMethods_Boosting/wd/2_5_EnsembleMethods_Boosting
Machine Learning
MLMIIN repository folder?If you have missed any of these steps you may need to restart VS Code after completing them.
Also if Python seems unresponsive at first, try restarting the kernel.
%cd 2_5_EnsembleMethods_Boosting/wd/2_5_EnsembleMethods_Boosting
Let us begin by loading the basic libraries.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as col
import seaborn as sns
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# This is a custom module that contains some plotting utilities, the source code is in the file
# plot_utils.py in the same directory as this notebook
from plot_utils import plot_2d_data, plot_2d_classifier
from sklearn.datasets import make_moons
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
We use the make_classification function to build the example. This is an utility function that you will use when you need to test your ideas, benchmark methods, etc.
N = 1000
from sklearn.datasets import make_classification
X, Y = make_classification(n_classes=2, n_samples=N, n_informative=5, random_state=1)inputs = ["X" + str(k) for k in range(X.shape[1])]
output = "Y"
df = pd.DataFrame(X, columns = inputs)
df[output] = Y
df.iloc[:,:12].head()| X0 | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -3.327193 | 0.832043 | 1.316756 | -0.660973 | 0.597524 | 2.027022 | -0.159786 | -2.702438 | 0.984458 | 1.116811 | -1.251926 | 0.202324 |
| 1 | 1.685766 | -0.403897 | -0.357680 | 0.172492 | -0.639168 | -1.151757 | -1.219221 | -1.012974 | 0.689497 | 0.177965 | -0.785904 | 1.199356 |
| 2 | 2.140472 | 0.377846 | -0.969882 | -0.366210 | 0.106640 | -0.595217 | -1.234516 | -0.813707 | 0.702014 | -2.371404 | -0.683127 | 0.188425 |
| 3 | 0.344467 | -0.698053 | 2.175077 | 0.426490 | -0.703525 | -0.436798 | -0.757212 | 2.192373 | 0.022481 | -0.623113 | -1.003478 | -0.160083 |
| 4 | -2.148952 | -0.387040 | 3.550959 | -1.488384 | 2.618837 | -0.323193 | -0.860368 | -4.625203 | 0.015528 | 0.705384 | -1.093792 | -0.698836 |
from sklearn.model_selection import train_test_split
XTR, XTS, YTR, YTS = train_test_split(df[inputs], df[output],
test_size=0.2, # percentage preserved as test data
random_state=1, # seed for replication
stratify = df[output]) # Preserves distribution of yn_trees = 2500
md = 6Bagging with a fixed base classifier is easily accomplished with BaggingClassifier. All the parameters of the BagggingClassifier have self describing names, except for the oob_score parameter. We will explain what this score means below.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
BagDT = BaggingClassifier(estimator= DecisionTreeClassifier(
max_depth=md),
n_estimators=n_trees,
oob_score=True,
random_state=1)Fitting the model and getting the scores follows the usual steps:
BagDT.fit(XTR, YTR)BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=6),
n_estimators=2500, oob_score=True, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | estimator | DecisionTreeC...r(max_depth=6) | |
| n_estimators | 2500 | |
| max_samples | 1.0 | |
| max_features | 1.0 | |
| bootstrap | True | |
| bootstrap_features | False | |
| oob_score | True | |
| warm_start | False | |
| n_jobs | None | |
| random_state | 1 | |
| verbose | 0 |
DecisionTreeClassifier(max_depth=6)
| criterion | 'gini' | |
| splitter | 'best' | |
| max_depth | 6 | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | None | |
| random_state | None | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| class_weight | None | |
| ccp_alpha | 0.0 | |
| monotonic_cst | None |
BagDT.score(XTR, YTR), BagDT.score(XTS, YTS), BagDT.oob_score_(0.9775, 0.875, 0.8875)
check_datasets = ['dfTR' in globals(), 'dfTR_eval' in globals(), 'dfTS' in globals(), 'dfTS_eval' in globals()]
print('Checking existence of dfTR, dfTR_eval and dfTS, dfTS_eval:', check_datasets)
if not('dfTR' in globals()):
# Dataset for Training Predictions
dfTR = XTR.copy()
dfTR['Y'] = YTR
print("Created dfTR")
if not('dfTS' in globals()):
# Dataset for Training Predictions
dfTS = XTS.copy()
dfTS['Y'] = YTS
print("Created dfTS")
if not('dfTR_eval' in globals()):
# Dataset for Training Predictions
dfTR_eval = dfTR.copy()
print("Created dfTR_eval")
if not('dfTS_eval' in globals()):
# Dataset for Training Predictions
dfTS_eval = dfTS.copy()
print("Created dfTS_eval")Checking existence of dfTR, dfTR_eval and dfTS, dfTS_eval: [False, False, False, False]
Created dfTR
Created dfTS
Created dfTR_eval
Created dfTS_eval
model = BagDT
model_name = "BagDT"# Store the actual predictions
newCol = 'Y_'+ model_name +'_prob_neg';
dfTR_eval.insert(loc=dfTR_eval.shape[1], column=newCol, value=model.predict_proba(XTR)[:, 0])
newCol = 'Y_'+ model_name +'_prob_pos';
dfTR_eval.insert(loc=dfTR_eval.shape[1], column=newCol, value=model.predict_proba(XTR)[:, 1])
dfTR_eval[newCol] = model.predict_proba(XTR)[:, 1]
newCol = 'Y_'+ model_name +'_pred';
dfTR_eval.insert(loc=dfTR_eval.shape[1], column=newCol, value=model.predict(XTR))dfTR_eval.filter(like="BagDT", axis=1).head()| Y_BagDT_prob_neg | Y_BagDT_prob_pos | Y_BagDT_pred | |
|---|---|---|---|
| 31 | 0.462173 | 0.537827 | 1 |
| 856 | 0.032527 | 0.967473 | 1 |
| 802 | 0.660871 | 0.339129 | 0 |
| 205 | 0.967945 | 0.032055 | 0 |
| 680 | 0.915283 | 0.084717 | 0 |
We do the same for the test set (this time we do not use insert):
# Test predictions dataset
dfTS_eval = XTS.copy()
dfTS_eval['Y'] = YTS
newCol = 'Y_'+ model_name +'_prob_neg';
dfTS_eval[newCol] = model.predict_proba(XTS)[:, 0]
newCol = 'Y_'+ model_name +'_prob_pos';
dfTS_eval[newCol] = model.predict_proba(XTS)[:, 1]
newCol = 'Y_'+ model_name +'_pred';
dfTS_eval[newCol] = model.predict(XTS)modelDict = {"BagDT" : {"model" : model, "inputs" : inputs}}#%load "../exclude/MLMIINprv/exercises/2_5_Exercise001.py"
# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise001.py"For example, the first decision tree in our bag of trees is obtained with:
BagDT.estimators_[0]DecisionTreeClassifier(max_depth=6, random_state=1028862084)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| criterion | 'gini' | |
| splitter | 'best' | |
| max_depth | 6 | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | None | |
| random_state | 1028862084 | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| class_weight | None | |
| ccp_alpha | 0.0 | |
| monotonic_cst | None |
This tree was trained using using a subsample with indices which you can see here (only the first few are shown).
BagDT.estimators_samples_[0][0:50]array([132, 220, 266, 791, 594, 249, 300, 365, 292, 74, 244, 316, 472,
363, 770, 191, 615, 65, 499, 53, 466, 412, 670, 228, 413, 60,
13, 487, 331, 83, 721, 376, 69, 33, 345, 86, 303, 359, 549,
412, 742, 387, 344, 774, 129, 643, 465, 395, 488, 612])
# %load "../exclude/MLMIINprv/exercises/2_5_Exercise002.py"
# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise002.py"BagDT.score(XTR, YTR), BagDT.score(XTS, YTS), BagDT.oob_score_(0.9775, 0.875, 0.8875)
from sklearn.tree import plot_tree
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 16), dpi = 300)
for i in range(12):
DT = BagDT.estimators_[i]
plot_tree(DT, max_depth=2, ax=axes[i//3, i%3])It feels as if we were aiming for this jungle-like rich diversity
.
but we stumbled upon this non at all diverse plantation of quasi identical trees
We create an array of the right size beforehand and use it to store the probabilities.
test_bag_probs_array = np.zeros([YTS.shape[0], n_trees])
for i in range(n_trees):
probs_i = BagDT.estimators_[i].predict_proba(XTS.values)[:,1]
test_bag_probs_array[:, i] = probs_iThese is what the first ones look like:
test_bag_probs_array[0:5, 0:5]array([[0.01149425, 0. , 0. , 0. , 0. ],
[1. , 0.98760331, 1. , 1. , 0.99530516],
[0. , 0. , 0. , 0. , 0. ],
[0.88235294, 0.98760331, 0.89830508, 0.82882883, 0.95698925],
[0.88235294, 0.98760331, 0.89830508, 1. , 0.99530516]])
# %load "../exclude/MLMIINprv/exercises/2_5_Exercise004.py"
test_bag_probs_array.shape
# A row represents the predictions of the 250 trees for a single observation
# A column represents the predictions of a single tree for all 200 test set observations
pd.DataFrame(test_bag_probs_array, columns=["T"+ str(i) for i in range(n_trees) ]).corr().iloc[0:10, 0:10]
print(f'The typical correlation between the predictions for two of these trees is {pd.DataFrame(test_bag_probs_array, columns=["T"+ str(i) for i in range(n_trees) ]).corr().median().median():.{2}f}')The typical correlation between the predictions for two of these trees is 0.65
# %load "../exclude/MLMIINprv/exercises/2_5_Exercise007.py"
# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise007.py"from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=n_trees,
max_depth=md,
max_features="sqrt",
random_state=1,
n_jobs=-1)Let us fit the model.
RF.fit(XTR, YTR)RandomForestClassifier(max_depth=6, n_estimators=2500, n_jobs=-1,
random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | n_estimators | 2500 | |
| criterion | 'gini' | |
| max_depth | 6 | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | 'sqrt' | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| bootstrap | True | |
| oob_score | False | |
| n_jobs | -1 | |
| random_state | 1 | |
| verbose | 0 | |
| warm_start | False | |
| class_weight | None | |
| ccp_alpha | 0.0 | |
| max_samples | None | |
| monotonic_cst | None |
The scores in training and test are:
RF.score(XTR, YTR), RF.score(XTS, YTS)(0.96125, 0.86)
These do not seem to be a major improvement from bagging.
BagDT.score(XTR, YTR), BagDT.score(XTS, YTS)(0.9775, 0.875)
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 16), dpi = 300)
for i in range(12):
# fig, axes = plt.subplots(figsize=(3, 4), dpi = 300)
DT = RF.estimators_[i]
plot_tree(DT, max_depth=1, ax=axes[i//3, i%3])test_RF_probs_array = np.zeros([YTS.shape[0], n_trees])
for i in range(n_trees):
probs_i = RF.estimators_[i].predict_proba(XTS.values)[:,1]
test_RF_probs_array[:, i] = probs_itest_RF_probs_array[0:5, 0:5]array([[0.1025641 , 1. , 0. , 0.01980198, 0. ],
[1. , 1. , 1. , 0.63265306, 1. ],
[0.17073171, 0.84552846, 0. , 0. , 0. ],
[0.96551724, 0.84552846, 0.94615385, 0.01980198, 0.54166667],
[0. , 1. , 1. , 0.95454545, 1. ]])
And the correlation matrix confirms that we have succeeded in decreasing the correlations significantly.
pd.DataFrame(test_RF_probs_array, columns=["T"+ str(i) for i in range(n_trees) ]).corr().iloc[0:10, 0:10]
print(f'The typical correlation between the predictions for two of these trees is {pd.DataFrame(test_RF_probs_array, columns=["T"+ str(i) for i in range(n_trees) ]).corr().median().median():.{2}f}')The typical correlation between the predictions for two of these trees is 0.49
We use the minimum number of samples that a terminal node should contain and the total depth of the tree as hyperparameters. We include a special use case of the ‘n_estimators’ parameter (the number of trees in the forest) in which we provide a single, fixed value of that hyperparameter. By commenting and uncommenting lines in the cell below you can explore the impact of the number of trees in the performance of the random forest.
hyp_grid = {'RF__min_samples_leaf':range(5, 8),
# 'RF__n_estimators': range(100, 1001, 100),
'RF__max_depth': range(1, 6)}RF = RandomForestClassifier(random_state=1, max_features="sqrt", n_estimators=500)
from sklearn.pipeline import Pipeline
RF_pipe = Pipeline(steps=[('RF', RF)])
num_folds = 10
from sklearn.model_selection import GridSearchCV
RF_gridCV = GridSearchCV(estimator=RF_pipe,
param_grid=hyp_grid,
cv=num_folds,
return_train_score=True,
n_jobs=-1)Give the model a name and fit it.
model_name = "RF"
model = RF_gridCV
model.fit(XTR, YTR)GridSearchCV(cv=10,
estimator=Pipeline(steps=[('RF',
RandomForestClassifier(n_estimators=500,
random_state=1))]),
n_jobs=-1,
param_grid={'RF__max_depth': range(1, 6),
'RF__min_samples_leaf': range(5, 8)},
return_train_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | estimator | Pipeline(step...om_state=1))]) | |
| param_grid | {'RF__max_depth': range(1, 6), 'RF__min_samples_leaf': range(5, 8)} | |
| scoring | None | |
| n_jobs | -1 | |
| refit | True | |
| cv | 10 | |
| verbose | 0 | |
| pre_dispatch | '2*n_jobs' | |
| error_score | nan | |
| return_train_score | True |
| n_estimators | 500 | |
| criterion | 'gini' | |
| max_depth | 5 | |
| min_samples_split | 2 | |
| min_samples_leaf | 6 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | 'sqrt' | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| bootstrap | True | |
| oob_score | False | |
| n_jobs | None | |
| random_state | 1 | |
| verbose | 0 | |
| warm_start | False | |
| class_weight | None | |
| ccp_alpha | 0.0 | |
| max_samples | None | |
| monotonic_cst | None |
modelDict[model_name] = {"model" : model, "inputs" : inputs}model.score(XTR, YTR), model.score(XTS, YTS)(0.93, 0.87)
BagDT.score(XTR, YTR), BagDT.score(XTS, YTS)(0.9775, 0.875)
print(hyp_grid)
for param in hyp_grid.keys():
print(f' Best value for {param} = {model.best_params_[param]}'){'RF__min_samples_leaf': range(5, 8), 'RF__max_depth': range(1, 6)}
Best value for RF__min_samples_leaf = 6
Best value for RF__max_depth = 5
The following script plots a graph that illustrates the hyperparameter search behind this choice.
%run -i "./2_5_GridSearch_Plot.py"The diagram shows that the highest value of accuracy was obtained when considering the deepest trees in the grid. Therefore we may suspect that growing deeper trees can lead to further accuracy improvements. But we have to keep in mind the risk of overfitting that comes with deeper trees. In order to keep it in line we should probably increase the number of estimators, that would in turn result in higher training times. Hyperparameter tuning is always a tradeoff of several aspects of modeling .
# %load "../exclude/MLMIINprv/exercises/2_5_Exercise008.py"
# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise008.py"var_importances = pd.DataFrame({'var':XTR.columns,
'importance': model.best_estimator_.named_steps["RF"].feature_importances_}
).sort_values(by="importance", ascending = False).set_index('var')
var_importances| importance | |
|---|---|
| var | |
| X0 | 0.281304 |
| X16 | 0.138379 |
| X18 | 0.129207 |
| X17 | 0.119175 |
| X7 | 0.112898 |
| X14 | 0.064446 |
| X2 | 0.036064 |
| X1 | 0.014357 |
| X6 | 0.011714 |
| X3 | 0.010854 |
| X15 | 0.010737 |
| X11 | 0.009104 |
| X5 | 0.009035 |
| X13 | 0.008465 |
| X12 | 0.007889 |
| X19 | 0.007722 |
| X8 | 0.007435 |
| X10 | 0.007238 |
| X4 | 0.007041 |
| X9 | 0.006937 |
# %load "../exclude/MLMIINprv/exercises/2_5_Exercise009.py"
# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise009.py"# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise010.py"
# %load "../exclude/MLMIINprv/exercises/2_5_Exercise010.py"2026-02-10 Reached this point
\(\quad\)
Let us begin by loading the basic libraries.
In Boosting methods (and in some other Machine Learning methods) it is customary to use 1 and -1 as the target values for binary classification problems. This is because these methods use the sign of the predictions to classify the samples (you may think that 0 defined the decision boundary that separates the two classes; this is not entirely true, as we will see later, but it is a useful guiding principle).
This is the reason why we convert the 0,1 labels in our dataset to 1,-1 in the following code.
X, y = make_moons(n_samples=100, noise=0.05, random_state=13)
y = 2 * y - 1 # Convert labels from [0, 1] to [-1, 1]
n_samples, n_features = X.shape
ensemble = [] # Initialize an empty ensemble
# cmap = cm.get_cmap('Blues')
cmap = mpl.colormaps.get_cmap('Blues')
colors = cmap(np.linspace(0, 0.5, num=2))
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.scatter(X[y <= 0, 0], X[y <= 0, 1], marker='o', c=col.rgb2hex(colors[0]), edgecolors='k', alpha=0.5)
ax.scatter(X[y > 0, 0], X[y > 0, 1], marker='s', c=col.rgb2hex(colors[1]), edgecolors='k', alpha=0.5)
ax.set_title('Initial classification problem for Adaboost')
ax.set_xticks([])
ax.set_yticks([])
plt.show();plt.close()In this example we will train an AdaBoost ensemble model with only three stumps, using sklearn. Keep in mind that in practice you will use more stumps, and you will need to tune the hyperparameters of the model. Also it is worth mentioning that the AdaBoostClassifier in sklearn uses decision trees as weak learners by default.
n_estimators = 3
h = DecisionTreeClassifier(max_depth=1) # Initialize a decision tree stumpNext we create an array to store the weights assigned by the Adaboost iterations to the samples. Initially the weight of all samples is the same, \(1/n\), where \(n\) is the number of samples in the training set. We also create an empty list to add the (weak learners) stumps as we train them.
D = np.ones((n_samples, ))
D = D / np.sum(D)
ensemble = []h.fit(X, y, sample_weight=D)
ypred = h.predict(X)
e = 1 - accuracy_score(y, ypred, sample_weight=D) # Weighted error of the weak learnerThe stump is also assigned a (collective) weight of its own. Then the stump and its weight are added to the ensemble. When the Adaboost iterations are completed (all stumps trained and weighted) we will use the ensemble to make predictions on the test set by adding up the predictions of all stumps, each one weighted by its own weight (so more reliable stumps will have a bigger say in the final prediction).
The weight of the stump is computed as \[a = \dfrac{1}{2} \log \left( \dfrac{1 - \text{weighted error of the samples}}{\text{weighted error of the samples}} \right).\] We will return to this formula later.
a = 0.5 * np.log((1 - e) / e) # Weak learner weight
print(a)0.8673005276940532
The Adaboost algorithm next updates the weights of the samples. The weight of the misclassified samples is increased, and the weight of the correctly classified samples is decreased, according to this formula: 1. Increase the weight of misclassified examples to \(D_i e^{\alpha_t}\)
2. Decrease the weight of correctly classified examples to \(\dfrac{D_i}{e^{\alpha_t}}\)
In the code below m is used to identify correctly classified and misclassified points and assign them 1 or -1 labels. After the weights of the stump and the samples are updated, we can add the stump to the ensemble.
m = (y == ypred) * 1 + (y != ypred) * -1
D = D * np.exp(- a * m) # Update the sample weights
ensemble.append((a, h)) # Add the weak learner to the ensemble Let us visualize the decission boundary corresponding to the first stump and the weights assigned by this first iteration to the samples. We will use the following code to assign different sized markers to the samples according to their weights. We also compute the error (1 - accuracy) of the predictions of the first stump (as a percent).
s = D / np.max(D)
s[(0.00 <= s) & (s < 0.25)] = 16
s[(0.25 <= s) & (s < 0.50)] = 32
s[(0.50 <= s) & (s < 0.75)] = 64
s[(0.75 <= s) & (s <= 1.00)] = 128
err = (1 - accuracy_score(y, ypred)) * 100Now we can plot the decision boundary and the samples with their updated weights:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
ax.scatter(X[y <= 0, 0], X[y <= 0, 1], s=s[y <= 0], marker='o', c=col.rgb2hex(colors[0]), edgecolors='k', alpha=0.5)
ax.scatter(X[y > 0, 0], X[y > 0, 1], s=s[y > 0], marker='s', c=col.rgb2hex(colors[1]), edgecolors='k', alpha=0.5)
ax.set_xticks([])
ax.set_yticks([])
title = f'First Weak Learner (error = {err:3.1f}%, weight of this stump = {a:3.2f})'
plot_2d_classifier(ax, X, y, predict_function=h.predict,
alpha=0.25, xlabel=None, ylabel=None,
title=title, colormap='Blues')
pos_err = (y > 0) & (y != ypred)
pos_cor = (y > 0) & (y == ypred)
neg_err = (y <= 0) & (y != ypred)
neg_cor = (y <= 0) & (y == ypred)
ax.scatter(X[neg_err, 0], X[neg_err, 1], marker='o', c=col.rgb2hex(colors[0]), edgecolors='k', s=80)
ax.scatter(X[pos_err, 0], X[pos_err, 1], marker='s', c=col.rgb2hex(colors[1]), edgecolors='k', s=80)
ax.set_xticks([])
ax.set_yticks([])
plt.show();plt.close()for k in range(1, n_estimators):
# -- Plot the training examples in different sizes proportional to their weights
s = D / np.max(D)
s[(0.00 <= s) & (s < 0.25)] = 16
s[(0.25 <= s) & (s < 0.50)] = 32
s[(0.50 <= s) & (s < 0.75)] = 64
s[(0.75 <= s) & (s <= 1.00)] = 128
h = DecisionTreeClassifier(max_depth=1) # Initialize a decision stump
h.fit(X, y, sample_weight=D) # Train a weak learner using sample weights
ypred = h.predict(X) # Predict using the weak learner
e = 1 - accuracy_score(y, ypred, sample_weight=D) # Weighted error of the weak learner
a = 0.5 * np.log((1 - e) / e) # Weak learner weight
m = (y == ypred) * 1 + (y != ypred) * -1 # Identify correctly classified and misclassified points
D *= np.exp(-a * m) # Update the sample weights
# -- Plot the (first and last, no more than 10) individual weak learner
if ((k < 5) or (n_estimators - k < 5)):
fig, ax = plt.subplots(figsize=(12, 6))
err = (1 - accuracy_score(y, ypred)) * 100
title = f'Iteration {k + 1}: Weak Learner (error = {err:3.1f}%, weight of this stump = {a:3.2f})'
plot_2d_classifier(ax, X, y, predict_function=h.predict,
alpha=0.25, xlabel=None, ylabel=None,
title=title, colormap='Blues')
pos_err = (y > 0) & (y != ypred)
pos_cor = (y > 0) & (y == ypred)
neg_err = (y <= 0) & (y != ypred)
neg_cor = (y <= 0) & (y == ypred)
ax.scatter(X[neg_err, 0], X[neg_err, 1], marker='o', c=col.rgb2hex(colors[0]), edgecolors='k', s=80)
ax.scatter(X[pos_err, 0], X[pos_err, 1], marker='s', c=col.rgb2hex(colors[1]), edgecolors='k', s=80)
ax.set_xticks([])
ax.set_yticks([])
plt.show();plt.close()
# --
ensemble.append((a, h)) # Save the weighted weak hypothesis
def predict_boosting(X, estimators):
pred = np.zeros((X.shape[0], ))
for a, h in estimators:
pred += a * h.predict(X)
y = np.sign(pred)
return y
ypred = predict_boosting(X, ensemble)
err = (1 - accuracy_score(y, ypred)) * 100
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
plot_2d_classifier(ax, X, y, predict_function=predict_boosting, predict_args=(ensemble),
boundary_level=[0.0], alpha=0.25, xlabel='$x_1$', ylabel='$x_2$', s=80,
title=title, colormap='Blues')
fig.tight_layout()
ax.set_title(f'Overall ensemble (error = {err:3.1f}%)'.format(fontsize=12))Text(0.5, 1.0, 'Overall ensemble (error = 9.0%)')
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# from sklearn.datasets import make_classification
# from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
N = 1000
X, Y = make_classification(n_classes=2, n_samples=N, n_informative=5, random_state=1)
inputs = ["X" + str(k) for k in range(X.shape[1])]
output = "Y"
df = pd.DataFrame(X, columns = inputs)
df[output] = Y
df.iloc[:,:12].head()
XTR, XTS, YTR, YTS = train_test_split(df[inputs], df[output],
test_size=0.2, # percentage preserved as test data
random_state=1, # seed for replication
stratify = df[output]) # Preserves distribution of yNow we create and fit the grid search for this model with learning rate as hyperparameter. Note the way that the stumps are defined.
hyp_grid = {'AdB__learning_rate': np.linspace(0.01, 1, 11)}
hyp_grid
n_trees = 100
stump = DecisionTreeClassifier(max_depth=1)
AdB = AdaBoostClassifier(estimator= stump,
n_estimators=n_trees,
algorithm='SAMME', # to prevent a deprecation warning
random_state=1)
AdB_pipe = Pipeline(steps=[('AdB', AdB)])
num_folds = 10
AdB_gridCV = GridSearchCV(estimator=AdB_pipe,
param_grid=hyp_grid,
cv=num_folds,
return_train_score=True,
n_jobs=-1)
model_name = "AdB"
model = AdB_gridCV
model.fit(XTR, YTR)
# Dataset for MOdel Predictions
dfTR_eval = XTR.copy()
dfTR_eval['Y'] = YTR
dfTS_eval = XTS.copy()
dfTS_eval['Y'] = YTS
We can now add the model to our dictionary as usual.
modelDict = {model_name : {"model" : model, "inputs" : inputs}}model.score(XTR, YTR), model.score(XTS, YTS)(0.85875, 0.845)
As you can see, it is a decent performance without much tuning. And no overfitting seems to be happening. Let us visualize the hyperparameter search.
param_name = list(hyp_grid.keys())[0]
param_values = hyp_grid[param_name]
mean_train_scores = model.cv_results_['mean_train_score']
plt.plot(param_values, 1 - mean_train_scores, marker='o', label='Mean Train Score')
mean_test_scores = model.cv_results_['mean_test_score']
plt.plot(param_values, 1 - mean_test_scores, marker='*', label='Mean Validation Score')
plt.xlabel('Learning Rate')
plt.ylabel('Error (1 - accuracy)')
plt.title('Hyperparameter Tuning Results')
plt.legend() # Add a legend to the plot
plt.show()
plt.close()The selected learning rate is:
model.best_params_{'AdB__learning_rate': np.float64(0.30700000000000005)}
from sklearn.tree import plot_tree
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(4, 3), dpi = 300)
for i in range(6):
DT = AdB_gridCV.best_estimator_.named_steps["AdB"].estimators_[i]
plot_tree(DT, max_depth=2, ax=axes[i//3, i%3])# %load "../exclude/MLMIINprv/exercises/2_5_Exercise013.py"
# %run -i "../exclude/MLMIINprv/exercises/2_5_Exercise013.py"from sklearn.ensemble import GradientBoostingClassifier
GradBoost = GradientBoostingClassifier(max_depth=1,
n_estimators=20,
learning_rate=0.75)
GradBoost_pipe = Pipeline(steps=[('GradBoost', GradBoost)])
num_folds = 10
hyp_grid = {'GradBoost__learning_rate': 10.**np.arange(-6, 1, 1),
'GradBoost__n_estimators': [20, 50, 100, 200, 500]}
GradBoost_gridCV = GridSearchCV(estimator=GradBoost_pipe,
param_grid=hyp_grid,
cv=num_folds,
return_train_score=True,
n_jobs=-1)
model_name = "GradBoost"
model = GradBoost_gridCV
model.fit(XTR, YTR)
modelDict[model_name] = {"model" : model, "inputs" : inputs}model.score(XTR, YTR), model.score(XTS, YTS)(0.86875, 0.865)
model.best_params_{'GradBoost__learning_rate': np.float64(0.1), 'GradBoost__n_estimators': 100}
from sklearn.ensemble import HistGradientBoostingClassifier
hyp_grid = {'HGB__learning_rate': 10.**np.arange(-6, 1, 1),
'HGB__max_iter':np.arange(25, 150, 25),
'HGB__max_bins':[10, 25, 50, 100]}
HGB = HistGradientBoostingClassifier(random_state=1)
HGB_pipe = Pipeline(steps=[('HGB', HGB)])
num_folds = 10
from sklearn.model_selection import GridSearchCV
HGB_gridCV = GridSearchCV(estimator=HGB_pipe,
param_grid=hyp_grid,
cv=num_folds,
return_train_score=True,
n_jobs=-1)
HGB_gridCV.fit(XTR, YTR)
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('HGB',
HistGradientBoostingClassifier(random_state=1))]),
n_jobs=-1,
param_grid={'HGB__learning_rate': array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00]),
'HGB__max_bins': [10, 25, 50, 100],
'HGB__max_iter': array([ 25, 50, 75, 100, 125])},
return_train_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | estimator | Pipeline(step...om_state=1))]) | |
| param_grid | {'HGB__learning_rate': array([1.e-06...e-01, 1.e+00]), 'HGB__max_bins': [10, 25, ...], 'HGB__max_iter': array([ 25, ...75, 100, 125])} | |
| scoring | None | |
| n_jobs | -1 | |
| refit | True | |
| cv | 10 | |
| verbose | 0 | |
| pre_dispatch | '2*n_jobs' | |
| error_score | nan | |
| return_train_score | True |
| loss | 'log_loss' | |
| learning_rate | np.float64(0.1) | |
| max_iter | np.int64(125) | |
| max_leaf_nodes | 31 | |
| max_depth | None | |
| min_samples_leaf | 20 | |
| l2_regularization | 0.0 | |
| max_features | 1.0 | |
| max_bins | 25 | |
| categorical_features | 'from_dtype' | |
| monotonic_cst | None | |
| interaction_cst | None | |
| warm_start | False | |
| early_stopping | 'auto' | |
| scoring | 'loss' | |
| validation_fraction | 0.1 | |
| n_iter_no_change | 10 | |
| tol | 1e-07 | |
| verbose | 0 | |
| random_state | 1 | |
| class_weight | None |
# import lightgbm as lgb
# lgb_pipeline = Pipeline([
# ('lgb', lgb.LGBMClassifier()) # First and only step
# ])
# lgb_hyp_grid = {
# 'lgb__num_leaves': [20, 31, 40], # Number of leaves in one tree
# 'lgb__learning_rate': [0.01, 0.05, 0.1], # Step size shrinkage
# 'lgb__n_estimators': [100, 500, 1000], # Number of boosting iterations
# 'lgb__max_depth': [-1, 10, 20], # Maximum tree depth
# 'lgb__min_child_samples': [10, 20, 30] # Minimum data points in a leaf
# }
# lgb_gridCV = GridSearchCV(estimator=lgb_pipeline,
# param_grid=lgb_hyp_grid,
# cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
# lgb_gridCV.fit(XTR, YTR)
# # Step 7: Get the best hyperparameters
# best_params = lgb_gridCV.best_params_# import lightgbm as lgb
# from sklearn.experimental import enable_halving_search_cv # noqa
# from sklearn.model_selection import HalvingGridSearchCV
# lgb_pipeline = Pipeline([
# ('lgb', lgb.LGBMClassifier()) # First and only step
# ])
# lgb_hyp_grid = {
# 'lgb__num_leaves': [20, 31, 40], # Number of leaves in one tree
# 'lgb__learning_rate': [0.01, 0.05, 0.1], # Step size shrinkage
# 'lgb__n_estimators': [100, 500, 1000], # Number of boosting iterations
# 'lgb__max_depth': [-1, 10, 20], # Maximum tree depth
# 'lgb__min_child_samples': [10, 20, 30] # Minimum data points in a leaf
# }
# # This is significantly faster and gives updates on each "round"
# lgb_halvingCV = HalvingGridSearchCV(estimator=lgb_pipeline,
# param_grid=lgb_hyp_grid,
# scoring='accuracy',
# factor=3, verbose=1, n_jobs=-1)
# lgb_halvingCV.fit(XTR, YTR)
# # Step 7: Get the best hyperparameters
# best_params = lgb_halvingCV.best_params_