Hide the code
%cd 2_3_KNN_Validation/wd/2_3_KNN_Validation
Machine Learning
MLMIIN repository folder?If you have missed any of these steps you may need to restart VS Code after completing them.
Also if Python seems unresponsive at first, try restarting the kernel.
%cd 2_3_KNN_Validation/wd/2_3_KNN_Validation
The following figure illustrates the idea for a multiclass problem, using different values of k.
Source: Figure 1-11 from (Glassner 2021), generously shared by the author.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.datasets import make_moons
X, Y = make_moons(n_samples=1000, noise=0.2, random_state=1)Preprocessing this synthetic data will be very brief. In order to explore and use this dataset we jump directly into the train/test split.
from sklearn.model_selection import train_test_split
XTR, XTS, YTR, YTS = train_test_split(X, Y,
test_size=0.2, # percentage preserved as test data
random_state=1, # seed for replication
stratify = Y) # Preserves distribution of y
dfTR = pd.DataFrame(XTR, columns=["X" + str(i + 1) for i in range(X.shape[1])])
inputs = dfTR.columns
dfTR["Y"] = YTR
output = "Y"
dfTR.head(4)| X1 | X2 | Y | |
|---|---|---|---|
| 0 | 1.276177 | -0.230873 | 1 |
| 1 | 1.141051 | -0.528525 | 1 |
| 2 | 1.791505 | -0.270981 | 1 |
| 3 | 0.253569 | -0.612456 | 1 |
You can see that there are two numerical inputs X1, X2 and the binary output Y. The problem is perfectly balances and you can see that we have 800 data points.
dfTR[output].value_counts()Y
1 400
0 400
Name: count, dtype: int64
sns.set_theme(rc={'figure.figsize':(6, 4)})
sns.scatterplot(dfTR, x = "X1", y = "X2", hue="Y");Note that the boundary between the classes is clearly nonllinear.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
k = 600
knn_pipe = Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=k))])Now we only need to call fit:
knn_pipe.fit(dfTR[inputs], dfTR[output])Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=600))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | steps | [('scaler', ...), ('knn', ...)] | |
| transform_input | None | |
| memory | None | |
| verbose | False |
| copy | True | |
| with_mean | True | |
| with_std | True |
| n_neighbors | 600 | |
| weights | 'uniform' | |
| algorithm | 'auto' | |
| leaf_size | 30 | |
| p | 2 | |
| metric | 'minkowski' | |
| metric_params | None | |
| n_jobs | None |
unique, counts = np.unique(knn_pipe.predict_proba(dfTR[inputs])[:, 1], return_counts=True)
pd.DataFrame({'unique':unique, 'counts':counts})| unique | counts | |
|---|---|---|
| 0 | 0.333333 | 51 |
| 1 | 0.335000 | 55 |
| 2 | 0.336667 | 30 |
| 3 | 0.338333 | 25 |
| 4 | 0.340000 | 10 |
| ... | ... | ... |
| 160 | 0.660000 | 17 |
| 161 | 0.661667 | 24 |
| 162 | 0.663333 | 8 |
| 163 | 0.665000 | 48 |
| 164 | 0.666667 | 85 |
165 rows × 2 columns
TR_pred = knn_pipe.predict(dfTR[inputs])
knn_pipe.score(dfTR[inputs], dfTR[output])0.81375
# %load "../exclude/code/2_3_Exercise_002.py"# %load "2_3_KNN_DecisionBoundary_Plots.py"
# %load "2_3_KNN_DecisionBoundary_Plots.py"
from matplotlib.gridspec import GridSpec
from sklearn.inspection import DecisionBoundaryDisplay
fig = plt.figure(figsize=(20, 6))
gs = GridSpec(1, 2)
plt.subplots_adjust(wspace=0.25)
row, col = [0, 0]
ax0 = fig.add_subplot(gs[row, col])
DB = DecisionBoundaryDisplay.from_estimator(knn_pipe, dfTR[inputs], response_method="predict", plot_method="contour",
colors="black", ax=ax0, alpha=0.1)
DB = DecisionBoundaryDisplay.from_estimator(knn_pipe, dfTR[inputs], response_method="predict",
cmap="Greens_r", ax=ax0, alpha=0.5)
sns.scatterplot(dfTR, x = "X1", y = "X2", hue="Y", palette="pastel", ax=ax0);
row, col = [0, 1]
ax1 = fig.add_subplot(gs[row, col])
X1g, X2g = np.meshgrid(np.linspace(dfTR.X1.min()*1.5, dfTR.X1.max()*1.5, 100),
np.linspace(dfTR.X2.min()*1.5, dfTR.X2.max()*1.5, 100))
X1_X2 = pd.DataFrame(np.vstack([X1g.ravel(), X2g.ravel()]).transpose(), columns=["X1", "X2"])
pred_probs = knn_pipe.predict_proba(X1_X2)[:, 1].reshape([100, 100])
uniq_probs, counts_probs = np.unique(pred_probs, return_counts=True)
contours = plt.contour(X1g, X2g, pred_probs,
levels=np.linspace(0, 1, num=len(uniq_probs)),
colors="black", linewidths=0.5)
plt.clabel(contours)
sns.scatterplot(dfTR, x = "X1", y = "X2", hue="Y", palette="pastel", ax=ax1);
The decision curve (left) and level curves (right) look jagged, not smooth at all. The classifier seems to be trying too hard in some cases and getting into far too much detail. Do you think this curves will generalize well to new data?
k = 2%run -i "2_3_KNN_Try_different_k_values.py"score (pipeline method)= 0.9725
accuracy_score = 0.9725
Building on the last observation, the following plot shows the evolution of accuracy as k increases. You can see that the interesting values are in the below 50 region.
# %load "./2_3_KNN_Accuracy_vs_k.py"
'''
2_3_KNN_Accuracy_vs_k.py
'''
# Select the values of k
k_start = 2
k_stop = 25
k_step = 1
k_values = np.arange(start=k_start, stop=k_stop, step=k_step).astype("int")
# Create an empty list to store the accuracies
accrcies = []
# Loop through k values, titting models and getting accuracies
for k in k_values:
knn_pipe = Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=k))])
knn_pipe.fit(dfTR[inputs], dfTR[output])
accrcies.append(knn_pipe.score(dfTR[inputs], dfTR[output]))
accrcies = np.array(accrcies)
# Plot accuracies vs k
fig = plt.figure(figsize=(12, 4))
ax_acc = sns.scatterplot(x = k_values, y = accrcies)
sns.lineplot(x = k_values, y = accrcies, ax=ax_acc)
# Axes labeks
ax_acc.set(xlabel ="k (num. of neighbors)",
ylabel = "Accuracy"); # %load "../exclude/code/2_3_Exercise_003.py"
'''
Exercise 2_3 003
'''
# Select the values of k
k_start = 2
k_stop = 25
k_step = 1
k_values = np.arange(start=k_start, stop=k_stop, step=k_step).astype("int")
# Create an empty list to store the accuracies
accrcies = []
# Loop through k values, titting models and getting accuracies
for k in k_values:
knn_pipe = Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=k))])
knn_pipe.fit(dfTR[inputs], dfTR[output])
accrcies.append(knn_pipe.score(dfTR[inputs], dfTR[output]))
accrcies = np.array(accrcies)
# Plot accuracies vs k
ax_acc = sns.scatterplot(x = k_values, y = accrcies)
sns.lineplot(x = k_values, y = accrcies, ax=ax_acc)
# Axes labeks
ax_acc.set(xlabel ="k (num. of neighbors)",
ylabel = "Accuracy")
k_select = k_values[np.argmax(accrcies)]
plt.axvline(x=k_select, linestyle="--")
plt.text(k_select + 0.5, np.min(accrcies) ,"k = "+str(k_select)) Text(3.5, 0.96625, 'k = 3')
# %load "../exclude/code/2_3_Exercise_004.py"
'''
Exercise 2_3 004
'''
dfTS = pd.DataFrame(XTS, columns=inputs)
dfTS["Y"] = YTS
# Select the values of k
k_start = 2
k_stop = 10
k_step = 1
k_values = np.arange(start=k_start, stop=k_stop, step=k_step).astype("int")
# Create an empty list to store the accuracies
accrcies = []
# Loop through k values, titting models and getting accuracies
for k in k_values:
knn_pipe = Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=k))])
knn_pipe.fit(dfTR[inputs], dfTR[output])
accrcies.append(knn_pipe.score(dfTS[inputs], dfTS[output]))
accrcies = np.array(accrcies)
# Plot accuracies vs k
ax_acc = sns.scatterplot(x = k_values, y = accrcies)
sns.lineplot(x = k_values, y = accrcies, ax=ax_acc)
# Axes labeks
ax_acc.set(xlabel ="k (num. of neighbors)",
ylabel = "Accuracy")
k_select = k_values[np.argmax(accrcies)]
plt.axvline(x=k_select, linestyle="--")
plt.text(k_select + 0.5, np.min(accrcies) ,"k = "+str(k_select)) Text(2.5, 0.945, 'k = 2')
# %load "./2_3_ExploringModelSampleVariance.py"
'''
2_3_ExploringModelSampleVariance.py
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
sns.set_theme(rc={'figure.figsize':(12, 4)})
N = 1000000
XP, YP = make_moons(n_samples=N, noise=0.2, random_state=1)
sample_size = 1000
# Select the values of k
k_start = 200
k_stop = 350
k_step = 1
k_values = np.arange(start=k_start, stop=k_stop, step=k_step).astype("int")
for smpl_num in range(0, 5):
sample = np.random.default_rng(seed=2024).integers(0, N, size=sample_size)
X = XP[sample, :]
Y = YP[sample]
from sklearn.model_selection import train_test_split
XTR, Xval, YTR, Yval = train_test_split(X, Y,
test_size=0.2, # percentage preserved as test data
random_state=smpl_num, # seed for replication
stratify = Y) # Preserves distribution of y
dfTR = pd.DataFrame(XTR, columns=["X" + str(i + 1) for i in range(X.shape[1])])
inputs = dfTR.columns
dfTR["Y"] = YTR
output = "Y"
dfval = pd.DataFrame(Xval, columns=inputs)
dfval["Y"] = Yval
# Create an empty list to store the accuracies
accrcies = []
# Loop through k values, fitting models and getting accuracies
for k in k_values:
knn_pipe = Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=k))])
knn_pipe.fit(dfTR[inputs], dfTR[output])
accrcies.append(knn_pipe.score(dfval[inputs], dfval[output]))
accrcies = np.array(accrcies)
# Plot accuracies vs k
ax_acc = sns.scatterplot(x = k_values, y = accrcies)
sns.lineplot(x = k_values, y = accrcies, ax=ax_acc, label="sample_"+str(smpl_num))
# Axes labels
ax_acc.set(xlabel ="k (num. of neighbors)",
ylabel = "Accuracy");
ax_acc.legend() Before discussing how to move forward, let us pay a visit to the example that we have been using.
%run -i "./2_3_Code_For_Exercise_007.py"In the KNN model example, when trying to identify the best possible model we trained a model for each value of the hyperparameter k from 1 or 2 up to 300 (or similar values). To incorporate 10-fold cross validation we will train 10 models for each one of those k values (we are talking thousands of models).
%run -i "../exclude/code/2_2_Exercise_001.py"Preprocessing completed. Train and test set created.
k_values = np.ceil(np.linspace(3, XTR.shape[0] / 2, num=15)).astype("int").tolist()
k_values[3, 31, 59, 87, 114, 142, 170, 198, 225, 253, 281, 309, 336, 364, 392]
And now we use that list of values to create a dictionary that will be used in the hyperparameter grid search below. The double underscore __ after knn is not arbitrary as we will see.
hyp_grid = {'knn__n_neighbors': k_values} from sklearn.compose import ColumnTransformer
num_transformer = Pipeline(
steps=[("scaler", StandardScaler())]
)
preprocessor = ColumnTransformer(
transformers=[
("num", num_transformer, num_inputs),
("cat", "passthrough", ohe_inputs),
]
)knn_pipe = Pipeline(steps=[('preproc', preprocessor),
('knn', KNeighborsClassifier())])from sklearn.model_selection import StratifiedKFold
num_folds = 10
cv_splitter = StratifiedKFold(shuffle=True, n_splits=num_folds, random_state=1)Creating the pipeline now is really simple (the commented line is the simplest version):
from sklearn.model_selection import GridSearchCV
knn_gridCV = GridSearchCV(estimator=knn_pipe,
param_grid=hyp_grid,
# cv=num_folds,
cv=cv_splitter,
return_train_score=True)knn_gridCV.fit(XTR, YTR) GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
estimator=Pipeline(steps=[('preproc',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['X1',
'X2']),
('cat',
'passthrough',
['X4_A',
'X4_B'])])),
('knn', KNeighborsClassifier())]),
param_grid={'knn__n_neighbors': [3, 31, 59, 87, 114, 142, 170, 198,
225, 253, 281, 309, 336, 364,
392]},
return_train_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. | estimator | Pipeline(step...lassifier())]) | |
| param_grid | {'knn__n_neighbors': [3, 31, ...]} | |
| scoring | None | |
| n_jobs | None | |
| refit | True | |
| cv | StratifiedKFo... shuffle=True) | |
| verbose | 0 | |
| pre_dispatch | '2*n_jobs' | |
| error_score | nan | |
| return_train_score | True |
| transformers | [('num', ...), ('cat', ...)] | |
| remainder | 'drop' | |
| sparse_threshold | 0.3 | |
| n_jobs | None | |
| transformer_weights | None | |
| verbose | False | |
| verbose_feature_names_out | True | |
| force_int_remainder_cols | 'deprecated' |
['X1', 'X2']
| copy | True | |
| with_mean | True | |
| with_std | True |
['X4_A', 'X4_B']
passthrough
| n_neighbors | 3 | |
| weights | 'uniform' | |
| algorithm | 'auto' | |
| leaf_size | 30 | |
| p | 2 | |
| metric | 'minkowski' | |
| metric_params | None | |
| n_jobs | None |
knn_gridCV.best_params_{'knn__n_neighbors': 3}
knn_gridCV.score(XTR, YTR), knn_gridCV.score(XTS, YTS)(0.9821200510855683, 0.9396984924623115)
These are accuracy data, and we see a sizeable difference between the result in trainnig and test (though not too big). We can see in this a hint at a possible overfitting of the model.
param_values = knn_gridCV.cv_results_['param_knn__n_neighbors']
mean_train_scores = knn_gridCV.cv_results_['mean_train_score']
plt.plot(param_values, mean_train_scores, marker='o', label='Mean Train Score')
mean_test_scores = knn_gridCV.cv_results_['mean_test_score']
plt.plot(param_values, mean_test_scores, marker='*', label='Mean Test Score')
plt.xlabel('k')
plt.ylabel('Score (accuracy)')
plt.title('Hyperparameter Tuning Results')
plt.legend() # Add a legend to the plot
plt.show()
plt.close()
# cv_split = cv_splitter.split(XTR, YTR)
# folds = [fold for fold in cv_split]
# fold_TR_0, fold_TS_0, = folds[0]
# fold_TR_0[0:10], fold_TS_0[0:10]# %load "2_3_GridSearchCV_details.py"# # train scores (mean across folds) in the first fold for all values of k
# knn_gridCV.cv_results_['split0_train_score']# # The same resukt from keep_cv:
# # loop with i through the values of k, getting the train scores for the first fold
# np.array([keep_cv[i]["train_score"][0] for i in range(15)])# # Using the third value of k, get the indices of the test fold number 0
# keep_cv[3]["indices"]["test"][0]# Training predictions dataset
dfTR_eval = XTR.copy()
dfTR_eval['Y'] = YTR
dfTR_eval['Y_knn_prob_neg'] = knn_gridCV.predict_proba(XTR)[:, 0]
dfTR_eval['Y_knn_prob_pos'] = knn_gridCV.predict_proba(XTR)[:, 1]
dfTR_eval['Y_knn_pred'] = knn_gridCV.predict(XTR)# Test predictions dataset
dfTS_eval = XTS.copy()
dfTS_eval['Y'] = YTS
dfTS_eval['Y_knn_prob_neg'] = knn_gridCV.predict_proba(XTS)[:, 0]
dfTS_eval['Y_knn_prob_pos'] = knn_gridCV.predict_proba(XTS)[:, 1]
dfTS_eval['Y_knn_pred'] = knn_gridCV.predict(XTS)from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
model = knn_gridCV
fig = plt.figure(constrained_layout=True, figsize=(6, 2))
spec = fig.add_gridspec(1, 3)
ax1 = fig.add_subplot(spec[0, 0]);ax1.set_title('Training'); ax1.grid(False)
ax2 = fig.add_subplot(spec[0, 2]);ax2.set_title('Test'); ax2.grid(False)
ConfusionMatrixDisplay.from_estimator(model, XTR, YTR, cmap="Greens", colorbar=False, ax=ax1, labels=[1, 0])
ConfusionMatrixDisplay.from_estimator(model, XTS, YTS, cmap="Greens", colorbar=False, ax=ax2, labels=[1, 0])
plt.show(); from sklearn.metrics import RocCurveDisplay
fig = plt.figure(figsize=(12, 4))
spec = fig.add_gridspec(1, 2)
ax1 = fig.add_subplot(spec[0, 0]);ax1.set_title('Training')
ax2 = fig.add_subplot(spec[0, 1]);ax2.set_title('Test')
RocCurveDisplay.from_estimator(model, XTR, YTR, plot_chance_level=True, ax=ax1)
RocCurveDisplay.from_estimator(model, XTS, YTS, plot_chance_level=True, ax=ax2);
plt.suptitle("ROC Curves")
plt.show(); from sklearn.calibration import CalibrationDisplay
fig = plt.figure(figsize=(12, 4))
spec = fig.add_gridspec(1, 2)
ax1 = fig.add_subplot(spec[0, 0]);ax1.set_title('Training')
ax2 = fig.add_subplot(spec[0, 1]);ax2.set_title('Test')
CalibrationDisplay.from_estimator(model, XTR, YTR, n_bins=10, ax=ax1)
CalibrationDisplay.from_estimator(model, XTS, YTS, n_bins=10, ax=ax2);
plt.suptitle("Calibration Curves")
plt.show(); plt.rcParams['figure.figsize']=plt.rcParamsDefault['figure.figsize']
from sklearn.linear_model import LogisticRegression
LogReg_pipe = Pipeline(steps=[('preproc', preprocessor),
('LogReg', LogisticRegression(penalty=None))])
LogReg_pipe.fit(XTR, YTR)
dfTS_eval['Y_LR_prob_neg'] = LogReg_pipe.predict_proba(XTS)[:, 0]
dfTS_eval['Y_LR_prob_pos'] = LogReg_pipe.predict_proba(XTS)[:, 1]
dfTS_eval['Y_LR_pred'] = LogReg_pipe.predict(XTS)from sklearn.model_selection import cross_val_score
metric = 'accuracy'
cross_val_score(LogReg_pipe, XTR, YTR, cv=10, scoring='accuracy')array([0.89873418, 0.87341772, 0.7721519 , 0.75641026, 0.83333333,
0.80769231, 0.83333333, 0.85897436, 0.82051282, 0.78205128])
metric = 'accuracy'
score = {'LogReg': cross_val_score(LogReg_pipe, XTR, YTR, cv=10, scoring='accuracy'),
'knn': cross_val_score(knn_gridCV, XTR, YTR, cv=10, scoring='accuracy')}
fig = plt.figure(figsize=(10, 4))
model_scores = pd.DataFrame(score)
sns.boxplot(model_scores.melt(var_name="model", value_name=metric), x=metric, y ="model");from sklearn.metrics import RocCurveDisplay
plt.figure(figsize=(6, 4))
fig, ax = plt.subplots()
RocCurveDisplay.from_estimator(knn_gridCV, XTS, YTS, ax=ax, name="knn_pipe", pos_label=1)
RocCurveDisplay.from_estimator(LogReg_pipe, XTS, YTS, plot_chance_level=True, ax=ax, name="LogReg_pipe", pos_label=1)
plt.title("ROC curves of the models for the Test set")
plt.show()<Figure size 600x400 with 0 Axes>
plt.rcdefaults();
from sklearn.calibration import CalibrationDisplay
plt.figure(constrained_layout=False, figsize=(12, 12))
fig, ax = plt.subplots()
CalibrationDisplay.from_estimator(knn_gridCV, XTS, YTS, n_bins=10,
name="knn_pipe", pos_label = 1, ax=ax)
CalibrationDisplay.from_estimator(LogReg_pipe, XTS, YTS, n_bins=10,
name="LogReg_pipe", pos_label = 1, ax=ax)
plt.title("Calibration curves of the models for the Test set")
plt.show()<Figure size 1200x1200 with 0 Axes>