Skip to content

K-nearest neighbors

1
2
3
4
5
6
7
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1
1
2
3
import sys

# sys.executable # Confirm that packages are in uv cache

Classification

from typing import List

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

# print(sklearn.__version__)
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.rename(
    {
        "sepal length (cm)": "sepal_length",
        "sepal width (cm)": "sepal_width",
        "petal length (cm)": "petal_length",
        "petal width (cm)": "petal_width",
    },
    axis=1,
    inplace=True,
)

df["target"] = iris.target

df["target"] = df["target"].map(
    {
        0: "setosa",
        1: "versicolor",
        2: "virginica",
    }
)
df.head()
sepal_length sepal_width petal_length petal_width target
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

1
2
3
4
5
sns.scatterplot(
    x="sepal_length", y="sepal_width", hue="target", data=df, palette="cool"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()
output_6_0.png

1
2
3
4
5
sns.scatterplot(
    data=df, x="petal_length", y="petal_width", hue="target", palette="Set1"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()
output_7_0.png

1
2
3
4
5
6
X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
1
2
3
4
5
6
7
8
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train, y_train)
pred = knn.predict(X_test)

evaluation = accuracy_score(y_test, pred)

print(f"Accuracy: {evaluation * 100:.2f}%")

Accuracy: 100.00%

1
2
3
4
cm = confusion_matrix(y_true=y_test, y_pred=pred)
labels = sorted(list(set(y_test)))
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_df
setosa versicolor virginica
setosa 10 0 0
versicolor 0 9 0
virginica 0 0 11

sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=labels,
    yticklabels=labels,
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
output_11_0.png

1
2
3
4
5
report_dict = classification_report(
    y_true=y_test, y_pred=pred, output_dict=True
)
report_df = pd.DataFrame(report_dict).transpose()
report_df
precision recall f1-score support
setosa 1.0 1.0 1.0 10.0
versicolor 1.0 1.0 1.0 9.0
virginica 1.0 1.0 1.0 11.0
accuracy 1.0 1.0 1.0 1.0
macro avg 1.0 1.0 1.0 30.0
weighted avg 1.0 1.0 1.0 30.0
1
2
3
4
5
6
7
8
9
neighbors = list(range(1, 50, 2))
cv_scores: List[float] = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring="accuracy")
    cv_scores.append(scores.mean())

type(cv_scores[1])

numpy.float64

mse = [1 - x for x in cv_scores]  # Misclassification Error


def find_min_index(x: List[float]) -> int:
    vector: List[int] = []
    for index, value in enumerate(x):
        if value == min(x):
            vector.append(index)
    return max(vector)


optimal_k = neighbors[find_min_index(mse)]
optimal_rse = min(mse)  # Root Squared Error

print(
    f"The optimal number of neighbors is {optimal_k} with RSE of {optimal_rse:.4f}"
)

The optimal number of neighbors is 11 with RSE of 0.0417

1
2
3
4
5
plt.plot(neighbors, mse, marker="o")
plt.xlabel("Number of Neighbors K")
plt.ylabel("Misclassification Error")
plt.plot(optimal_k, optimal_rse, "ro")
plt.show()
output_15_0.png