K-nearest neighbors
| !uv pip install -q \
pandas==2.3.2 \
pandas-stubs==2.3.2.250827 \
numpy==2.3.2 \
matplotlib==3.10.6 \
seaborn==0.13.2 \
scikit-learn==1.7.1
|
| import sys
# sys.executable # Confirm that packages are in uv cache
|
Classification
| from typing import List
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.metrics import (
accuracy_score,
classification_report,
confusion_matrix,
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")
# print(sklearn.__version__)
|
| iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.rename(
{
"sepal length (cm)": "sepal_length",
"sepal width (cm)": "sepal_width",
"petal length (cm)": "petal_length",
"petal width (cm)": "petal_width",
},
axis=1,
inplace=True,
)
df["target"] = iris.target
df["target"] = df["target"].map(
{
0: "setosa",
1: "versicolor",
2: "virginica",
}
)
df.head()
|
|
sepal_length |
sepal_width |
petal_length |
petal_width |
target |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
| sns.scatterplot(
x="sepal_length", y="sepal_width", hue="target", data=df, palette="cool"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()
|
| sns.scatterplot(
data=df, x="petal_length", y="petal_width", hue="target", palette="Set1"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()
|
| X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
|
| knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
evaluation = accuracy_score(y_test, pred)
print(f"Accuracy: {evaluation * 100:.2f}%")
|
Accuracy: 100.00%
| cm = confusion_matrix(y_true=y_test, y_pred=pred)
labels = sorted(list(set(y_test)))
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_df
|
|
setosa |
versicolor |
virginica |
setosa |
10 |
0 |
0 |
versicolor |
0 |
9 |
0 |
virginica |
0 |
0 |
11 |
| sns.heatmap(
cm,
annot=True,
fmt="d",
cmap="Blues",
xticklabels=labels,
yticklabels=labels,
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
|
| report_dict = classification_report(
y_true=y_test, y_pred=pred, output_dict=True
)
report_df = pd.DataFrame(report_dict).transpose()
report_df
|
|
precision |
recall |
f1-score |
support |
setosa |
1.0 |
1.0 |
1.0 |
10.0 |
versicolor |
1.0 |
1.0 |
1.0 |
9.0 |
virginica |
1.0 |
1.0 |
1.0 |
11.0 |
accuracy |
1.0 |
1.0 |
1.0 |
1.0 |
macro avg |
1.0 |
1.0 |
1.0 |
30.0 |
weighted avg |
1.0 |
1.0 |
1.0 |
30.0 |
| neighbors = list(range(1, 50, 2))
cv_scores: List[float] = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring="accuracy")
cv_scores.append(scores.mean())
type(cv_scores[1])
|
numpy.float64
| mse = [1 - x for x in cv_scores] # Misclassification Error
def find_min_index(x: List[float]) -> int:
vector: List[int] = []
for index, value in enumerate(x):
if value == min(x):
vector.append(index)
return max(vector)
optimal_k = neighbors[find_min_index(mse)]
optimal_rse = min(mse) # Root Squared Error
print(
f"The optimal number of neighbors is {optimal_k} with RSE of {optimal_rse:.4f}"
)
|
The optimal number of neighbors is 11 with RSE of 0.0417
| plt.plot(neighbors, mse, marker="o")
plt.xlabel("Number of Neighbors K")
plt.ylabel("Misclassification Error")
plt.plot(optimal_k, optimal_rse, "ro")
plt.show()
|