California Housing - Feature Engineering¶

Dataset

Install packages

!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1 \
    plotly==6.6.0 \
    nbformat==4.2.0

Append notebooks directory to sys.path

1
2
3

import sys

sys.path.append("../../..")

Import packages

import os
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from dotenv import load_dotenv
from sklearn.metrics import (
    mean_absolute_error
)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
import plotly.express as px


pd.set_option("display.max_columns", None)

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

%matplotlib inline

load_dotenv()  # Root directory .env file

True

Utility scripts:¶

KaggleDataExtractor:

import base64
import io
import logging
import os
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger("KaggleExtractor")


class ExtractionConfig(ABC):
    pass


class DataExtractionStrategy(ABC):
    @abstractmethod
    def download_dataset(self, config: ExtractionConfig) -> None:
        pass


@dataclass(frozen=True)
class KaggleExtractionConfig(ExtractionConfig):
    dataset_slug: str  # e.g. "zynicide/wine-reviews"
    file_name: str  # file inside the Kaggle zip
    destination_path: str  # folder to extract to
    output_file_name: str | None = None  # optional rename


class KaggleDataExtractor(DataExtractionStrategy):
    def __init__(self, username: str, api_token: str) -> None:
        self.username = username
        self.api_token = api_token
        self.auth_header = self._create_auth_header()

    def _create_auth_header(self):
        token = f"{self.username}:{self.api_token}"
        base64_token = base64.b64encode(token.encode()).decode()
        return {"Authorization": f"Basic {base64_token}"}

    def download_dataset(self, config: ExtractionConfig) -> None:
        if not isinstance(config, KaggleExtractionConfig):
            raise TypeError("config must be a KaggleExtractionConfig instance")

        url = f"https://www.kaggle.com/api/v1/datasets/download/{config.dataset_slug}"
        request = Request(url, headers=self.auth_header)

        logger.info(f"Starting download from Kaggle: {url}")

        try:
            with urlopen(request) as response:
                data = response.read()
            logger.info("Download completed. Extracting zip file...")

            os.makedirs(config.destination_path, exist_ok=True)

            with zipfile.ZipFile(io.BytesIO(data)) as z:
                extracted_path = z.extract(
                    config.file_name, path=config.destination_path
                )

            if config.output_file_name is None:
                logger.info(
                    f"Dataset '{config.file_name}' extracted successfully "
                    f"to: {config.destination_path}"
                )
                return

            old_path = os.path.join(config.destination_path, config.file_name)
            new_path = os.path.join(
                config.destination_path, config.output_file_name
            )

            os.rename(old_path, new_path)

            logger.info(
                f"Dataset '{config.file_name}' extracted successfully "
                f"to: {config.destination_path}"
            )

        except HTTPError as e:
            logger.error(f"HTTP Error {e.code}: {e.reason}")
        except URLError as e:
            logger.error(f"URL Error: {e.reason}")
        except zipfile.BadZipFile:
            logger.error(
                "Failed to read zip file. Kaggle may have returned HTML instead of a zip."
            )
        except Exception as e:
            logger.exception(f"Unexpected error occurred: {e}")

Create data directory

1
2
3

DATA_DIR = pathlib.Path("data/california-housing-feature-engineering")

os.makedirs(DATA_DIR, exist_ok=True)

username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "housing.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="camnugent/california-housing-prices",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="housing.csv",
)

if not os.path.isfile(DATA_DIR / "housing.csv"):
    extractor.download_dataset(config)

Load dataset

1
2
3

df = pd.read_csv(DATA_DIR / "housing.csv")

df.head(n=2)

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY

1 2	`df = df.dropna() df.info()`

<class 'pandas.core.frame.DataFrame'>

Index: 20433 entries, 0 to 20639

Data columns (total 10 columns):

 #   Column              Non-Null Count  Dtype  

---  ------              --------------  -----  

 0   longitude           20433 non-null  float64

 1   latitude            20433 non-null  float64

 2   housing_median_age  20433 non-null  float64

 3   total_rooms         20433 non-null  float64

 4   total_bedrooms      20433 non-null  float64

 5   population          20433 non-null  float64

 6   households          20433 non-null  float64

 7   median_income       20433 non-null  float64

 8   median_house_value  20433 non-null  float64

 9   ocean_proximity     20433 non-null  object 

dtypes: float64(9), object(1)

memory usage: 1.7+ MB

Split¶

1	`len(df) * 0.8`

16346.400000000001

df = df.sample(frac=1, random_state=2)
train_df = df[:17000]
train_df = train_df.reset_index(drop=True)
test_df = df[17000:]
test_df = test_df.reset_index(drop=True)

train_df.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.16	37.76	36.0	2781.0	574.0	1438.0	519.0	2.4598	155500.0	NEAR BAY
1	-119.95	36.80	30.0	1233.0	214.0	620.0	199.0	3.4297	112500.0	INLAND
2	-118.49	34.42	23.0	4166.0	756.0	2082.0	743.0	4.4107	213400.0	<1H OCEAN
3	-122.24	37.79	27.0	1632.0	492.0	1171.0	429.0	2.3173	125000.0	NEAR BAY
4	-121.45	36.86	11.0	1613.0	335.0	1617.0	342.0	3.1375	146200.0	INLAND

Converting target variables as array

y_train = train_df["median_house_value"].to_numpy()
y_test = test_df["median_house_value"].to_numpy()

y_train.shape, y_test.shape

((17000,), (3433,))

Baseline model, setting the result for every median_house_value as mean

average_median_house_value = train_df["median_house_value"].mean()
baseline_model_test_predictions = [average_median_house_value] * len(test_df)

mean_absolute_error(baseline_model_test_predictions, y_test)

90952.86688447764

Dimensionality Reduction¶

As dimensions increase, data becomes sparse, it is difficult for the model see patterns because there are no dense regions

It makes overfit more easily because the model can fit noise
Computational cost increases (more memory usage, training time increases, inference latency)
Feature redundance for correlated features and irrelevance for features that are just noise (does not mean anything)
Model stability, small changes in data can do big changes in predictions
Harder to debug and interpret

Correlation:

When the value is positive while a variable increase the other increase also
When the value is negative while a variable decrease the other increase

Principal Component Analysis (PCA) — Feature Extraction¶

Why is Important¶

Reduce dimensionality while preserving variance
Remove correlation and redundant features
Improve generalization and computational efficiency

When to Use¶

Apply PCA when:

Data is high-dimensional, correlated, or noisy
Using models sensitive to distance/geometry:
KNN, K-means, SVM
Feature count impacts training/inference cost

Avoid when:

Using neural networks (learn representations internally)
Using tree models (need original feature structure)
Interpretability or original feature meaning is required

Precondition¶

Apply standardization:
Zero mean, unit variance
Prevent dominance of high-variance features

Model Impact¶

KNN / K-means
More stable distance calculations
Less noise
SVM
Faster training
Less overfitting in high dimensions
Neural Networks
Usually unnecessary
Useful only for very high-dimensional, small datasets
Tree Models
Can degrade performance
Lose interpretability

Trade-offs¶

Aspect	PCA Applied	PCA Not Applied
Dimensionality	Reduced	High
Interpretability	Lost	Preserved
Noise	Reduced	Higher
Distance Stability	Improved	Degraded (high-dim)
Training Time	Lower	Higher
Information	Partial	Full
Complexity	Higher (extra step)	Lower

Key Constraint¶

PCA is linear:
Cannot model non-linear relationships
Assumes variance ≈ importance

1	`train_df[["total_rooms", "total_bedrooms", "households"]].corr()`

	total_rooms	total_bedrooms	households
total_rooms	1.000000	0.931023	0.918161
total_bedrooms	0.931023	1.000000	0.979402
households	0.918161	0.979402	1.000000

X_train_3 = train_df[
    ["total_rooms", "total_bedrooms", "households"]
].to_numpy()

X_train_3.shape

(17000, 3)

1
2
3

X_test_3 = test_df[["total_rooms", "total_bedrooms", "households"]].to_numpy()

X_test_3.shape

(3433, 3)

forest_base = RandomForestRegressor(n_estimators=50, max_depth=5).fit(
    X_train_3, y_train
)
forest_base_test_predictions = forest_base.predict(X_test_3)
mean_absolute_error(y_test, forest_base_test_predictions)

82353.62932794042

Transform the 3 columns from X_train in 2 keeping the variance

pca = PCA(n_components=2)
pca = pca.fit(X_train_3)

print(pca)

PCA(n_components=2)

X_train_pca = pca.transform(X_train_3)
X_test_pca = pca.transform(X_test_3)

X_train_pca.shape, X_test_pca.shape

((17000, 2), (3433, 2))

forest_pca = RandomForestRegressor(n_estimators=50, max_depth=5).fit(
    X_train_pca, y_train
)
forest_pca_test_predictions = forest_pca.predict(X_test_pca)

mean_absolute_error(y_test, forest_pca_test_predictions)

79370.2691703226

Feature Scaling (Normalization / Standardization)¶

Why is Important¶

Ensure comparable feature magnitudes
Prevent one feature from dominating due to scale
Stabilize distance computations
Improve optimization convergence (neural networks)
Ensure consistent L1/L2 regularization

When to Use¶

Apply scaling when:

Using distance-based models:
KNN, K-means, SVM, PCA
Using gradient-based models:
Neural networks
Features have different numeric ranges

Avoid when:

Using tree-based models (Random Forest, XGBoost)
Use threshold splits
Do not rely on distance or scale

Key Constraint¶

Scaling changes feature scale only:
Does NOT reduce dimensionality
Does NOT remove noise
Does NOT fix curse of dimensionality

Get each value, subtract by mean and divide by standard deviation

Mean = 0
Standard deviation = 1

scaler = StandardScaler().fit(X_train_3)
X_train_3_scaled = scaler.transform(X_train_3)

plt.hist(X_train_3_scaled[:, 0])
plt.show()

Make values go between 0 and 1, when we already have a maximum known number like RGB (255)

Small values like 0 to 1 are easily to work than 0 to 255

normalizer = Normalizer().fit(X_train_3)
X_train_3_normalized = normalizer.transform(X_train_3)

plt.hist(X_train_3_normalized[:, 0])
plt.show()

scaler = StandardScaler().fit(X_train_3)
X_train_3_scaled = scaler.transform(X_train_3)

X_test_3_scaled = scaler.transform(X_test_3)

X_train_3_scaled.shape, X_test_3_scaled.shape

((17000, 3), (3433, 3))

random_forest_scaled = RandomForestRegressor(n_estimators=50, max_depth=5).fit(
    X_train_3_scaled, y_train
)
random_forest_scaled_test_predictions = random_forest_scaled.predict(
    X_test_3_scaled
)

mean_absolute_error(y_test, random_forest_scaled_test_predictions)

82187.41885158629

Create pipeline with StandardScaler

scale_pca_pipe_forest = Pipeline(
    steps=[
        ("Scaler", StandardScaler()),
        ("PCA", PCA(n_components=2)),
        ("Forest", RandomForestRegressor(n_estimators=50, max_depth=5)),
    ]
)

scale_pca_pipe_forest.fit(X_train_3, y_train)
scale_pca_pipe_forest_test_predictions = scale_pca_pipe_forest.predict(
    X_test_3
)

mean_absolute_error(y_test, scale_pca_pipe_forest_test_predictions)

80198.9302401591

Create pipeline with Normalizer

scale_pca_pipe_forest = Pipeline(
    steps=[
        ("Scaler", Normalizer()),
        ("PCA", PCA(n_components=2)),
        ("Forest", RandomForestRegressor(n_estimators=50, max_depth=5)),
    ]
)

scale_pca_pipe_forest.fit(X_train_3, y_train)
scale_pca_pipe_forest_test_predictions = scale_pca_pipe_forest.predict(
    X_test_3
)

mean_absolute_error(y_test, scale_pca_pipe_forest_test_predictions)

77346.30550039872

Categorical Encoding — One-Hot / Dummy Encoding¶

Why is important¶

Convert categorical variables into numeric representation without introducing ordinal relationships.
Prevent models from inferring false ordering or artificial distances between categories.
Preserve categorical independence via binary indicator features.

Example constraint:

Label encoding:
Red = 1, Blue = 2, Green = 3
=> Implies: Green > Blue > Red, distance(Green, Red) = 2 (invalid)
One-hot encoding:
Represent each category as independent binary vector:

Red	Blue	Green
1	0	0

Environmental Context (When)¶

Apply one-hot encoding when:

Model requires numeric input with linear or geometric assumptions:
Linear Regression
Logistic Regression
KNN
K-means
SVM (especially linear)
Model behavior depends on:
Distance metrics
Linear combinations

Avoid or deprioritize when:

Using tree-based models:
Random Forest
XGBoost
LightGBM
These models:
Perform threshold-based splits (e.g., if color == "Red")
Do not rely on distance
Do not assume linear relationships

Execution Logic (How)¶

Enumerate categories - Identify all unique values in categorical feature
Instantiate binary columns - Create one column per category
Assign indicator values - Set:
- 1 => category present
- 0 => otherwise
Replace original feature - Drop original categorical column - Use binary feature matrix as model input

Comparative Analysis & Trade-offs¶

Dimension	One-Hot Encoding Applied	Label Encoding Applied
Ordinal Assumption	None	Introduced (invalid for nominal)
Distance Semantics	Preserved (no artificial order)	Distorted
Model Compatibility	Linear, distance-based models	Tree-based models
Dimensionality	Increased (one column/category)	Constant (single column)
Interpretability	High (explicit categories)	Lower (encoded values ambiguous)
Memory Usage	Higher	Lower

Key Constraints¶

One-hot encoding increases feature dimensionality:
Especially problematic with high cardinality (hundreds/thousands of categories)
Can negatively impact:
Memory usage
Training time
Model generalization (sparsity)

Usage Constraints¶

Avoid one-hot encoding when:

Feature has high cardinality
Using tree-based models
Memory or performance constraints are critical

1 2	`train_dummies = pd.get_dummies(train_df["ocean_proximity"]) train_dummies.head()`

	<1H OCEAN	INLAND	ISLAND	NEAR BAY	NEAR OCEAN
0	False	False	False	True	False
1	False	True	False	False	False
2	True	False	False	False	False
3	False	False	False	True	False
4	False	True	False	False	False

1 2	`train_df = pd.concat([train_df, train_dummies], axis=1) train_df.head()`

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	<1H OCEAN	INLAND	ISLAND	NEAR BAY	NEAR OCEAN
0	-122.16	37.76	36.0	2781.0	574.0	1438.0	519.0	2.4598	155500.0	NEAR BAY	False	False	False	True	False
1	-119.95	36.80	30.0	1233.0	214.0	620.0	199.0	3.4297	112500.0	INLAND	False	True	False	False	False
2	-118.49	34.42	23.0	4166.0	756.0	2082.0	743.0	4.4107	213400.0	<1H OCEAN	True	False	False	False	False
3	-122.24	37.79	27.0	1632.0	492.0	1171.0	429.0	2.3173	125000.0	NEAR BAY	False	False	False	True	False
4	-121.45	36.86	11.0	1613.0	335.0	1617.0	342.0	3.1375	146200.0	INLAND	False	True	False	False	False

1	`train_df["ocean_proximity"].value_counts()`

ocean_proximity
<1H OCEAN     7522
INLAND        5408
NEAR OCEAN    2172
NEAR BAY      1895
ISLAND           3
Name: count, dtype: int64

Drop the column with less occurrences

1 2	`train_df.drop("ISLAND", inplace=True, axis=1, errors="ignore") train_df.head()`

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	<1H OCEAN	INLAND	NEAR BAY	NEAR OCEAN
0	-122.16	37.76	36.0	2781.0	574.0	1438.0	519.0	2.4598	155500.0	NEAR BAY	False	False	True	False
1	-119.95	36.80	30.0	1233.0	214.0	620.0	199.0	3.4297	112500.0	INLAND	False	True	False	False
2	-118.49	34.42	23.0	4166.0	756.0	2082.0	743.0	4.4107	213400.0	<1H OCEAN	True	False	False	False
3	-122.24	37.79	27.0	1632.0	492.0	1171.0	429.0	2.3173	125000.0	NEAR BAY	False	False	True	False
4	-121.45	36.86	11.0	1613.0	335.0	1617.0	342.0	3.1375	146200.0	INLAND	False	True	False	False

1 2	`test_dummies = pd.get_dummies(test_df["ocean_proximity"]) test_dummies.head()`

	<1H OCEAN	INLAND	ISLAND	NEAR BAY	NEAR OCEAN
0	True	False	False	False	False
1	True	False	False	False	False
2	True	False	False	False	False
3	False	True	False	False	False
4	True	False	False	False	False

1 2	`test_df = pd.concat([test_df, test_dummies], axis=1) test_df.head()`

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	<1H OCEAN	INLAND	ISLAND	NEAR BAY	NEAR OCEAN
0	-118.07	33.87	28.0	2399.0	436.0	1613.0	429.0	3.6339	220100.0	<1H OCEAN	True	False	False	False	False
1	-118.26	34.02	40.0	1259.0	362.0	1499.0	327.0	1.8382	126400.0	<1H OCEAN	True	False	False	False	False
2	-118.51	34.16	23.0	11154.0	1995.0	4076.0	1809.0	5.4609	500001.0	<1H OCEAN	True	False	False	False	False
3	-120.04	36.95	36.0	1528.0	347.0	1334.0	304.0	1.3594	48300.0	INLAND	False	True	False	False	False
4	-117.91	33.65	24.0	1494.0	494.0	814.0	459.0	2.1074	181300.0	<1H OCEAN	True	False	False	False	False

1	`test_df["ocean_proximity"].value_counts()`

ocean_proximity
<1H OCEAN     1512
INLAND        1088
NEAR OCEAN     456
NEAR BAY       375
ISLAND           2
Name: count, dtype: int64

1 2	`test_df.drop("ISLAND", inplace=True, axis=1, errors="ignore") test_df.head()`

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	<1H OCEAN	INLAND	NEAR BAY	NEAR OCEAN
0	-118.07	33.87	28.0	2399.0	436.0	1613.0	429.0	3.6339	220100.0	<1H OCEAN	True	False	False	False
1	-118.26	34.02	40.0	1259.0	362.0	1499.0	327.0	1.8382	126400.0	<1H OCEAN	True	False	False	False
2	-118.51	34.16	23.0	11154.0	1995.0	4076.0	1809.0	5.4609	500001.0	<1H OCEAN	True	False	False	False
3	-120.04	36.95	36.0	1528.0	347.0	1334.0	304.0	1.3594	48300.0	INLAND	False	True	False	False
4	-117.91	33.65	24.0	1494.0	494.0	814.0	459.0	2.1074	181300.0	<1H OCEAN	True	False	False	False

1 2	`X_train_dummies = train_df.to_numpy()[:, -4:] X_train_dummies.shape`

(17000, 4)

1 2	`X_test_dummies = test_df.to_numpy()[:, -4:] X_test_dummies.shape`

(3433, 4)

linear_dummy = LinearRegression().fit(X_train_dummies, y_train)
linear_dummy_test_predictions = linear_dummy.predict(X_test_dummies)

mean_absolute_error(y_test, linear_dummy_test_predictions)

77095.0289334857

Binning (Discretization / Grouping)¶

Why is important¶

Encode non-linear relationships into discrete intervals for models assuming linearity.
Reduce noise by aggregating continuous values into stable groups.
Absorb outliers into boundary bins to limit their influence.
Improve interpretability via human-readable intervals.

When to Use¶

Apply binning when:

Using models with linear assumptions:
Linear Regression
Logistic Regression
Using models that operate on discrete distributions:
Naive Bayes
Continuous feature exhibits:
Non-linear relationship with target
High variance / noise
Outliers

Avoid or deprioritize when:

Using tree-based models:
Random Forest
XGBoost
LightGBM
These models:
Perform implicit threshold-based splits (dynamic binning)
Using neural networks:
Learn non-linear patterns directly
Using distance-based models:
KNN
K-means Binning:
Destroys distance semantics
Makes nearby values appear unrelated

Key Constraint¶

Binning introduces information loss:
Replaces continuous variation with discrete intervals
May reduce predictive precision
Manual binning in tree-based models:
Duplicates internal splitting logic
Can degrade performance by removing fine-grained thresholds

1
2
3

train_df["housing_median_age"].hist()

plt.show()

train_df["median_age_less_than_30"] = (
    train_df["housing_median_age"] < 30
).astype(int)

train_df.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	<1H OCEAN	INLAND	NEAR BAY	NEAR OCEAN	median_age_less_than_30
0	-122.16	37.76	36.0	2781.0	574.0	1438.0	519.0	2.4598	155500.0	NEAR BAY	False	False	True	False	0
1	-119.95	36.80	30.0	1233.0	214.0	620.0	199.0	3.4297	112500.0	INLAND	False	True	False	False	0
2	-118.49	34.42	23.0	4166.0	756.0	2082.0	743.0	4.4107	213400.0	<1H OCEAN	True	False	False	False	1
3	-122.24	37.79	27.0	1632.0	492.0	1171.0	429.0	2.3173	125000.0	NEAR BAY	False	False	True	False	1
4	-121.45	36.86	11.0	1613.0	335.0	1617.0	342.0	3.1375	146200.0	INLAND	False	True	False	False	1

X_train_median_age = (
    train_df["median_age_less_than_30"].to_numpy().reshape(-1, 1)
)
X_train_median_age.shape

(17000, 1)

test_df["median_age_less_than_30"] = (
    test_df["housing_median_age"] < 30
).astype(int)

test_df.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity	<1H OCEAN	INLAND	NEAR BAY	NEAR OCEAN	median_age_less_than_30
0	-118.07	33.87	28.0	2399.0	436.0	1613.0	429.0	3.6339	220100.0	<1H OCEAN	True	False	False	False	1
1	-118.26	34.02	40.0	1259.0	362.0	1499.0	327.0	1.8382	126400.0	<1H OCEAN	True	False	False	False	0
2	-118.51	34.16	23.0	11154.0	1995.0	4076.0	1809.0	5.4609	500001.0	<1H OCEAN	True	False	False	False	1
3	-120.04	36.95	36.0	1528.0	347.0	1334.0	304.0	1.3594	48300.0	INLAND	False	True	False	False	0
4	-117.91	33.65	24.0	1494.0	494.0	814.0	459.0	2.1074	181300.0	<1H OCEAN	True	False	False	False	1

X_test_median_age = (
    test_df["median_age_less_than_30"].to_numpy().reshape(-1, 1)
)
X_test_median_age.shape

(3433, 1)

linear_median_age = LinearRegression().fit(X_train_median_age, y_train)
linear_median_age_test_predictions = linear_median_age.predict(
    X_test_median_age
)
mean_absolute_error(y_test, linear_median_age_test_predictions)

90794.78827572097

Clustering (Unsupervised Grouping)¶

FWhy is important¶

Partition data into groups of similar data points without labels.
Discover latent structure not explicitly defined:
Customer segments
Fraud patterns
User behavior
Enable feature engineering via cluster-derived features:
Cluster ID
Distance to cluster centers
Support data compression and summarization:
Reduce large datasets to representative groups
Enable anomaly detection:
Identify points far from cluster structure (e.g., fraud, system anomalies)

When to Use¶

Apply clustering when:

Labels are unavailable
Data is expected to contain natural group structure
Distance/similarity metrics are meaningful
Use cases include:
Customer behavior analysis
Transaction pattern detection
System monitoring

Use clustering outputs as features when:

Enhancing downstream models:
Linear models
Tree models
Need to introduce non-linear structure into simpler models
(Clustering acts similarly to binning for linear models)

Combine with preprocessing when:

Applying standardization (ensure valid distance computation)
Applying PCA (reduce dimensionality before clustering)

Preprocess data¶

Apply standardization (scale features)
Optionally apply PCA (reduce dimensionality)

Key Constraints¶

Clustering assumes meaningful distance/similarity metrics
Performance degrades in:
High-dimensional spaces (curse of dimensionality)
Results depend on:
Algorithm choice (K-means, DBSCAN, hierarchical)
Data distribution and scaling
Often requires:
Standardization (mandatory for distance-based clustering)
PCA (optional, improves clustering in high dimensions)

1
2
3

plt.scatter(train_df["longitude"], train_df["latitude"])

plt.show()

1
2
3

X_train_lat_long = train_df[["longitude", "latitude"]].to_numpy()
kmeans = KMeans(n_clusters=7).fit(X_train_lat_long)
kmeans.labels_

array([0, 3, 5, ..., 2, 5, 5], shape=(17000,), dtype=int32)

1
2
3

px.scatter(
    x=train_df["longitude"], y=train_df["latitude"], color=kmeans.labels_
)

---------------------------------------------------------------------------ValueError Traceback (most recent call last)File ~/.cache/uv/archive-v0/Trc0P-FTOEtFceW2c_C0B/lib/python3.11/site-packages/IPython/core/formatters.py:984, in IPythonDisplayFormatter.call(self, obj) 982 method = get_real_method(obj, self.print_method) 983 if method is not None: --> 984 method() 985 return TrueFile ~/.cache/uv/builds-v0/.tmpAAwKXY/lib/python3.11/site-packages/plotly/basedatatypes.py:850, in BaseFigure.ipython_display(self) 847 import plotly.io as pio 849 if pio.renderers.render_on_display and pio.renderers.default: --> 850 pio.show(self) 851 else: 852 print(repr(self))File ~/.cache/uv/builds-v0/.tmpAAwKXY/lib/python3.11/site-packages/plotly/io/renderers.py:415, in show(fig, renderer, validate, **kwargs) 410 raise ValueError( 411 "Mime type rendering requires ipython but it is not installed" 412 ) 414 if not nbformat or Version(nbformat.__version_) < Version("4.2.0"): --> 415 raise ValueError( 416 "Mime type rendering requires nbformat>=4.2.0 but it is not installed" 417 ) 419 display_jupyter_version_warnings() 421 ipython_display.display(bundle, raw=True)ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

1 2	`X_train_clustering = pd.get_dummies(pd.Series(kmeans.labels_)).to_numpy() X_train_clustering`

array([[ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       ...,
       [False, False,  True, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False,  True, False]], shape=(17000, 7))

X_test_lat_long = test_df[["longitude", "latitude"]].to_numpy()
X_test_clustering = pd.get_dummies(
    pd.Series(kmeans.predict(X_test_lat_long))
).to_numpy()
X_test_clustering.shape

(3433, 7)

linear_clustering = LinearRegression().fit(X_train_clustering, y_train)
linear_clustering_test_predictions = linear_clustering.predict(
    X_test_clustering
)

mean_absolute_error(linear_clustering_test_predictions, y_test)

75498.33350007689

Feature Selection (Subset Selection / Dimensionality Reduction via Selection)¶

Why is Important¶

Reduce overfitting by eliminating features that enable memorization of noise.
Improve model performance by retaining only informative features.
Reduce computational cost (training/inference).
Improve interpretability by simplifying feature space.
Handle multicollinearity by selecting representative features and removing redundant ones.

When to use¶

Apply feature selection when:

Feature space is large (> 50–100 features).
Using models sensitive to:
Linear relationships:
- Linear Regression
- Logistic Regression
Distance/similarity metrics:
- KNN
- K-Means
Using models sensitive to:
Noise and dimensionality:
- SVM
Interpretability is required.

Model-specific benefits:

Linear / Logistic Regression:
Improves stability and coefficient interpretability
KNN / K-Means:
Improves distance quality (removes irrelevant dimensions)
SVM:
Reduces noise and accelerates training

Methods¶

Filter Methods (model-agnostic):

Compute statistical metrics per feature:
Correlation
Variance Threshold
Mutual Information
Remove features based on thresholds

Wrapper Methods (model-dependent):

Iterate over feature subsets
Train model and evaluate performance
Example:
Recursive Feature Elimination (RFE)

Embedded Methods (model-integrated):

Perform selection during training:
L1 regularization (Lasso): drives coefficients to zero
Tree-based feature importance
Boruta algorithm

Method	What It Does	How It Works	Strengths	Limitations / Constraints	Best Use Case
Correlation (Pearson)	Removes redundant linear relationships	Computes pairwise correlation, drops one of highly correlated features	Simple, effective for multicollinearity	Only captures linear relationships	Linear models, multicollinearity control
Mutual Information	Captures dependency with target	Measures information gain between feature and target	Detects non-linear relationships	Slower, less interpretable than correlation	General feature relevance (non-linear)
RFE (Recursive Feature Elimination)	Selects optimal subset via iterative pruning	Trains model, removes least important features iteratively	High-quality subset selection	Computationally expensive	Medium feature sets, high accuracy requirement
L1 (Lasso)	Enforces sparsity (zero coefficients)	Adds L1 penalty → minimizes loss + λ∑\|w\| → coefficients shrink to zero	Efficient, built-in selection	Unstable with correlated features	Linear models, high-dimensional data
Tree Feature Importance	Ranks features by predictive contribution	Aggregates impurity reduction (Gini/MSE) across splits	Captures non-linearities, interactions	Bias toward high-cardinality features	Tree-based models (RF, XGBoost, LightGBM)
Permutation Importance	Measures impact on model performance	Shuffles feature → measures performance drop	Model-agnostic, more reliable importance	Computationally expensive	Post-training validation of importance
Boruta	Identifies all relevant features (all-relevant)	Compares real vs shadow features using RF importance	Robust, avoids missing weak signals	Expensive, slower	High-stakes feature selection

Key Constraints¶

Removing features may:
Discard useful signal if improperly configured
Wrapper methods:
Require cross-validation to control overfitting
Embedded methods:
Depend on model assumptions:
- L1 => sparsity assumption
- Trees => split-based importance bias
Feature selection must be:
Applied consistently in training and inference pipelines

1	`X_train_clustering.shape, X_train_3_scaled.shape, X_train_dummies.shape`

((17000, 7), (17000, 3), (17000, 4))

X_train_full = np.concatenate(
    [X_train_clustering, X_train_3_scaled, X_train_dummies], axis=1
)
X_train_full.shape

(17000, 14)

1	`X_test_clustering.shape, X_test_3_scaled.shape, X_test_dummies.shape`

((3433, 7), (3433, 3), (3433, 4))

X_test_full = np.concatenate(
    [X_test_clustering, X_test_3_scaled, X_test_dummies], axis=1
)
X_test_full.shape

(3433, 14)

random_forest_full = RandomForestRegressor().fit(X_train_full, y_train)
random_forest_full_test_predictions = random_forest_full.predict(X_test_full)

mean_absolute_error(random_forest_clustering_test_predictions, y_test)

67617.36133866961