Skip to content

California Housing - Feature Engineering

Install packages

!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1 \
    plotly==6.6.0 \
    nbformat==4.2.0

Append notebooks directory to sys.path

1
2
3
import sys

sys.path.append("../../..")

Import packages

import os
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from dotenv import load_dotenv
from sklearn.metrics import (
    mean_absolute_error
)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
import plotly.express as px


pd.set_option("display.max_columns", None)

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

%matplotlib inline

load_dotenv()  # Root directory .env file
True

Utility scripts:

KaggleDataExtractor:

import base64
import io
import logging
import os
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger("KaggleExtractor")


class ExtractionConfig(ABC):
    pass


class DataExtractionStrategy(ABC):
    @abstractmethod
    def download_dataset(self, config: ExtractionConfig) -> None:
        pass


@dataclass(frozen=True)
class KaggleExtractionConfig(ExtractionConfig):
    dataset_slug: str  # e.g. "zynicide/wine-reviews"
    file_name: str  # file inside the Kaggle zip
    destination_path: str  # folder to extract to
    output_file_name: str | None = None  # optional rename


class KaggleDataExtractor(DataExtractionStrategy):
    def __init__(self, username: str, api_token: str) -> None:
        self.username = username
        self.api_token = api_token
        self.auth_header = self._create_auth_header()

    def _create_auth_header(self):
        token = f"{self.username}:{self.api_token}"
        base64_token = base64.b64encode(token.encode()).decode()
        return {"Authorization": f"Basic {base64_token}"}

    def download_dataset(self, config: ExtractionConfig) -> None:
        if not isinstance(config, KaggleExtractionConfig):
            raise TypeError("config must be a KaggleExtractionConfig instance")

        url = f"https://www.kaggle.com/api/v1/datasets/download/{config.dataset_slug}"
        request = Request(url, headers=self.auth_header)

        logger.info(f"Starting download from Kaggle: {url}")

        try:
            with urlopen(request) as response:
                data = response.read()
            logger.info("Download completed. Extracting zip file...")

            os.makedirs(config.destination_path, exist_ok=True)

            with zipfile.ZipFile(io.BytesIO(data)) as z:
                extracted_path = z.extract(
                    config.file_name, path=config.destination_path
                )

            if config.output_file_name is None:
                logger.info(
                    f"Dataset '{config.file_name}' extracted successfully "
                    f"to: {config.destination_path}"
                )
                return

            old_path = os.path.join(config.destination_path, config.file_name)
            new_path = os.path.join(
                config.destination_path, config.output_file_name
            )

            os.rename(old_path, new_path)

            logger.info(
                f"Dataset '{config.file_name}' extracted successfully "
                f"to: {config.destination_path}"
            )

        except HTTPError as e:
            logger.error(f"HTTP Error {e.code}: {e.reason}")
        except URLError as e:
            logger.error(f"URL Error: {e.reason}")
        except zipfile.BadZipFile:
            logger.error(
                "Failed to read zip file. Kaggle may have returned HTML instead of a zip."
            )
        except Exception as e:
            logger.exception(f"Unexpected error occurred: {e}")

Create data directory

1
2
3
DATA_DIR = pathlib.Path("data/california-housing-feature-engineering")

os.makedirs(DATA_DIR, exist_ok=True)
username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "housing.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="camnugent/california-housing-prices",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="housing.csv",
)

if not os.path.isfile(DATA_DIR / "housing.csv"):
    extractor.download_dataset(config)

Load dataset

1
2
3
df = pd.read_csv(DATA_DIR / "housing.csv")

df.head(n=2)
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'>

Index: 20433 entries, 0 to 20639

Data columns (total 10 columns):

 #   Column              Non-Null Count  Dtype  

---  ------              --------------  -----  

 0   longitude           20433 non-null  float64

 1   latitude            20433 non-null  float64

 2   housing_median_age  20433 non-null  float64

 3   total_rooms         20433 non-null  float64

 4   total_bedrooms      20433 non-null  float64

 5   population          20433 non-null  float64

 6   households          20433 non-null  float64

 7   median_income       20433 non-null  float64

 8   median_house_value  20433 non-null  float64

 9   ocean_proximity     20433 non-null  object 

dtypes: float64(9), object(1)

memory usage: 1.7+ MB

Split

len(df) * 0.8
16346.400000000001
1
2
3
4
5
6
7
df = df.sample(frac=1, random_state=2)
train_df = df[:17000]
train_df = train_df.reset_index(drop=True)
test_df = df[17000:]
test_df = test_df.reset_index(drop=True)

train_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.16 37.76 36.0 2781.0 574.0 1438.0 519.0 2.4598 155500.0 NEAR BAY
1 -119.95 36.80 30.0 1233.0 214.0 620.0 199.0 3.4297 112500.0 INLAND
2 -118.49 34.42 23.0 4166.0 756.0 2082.0 743.0 4.4107 213400.0 <1H OCEAN
3 -122.24 37.79 27.0 1632.0 492.0 1171.0 429.0 2.3173 125000.0 NEAR BAY
4 -121.45 36.86 11.0 1613.0 335.0 1617.0 342.0 3.1375 146200.0 INLAND

Converting target variables as array

1
2
3
4
y_train = train_df["median_house_value"].to_numpy()
y_test = test_df["median_house_value"].to_numpy()

y_train.shape, y_test.shape
((17000,), (3433,))

Baseline model, setting the result for every median_house_value as mean

1
2
3
4
average_median_house_value = train_df["median_house_value"].mean()
baseline_model_test_predictions = [average_median_house_value] * len(test_df)

mean_absolute_error(baseline_model_test_predictions, y_test)
90952.86688447764

Dimensionality Reduction

As dimensions increase, data becomes sparse, it is difficult for the model see patterns because there are no dense regions

  • It makes overfit more easily because the model can fit noise
  • Computational cost increases (more memory usage, training time increases, inference latency)
  • Feature redundance for correlated features and irrelevance for features that are just noise (does not mean anything)
  • Model stability, small changes in data can do big changes in predictions
  • Harder to debug and interpret

Correlation:

  • When the value is positive while a variable increase the other increase also
  • When the value is negative while a variable decrease the other increase

Principal Component Analysis (PCA) — Feature Extraction

Why is Important

  • Reduce dimensionality while preserving variance
  • Remove correlation and redundant features
  • Improve generalization and computational efficiency

When to Use

Apply PCA when:

  • Data is high-dimensional, correlated, or noisy
  • Using models sensitive to distance/geometry:
  • KNN, K-means, SVM
  • Feature count impacts training/inference cost

Avoid when:

  • Using neural networks (learn representations internally)
  • Using tree models (need original feature structure)
  • Interpretability or original feature meaning is required

Precondition

  • Apply standardization:
  • Zero mean, unit variance
  • Prevent dominance of high-variance features

Model Impact

  • KNN / K-means
  • More stable distance calculations
  • Less noise

  • SVM

  • Faster training
  • Less overfitting in high dimensions

  • Neural Networks

  • Usually unnecessary
  • Useful only for very high-dimensional, small datasets

  • Tree Models

  • Can degrade performance
  • Lose interpretability

Trade-offs

Aspect PCA Applied PCA Not Applied
Dimensionality Reduced High
Interpretability Lost Preserved
Noise Reduced Higher
Distance Stability Improved Degraded (high-dim)
Training Time Lower Higher
Information Partial Full
Complexity Higher (extra step) Lower

Key Constraint

  • PCA is linear:
  • Cannot model non-linear relationships
  • Assumes variance ≈ importance
train_df[["total_rooms", "total_bedrooms", "households"]].corr()
total_rooms total_bedrooms households
total_rooms 1.000000 0.931023 0.918161
total_bedrooms 0.931023 1.000000 0.979402
households 0.918161 0.979402 1.000000
1
2
3
4
5
X_train_3 = train_df[
    ["total_rooms", "total_bedrooms", "households"]
].to_numpy()

X_train_3.shape
(17000, 3)
1
2
3
X_test_3 = test_df[["total_rooms", "total_bedrooms", "households"]].to_numpy()

X_test_3.shape
(3433, 3)
1
2
3
4
5
forest_base = RandomForestRegressor(n_estimators=50, max_depth=5).fit(
    X_train_3, y_train
)
forest_base_test_predictions = forest_base.predict(X_test_3)
mean_absolute_error(y_test, forest_base_test_predictions)
82353.62932794042

Transform the 3 columns from X_train in 2 keeping the variance

1
2
3
4
pca = PCA(n_components=2)
pca = pca.fit(X_train_3)

print(pca)
PCA(n_components=2)
1
2
3
4
X_train_pca = pca.transform(X_train_3)
X_test_pca = pca.transform(X_test_3)

X_train_pca.shape, X_test_pca.shape
((17000, 2), (3433, 2))
1
2
3
4
5
6
forest_pca = RandomForestRegressor(n_estimators=50, max_depth=5).fit(
    X_train_pca, y_train
)
forest_pca_test_predictions = forest_pca.predict(X_test_pca)

mean_absolute_error(y_test, forest_pca_test_predictions)
79370.2691703226

Feature Scaling (Normalization / Standardization)

Why is Important

  • Ensure comparable feature magnitudes
  • Prevent one feature from dominating due to scale
  • Stabilize distance computations
  • Improve optimization convergence (neural networks)
  • Ensure consistent L1/L2 regularization

When to Use

Apply scaling when:

  • Using distance-based models:
  • KNN, K-means, SVM, PCA
  • Using gradient-based models:
  • Neural networks
  • Features have different numeric ranges

Avoid when:

  • Using tree-based models (Random Forest, XGBoost)
  • Use threshold splits
  • Do not rely on distance or scale

Key Constraint

  • Scaling changes feature scale only:
  • Does NOT reduce dimensionality
  • Does NOT remove noise
  • Does NOT fix curse of dimensionality

Get each value, subtract by mean and divide by standard deviation

  • Mean = 0
  • Standard deviation = 1

1
2
3
4
5
scaler = StandardScaler().fit(X_train_3)
X_train_3_scaled = scaler.transform(X_train_3)

plt.hist(X_train_3_scaled[:, 0])
plt.show()
output_33_0.png Make values go between 0 and 1, when we already have a maximum known number like RGB (255)

  • Small values like 0 to 1 are easily to work than 0 to 255

1
2
3
4
5
normalizer = Normalizer().fit(X_train_3)
X_train_3_normalized = normalizer.transform(X_train_3)

plt.hist(X_train_3_normalized[:, 0])
plt.show()
output_35_0.png

1
2
3
4
5
6
scaler = StandardScaler().fit(X_train_3)
X_train_3_scaled = scaler.transform(X_train_3)

X_test_3_scaled = scaler.transform(X_test_3)

X_train_3_scaled.shape, X_test_3_scaled.shape
((17000, 3), (3433, 3))
1
2
3
4
5
6
7
8
random_forest_scaled = RandomForestRegressor(n_estimators=50, max_depth=5).fit(
    X_train_3_scaled, y_train
)
random_forest_scaled_test_predictions = random_forest_scaled.predict(
    X_test_3_scaled
)

mean_absolute_error(y_test, random_forest_scaled_test_predictions)
82187.41885158629

Create pipeline with StandardScaler

scale_pca_pipe_forest = Pipeline(
    steps=[
        ("Scaler", StandardScaler()),
        ("PCA", PCA(n_components=2)),
        ("Forest", RandomForestRegressor(n_estimators=50, max_depth=5)),
    ]
)

scale_pca_pipe_forest.fit(X_train_3, y_train)
scale_pca_pipe_forest_test_predictions = scale_pca_pipe_forest.predict(
    X_test_3
)

mean_absolute_error(y_test, scale_pca_pipe_forest_test_predictions)
80198.9302401591

Create pipeline with Normalizer

scale_pca_pipe_forest = Pipeline(
    steps=[
        ("Scaler", Normalizer()),
        ("PCA", PCA(n_components=2)),
        ("Forest", RandomForestRegressor(n_estimators=50, max_depth=5)),
    ]
)

scale_pca_pipe_forest.fit(X_train_3, y_train)
scale_pca_pipe_forest_test_predictions = scale_pca_pipe_forest.predict(
    X_test_3
)

mean_absolute_error(y_test, scale_pca_pipe_forest_test_predictions)
77346.30550039872

Categorical Encoding — One-Hot / Dummy Encoding

Why is important

  • Convert categorical variables into numeric representation without introducing ordinal relationships.
  • Prevent models from inferring false ordering or artificial distances between categories.
  • Preserve categorical independence via binary indicator features.

Example constraint:

  • Label encoding:
  • Red = 1, Blue = 2, Green = 3
    => Implies: Green > Blue > Red, distance(Green, Red) = 2 (invalid)

  • One-hot encoding:

  • Represent each category as independent binary vector:
Red Blue Green
1 0 0

Environmental Context (When)

Apply one-hot encoding when:

  • Model requires numeric input with linear or geometric assumptions:
  • Linear Regression
  • Logistic Regression
  • KNN
  • K-means
  • SVM (especially linear)
  • Model behavior depends on:
  • Distance metrics
  • Linear combinations

Avoid or deprioritize when:

  • Using tree-based models:
  • Random Forest
  • XGBoost
  • LightGBM
    These models:
  • Perform threshold-based splits (e.g., if color == "Red")
  • Do not rely on distance
  • Do not assume linear relationships

Execution Logic (How)

  1. Enumerate categories - Identify all unique values in categorical feature

  2. Instantiate binary columns - Create one column per category

  3. Assign indicator values - Set:

    • 1 => category present
    • 0 => otherwise
  4. Replace original feature - Drop original categorical column - Use binary feature matrix as model input

Comparative Analysis & Trade-offs

Dimension One-Hot Encoding Applied Label Encoding Applied
Ordinal Assumption None Introduced (invalid for nominal)
Distance Semantics Preserved (no artificial order) Distorted
Model Compatibility Linear, distance-based models Tree-based models
Dimensionality Increased (one column/category) Constant (single column)
Interpretability High (explicit categories) Lower (encoded values ambiguous)
Memory Usage Higher Lower

Key Constraints

  • One-hot encoding increases feature dimensionality:
  • Especially problematic with high cardinality (hundreds/thousands of categories)
  • Can negatively impact:
  • Memory usage
  • Training time
  • Model generalization (sparsity)

Usage Constraints

Avoid one-hot encoding when:

  • Feature has high cardinality
  • Using tree-based models
  • Memory or performance constraints are critical
train_dummies = pd.get_dummies(train_df["ocean_proximity"])
train_dummies.head()
<1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
0 False False False True False
1 False True False False False
2 True False False False False
3 False False False True False
4 False True False False False
train_df = pd.concat([train_df, train_dummies], axis=1)
train_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
0 -122.16 37.76 36.0 2781.0 574.0 1438.0 519.0 2.4598 155500.0 NEAR BAY False False False True False
1 -119.95 36.80 30.0 1233.0 214.0 620.0 199.0 3.4297 112500.0 INLAND False True False False False
2 -118.49 34.42 23.0 4166.0 756.0 2082.0 743.0 4.4107 213400.0 <1H OCEAN True False False False False
3 -122.24 37.79 27.0 1632.0 492.0 1171.0 429.0 2.3173 125000.0 NEAR BAY False False False True False
4 -121.45 36.86 11.0 1613.0 335.0 1617.0 342.0 3.1375 146200.0 INLAND False True False False False
train_df["ocean_proximity"].value_counts()
ocean_proximity
<1H OCEAN     7522
INLAND        5408
NEAR OCEAN    2172
NEAR BAY      1895
ISLAND           3
Name: count, dtype: int64

Drop the column with less occurrences

train_df.drop("ISLAND", inplace=True, axis=1, errors="ignore")
train_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity <1H OCEAN INLAND NEAR BAY NEAR OCEAN
0 -122.16 37.76 36.0 2781.0 574.0 1438.0 519.0 2.4598 155500.0 NEAR BAY False False True False
1 -119.95 36.80 30.0 1233.0 214.0 620.0 199.0 3.4297 112500.0 INLAND False True False False
2 -118.49 34.42 23.0 4166.0 756.0 2082.0 743.0 4.4107 213400.0 <1H OCEAN True False False False
3 -122.24 37.79 27.0 1632.0 492.0 1171.0 429.0 2.3173 125000.0 NEAR BAY False False True False
4 -121.45 36.86 11.0 1613.0 335.0 1617.0 342.0 3.1375 146200.0 INLAND False True False False
test_dummies = pd.get_dummies(test_df["ocean_proximity"])
test_dummies.head()
<1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
0 True False False False False
1 True False False False False
2 True False False False False
3 False True False False False
4 True False False False False
test_df = pd.concat([test_df, test_dummies], axis=1)
test_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
0 -118.07 33.87 28.0 2399.0 436.0 1613.0 429.0 3.6339 220100.0 <1H OCEAN True False False False False
1 -118.26 34.02 40.0 1259.0 362.0 1499.0 327.0 1.8382 126400.0 <1H OCEAN True False False False False
2 -118.51 34.16 23.0 11154.0 1995.0 4076.0 1809.0 5.4609 500001.0 <1H OCEAN True False False False False
3 -120.04 36.95 36.0 1528.0 347.0 1334.0 304.0 1.3594 48300.0 INLAND False True False False False
4 -117.91 33.65 24.0 1494.0 494.0 814.0 459.0 2.1074 181300.0 <1H OCEAN True False False False False
test_df["ocean_proximity"].value_counts()
ocean_proximity
<1H OCEAN     1512
INLAND        1088
NEAR OCEAN     456
NEAR BAY       375
ISLAND           2
Name: count, dtype: int64
test_df.drop("ISLAND", inplace=True, axis=1, errors="ignore")
test_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity <1H OCEAN INLAND NEAR BAY NEAR OCEAN
0 -118.07 33.87 28.0 2399.0 436.0 1613.0 429.0 3.6339 220100.0 <1H OCEAN True False False False
1 -118.26 34.02 40.0 1259.0 362.0 1499.0 327.0 1.8382 126400.0 <1H OCEAN True False False False
2 -118.51 34.16 23.0 11154.0 1995.0 4076.0 1809.0 5.4609 500001.0 <1H OCEAN True False False False
3 -120.04 36.95 36.0 1528.0 347.0 1334.0 304.0 1.3594 48300.0 INLAND False True False False
4 -117.91 33.65 24.0 1494.0 494.0 814.0 459.0 2.1074 181300.0 <1H OCEAN True False False False
X_train_dummies = train_df.to_numpy()[:, -4:]
X_train_dummies.shape
(17000, 4)
X_test_dummies = test_df.to_numpy()[:, -4:]
X_test_dummies.shape
(3433, 4)
1
2
3
4
linear_dummy = LinearRegression().fit(X_train_dummies, y_train)
linear_dummy_test_predictions = linear_dummy.predict(X_test_dummies)

mean_absolute_error(y_test, linear_dummy_test_predictions)
77095.0289334857

Binning (Discretization / Grouping)

Why is important

  • Encode non-linear relationships into discrete intervals for models assuming linearity.
  • Reduce noise by aggregating continuous values into stable groups.
  • Absorb outliers into boundary bins to limit their influence.
  • Improve interpretability via human-readable intervals.

When to Use

Apply binning when:

  • Using models with linear assumptions:
  • Linear Regression
  • Logistic Regression
  • Using models that operate on discrete distributions:
  • Naive Bayes
  • Continuous feature exhibits:
  • Non-linear relationship with target
  • High variance / noise
  • Outliers

Avoid or deprioritize when:

  • Using tree-based models:
  • Random Forest
  • XGBoost
  • LightGBM
    These models:
  • Perform implicit threshold-based splits (dynamic binning)
  • Using neural networks:
  • Learn non-linear patterns directly
  • Using distance-based models:
  • KNN
  • K-means Binning:
  • Destroys distance semantics
  • Makes nearby values appear unrelated

Key Constraint

  • Binning introduces information loss:
  • Replaces continuous variation with discrete intervals
  • May reduce predictive precision

  • Manual binning in tree-based models:

  • Duplicates internal splitting logic
  • Can degrade performance by removing fine-grained thresholds

1
2
3
train_df["housing_median_age"].hist()

plt.show()
output_56_0.png

1
2
3
4
5
train_df["median_age_less_than_30"] = (
    train_df["housing_median_age"] < 30
).astype(int)

train_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity <1H OCEAN INLAND NEAR BAY NEAR OCEAN median_age_less_than_30
0 -122.16 37.76 36.0 2781.0 574.0 1438.0 519.0 2.4598 155500.0 NEAR BAY False False True False 0
1 -119.95 36.80 30.0 1233.0 214.0 620.0 199.0 3.4297 112500.0 INLAND False True False False 0
2 -118.49 34.42 23.0 4166.0 756.0 2082.0 743.0 4.4107 213400.0 <1H OCEAN True False False False 1
3 -122.24 37.79 27.0 1632.0 492.0 1171.0 429.0 2.3173 125000.0 NEAR BAY False False True False 1
4 -121.45 36.86 11.0 1613.0 335.0 1617.0 342.0 3.1375 146200.0 INLAND False True False False 1
1
2
3
4
X_train_median_age = (
    train_df["median_age_less_than_30"].to_numpy().reshape(-1, 1)
)
X_train_median_age.shape
(17000, 1)
1
2
3
4
5
test_df["median_age_less_than_30"] = (
    test_df["housing_median_age"] < 30
).astype(int)

test_df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity <1H OCEAN INLAND NEAR BAY NEAR OCEAN median_age_less_than_30
0 -118.07 33.87 28.0 2399.0 436.0 1613.0 429.0 3.6339 220100.0 <1H OCEAN True False False False 1
1 -118.26 34.02 40.0 1259.0 362.0 1499.0 327.0 1.8382 126400.0 <1H OCEAN True False False False 0
2 -118.51 34.16 23.0 11154.0 1995.0 4076.0 1809.0 5.4609 500001.0 <1H OCEAN True False False False 1
3 -120.04 36.95 36.0 1528.0 347.0 1334.0 304.0 1.3594 48300.0 INLAND False True False False 0
4 -117.91 33.65 24.0 1494.0 494.0 814.0 459.0 2.1074 181300.0 <1H OCEAN True False False False 1
1
2
3
4
X_test_median_age = (
    test_df["median_age_less_than_30"].to_numpy().reshape(-1, 1)
)
X_test_median_age.shape
(3433, 1)
1
2
3
4
5
linear_median_age = LinearRegression().fit(X_train_median_age, y_train)
linear_median_age_test_predictions = linear_median_age.predict(
    X_test_median_age
)
mean_absolute_error(y_test, linear_median_age_test_predictions)
90794.78827572097

Clustering (Unsupervised Grouping)

FWhy is important

  • Partition data into groups of similar data points without labels.
  • Discover latent structure not explicitly defined:
  • Customer segments
  • Fraud patterns
  • User behavior
  • Enable feature engineering via cluster-derived features:
  • Cluster ID
  • Distance to cluster centers
  • Support data compression and summarization:
  • Reduce large datasets to representative groups
  • Enable anomaly detection:
  • Identify points far from cluster structure (e.g., fraud, system anomalies)

When to Use

Apply clustering when:

  • Labels are unavailable
  • Data is expected to contain natural group structure
  • Distance/similarity metrics are meaningful
  • Use cases include:
  • Customer behavior analysis
  • Transaction pattern detection
  • System monitoring

Use clustering outputs as features when:

  • Enhancing downstream models:
  • Linear models
  • Tree models
  • Need to introduce non-linear structure into simpler models
  • (Clustering acts similarly to binning for linear models)

Combine with preprocessing when:

  • Applying standardization (ensure valid distance computation)
  • Applying PCA (reduce dimensionality before clustering)

Preprocess data

  • Apply standardization (scale features)
  • Optionally apply PCA (reduce dimensionality)

Key Constraints

  • Clustering assumes meaningful distance/similarity metrics
  • Performance degrades in:
  • High-dimensional spaces (curse of dimensionality)
  • Results depend on:
  • Algorithm choice (K-means, DBSCAN, hierarchical)
  • Data distribution and scaling
  • Often requires:
  • Standardization (mandatory for distance-based clustering)
  • PCA (optional, improves clustering in high dimensions)

1
2
3
plt.scatter(train_df["longitude"], train_df["latitude"])

plt.show()
output_63_0.png

1
2
3
X_train_lat_long = train_df[["longitude", "latitude"]].to_numpy()
kmeans = KMeans(n_clusters=7).fit(X_train_lat_long)
kmeans.labels_
array([0, 3, 5, ..., 2, 5, 5], shape=(17000,), dtype=int32)
1
2
3
px.scatter(
    x=train_df["longitude"], y=train_df["latitude"], color=kmeans.labels_
)

---------------------------------------------------------------------------ValueError Traceback (most recent call last)File ~/.cache/uv/archive-v0/Trc0P-FTOEtFceW2c_C0B/lib/python3.11/site-packages/IPython/core/formatters.py:984, in IPythonDisplayFormatter.call(self, obj) 982 method = get_real_method(obj, self.print_method) 983 if method is not None: --> 984 method() 985 return TrueFile ~/.cache/uv/builds-v0/.tmpAAwKXY/lib/python3.11/site-packages/plotly/basedatatypes.py:850, in BaseFigure.ipython_display(self) 847 import plotly.io as pio 849 if pio.renderers.render_on_display and pio.renderers.default: --> 850 pio.show(self) 851 else: 852 print(repr(self))File ~/.cache/uv/builds-v0/.tmpAAwKXY/lib/python3.11/site-packages/plotly/io/renderers.py:415, in show(fig, renderer, validate, **kwargs) 410 raise ValueError( 411 "Mime type rendering requires ipython but it is not installed" 412 ) 414 if not nbformat or Version(nbformat.__version_) < Version("4.2.0"): --> 415 raise ValueError( 416 "Mime type rendering requires nbformat>=4.2.0 but it is not installed" 417 ) 419 display_jupyter_version_warnings() 421 ipython_display.display(bundle, raw=True)ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

X_train_clustering = pd.get_dummies(pd.Series(kmeans.labels_)).to_numpy()
X_train_clustering
array([[ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       ...,
       [False, False,  True, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False,  True, False]], shape=(17000, 7))
1
2
3
4
5
X_test_lat_long = test_df[["longitude", "latitude"]].to_numpy()
X_test_clustering = pd.get_dummies(
    pd.Series(kmeans.predict(X_test_lat_long))
).to_numpy()
X_test_clustering.shape
(3433, 7)
1
2
3
4
5
6
linear_clustering = LinearRegression().fit(X_train_clustering, y_train)
linear_clustering_test_predictions = linear_clustering.predict(
    X_test_clustering
)

mean_absolute_error(linear_clustering_test_predictions, y_test)
75498.33350007689

Feature Selection (Subset Selection / Dimensionality Reduction via Selection)

Why is Important

  • Reduce overfitting by eliminating features that enable memorization of noise.
  • Improve model performance by retaining only informative features.
  • Reduce computational cost (training/inference).
  • Improve interpretability by simplifying feature space.
  • Handle multicollinearity by selecting representative features and removing redundant ones.

When to use

Apply feature selection when:

  • Feature space is large (> 50–100 features).
  • Using models sensitive to:
  • Linear relationships:
    • Linear Regression
    • Logistic Regression
  • Distance/similarity metrics:
    • KNN
    • K-Means
  • Using models sensitive to:
  • Noise and dimensionality:
    • SVM
  • Interpretability is required.

Model-specific benefits:

  • Linear / Logistic Regression:
  • Improves stability and coefficient interpretability
  • KNN / K-Means:
  • Improves distance quality (removes irrelevant dimensions)
  • SVM:
  • Reduces noise and accelerates training

Methods

Filter Methods (model-agnostic):

  • Compute statistical metrics per feature:
  • Correlation
  • Variance Threshold
  • Mutual Information
  • Remove features based on thresholds

Wrapper Methods (model-dependent):

  • Iterate over feature subsets
  • Train model and evaluate performance
  • Example:
  • Recursive Feature Elimination (RFE)

Embedded Methods (model-integrated):

  • Perform selection during training:
  • L1 regularization (Lasso): drives coefficients to zero
  • Tree-based feature importance
  • Boruta algorithm
Method What It Does How It Works Strengths Limitations / Constraints Best Use Case
Correlation (Pearson) Removes redundant linear relationships Computes pairwise correlation, drops one of highly correlated features Simple, effective for multicollinearity Only captures linear relationships Linear models, multicollinearity control
Mutual Information Captures dependency with target Measures information gain between feature and target Detects non-linear relationships Slower, less interpretable than correlation General feature relevance (non-linear)
RFE (Recursive Feature Elimination) Selects optimal subset via iterative pruning Trains model, removes least important features iteratively High-quality subset selection Computationally expensive Medium feature sets, high accuracy requirement
L1 (Lasso) Enforces sparsity (zero coefficients) Adds L1 penalty → minimizes loss + λ∑|w| → coefficients shrink to zero Efficient, built-in selection Unstable with correlated features Linear models, high-dimensional data
Tree Feature Importance Ranks features by predictive contribution Aggregates impurity reduction (Gini/MSE) across splits Captures non-linearities, interactions Bias toward high-cardinality features Tree-based models (RF, XGBoost, LightGBM)
Permutation Importance Measures impact on model performance Shuffles feature → measures performance drop Model-agnostic, more reliable importance Computationally expensive Post-training validation of importance
Boruta Identifies all relevant features (all-relevant) Compares real vs shadow features using RF importance Robust, avoids missing weak signals Expensive, slower High-stakes feature selection

Key Constraints

  • Removing features may:
  • Discard useful signal if improperly configured
  • Wrapper methods:
  • Require cross-validation to control overfitting
  • Embedded methods:
  • Depend on model assumptions:
    • L1 => sparsity assumption
    • Trees => split-based importance bias
  • Feature selection must be:
  • Applied consistently in training and inference pipelines
X_train_clustering.shape, X_train_3_scaled.shape, X_train_dummies.shape
((17000, 7), (17000, 3), (17000, 4))
1
2
3
4
X_train_full = np.concatenate(
    [X_train_clustering, X_train_3_scaled, X_train_dummies], axis=1
)
X_train_full.shape
(17000, 14)
X_test_clustering.shape, X_test_3_scaled.shape, X_test_dummies.shape
((3433, 7), (3433, 3), (3433, 4))
1
2
3
4
X_test_full = np.concatenate(
    [X_test_clustering, X_test_3_scaled, X_test_dummies], axis=1
)
X_test_full.shape
(3433, 14)
1
2
3
4
random_forest_full = RandomForestRegressor().fit(X_train_full, y_train)
random_forest_full_test_predictions = random_forest_full.predict(X_test_full)

mean_absolute_error(random_forest_clustering_test_predictions, y_test)
67617.36133866961