California Housing Price Regression


1
2
3
4
!uv pip install -q \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1
1
2
3
4
5
6
7
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
1
2
3
housing = fetch_california_housing()

housing.data.shape
(20640, 8)
housing.keys()
dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])
1
2
3
4
5
6
X_train, X_test, y_train, y_test = train_test_split(
    housing.data, housing.target
)

print(X_train.shape)
print(X_test.shape)
(15480, 8)

(5160, 8)
def get_r_squared(k=9):
    train_score = []
    test_score = []
    models = []

    for neighbors in range(1, k + 1):
        model = KNeighborsRegressor(n_neighbors=neighbors)

        model.fit(X_train, y_train)

        train_score.append(model.score(X_train, y_train))
        test_score.append(model.score(X_test, y_test))

        models.append(model)

    return train_score, test_score, models
train_score, test_score, models = get_r_squared()
train_score
[1.0,
 0.6980255655672355,
 0.5691893851080985,
 0.495214071212306,
 0.4450438808723586,
 0.4027039579951145,
 0.370615696342977,
 0.3464775541368713,
 0.3256464737781445]
test_score
[-0.19442814226045568,
 0.05248837739648182,
 0.12091507306226734,
 0.15004555800911934,
 0.14974047514813815,
 0.16054533889822697,
 0.16308632333280093,
 0.16460446008323348,
 0.16440605944584585]
1
2
3
# Examining performance for k=6
print(f"Training r2 for 6 neighbors: {train_score[5]:.2f}")
print(f"Testing r2 for 6 neighbors: {test_score[5]:.2f}")
Training r2 for 6 neighbors: 0.40

Testing r2 for 6 neighbors: 0.16

1
2
3
4
5
6
7
k = range(1, 10)
plt.plot(k, train_score, label="Training")
plt.plot(k, test_score, label="Testing")
plt.xlabel("neighbours")
plt.ylabel("$r^2$")
plt.legend()
plt.show()
output_10_0.png

def get_r_squared_scaled(k=9):
    train_score = []
    test_score = []
    models = []
    for neighbors in range(1, k + 1):
        model = make_pipeline(
            StandardScaler(), KNeighborsRegressor(n_neighbors=neighbors)
        )
        model.fit(X_train, y_train)
        train_score.append(model.score(X_train, y_train))
        test_score.append(model.score(X_test, y_test))
        models.append(model)
    return train_score, test_score, models
train_score, test_score, models = get_r_squared_scaled()
1
2
3
# Examining performance for k=6
print(f"Training r2 for 6 neighbors: {train_score[5]:.2f}")
print(f"Testing r2 for 6 neighbors: {test_score[5]:.2f}")
Training r2 for 6 neighbors: 0.78

Testing r2 for 6 neighbors: 0.69