California Housing Price Regression 1 2 3 4!uv pip install -q \ matplotlib==3.10.6 \ seaborn==0.13.2 \ scikit-learn==1.7.1 1 2 3 4 5 6 7import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler 1 2 3housing = fetch_california_housing() housing.data.shape (20640, 8) 1housing.keys() dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR']) 1 2 3 4 5 6X_train, X_test, y_train, y_test = train_test_split( housing.data, housing.target ) print(X_train.shape) print(X_test.shape) (15480, 8) (5160, 8) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16def get_r_squared(k=9): train_score = [] test_score = [] models = [] for neighbors in range(1, k + 1): model = KNeighborsRegressor(n_neighbors=neighbors) model.fit(X_train, y_train) train_score.append(model.score(X_train, y_train)) test_score.append(model.score(X_test, y_test)) models.append(model) return train_score, test_score, models 1train_score, test_score, models = get_r_squared() 1train_score [1.0, 0.6980255655672355, 0.5691893851080985, 0.495214071212306, 0.4450438808723586, 0.4027039579951145, 0.370615696342977, 0.3464775541368713, 0.3256464737781445] 1test_score [-0.19442814226045568, 0.05248837739648182, 0.12091507306226734, 0.15004555800911934, 0.14974047514813815, 0.16054533889822697, 0.16308632333280093, 0.16460446008323348, 0.16440605944584585] 1 2 3# Examining performance for k=6 print(f"Training r2 for 6 neighbors: {train_score[5]:.2f}") print(f"Testing r2 for 6 neighbors: {test_score[5]:.2f}") Training r2 for 6 neighbors: 0.40 Testing r2 for 6 neighbors: 0.16 1 2 3 4 5 6 7k = range(1, 10) plt.plot(k, train_score, label="Training") plt.plot(k, test_score, label="Testing") plt.xlabel("neighbours") plt.ylabel("$r^2$") plt.legend() plt.show() 1 2 3 4 5 6 7 8 9 10 11 12 13def get_r_squared_scaled(k=9): train_score = [] test_score = [] models = [] for neighbors in range(1, k + 1): model = make_pipeline( StandardScaler(), KNeighborsRegressor(n_neighbors=neighbors) ) model.fit(X_train, y_train) train_score.append(model.score(X_train, y_train)) test_score.append(model.score(X_test, y_test)) models.append(model) return train_score, test_score, models 1train_score, test_score, models = get_r_squared_scaled() 1 2 3# Examining performance for k=6 print(f"Training r2 for 6 neighbors: {train_score[5]:.2f}") print(f"Testing r2 for 6 neighbors: {test_score[5]:.2f}") Training r2 for 6 neighbors: 0.78 Testing r2 for 6 neighbors: 0.69