Churn Prediction
\[g(x_{i}) \approx y_{i}\]
\[y_{i} \in \{0, 1\}\]
- \(1\): Churn
- \(0\): No Churn
Dataset:
telco-customer-churn
Install packages
| !uv pip install -q \
python-dotenv==1.2.1 \
pandas==2.3.2 \
pandas-stubs==2.3.2.250827 \
numpy==2.3.2 \
matplotlib==3.10.6 \
seaborn==0.13.2 \
scikit-learn==1.7.1 \
tqdm==4.67.1
|
Append notebooks directory to sys.path
| import sys
sys.path.append("../../..")
|
Import packages
| import os
import pathlib
import random
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
from typing import Tuple, Union
import numpy as np
from numpy.typing import NDArray
import seaborn as sns
import datetime
from collections import Counter
import pickle
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
mutual_info_score, accuracy_score, roc_curve, auc, roc_auc_score
)
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from notebooks.python.utils.data_extraction.data_extraction import (
KaggleDataExtractor,
KaggleExtractionConfig,
)
pd.set_option("display.max_columns", None)
sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")
%matplotlib inline
load_dotenv() # Root directory .env file
|
Utility scripts:
KaggleDataExtractor:
| import base64
import io
import logging
import os
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("KaggleExtractor")
class ExtractionConfig(ABC):
pass
class DataExtractionStrategy(ABC):
@abstractmethod
def download_dataset(self, config: ExtractionConfig) -> None:
pass
@dataclass(frozen=True)
class KaggleExtractionConfig(ExtractionConfig):
dataset_slug: str # e.g. "zynicide/wine-reviews"
file_name: str # file inside the Kaggle zip
destination_path: str # folder to extract to
output_file_name: str | None = None # optional rename
class KaggleDataExtractor(DataExtractionStrategy):
def __init__(self, username: str, api_token: str) -> None:
self.username = username
self.api_token = api_token
self.auth_header = self._create_auth_header()
def _create_auth_header(self):
token = f"{self.username}:{self.api_token}"
base64_token = base64.b64encode(token.encode()).decode()
return {"Authorization": f"Basic {base64_token}"}
def download_dataset(self, config: ExtractionConfig) -> None:
if not isinstance(config, KaggleExtractionConfig):
raise TypeError("config must be a KaggleExtractionConfig instance")
url = f"https://www.kaggle.com/api/v1/datasets/download/{config.dataset_slug}"
request = Request(url, headers=self.auth_header)
logger.info(f"Starting download from Kaggle: {url}")
try:
with urlopen(request) as response:
data = response.read()
logger.info("Download completed. Extracting zip file...")
os.makedirs(config.destination_path, exist_ok=True)
with zipfile.ZipFile(io.BytesIO(data)) as z:
extracted_path = z.extract(
config.file_name, path=config.destination_path
)
if config.output_file_name is None:
logger.info(
f"Dataset '{config.file_name}' extracted successfully "
f"to: {config.destination_path}"
)
return
old_path = os.path.join(config.destination_path, config.file_name)
new_path = os.path.join(
config.destination_path, config.output_file_name
)
os.rename(old_path, new_path)
logger.info(
f"Dataset '{config.file_name}' extracted successfully "
f"to: {config.destination_path}"
)
except HTTPError as e:
logger.error(f"HTTP Error {e.code}: {e.reason}")
except URLError as e:
logger.error(f"URL Error: {e.reason}")
except zipfile.BadZipFile:
logger.error(
"Failed to read zip file. Kaggle may have returned HTML instead of a zip."
)
except Exception as e:
logger.exception(f"Unexpected error occurred: {e}")
|
Create data directory
| DATA_DIR = pathlib.Path("data/predicting-customer-churn")
os.makedirs(DATA_DIR, exist_ok=True)
|
Download dataset from Kaggle
| username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
extractor = KaggleDataExtractor(username=username, api_token=api_token)
config = KaggleExtractionConfig(
dataset_slug="blastchar/telco-customer-churn",
file_name=file_name,
destination_path=DATA_DIR,
output_file_name="churn.csv",
)
if not os.path.isfile(DATA_DIR / "churn.csv"):
extractor.download_dataset(config)
|
2026-01-04 11:17:27 | INFO | KaggleExtractor | Starting download from Kaggle: https://www.kaggle.com/api/v1/datasets/download/blastchar/telco-customer-churn
2026-01-04 11:17:28 | INFO | KaggleExtractor | Download completed. Extracting zip file...
2026-01-04 11:17:28 | INFO | KaggleExtractor | Dataset 'WA_Fn-UseC_-Telco-Customer-Churn.csv' extracted successfully to: data/predicting-customer-churn
Data Preparation
Load dataset
| df = pd.read_csv(DATA_DIR / "churn.csv")
df.head(n=2)
|
|
customerID |
gender |
SeniorCitizen |
Partner |
Dependents |
tenure |
PhoneService |
MultipleLines |
InternetService |
OnlineSecurity |
OnlineBackup |
DeviceProtection |
TechSupport |
StreamingTV |
StreamingMovies |
Contract |
PaperlessBilling |
PaymentMethod |
MonthlyCharges |
TotalCharges |
Churn |
| 0 |
7590-VHVEG |
Female |
0 |
Yes |
No |
1 |
No |
No phone service |
DSL |
No |
Yes |
No |
No |
No |
No |
Month-to-month |
Yes |
Electronic check |
29.85 |
29.85 |
No |
| 1 |
5575-GNVDE |
Male |
0 |
No |
No |
34 |
Yes |
No |
DSL |
Yes |
No |
Yes |
No |
No |
No |
One year |
No |
Mailed check |
56.95 |
1889.5 |
No |
Inspect all columns at once
|
0 |
1 |
2 |
| customerID |
7590-VHVEG |
5575-GNVDE |
3668-QPYBK |
| gender |
Female |
Male |
Male |
| SeniorCitizen |
0 |
0 |
0 |
| Partner |
Yes |
No |
No |
| Dependents |
No |
No |
No |
| tenure |
1 |
34 |
2 |
| PhoneService |
No |
Yes |
Yes |
| MultipleLines |
No phone service |
No |
No |
| InternetService |
DSL |
DSL |
DSL |
| OnlineSecurity |
No |
Yes |
Yes |
| OnlineBackup |
Yes |
No |
Yes |
| DeviceProtection |
No |
Yes |
No |
| TechSupport |
No |
No |
No |
| StreamingTV |
No |
No |
No |
| StreamingMovies |
No |
No |
No |
| Contract |
Month-to-month |
One year |
Month-to-month |
| PaperlessBilling |
Yes |
No |
Yes |
| PaymentMethod |
Electronic check |
Mailed check |
Mailed check |
| MonthlyCharges |
29.85 |
56.95 |
53.85 |
| TotalCharges |
29.85 |
1889.5 |
108.15 |
| Churn |
No |
No |
Yes |
Data summary
| df_summary = pd.DataFrame(
{
"column": df.columns,
"dtype": [df[col].dtype for col in df.columns],
"sample_unique": [df[col].unique()[:6] for col in df.columns],
"n_unique": [df[col].nunique() for col in df.columns],
}
)
df_summary
|
|
column |
dtype |
sample_unique |
n_unique |
| 0 |
customerID |
object |
[7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOC... |
7043 |
| 1 |
gender |
object |
[Female, Male] |
2 |
| 2 |
SeniorCitizen |
int64 |
[0, 1] |
2 |
| 3 |
Partner |
object |
[Yes, No] |
2 |
| 4 |
Dependents |
object |
[No, Yes] |
2 |
| 5 |
tenure |
int64 |
[1, 34, 2, 45, 8, 22] |
73 |
| 6 |
PhoneService |
object |
[No, Yes] |
2 |
| 7 |
MultipleLines |
object |
[No phone service, No, Yes] |
3 |
| 8 |
InternetService |
object |
[DSL, Fiber optic, No] |
3 |
| 9 |
OnlineSecurity |
object |
[No, Yes, No internet service] |
3 |
| 10 |
OnlineBackup |
object |
[Yes, No, No internet service] |
3 |
| 11 |
DeviceProtection |
object |
[No, Yes, No internet service] |
3 |
| 12 |
TechSupport |
object |
[No, Yes, No internet service] |
3 |
| 13 |
StreamingTV |
object |
[No, Yes, No internet service] |
3 |
| 14 |
StreamingMovies |
object |
[No, Yes, No internet service] |
3 |
| 15 |
Contract |
object |
[Month-to-month, One year, Two year] |
3 |
| 16 |
PaperlessBilling |
object |
[Yes, No] |
2 |
| 17 |
PaymentMethod |
object |
[Electronic check, Mailed check, Bank transfer... |
4 |
| 18 |
MonthlyCharges |
float64 |
[29.85, 56.95, 53.85, 42.3, 70.7, 99.65] |
1585 |
| 19 |
TotalCharges |
object |
[29.85, 1889.5, 108.15, 1840.75, 151.65, 820.5] |
6531 |
| 20 |
Churn |
object |
[No, Yes] |
2 |
Clean column names
| df.columns = df.columns.str.lower().str.replace(" ", "_")
df.head(n=2)
|
|
customerid |
gender |
seniorcitizen |
partner |
dependents |
tenure |
phoneservice |
multiplelines |
internetservice |
onlinesecurity |
onlinebackup |
deviceprotection |
techsupport |
streamingtv |
streamingmovies |
contract |
paperlessbilling |
paymentmethod |
monthlycharges |
totalcharges |
churn |
| 0 |
7590-VHVEG |
Female |
0 |
Yes |
No |
1 |
No |
No phone service |
DSL |
No |
Yes |
No |
No |
No |
No |
Month-to-month |
Yes |
Electronic check |
29.85 |
29.85 |
No |
| 1 |
5575-GNVDE |
Male |
0 |
No |
No |
34 |
Yes |
No |
DSL |
Yes |
No |
Yes |
No |
No |
No |
One year |
No |
Mailed check |
56.95 |
1889.5 |
No |
Select only object type columns
| object_type_columns = list(df.dtypes[df.dtypes == "object"].index)
object_type_columns
|
['customerid',
'gender',
'partner',
'dependents',
'phoneservice',
'multiplelines',
'internetservice',
'onlinesecurity',
'onlinebackup',
'deviceprotection',
'techsupport',
'streamingtv',
'streamingmovies',
'contract',
'paperlessbilling',
'paymentmethod',
'totalcharges',
'churn']
Clean columns
| object_type_columns = list(df.dtypes[df.dtypes == "object"].index)
for column in object_type_columns:
df[column] = df[column].str.lower().str.replace(" ", "_")
|
Inspect values of total charges, it should numeric
0 29.85
1 1889.5
2 108.15
3 1840.75
4 151.65
Name: totalcharges, dtype: object
Cast total charges to numeric type
| total_charges = pd.to_numeric(df.totalcharges, errors="coerce")
total_charges[:5]
|
0 29.85
1 1889.50
2 108.15
3 1840.75
4 151.65
Name: totalcharges, dtype: float64
Check for null values
| total_charges.loc[total_charges.isnull()][:5]
|
488 NaN
753 NaN
936 NaN
1082 NaN
1340 NaN
Name: totalcharges, dtype: float64
Treat the null values
| df.totalcharges = total_charges.fillna(0)
|
Check churn field values
0 no
1 no
2 yes
3 no
4 yes
Name: churn, dtype: object
Encode churn field to binary
| (df.churn == "yes").astype(int)[:5]
|
0 0
1 0
2 1
3 0
4 1
Name: churn, dtype: int64
Set original churn dataset column to binary
| df.churn = (df.churn == "yes").astype(int)
|
Validation Framework
Set split sizes
- Training dataset: 60%
- Validation dataset: 20%
- Test dataset: 20%
Split dataset into full train (train + validation) and test
| df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
|
Get dataset's sizes
| len(df_full_train), len(df_test)
|
Calculate how train and validation dataset's sizes should be
| print(
f"df_full_train size: {(100 - 20)/100.:.0%}\n"
f"df_test size: {(20)/100.:.0%}\n"
f"df_train size: 60% of 80% = {(60)/80.:.0%}\n"
f"df_validation size: 20% of 80% = {(20)/80.:.0%}\n"
)
|
df_full_train size: 80%
df_test size: 20%
df_train size: 60% of 80% = 75%
df_validation size: 20% of 80% = 25%
Split full train dataset into train and validation datasets
| df_train, df_validation = train_test_split(
df_full_train, test_size=0.25, random_state=1
)
|
Get full length of dataset
| len(df_train), len(df_validation), len(df_test)
|
Reset dataset's indexes
| df_train.reset_index(drop=True, inplace=True)
df_validation.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
|
Get target variables
| y_train = df_train["churn"]
y_validation = df_validation["churn"]
y_test = df_test["churn"]
|
Remove target variables from original datasets
| df_train.drop(columns=["churn"], inplace=True)
df_validation.drop(columns=["churn"], inplace=True)
df_test.drop(columns=["churn"], inplace=True)
|
Exploratory Data Analysis
Reset full train dataset index
| df_full_train.reset_index(drop=True, inplace=True)
|
Inspect dataset
|
customerid |
gender |
seniorcitizen |
partner |
dependents |
tenure |
phoneservice |
multiplelines |
internetservice |
onlinesecurity |
onlinebackup |
deviceprotection |
techsupport |
streamingtv |
streamingmovies |
contract |
paperlessbilling |
paymentmethod |
monthlycharges |
totalcharges |
churn |
| 0 |
5442-pptjy |
male |
0 |
yes |
yes |
12 |
yes |
no |
no |
no_internet_service |
no_internet_service |
no_internet_service |
no_internet_service |
no_internet_service |
no_internet_service |
two_year |
no |
mailed_check |
19.7 |
258.35 |
0 |
| 1 |
6261-rcvns |
female |
0 |
no |
no |
42 |
yes |
no |
dsl |
yes |
yes |
yes |
yes |
no |
yes |
one_year |
no |
credit_card_(automatic) |
73.9 |
3160.55 |
1 |
Check if null values are present
customerid 0
gender 0
seniorcitizen 0
partner 0
dependents 0
tenure 0
phoneservice 0
multiplelines 0
internetservice 0
onlinesecurity 0
onlinebackup 0
deviceprotection 0
techsupport 0
streamingtv 0
streamingmovies 0
contract 0
paperlessbilling 0
paymentmethod 0
monthlycharges 0
totalcharges 0
churn 0
dtype: int64
Inspect target variable churn
| df_full_train.churn.value_counts()
|
churn
0 4113
1 1521
Name: count, dtype: int64
Get percent of total
| df_full_train.churn.value_counts(normalize=True)
|
churn
0 0.730032
1 0.269968
Name: proportion, dtype: float64
Get mean
| df_full_train.churn.mean() # number of ones divided by total
|
np.float64(0.26996805111821087)
Mean and percent of total for churn is the same because is encoded to binary. So both calculations are number of ones divided by total
| global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)
|
Inspect columns types
customerid object
gender object
seniorcitizen int64
partner object
dependents object
tenure int64
phoneservice object
multiplelines object
internetservice object
onlinesecurity object
onlinebackup object
deviceprotection object
techsupport object
streamingtv object
streamingmovies object
contract object
paperlessbilling object
paymentmethod object
monthlycharges float64
totalcharges float64
churn int64
dtype: object
Set numerical columns
| numerical_columns = ["tenure", "monthlycharges", "totalcharges"]
|
Set categorical columns
| categorical_columns = [
"gender",
"seniorcitizen",
"partner",
"dependents",
"phoneservice",
"multiplelines",
"internetservice",
"onlinesecurity",
"onlinebackup",
"deviceprotection",
"techsupport",
"streamingtv",
"streamingmovies",
"contract",
"paperlessbilling",
"paymentmethod",
]
|
Inspect categorical columns
| df_full_train[categorical_columns].nunique()
|
gender 2
seniorcitizen 2
partner 2
dependents 2
phoneservice 2
multiplelines 3
internetservice 3
onlinesecurity 3
onlinebackup 3
deviceprotection 3
techsupport 3
streamingtv 3
streamingmovies 3
contract 3
paperlessbilling 2
paymentmethod 4
dtype: int64
Feature Importance
Churn Rate
Difference:
- (global_churn_rate - group_churn_rate) > 0: Less likely to churn
- (global_churn_rate - group_churn_rate) < 0: More likely to churn
Risk Ratio:
- (group_churn_rate / global_churn_rate) > 1: More likely to churn
- (group_churn_rate / global_churn_rate) < 1: Less likely to churn
| df_groups = []
for column in categorical_columns:
global_churn_rate = df_full_train.churn.mean()
df_group = (
df_full_train[[column, "churn"]]
.groupby(column)
.churn.agg(["mean", "count"])
)
df_group["diff"] = df_group["mean"] - global_churn_rate
df_group["risk"] = df_group["mean"] / global_churn_rate
df_group = df_group.reset_index().rename(columns={column: "label"})
df_group.insert(0, "column", column)
df_groups.append(df_group)
result = pd.concat(df_groups, ignore_index=True)
result
|
|
column |
label |
mean |
count |
diff |
risk |
| 0 |
gender |
female |
0.276824 |
2796 |
0.006856 |
1.025396 |
| 1 |
gender |
male |
0.263214 |
2838 |
-0.006755 |
0.974980 |
| 2 |
seniorcitizen |
0 |
0.242270 |
4722 |
-0.027698 |
0.897403 |
| 3 |
seniorcitizen |
1 |
0.413377 |
912 |
0.143409 |
1.531208 |
| 4 |
partner |
no |
0.329809 |
2932 |
0.059841 |
1.221659 |
| 5 |
partner |
yes |
0.205033 |
2702 |
-0.064935 |
0.759472 |
| 6 |
dependents |
no |
0.313760 |
3968 |
0.043792 |
1.162212 |
| 7 |
dependents |
yes |
0.165666 |
1666 |
-0.104302 |
0.613651 |
| 8 |
phoneservice |
no |
0.241316 |
547 |
-0.028652 |
0.893870 |
| 9 |
phoneservice |
yes |
0.273049 |
5087 |
0.003081 |
1.011412 |
| 10 |
multiplelines |
no |
0.257407 |
2700 |
-0.012561 |
0.953474 |
| 11 |
multiplelines |
no_phone_service |
0.241316 |
547 |
-0.028652 |
0.893870 |
| 12 |
multiplelines |
yes |
0.290742 |
2387 |
0.020773 |
1.076948 |
| 13 |
internetservice |
dsl |
0.192347 |
1934 |
-0.077621 |
0.712482 |
| 14 |
internetservice |
fiber_optic |
0.425171 |
2479 |
0.155203 |
1.574895 |
| 15 |
internetservice |
no |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 16 |
onlinesecurity |
no |
0.420921 |
2801 |
0.150953 |
1.559152 |
| 17 |
onlinesecurity |
no_internet_service |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 18 |
onlinesecurity |
yes |
0.153226 |
1612 |
-0.116742 |
0.567570 |
| 19 |
onlinebackup |
no |
0.404323 |
2498 |
0.134355 |
1.497672 |
| 20 |
onlinebackup |
no_internet_service |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 21 |
onlinebackup |
yes |
0.217232 |
1915 |
-0.052736 |
0.804660 |
| 22 |
deviceprotection |
no |
0.395875 |
2473 |
0.125907 |
1.466379 |
| 23 |
deviceprotection |
no_internet_service |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 24 |
deviceprotection |
yes |
0.230412 |
1940 |
-0.039556 |
0.853480 |
| 25 |
techsupport |
no |
0.418914 |
2781 |
0.148946 |
1.551717 |
| 26 |
techsupport |
no_internet_service |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 27 |
techsupport |
yes |
0.159926 |
1632 |
-0.110042 |
0.592390 |
| 28 |
streamingtv |
no |
0.342832 |
2246 |
0.072864 |
1.269897 |
| 29 |
streamingtv |
no_internet_service |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 30 |
streamingtv |
yes |
0.302723 |
2167 |
0.032755 |
1.121328 |
| 31 |
streamingmovies |
no |
0.338906 |
2213 |
0.068938 |
1.255358 |
| 32 |
streamingmovies |
no_internet_service |
0.077805 |
1221 |
-0.192163 |
0.288201 |
| 33 |
streamingmovies |
yes |
0.307273 |
2200 |
0.037305 |
1.138182 |
| 34 |
contract |
month-to-month |
0.431701 |
3104 |
0.161733 |
1.599082 |
| 35 |
contract |
one_year |
0.120573 |
1186 |
-0.149395 |
0.446621 |
| 36 |
contract |
two_year |
0.028274 |
1344 |
-0.241694 |
0.104730 |
| 37 |
paperlessbilling |
no |
0.172071 |
2313 |
-0.097897 |
0.637375 |
| 38 |
paperlessbilling |
yes |
0.338151 |
3321 |
0.068183 |
1.252560 |
| 39 |
paymentmethod |
bank_transfer_(automatic) |
0.168171 |
1219 |
-0.101797 |
0.622928 |
| 40 |
paymentmethod |
credit_card_(automatic) |
0.164339 |
1217 |
-0.105630 |
0.608733 |
| 41 |
paymentmethod |
electronic_check |
0.455890 |
1893 |
0.185922 |
1.688682 |
| 42 |
paymentmethod |
mailed_check |
0.193870 |
1305 |
-0.076098 |
0.718121 |
Measure importance for categorical features
How much We can learn about one variable if we know the value of another
| df_full_train[categorical_columns].apply(
lambda col: mutual_info_score(col, df_full_train["churn"])
).sort_values(ascending=False)
|
contract 0.098320
onlinesecurity 0.063085
techsupport 0.061032
internetservice 0.055868
onlinebackup 0.046923
deviceprotection 0.043453
paymentmethod 0.043210
streamingtv 0.031853
streamingmovies 0.031581
paperlessbilling 0.017589
dependents 0.012346
partner 0.009968
seniorcitizen 0.009410
multiplelines 0.000857
phoneservice 0.000229
gender 0.000117
dtype: float64
Correlation
Measure importance for numerical features
- Positive correlation: both variables increase together
- Negative correlation: When a variable increase other decrease
| df_full_train[numerical_columns].corrwith(df_full_train.churn)
|
tenure -0.351885
monthlycharges 0.196805
totalcharges -0.196353
dtype: float64
One-hot encoding
Encode categorical features
| train_dicts = df_train[categorical_columns + numerical_columns].to_dict(
orient="records"
)
dict_vectorizer = DictVectorizer(sparse=False)
X_train = dict_vectorizer.fit_transform(train_dicts)
X_train.shape
|
| validation_dicts = df_validation[
categorical_columns + numerical_columns
].to_dict(orient="records")
X_validation = dict_vectorizer.transform(validation_dicts)
X_validation.shape
|
Logistic Regression
\[g(x_{i}) = sigmoid(w_{j} + W^{t}x_{i})\]
- Linear regression return a number between -\(\infty\) and +\(\infty\)
- Logistic regression return a number between 0 and 1, due the usage of sigmoid function
Regression
- Linear: gives a score
- Logistic: transform the score into a probability
Classification
Sigmoid function
| def sigmoid(z: NDArray[np.float64]) -> NDArray[np.float64]:
return 1 / (1 + np.exp(-z))
z = np.linspace(-7, 7, 51)
plt.plot(z, sigmoid(z))
plt.show()
|
Linear regression formula
| def linear_regression(xi: NDArray[np.float64]) -> float:
score = weight0
for index in range(len(weights)):
# linear operator (dot product)
score = score + xi[index] * weights[index]
return score
|
Logistic regression formula
| def logistic_regression(xi: NDArray[np.float64]) -> float:
score = weight0
for index in range(len(weights)):
# linear operator (dot product)
score = score + xi[index] * weights[index]
result = sigmoid(score)
return score
|
Training a logistic regression model
| model = LogisticRegression(max_iter=9999)
model.fit(X_train, y_train)
print(
model.intercept_[0], # bias term
model.coef_[0].round(3), # w (weights)
)
|
-0.045323382519725265 [ 0.685 0.039 -0.682 0.056 -0.015 0.114 -0.16 0.087 0.039 0.002
-0.497 0.698 -0.16 -0.018 -0.187 0.066 0.162 0.117 -0.16 0.084
0.285 -0.16 -0.084 -0.161 0.202 -0.045 0.086 -0.052 -0.003 0.106
-0.011 0.066 -0.025 0.194 -0.094 -0.16 0.295 -0.054 -0.16 0.255
0.235 -0.16 -0.034 -0.069 0. ]
Probability of a customer churn without we knowing anything about it
| sigmoid(model.intercept_[0]) # w0 (bias therm)
|
np.float64(0.4886710936321301)
| # model.predict(X_train) # Label
y_validation_pred = model.predict_proba(X_validation)[:, 1] # Probability
y_validation_pred
|
array([0.0066238 , 0.20482253, 0.21781799, ..., 0.15149644, 0.78847856,
0.81190391], shape=(1409,))
| churn_decision = y_validation_pred >= 0.5
|
Customers that may churn
| df_validation[churn_decision].head()
|
|
customerid |
gender |
seniorcitizen |
partner |
dependents |
tenure |
phoneservice |
multiplelines |
internetservice |
onlinesecurity |
onlinebackup |
deviceprotection |
techsupport |
streamingtv |
streamingmovies |
contract |
paperlessbilling |
paymentmethod |
monthlycharges |
totalcharges |
| 3 |
8433-wxgna |
male |
0 |
no |
no |
2 |
yes |
no |
fiber_optic |
yes |
no |
no |
no |
no |
no |
month-to-month |
yes |
electronic_check |
75.70 |
189.20 |
| 8 |
3440-jpscl |
female |
0 |
no |
no |
6 |
yes |
no |
fiber_optic |
no |
no |
yes |
yes |
yes |
yes |
month-to-month |
yes |
mailed_check |
99.95 |
547.65 |
| 12 |
7228-omtpn |
male |
0 |
no |
no |
4 |
yes |
no |
fiber_optic |
no |
no |
no |
no |
yes |
yes |
month-to-month |
yes |
electronic_check |
88.45 |
370.65 |
| 19 |
6711-fldfb |
female |
0 |
no |
no |
7 |
yes |
yes |
fiber_optic |
no |
no |
no |
no |
no |
no |
month-to-month |
yes |
electronic_check |
74.90 |
541.15 |
| 24 |
2612-ranwt |
female |
0 |
no |
no |
12 |
yes |
yes |
fiber_optic |
no |
no |
yes |
no |
yes |
yes |
month-to-month |
yes |
bank_transfer_(automatic) |
100.15 |
1164.30 |
| (y_validation == churn_decision).mean()
|
np.float64(0.8034066713981547)
Doing verification step by step
| df_pred = pd.DataFrame()
df_pred["probability"] = y_validation_pred
df_pred["prediction"] = churn_decision.astype(int)
df_pred["actual"] = y_validation
df_pred["correct"] = df_pred.prediction == df_pred.actual
df_pred.head()
|
|
probability |
prediction |
actual |
correct |
| 0 |
0.006624 |
0 |
0 |
True |
| 1 |
0.204823 |
0 |
0 |
True |
| 2 |
0.217818 |
0 |
0 |
True |
| 3 |
0.563750 |
1 |
1 |
True |
| 4 |
0.218675 |
0 |
0 |
True |
np.float64(0.8034066713981547)
Model interpretation
- Using a smaller model (trained with less features) and multiplying the values for their weights to reach the predicted value is an easier approach
Understand weights for each feature
| pd.DataFrame(
list(
zip(dict_vectorizer.get_feature_names_out(), model.coef_[0].round(3))
),
columns=["feature", "coefficient"],
).sort_values(by=["coefficient"], ascending=False).head(10)
|
|
feature |
coefficient |
| 11 |
internetservice=fiber_optic |
0.698 |
| 0 |
contract=month-to-month |
0.685 |
| 36 |
streamingmovies=yes |
0.295 |
| 20 |
onlinesecurity=no |
0.285 |
| 39 |
streamingtv=yes |
0.255 |
| 40 |
techsupport=no |
0.235 |
| 24 |
paperlessbilling=yes |
0.202 |
| 33 |
seniorcitizen |
0.194 |
| 16 |
multiplelines=yes |
0.162 |
| 17 |
onlinebackup=no |
0.117 |
| small_model_features = ["contract", "tenure", "monthlycharges"]
df_train[small_model_features].head()
|
|
contract |
tenure |
monthlycharges |
| 0 |
two_year |
72 |
115.50 |
| 1 |
month-to-month |
10 |
95.25 |
| 2 |
month-to-month |
5 |
75.55 |
| 3 |
month-to-month |
5 |
80.85 |
| 4 |
two_year |
18 |
20.10 |
Encode features
| dicts_train_small = df_train[small_model_features].to_dict(orient="records")
dicts_validation_small = df_validation[small_model_features].to_dict(
orient="records"
)
dict_vectorizer_small = DictVectorizer(sparse=False)
dict_vectorizer_small.fit(dicts_train_small)
dict_vectorizer_small.get_feature_names_out()
|
array(['contract=month-to-month', 'contract=one_year',
'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)
Train model to get coefficients
| X_train_small = dict_vectorizer_small.transform(dicts_train_small)
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)
w0 = model_small.intercept_[0]
w = model_small.coef_[0]
w0, w
|
(np.float64(-2.477957595829565),
array([ 0.9711394 , -0.02379507, -0.94828863, 0.02748534, -0.03619005]))
Inspect coefficients
| small_coefficients = dict(
zip(
dict_vectorizer_small.get_feature_names_out(),
model_small.coef_[0].round(3),
)
)
small_coefficients
|
{'contract=month-to-month': np.float64(0.971),
'contract=one_year': np.float64(-0.024),
'contract=two_year': np.float64(-0.948),
'monthlycharges': np.float64(0.027),
'tenure': np.float64(-0.036)}
Calculate the result for a customer:
- month to month contract
- $50 of monthly charges
- 5 months that the customer has been with the company
Probability of churn
| contract = (
1 * small_coefficients["contract=month-to-month"]
+ 0 * small_coefficients["contract=one_year"]
+ 0 * small_coefficients["contract=two_year"]
)
monthly_charges = 50 * small_coefficients["monthlycharges"]
tenure = 5 * small_coefficients["tenure"]
sigmoid(model_small.intercept_[0] + contract + monthly_charges + tenure)
|
np.float64(0.41654870218821455)
Using the model
Train the model with complete dataset
| # Encode full train dataset features
dicts_full_train = df_full_train[
categorical_columns + numerical_columns
].to_dict(orient="records")
dicts_vectorizer_full_train = DictVectorizer(sparse=False)
X_full_train = dicts_vectorizer_full_train.fit_transform(dicts_full_train)
# Train model
y_full_train = df_full_train.churn.values
model = LogisticRegression(max_iter=99999)
model.fit(X_full_train, y_full_train)
# Encode test dataset features
dicts_test = df_test[categorical_columns + numerical_columns].to_dict(
orient="records"
)
X_test = dicts_vectorizer_full_train.transform(dicts_test)
# Predict results
y_test_pred = model.predict_proba(X_test)[:, 1]
churn_decision = y_test_pred >= 0.5
churn_decision[:5]
|
array([False, False, False, False, False])
Model accuracy
| (churn_decision == y_test).mean()
|
np.float64(0.8105039034776437)
| customer_data = dicts_test[10]
customer_data
|
{'gender': 'male',
'seniorcitizen': 1,
'partner': 'yes',
'dependents': 'yes',
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'fiber_optic',
'onlinesecurity': 'no',
'onlinebackup': 'yes',
'deviceprotection': 'no',
'techsupport': 'no',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'month-to-month',
'paperlessbilling': 'yes',
'paymentmethod': 'mailed_check',
'tenure': 32,
'monthlycharges': 93.95,
'totalcharges': 2861.45}
Encode customer features to prediction
| X_customer = dicts_vectorizer_full_train.transform([customer_data])
X_customer.shape # 1 customer, 45 features
|
Predict customer churn
| model.predict_proba(X_customer)[:, 1]
|
when probability < 0.5, is more likely that customer will not churn
Confirm the actual label
Evaluation
Accuracy
- Evaluate the model on different thresholds
| thresholds = np.linspace(0, 1, 21)
scores = []
for threshold in thresholds:
score = accuracy_score(y_validation, y_validation_pred >= threshold)
print(f"{threshold:.2f}", f"{score:.2f}")
scores.append(score)
|
0.00 0.27
0.05 0.51
0.10 0.61
0.15 0.66
0.20 0.70
0.25 0.73
0.30 0.76
0.35 0.77
0.40 0.78
0.45 0.79
0.50 0.80
0.55 0.80
0.60 0.80
0.65 0.79
0.70 0.77
0.75 0.74
0.80 0.73
0.85 0.73
0.90 0.73
0.95 0.73
1.00 0.73
| Counter(y_validation_pred >= 1.0)
|
Counter({np.False_: 1409})
Check class imbalance
| df_full_train.churn.value_counts()
|
churn
0 4113
1 1521
Name: count, dtype: int64
If we have a class with a lot more values than another, this is called class imbalanced, and the accuracy measure can be misleading for this cases because is calculation is:
\[\frac{number\_correct\_predictions}{total\_predictions}\]
We must have a way to identify if our model is good besides class imbalance
Confusion Table
| True Negative |
False Negative |
False Positive |
True Positive |
| No Churn |
No Churn |
Churn |
Churn |
| Customer did not churn |
Customer churned |
Customer did not churn |
Customer Churned |
| Correct |
Wrong |
Wrong |
Correct |
Building a confusion matrix
| actual_positive = y_validation == 1
actual_negative = y_validation == 0
confusion_threshold = 0.5
predict_positive = y_validation_pred >= confusion_threshold
predict_negative = y_validation_pred < confusion_threshold
true_positive = (predict_positive & actual_positive).sum()
true_negative = (predict_negative & actual_negative).sum()
false_positive = (predict_positive & actual_negative).sum()
false_negative = (predict_negative & actual_positive).sum()
confusion_matrix = np.array(
[[true_negative, false_positive], [false_negative, true_positive]]
)
confusion_matrix
|
array([[920, 103],
[174, 212]])
Getting the accuracy from confusion matrix
| accuracy = (true_positive + true_negative) / (
true_positive + true_negative + false_positive + false_negative
)
accuracy
|
np.float64(0.8034066713981547)
Precision and Recall
Precision:
Fraction of positive predictions (customers that will churn) that are correct
\[\frac{true\_positives}{true\_positives + false\_positives}\]
| precision = true_positive / (true_positive + false_positive)
print(
f"Precision: From those we predicted would churn only {precision:.2f} actually would\n",
f"Wrongly said that would churn: {1.0 - precision:.2f}",
)
|
Precision: From those we predicted would churn only 0.67 actually would
Wrongly said that would churn: 0.33
Recall:
Fraction correctly identified positive examples
\[\frac{true\_positives}{true\_positives + false\_negatives}\]
| recall = true_positive / (true_positive + false_negative)
print(
f"Recall: From the customer that would actually churn {recall:.2f} customers were predicted to\n",
f"Failed to identify churning customers: {1.0 - recall:.2f}",
)
|
Recall: From the customer that would actually churn 0.55 customers were predicted to
Failed to identify churning customers: 0.45
ROC Curves
Receiver Operating Characteristics
Is a way to describe the performance of a binary classification model
TPR (true positive rate)
| true_positive_rate = true_positive / (true_positive + false_negative)
true_positive_rate
|
np.float64(0.5492227979274611)
FPR (false positive rate)
| false_positive_rate = false_positive / (true_positive + false_negative)
false_positive_rate
|
np.float64(0.266839378238342)
Evaluate different confusion matrixes for each threshold
| def tpr_fpr_dataframe(
y_validation: NDArray[Union[np.int64, np.float64]],
y_validation_pred: NDArray[np.float64],
) -> pd.DataFrame:
scores = []
thresholds = np.linspace(0, 1, 101)
for threshold in thresholds:
actual_positive = y_validation == 1
actual_negative = y_validation == 0
predict_positive = y_validation_pred >= threshold
predict_negative = y_validation_pred < threshold
true_positive = (predict_positive & actual_positive).sum()
true_negative = (predict_negative & actual_negative).sum()
false_positive = (predict_positive & actual_negative).sum()
false_negative = (predict_negative & actual_positive).sum()
scores.append(
(
threshold,
true_positive,
false_positive,
false_negative,
true_negative,
)
)
df_scores = pd.DataFrame(
scores,
columns=[
"threshold",
"true_positive",
"false_positive",
"false_negative",
"true_negative",
],
)
df_scores["true_positive_rate"] = df_scores.true_positive / (
df_scores.true_positive + df_scores.false_negative
)
df_scores["false_positive_rate"] = df_scores.false_positive / (
df_scores.false_positive + df_scores.true_negative
)
return df_scores
|
Visualize values in a DataFrame
| df_scores = tpr_fpr_dataframe(y_validation, y_validation_pred)
df_scores[::10]
|
|
threshold |
true_positive |
false_positive |
false_negative |
true_negative |
true_positive_rate |
false_positive_rate |
| 0 |
0.0 |
386 |
1023 |
0 |
0 |
1.000000 |
1.000000 |
| 10 |
0.1 |
366 |
533 |
20 |
490 |
0.948187 |
0.521017 |
| 20 |
0.2 |
339 |
372 |
47 |
651 |
0.878238 |
0.363636 |
| 30 |
0.3 |
292 |
247 |
94 |
776 |
0.756477 |
0.241447 |
| 40 |
0.4 |
254 |
175 |
132 |
848 |
0.658031 |
0.171065 |
| 50 |
0.5 |
212 |
103 |
174 |
920 |
0.549223 |
0.100684 |
| 60 |
0.6 |
151 |
53 |
235 |
970 |
0.391192 |
0.051808 |
| 70 |
0.7 |
69 |
13 |
317 |
1010 |
0.178756 |
0.012708 |
| 80 |
0.8 |
4 |
0 |
382 |
1023 |
0.010363 |
0.000000 |
| 90 |
0.9 |
0 |
0 |
386 |
1023 |
0.000000 |
0.000000 |
| 100 |
1.0 |
0 |
0 |
386 |
1023 |
0.000000 |
0.000000 |
Plot true positive rates and true negative rates
| plt.plot(df_scores.threshold, df_scores.true_positive_rate, label="TPR")
plt.plot(df_scores.threshold, df_scores.false_positive_rate, label="FPR")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
|
Random Model
The probability is almost the same as flipping a coin
| np.random.seed(1)
y_random = np.random.uniform(0, 1, size=len(y_validation))
y_random
|
array([4.17022005e-01, 7.20324493e-01, 1.14374817e-04, ...,
7.73916250e-01, 3.34276405e-01, 8.89982208e-02], shape=(1409,))
| ((y_random >= 0.5) == y_validation).mean()
|
np.float64(0.5017743080198722)
| df_random = tpr_fpr_dataframe(y_validation, y_random)
plt.plot(df_random.threshold, df_random.true_positive_rate, label="TPR")
plt.plot(df_random.threshold, df_random.false_positive_rate, label="FPR")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
|
Ideal Model
100% accuracy
| number_of_positives = (
y_validation == 1
).sum() # positive means that will churn
number_of_negatives = (
y_validation == 0
).sum() # negative means that will not churn
number_of_negatives, number_of_positives
|
(np.int64(1023), np.int64(386))
| y_ideal = np.repeat([0, 1], [number_of_negatives, number_of_positives])
y_ideal
|
array([0, 0, 0, ..., 1, 1, 1], shape=(1409,))
| y_ideal_pred = np.linspace(0, 1, len(y_validation))
y_ideal_pred
|
array([0.00000000e+00, 7.10227273e-04, 1.42045455e-03, ...,
9.98579545e-01, 9.99289773e-01, 1.00000000e+00], shape=(1409,))
np.float64(0.7260468417317246)
| ((y_ideal_pred >= 0.726) == y_ideal).mean()
|
| df_ideal = tpr_fpr_dataframe(y_ideal, y_ideal_pred)
df_ideal[::10]
|
|
threshold |
true_positive |
false_positive |
false_negative |
true_negative |
true_positive_rate |
false_positive_rate |
| 0 |
0.0 |
386 |
1023 |
0 |
0 |
1.000000 |
1.000000 |
| 10 |
0.1 |
386 |
882 |
0 |
141 |
1.000000 |
0.862170 |
| 20 |
0.2 |
386 |
741 |
0 |
282 |
1.000000 |
0.724340 |
| 30 |
0.3 |
386 |
600 |
0 |
423 |
1.000000 |
0.586510 |
| 40 |
0.4 |
386 |
459 |
0 |
564 |
1.000000 |
0.448680 |
| 50 |
0.5 |
386 |
319 |
0 |
704 |
1.000000 |
0.311828 |
| 60 |
0.6 |
386 |
178 |
0 |
845 |
1.000000 |
0.173998 |
| 70 |
0.7 |
386 |
37 |
0 |
986 |
1.000000 |
0.036168 |
| 80 |
0.8 |
282 |
0 |
104 |
1023 |
0.730570 |
0.000000 |
| 90 |
0.9 |
141 |
0 |
245 |
1023 |
0.365285 |
0.000000 |
| 100 |
1.0 |
1 |
0 |
385 |
1023 |
0.002591 |
0.000000 |
| plt.plot(df_ideal.threshold, df_ideal.true_positive_rate, label="TPR")
plt.plot(df_ideal.threshold, df_ideal.false_positive_rate, label="FPR")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
|
Plotting all models
| plt.plot(
df_scores.threshold, df_scores.true_positive_rate, label="TPR - scores"
)
plt.plot(
df_scores.threshold, df_scores.false_positive_rate, label="FPR - scores"
)
plt.plot(
df_ideal.threshold,
df_ideal.true_positive_rate,
label="TPR - ideal",
color="black",
)
plt.plot(
df_ideal.threshold,
df_ideal.false_positive_rate,
label="FPR - ideal",
color="black",
)
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
|
Plotting rates
The model must be as close to ideal as possible, if below of random the model performance is really bad
| plt.figure(figsize=(5, 5))
plt.plot(
df_scores.false_positive_rate, df_scores.true_positive_rate, label="model"
)
plt.plot(
df_random.false_positive_rate,
df_random.true_positive_rate,
label="random",
)
plt.plot(
df_ideal.false_positive_rate,
df_ideal.true_positive_rate,
label="ideal",
)
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
|
Using scikit learn
| skl_false_positive_rate, skl_true_positive_rate, skl_threshold = roc_curve(
y_validation, y_validation_pred
)
plt.figure(figsize=(5, 5))
plt.plot(
skl_false_positive_rate,
skl_true_positive_rate,
label="Model",
)
plt.plot([0, 1], [0, 1], label="Random", linestyle="--")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
|
ROC AUC
Receiver Operating Characteristics - Area Under The Curve
Measuring the area under the curve is possible to understand if te model is close to the ideal or the baseline
| skl_false_positive_rate, skl_true_positive_rate, skl_threshold = roc_curve(
y_validation, y_validation_pred
)
auc(skl_false_positive_rate, skl_true_positive_rate)
|
ROC AUC for scores DataFrame
| auc(df_scores.false_positive_rate, df_scores.true_positive_rate)
|
ROC AUC for ideal DataFrame
| auc(df_ideal.false_positive_rate, df_ideal.true_positive_rate)
|
Or using scikit-learn method
| roc_auc_score(y_validation, y_validation_pred)
|
Interpretation of ROC AUC
AUC: Probability that randomly selected positive example has higher score than randomly selected negative example and how well the model can order or customers
| negative_occurrences = y_validation_pred[y_validation == 0]
positive_occurrences = y_validation_pred[y_validation == 1]
|
Calculating roc auc score
| n = 100000
success = 0
for i in range(n):
positive_index = random.randint(0, len(positive_occurrences) - 1)
negative_index = random.randint(0, len(negative_occurrences) - 1)
if (
positive_occurrences[positive_index]
> negative_occurrences[negative_index]
):
success = success + 1
success / n
|
Or using numpy
| n = 100000
positive_index = np.random.randint(0, len(positive_occurrences), size=n)
negative_index = np.random.randint(0, len(negative_occurrences), size=n)
(
(
positive_occurrences[positive_index]
> negative_occurrences[negative_index]
)
).mean()
|
Cross-Validation
- Evaluating the same model on different subsets of data
- Get the average prediction and the spread within predictions
K-Fold Cross Validation
| Full train dataset |
Test dataset |
| Train + Validation |
Test |
Split the full train dataset in multiple parts (folds)
- Train with 1, 2 and validate with 3 using AUC
- Train with 1, 3 and validate with 2 using AUC
- Train with 2, 3 and validate with 1 using AUC
Define a train function
| def train(
df_train: pd.DataFrame, y_train: NDArray[np.float64], C=1.0
) -> Tuple[DictVectorizer, LogisticRegression]:
"""
Trains a logistic regression model using a DictVectorizer for feature encoding.
The function converts categorical and numerical features into a numeric
feature matrix using ``DictVectorizer`` and fits a logistic regression
classifier with L2 regularization.
Args:
df_train (pd.DataFrame): Training dataset containing both categorical
and numerical feature columns.
y_train (NDArray[np.float64]): Target labels corresponding to the
training data.
C (float, optional): Inverse of regularization strength for the logistic
regression model. Smaller values specify stronger regularization.
Defaults to 1.0.
Returns:
Tuple[DictVectorizer, LogisticRegression]:
- DictVectorizer: Fitted vectorizer used to transform feature
dictionaries into numeric arrays.
- LogisticRegression: Trained logistic regression model.
Raises:
ValueError: If the input data contains incompatible shapes or missing
required columns.
"""
dicts = df_train[categorical_columns + numerical_columns].to_dict(
orient="records"
)
dict_vectorizer = DictVectorizer(sparse=False)
X_train = dict_vectorizer.fit_transform(dicts)
model = LogisticRegression(max_iter=99999, C=C)
model.fit(X_train, y_train)
return dict_vectorizer, model
|
Train the model
| dict_vectorizer, model = train(df_train, y_train)
|
Define a predict function
| def predict(
df: pd.DataFrame,
dict_vectorizer: DictVectorizer,
model: LogisticRegression,
):
dicts = df[categorical_columns + numerical_columns].to_dict(
orient="records"
)
X = dict_vectorizer.transform(dicts)
y_pred = model.predict_proba(X)[:, 1]
return y_pred
|
Predict execution example
| predict(df_validation, dict_vectorizer, model)[:5]
|
array([0.0066238 , 0.20482253, 0.21781799, 0.56375043, 0.21867476])
Run cross validation
| n_splits = 5
for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
scores = []
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for train_index, validation_index in kfold.split(df_full_train):
df_train = df_full_train.iloc[train_index]
df_validation = df_full_train.iloc[validation_index]
y_train = df_train.churn.values
y_validation = df_validation.churn.values
dict_vectorizer, model = train(df_train, y_train, C=C)
y_pred = predict(df_validation, dict_vectorizer, model)
auc = roc_auc_score(y_validation, y_pred)
scores.append(auc)
print(C, f"{np.mean(scores):.3f}", f"{np.std(scores):.3f}")
|
14%|█▍ | 1/7 [00:08<00:48, 8.04s/it]
29%|██▊ | 2/7 [00:27<01:14, 14.81s/it]
43%|████▎ | 3/7 [01:01<01:33, 23.43s/it]
57%|█████▋ | 4/7 [01:35<01:23, 27.88s/it]
71%|███████▏ | 5/7 [02:02<00:54, 27.43s/it]
86%|████████▌ | 6/7 [02:27<00:26, 26.63s/it]
100%|██████████| 7/7 [02:55<00:00, 25.05s/it]
Measure ROC AUC for the final model
| dict_vectorizer, model = train(
df_full_train, df_full_train.churn.values, C=1.0
)
y_pred = predict(df_test, dict_vectorizer, model)
auc = roc_auc_score(y_test, y_pred)
auc
|
Save the model
Define output dir
| OUTPUT_DIR = pathlib.Path("artifacts/predicting-customer-churn")
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
Define output file
| output_file = OUTPUT_DIR / f"model_C={C}.bin"
output_file
|
PosixPath('artifacts/predicting-customer-churn/model_C=10.bin')
Save model
| with open(output_file, "wb") as f_out:
pickle.dump((dict_vectorizer, model), f_out)
|
Load the model
| with open(output_file, "rb") as f_in:
loaded_dict_vectorizer, loaded_model = pickle.load(f_in)
loaded_dict_vectorizer, loaded_model
|
(DictVectorizer(sparse=False), LogisticRegression(max_iter=99999))
| customer = {
"gender": "female",
"seniorcitizen": 0,
"partner": "yes",
"dependents": "no",
"phoneservice": "no",
"multiplelines": "no_phone_service",
"internetservice": "dsl",
"onlinesecurity": "no",
"onlinebackup": "yes",
"deviceprotection": "no",
"techsupport": "no",
"streamingtv": "no",
"streamingmovies": "no",
"contract": "month-to-month",
"paperlessbilling": "yes",
"paymentmethod": "eletronic_check",
"tenure": 1,
"monthlycharges": 29.85,
"totalcharges": 29.85,
}
|
| X = loaded_dict_vectorizer.transform([customer])
|
| loaded_model.predict_proba(X)[0, 1]
|
np.float64(0.5786968179280463)