Churn Prediction¶

Binary classification

\[g(x_{i}) \approx y_{i}\]

\[y_{i} \in \{0, 1\}\]

$1$: Churn
$0$: No Churn

Dataset:

Install packages

!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1

Append notebooks directory to sys.path

import sys

sys.path.append("../../..")

Import packages

import os
import pathlib
import random
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
from typing import Tuple, Union
import numpy as np
from numpy.typing import NDArray
import seaborn as sns
import datetime
from collections import Counter
import pickle
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
    mutual_info_score, accuracy_score, roc_curve, auc, roc_auc_score
)
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)

pd.set_option("display.max_columns", None)

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

%matplotlib inline

load_dotenv()  # Root directory .env file

True

Utility scripts:¶

KaggleDataExtractor:

import base64
import io
import logging
import os
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger("KaggleExtractor")


class ExtractionConfig(ABC):
    pass


class DataExtractionStrategy(ABC):
    @abstractmethod
    def download_dataset(self, config: ExtractionConfig) -> None:
        pass


@dataclass(frozen=True)
class KaggleExtractionConfig(ExtractionConfig):
    dataset_slug: str  # e.g. "zynicide/wine-reviews"
    file_name: str  # file inside the Kaggle zip
    destination_path: str  # folder to extract to
    output_file_name: str | None = None  # optional rename


class KaggleDataExtractor(DataExtractionStrategy):
    def __init__(self, username: str, api_token: str) -> None:
        self.username = username
        self.api_token = api_token
        self.auth_header = self._create_auth_header()

    def _create_auth_header(self):
        token = f"{self.username}:{self.api_token}"
        base64_token = base64.b64encode(token.encode()).decode()
        return {"Authorization": f"Basic {base64_token}"}

    def download_dataset(self, config: ExtractionConfig) -> None:
        if not isinstance(config, KaggleExtractionConfig):
            raise TypeError("config must be a KaggleExtractionConfig instance")

        url = f"https://www.kaggle.com/api/v1/datasets/download/{config.dataset_slug}"
        request = Request(url, headers=self.auth_header)

        logger.info(f"Starting download from Kaggle: {url}")

        try:
            with urlopen(request) as response:
                data = response.read()
            logger.info("Download completed. Extracting zip file...")

            os.makedirs(config.destination_path, exist_ok=True)

            with zipfile.ZipFile(io.BytesIO(data)) as z:
                extracted_path = z.extract(
                    config.file_name, path=config.destination_path
                )

            if config.output_file_name is None:
                logger.info(
                    f"Dataset '{config.file_name}' extracted successfully "
                    f"to: {config.destination_path}"
                )
                return

            old_path = os.path.join(config.destination_path, config.file_name)
            new_path = os.path.join(
                config.destination_path, config.output_file_name
            )

            os.rename(old_path, new_path)

            logger.info(
                f"Dataset '{config.file_name}' extracted successfully "
                f"to: {config.destination_path}"
            )

        except HTTPError as e:
            logger.error(f"HTTP Error {e.code}: {e.reason}")
        except URLError as e:
            logger.error(f"URL Error: {e.reason}")
        except zipfile.BadZipFile:
            logger.error(
                "Failed to read zip file. Kaggle may have returned HTML instead of a zip."
            )
        except Exception as e:
            logger.exception(f"Unexpected error occurred: {e}")

Create data directory

DATA_DIR = pathlib.Path("data/predicting-customer-churn")

os.makedirs(DATA_DIR, exist_ok=True)

Download dataset from Kaggle

username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="blastchar/telco-customer-churn",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="churn.csv",
)

if not os.path.isfile(DATA_DIR / "churn.csv"):
    extractor.download_dataset(config)

2026-01-04 11:17:27 | INFO     | KaggleExtractor | Starting download from Kaggle: https://www.kaggle.com/api/v1/datasets/download/blastchar/telco-customer-churn

2026-01-04 11:17:28 | INFO     | KaggleExtractor | Download completed. Extracting zip file...

2026-01-04 11:17:28 | INFO     | KaggleExtractor | Dataset 'WA_Fn-UseC_-Telco-Customer-Churn.csv' extracted successfully to: data/predicting-customer-churn

Data Preparation¶

Load dataset

df = pd.read_csv(DATA_DIR / "churn.csv")

df.head(n=2)

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No

Inspect all columns at once

df.head(3).T

	0	1	2
customerID	7590-VHVEG	5575-GNVDE	3668-QPYBK
gender	Female	Male	Male
SeniorCitizen	0	0	0
Partner	Yes	No	No
Dependents	No	No	No
tenure	1	34	2
PhoneService	No	Yes	Yes
MultipleLines	No phone service	No	No
InternetService	DSL	DSL	DSL
OnlineSecurity	No	Yes	Yes
OnlineBackup	Yes	No	Yes
DeviceProtection	No	Yes	No
TechSupport	No	No	No
StreamingTV	No	No	No
StreamingMovies	No	No	No
Contract	Month-to-month	One year	Month-to-month
PaperlessBilling	Yes	No	Yes
PaymentMethod	Electronic check	Mailed check	Mailed check
MonthlyCharges	29.85	56.95	53.85
TotalCharges	29.85	1889.5	108.15
Churn	No	No	Yes

Data summary

df_summary = pd.DataFrame(
    {
        "column": df.columns,
        "dtype": [df[col].dtype for col in df.columns],
        "sample_unique": [df[col].unique()[:6] for col in df.columns],
        "n_unique": [df[col].nunique() for col in df.columns],
    }
)
df_summary

	column	dtype	sample_unique	n_unique
0	customerID	object	[7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOC...	7043
1	gender	object	[Female, Male]	2
2	SeniorCitizen	int64	[0, 1]	2
3	Partner	object	[Yes, No]	2
4	Dependents	object	[No, Yes]	2
5	tenure	int64	[1, 34, 2, 45, 8, 22]	73
6	PhoneService	object	[No, Yes]	2
7	MultipleLines	object	[No phone service, No, Yes]	3
8	InternetService	object	[DSL, Fiber optic, No]	3
9	OnlineSecurity	object	[No, Yes, No internet service]	3
10	OnlineBackup	object	[Yes, No, No internet service]	3
11	DeviceProtection	object	[No, Yes, No internet service]	3
12	TechSupport	object	[No, Yes, No internet service]	3
13	StreamingTV	object	[No, Yes, No internet service]	3
14	StreamingMovies	object	[No, Yes, No internet service]	3
15	Contract	object	[Month-to-month, One year, Two year]	3
16	PaperlessBilling	object	[Yes, No]	2
17	PaymentMethod	object	[Electronic check, Mailed check, Bank transfer...	4
18	MonthlyCharges	float64	[29.85, 56.95, 53.85, 42.3, 70.7, 99.65]	1585
19	TotalCharges	object	[29.85, 1889.5, 108.15, 1840.75, 151.65, 820.5]	6531
20	Churn	object	[No, Yes]	2

Clean column names

df.columns = df.columns.str.lower().str.replace(" ", "_")

df.head(n=2)

	customerid	gender	seniorcitizen	partner	dependents	tenure	phoneservice	multiplelines	internetservice	onlinesecurity	onlinebackup	deviceprotection	techsupport	streamingtv	streamingmovies	contract	paperlessbilling	paymentmethod	monthlycharges	totalcharges	churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No

Select only object type columns

object_type_columns = list(df.dtypes[df.dtypes == "object"].index)
object_type_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

Clean columns

object_type_columns = list(df.dtypes[df.dtypes == "object"].index)
for column in object_type_columns:
    df[column] = df[column].str.lower().str.replace(" ", "_")

Inspect values of total charges, it should numeric

df.totalcharges[:5]

0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
Name: totalcharges, dtype: object

Cast total charges to numeric type

total_charges = pd.to_numeric(df.totalcharges, errors="coerce")
total_charges[:5]

0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
Name: totalcharges, dtype: float64

Check for null values

total_charges.loc[total_charges.isnull()][:5]

488    NaN
753    NaN
936    NaN
1082   NaN
1340   NaN
Name: totalcharges, dtype: float64

Treat the null values

df.totalcharges = total_charges.fillna(0)

Check churn field values

df.churn[:5]

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

Encode churn field to binary

(df.churn == "yes").astype(int)[:5]

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

Set original churn dataset column to binary

df.churn = (df.churn == "yes").astype(int)

Validation Framework¶

Set split sizes

Training dataset: 60%
Validation dataset: 20%
Test dataset: 20%

Split dataset into full train (train + validation) and test

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

Get dataset's sizes

len(df_full_train), len(df_test)

(5634, 1409)

Calculate how train and validation dataset's sizes should be

print(
    f"df_full_train size: {(100 - 20)/100.:.0%}\n"
    f"df_test size: {(20)/100.:.0%}\n"
    f"df_train size: 60% of 80% = {(60)/80.:.0%}\n"
    f"df_validation size: 20% of 80% = {(20)/80.:.0%}\n"
)

df_full_train size: 80%

df_test size: 20%

df_train size: 60% of 80% = 75%

df_validation size: 20% of 80% = 25%

Split full train dataset into train and validation datasets

df_train, df_validation = train_test_split(
    df_full_train, test_size=0.25, random_state=1
)

Get full length of dataset

len(df_train), len(df_validation), len(df_test)

(4225, 1409, 1409)

Reset dataset's indexes

df_train.reset_index(drop=True, inplace=True)
df_validation.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

Get target variables

y_train = df_train["churn"]
y_validation = df_validation["churn"]
y_test = df_test["churn"]

Remove target variables from original datasets

df_train.drop(columns=["churn"], inplace=True)
df_validation.drop(columns=["churn"], inplace=True)
df_test.drop(columns=["churn"], inplace=True)

Exploratory Data Analysis¶

Reset full train dataset index

df_full_train.reset_index(drop=True, inplace=True)

Inspect dataset

df_full_train.head(n=2)

	customerid	gender	seniorcitizen	partner	dependents	tenure	phoneservice	multiplelines	internetservice	onlinesecurity	onlinebackup	deviceprotection	techsupport	streamingtv	streamingmovies	contract	paperlessbilling	paymentmethod	monthlycharges	totalcharges	churn
0	5442-pptjy	male	0	yes	yes	12	yes	no	no	no_internet_service	no_internet_service	no_internet_service	no_internet_service	no_internet_service	no_internet_service	two_year	no	mailed_check	19.7	258.35	0
1	6261-rcvns	female	0	no	no	42	yes	no	dsl	yes	yes	yes	yes	no	yes	one_year	no	credit_card_(automatic)	73.9	3160.55	1

Check if null values are present

df.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

Inspect target variable churn

df_full_train.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

Get percent of total

df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

Get mean

df_full_train.churn.mean()  # number of ones divided by total

np.float64(0.26996805111821087)

Mean and percent of total for churn is the same because is encoded to binary. So both calculations are number of ones divided by total

global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

np.float64(0.27)

Inspect columns types

df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

Set numerical columns

numerical_columns = ["tenure", "monthlycharges", "totalcharges"]

Set categorical columns

categorical_columns = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

Inspect categorical columns

df_full_train[categorical_columns].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

Feature Importance¶

Churn Rate¶

Difference:

(global_churn_rate - group_churn_rate) > 0: Less likely to churn
(global_churn_rate - group_churn_rate) < 0: More likely to churn

Risk Ratio:

(group_churn_rate / global_churn_rate) > 1: More likely to churn
(group_churn_rate / global_churn_rate) < 1: Less likely to churn

df_groups = []

for column in categorical_columns:
    global_churn_rate = df_full_train.churn.mean()
    df_group = (
        df_full_train[[column, "churn"]]
        .groupby(column)
        .churn.agg(["mean", "count"])
    )
    df_group["diff"] = df_group["mean"] - global_churn_rate
    df_group["risk"] = df_group["mean"] / global_churn_rate
    df_group = df_group.reset_index().rename(columns={column: "label"})
    df_group.insert(0, "column", column)
    df_groups.append(df_group)

result = pd.concat(df_groups, ignore_index=True)
result

	column	label	mean	count	diff	risk
0	gender	female	0.276824	2796	0.006856	1.025396
1	gender	male	0.263214	2838	-0.006755	0.974980
2	seniorcitizen	0	0.242270	4722	-0.027698	0.897403
3	seniorcitizen	1	0.413377	912	0.143409	1.531208
4	partner	no	0.329809	2932	0.059841	1.221659
5	partner	yes	0.205033	2702	-0.064935	0.759472
6	dependents	no	0.313760	3968	0.043792	1.162212
7	dependents	yes	0.165666	1666	-0.104302	0.613651
8	phoneservice	no	0.241316	547	-0.028652	0.893870
9	phoneservice	yes	0.273049	5087	0.003081	1.011412
10	multiplelines	no	0.257407	2700	-0.012561	0.953474
11	multiplelines	no_phone_service	0.241316	547	-0.028652	0.893870
12	multiplelines	yes	0.290742	2387	0.020773	1.076948
13	internetservice	dsl	0.192347	1934	-0.077621	0.712482
14	internetservice	fiber_optic	0.425171	2479	0.155203	1.574895
15	internetservice	no	0.077805	1221	-0.192163	0.288201
16	onlinesecurity	no	0.420921	2801	0.150953	1.559152
17	onlinesecurity	no_internet_service	0.077805	1221	-0.192163	0.288201
18	onlinesecurity	yes	0.153226	1612	-0.116742	0.567570
19	onlinebackup	no	0.404323	2498	0.134355	1.497672
20	onlinebackup	no_internet_service	0.077805	1221	-0.192163	0.288201
21	onlinebackup	yes	0.217232	1915	-0.052736	0.804660
22	deviceprotection	no	0.395875	2473	0.125907	1.466379
23	deviceprotection	no_internet_service	0.077805	1221	-0.192163	0.288201
24	deviceprotection	yes	0.230412	1940	-0.039556	0.853480
25	techsupport	no	0.418914	2781	0.148946	1.551717
26	techsupport	no_internet_service	0.077805	1221	-0.192163	0.288201
27	techsupport	yes	0.159926	1632	-0.110042	0.592390
28	streamingtv	no	0.342832	2246	0.072864	1.269897
29	streamingtv	no_internet_service	0.077805	1221	-0.192163	0.288201
30	streamingtv	yes	0.302723	2167	0.032755	1.121328
31	streamingmovies	no	0.338906	2213	0.068938	1.255358
32	streamingmovies	no_internet_service	0.077805	1221	-0.192163	0.288201
33	streamingmovies	yes	0.307273	2200	0.037305	1.138182
34	contract	month-to-month	0.431701	3104	0.161733	1.599082
35	contract	one_year	0.120573	1186	-0.149395	0.446621
36	contract	two_year	0.028274	1344	-0.241694	0.104730
37	paperlessbilling	no	0.172071	2313	-0.097897	0.637375
38	paperlessbilling	yes	0.338151	3321	0.068183	1.252560
39	paymentmethod	bank_transfer_(automatic)	0.168171	1219	-0.101797	0.622928
40	paymentmethod	credit_card_(automatic)	0.164339	1217	-0.105630	0.608733
41	paymentmethod	electronic_check	0.455890	1893	0.185922	1.688682
42	paymentmethod	mailed_check	0.193870	1305	-0.076098	0.718121

Mutual information¶

Measure importance for categorical features

How much We can learn about one variable if we know the value of another

df_full_train[categorical_columns].apply(
    lambda col: mutual_info_score(col, df_full_train["churn"])
).sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

Correlation¶

Measure importance for numerical features

Positive correlation: both variables increase together
Negative correlation: When a variable increase other decrease

df_full_train[numerical_columns].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

One-hot encoding¶

Encode categorical features

train_dicts = df_train[categorical_columns + numerical_columns].to_dict(
    orient="records"
)

dict_vectorizer = DictVectorizer(sparse=False)

X_train = dict_vectorizer.fit_transform(train_dicts)
X_train.shape

(4225, 45)

validation_dicts = df_validation[
    categorical_columns + numerical_columns
].to_dict(orient="records")

X_validation = dict_vectorizer.transform(validation_dicts)
X_validation.shape

(1409, 45)

Logistic Regression¶

\[g(x_{i}) = sigmoid(w_{j} + W^{t}x_{i})\]

Linear regression return a number between -$\infty$ and +$\infty$
Logistic regression return a number between 0 and 1, due the usage of sigmoid function

Regression¶

Linear: gives a score
Logistic: transform the score into a probability

Classification¶

Binary
Multiclass

Sigmoid function

def sigmoid(z: NDArray[np.float64]) -> NDArray[np.float64]:
    return 1 / (1 + np.exp(-z))

z = np.linspace(-7, 7, 51)

plt.plot(z, sigmoid(z))
plt.show()

Linear regression formula

def linear_regression(xi: NDArray[np.float64]) -> float:
    score = weight0

    for index in range(len(weights)):
        # linear operator (dot product)
        score = score + xi[index] * weights[index]

    return score

Logistic regression formula

def logistic_regression(xi: NDArray[np.float64]) -> float:
    score = weight0

    for index in range(len(weights)):
        # linear operator (dot product)
        score = score + xi[index] * weights[index]

    result = sigmoid(score)
    return score

Training a logistic regression model¶

model = LogisticRegression(max_iter=9999)
model.fit(X_train, y_train)

print(
    model.intercept_[0],  # bias term
    model.coef_[0].round(3),  # w (weights)
)

-0.045323382519725265 [ 0.685  0.039 -0.682  0.056 -0.015  0.114 -0.16   0.087  0.039  0.002

 -0.497  0.698 -0.16  -0.018 -0.187  0.066  0.162  0.117 -0.16   0.084

  0.285 -0.16  -0.084 -0.161  0.202 -0.045  0.086 -0.052 -0.003  0.106

 -0.011  0.066 -0.025  0.194 -0.094 -0.16   0.295 -0.054 -0.16   0.255

  0.235 -0.16  -0.034 -0.069  0.   ]

Probability of a customer churn without we knowing anything about it

sigmoid(model.intercept_[0])  # w0 (bias therm)

np.float64(0.4886710936321301)

# model.predict(X_train) # Label
y_validation_pred = model.predict_proba(X_validation)[:, 1]  # Probability
y_validation_pred

array([0.0066238 , 0.20482253, 0.21781799, ..., 0.15149644, 0.78847856,
       0.81190391], shape=(1409,))

churn_decision = y_validation_pred >= 0.5

Customers that may churn

df_validation[churn_decision].head()

	customerid	gender	partner	dependents	tenure	phoneservice	multiplelines	internetservice	onlinesecurity	onlinebackup	deviceprotection	techsupport	streamingtv	streamingmovies	contract	paperlessbilling	paymentmethod	monthlycharges	totalcharges
3	8433-wxgna	male	no	no	2	yes	no	fiber_optic	yes	no	no	no	no	no	month-to-month	yes	electronic_check	75.70	189.20
8	3440-jpscl	female	no	no	6	yes	no	fiber_optic	no	no	yes	yes	yes	yes	month-to-month	yes	mailed_check	99.95	547.65
12	7228-omtpn	male	no	no	4	yes	no	fiber_optic	no	no	no	no	yes	yes	month-to-month	yes	electronic_check	88.45	370.65
19	6711-fldfb	female	no	no	7	yes	yes	fiber_optic	no	no	no	no	no	no	month-to-month	yes	electronic_check	74.90	541.15
24	2612-ranwt	female	no	no	12	yes	yes	fiber_optic	no	no	yes	no	yes	yes	month-to-month	yes	bank_transfer_(automatic)	100.15	1164.30

(y_validation == churn_decision).mean()

np.float64(0.8034066713981547)

Doing verification step by step

df_pred = pd.DataFrame()
df_pred["probability"] = y_validation_pred
df_pred["prediction"] = churn_decision.astype(int)
df_pred["actual"] = y_validation
df_pred["correct"] = df_pred.prediction == df_pred.actual
df_pred.head()

	probability	prediction	actual	correct
0	0.006624	0	0	True
1	0.204823	0	0	True
2	0.217818	0	0	True
3	0.563750	1	1	True
4	0.218675	0	0	True

df_pred.correct.mean()

np.float64(0.8034066713981547)

Model interpretation¶

Using a smaller model (trained with less features) and multiplying the values for their weights to reach the predicted value is an easier approach

Understand weights for each feature

pd.DataFrame(
    list(
        zip(dict_vectorizer.get_feature_names_out(), model.coef_[0].round(3))
    ),
    columns=["feature", "coefficient"],
).sort_values(by=["coefficient"], ascending=False).head(10)

	feature	coefficient
11	internetservice=fiber_optic	0.698
0	contract=month-to-month	0.685
36	streamingmovies=yes	0.295
20	onlinesecurity=no	0.285
39	streamingtv=yes	0.255
40	techsupport=no	0.235
24	paperlessbilling=yes	0.202
33	seniorcitizen	0.194
16	multiplelines=yes	0.162
17	onlinebackup=no	0.117

small_model_features = ["contract", "tenure", "monthlycharges"]
df_train[small_model_features].head()

	contract	tenure	monthlycharges
0	two_year	72	115.50
1	month-to-month	10	95.25
2	month-to-month	5	75.55
3	month-to-month	5	80.85
4	two_year	18	20.10

Encode features

dicts_train_small = df_train[small_model_features].to_dict(orient="records")
dicts_validation_small = df_validation[small_model_features].to_dict(
    orient="records"
)

dict_vectorizer_small = DictVectorizer(sparse=False)
dict_vectorizer_small.fit(dicts_train_small)

dict_vectorizer_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

Train model to get coefficients

X_train_small = dict_vectorizer_small.transform(dicts_train_small)

model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

w0 = model_small.intercept_[0]
w = model_small.coef_[0]

w0, w

(np.float64(-2.477957595829565),
 array([ 0.9711394 , -0.02379507, -0.94828863,  0.02748534, -0.03619005]))

Inspect coefficients

small_coefficients = dict(
    zip(
        dict_vectorizer_small.get_feature_names_out(),
        model_small.coef_[0].round(3),
    )
)

small_coefficients

{'contract=month-to-month': np.float64(0.971),
 'contract=one_year': np.float64(-0.024),
 'contract=two_year': np.float64(-0.948),
 'monthlycharges': np.float64(0.027),
 'tenure': np.float64(-0.036)}

Calculate the result for a customer:

month to month contract
$50 of monthly charges
5 months that the customer has been with the company

Probability of churn

contract = (
    1 * small_coefficients["contract=month-to-month"]
    + 0 * small_coefficients["contract=one_year"]
    + 0 * small_coefficients["contract=two_year"]
)
monthly_charges = 50 * small_coefficients["monthlycharges"]
tenure = 5 * small_coefficients["tenure"]

sigmoid(model_small.intercept_[0] + contract + monthly_charges + tenure)

np.float64(0.41654870218821455)

Using the model¶

Train the model with complete dataset

# Encode full train dataset features
dicts_full_train = df_full_train[
    categorical_columns + numerical_columns
].to_dict(orient="records")

dicts_vectorizer_full_train = DictVectorizer(sparse=False)
X_full_train = dicts_vectorizer_full_train.fit_transform(dicts_full_train)

# Train model
y_full_train = df_full_train.churn.values
model = LogisticRegression(max_iter=99999)
model.fit(X_full_train, y_full_train)

# Encode test dataset features
dicts_test = df_test[categorical_columns + numerical_columns].to_dict(
    orient="records"
)
X_test = dicts_vectorizer_full_train.transform(dicts_test)

# Predict results
y_test_pred = model.predict_proba(X_test)[:, 1]
churn_decision = y_test_pred >= 0.5
churn_decision[:5]

array([False, False, False, False, False])

Model accuracy

(churn_decision == y_test).mean()

np.float64(0.8105039034776437)

customer_data = dicts_test[10]
customer_data

{'gender': 'male',
 'seniorcitizen': 1,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'tenure': 32,
 'monthlycharges': 93.95,
 'totalcharges': 2861.45}

Encode customer features to prediction

X_customer = dicts_vectorizer_full_train.transform([customer_data])
X_customer.shape  # 1 customer, 45 features

(1, 45)

Predict customer churn

model.predict_proba(X_customer)[:, 1]

array([0.47632824])

when probability < 0.5, is more likely that customer will not churn

Confirm the actual label

y_test[10]

np.int64(0)

Evaluation¶

Accuracy¶

Evaluate the model on different thresholds

thresholds = np.linspace(0, 1, 21)

scores = []

for threshold in thresholds:
    score = accuracy_score(y_validation, y_validation_pred >= threshold)
    print(f"{threshold:.2f}", f"{score:.2f}")
    scores.append(score)

Counter(y_validation_pred >= 1.0)

Counter({np.False_: 1409})

Check class imbalance

df_full_train.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

If we have a class with a lot more values than another, this is called class imbalanced, and the accuracy measure can be misleading for this cases because is calculation is:

\[\frac{number\_correct\_predictions}{total\_predictions}\]

We must have a way to identify if our model is good besides class imbalance

Confusion Table¶

True Negative	False Negative	False Positive	True Positive
No Churn	No Churn	Churn	Churn
Customer did not churn	Customer churned	Customer did not churn	Customer Churned
Correct	Wrong	Wrong	Correct

Building a confusion matrix

actual_positive = y_validation == 1
actual_negative = y_validation == 0

confusion_threshold = 0.5

predict_positive = y_validation_pred >= confusion_threshold
predict_negative = y_validation_pred < confusion_threshold

true_positive = (predict_positive & actual_positive).sum()
true_negative = (predict_negative & actual_negative).sum()
false_positive = (predict_positive & actual_negative).sum()
false_negative = (predict_negative & actual_positive).sum()

confusion_matrix = np.array(
    [[true_negative, false_positive], [false_negative, true_positive]]
)

confusion_matrix

array([[920, 103],
       [174, 212]])

Getting the accuracy from confusion matrix

accuracy = (true_positive + true_negative) / (
    true_positive + true_negative + false_positive + false_negative
)

accuracy

np.float64(0.8034066713981547)

Precision and Recall¶

Precision:

Fraction of positive predictions (customers that will churn) that are correct

\[\frac{true\_positives}{true\_positives + false\_positives}\]

precision = true_positive / (true_positive + false_positive)
print(
    f"Precision: From those we predicted would churn only {precision:.2f} actually would\n",
    f"Wrongly said that would churn: {1.0 - precision:.2f}",
)

Precision: From those we predicted would churn only 0.67 actually would

 Wrongly said that would churn: 0.33

Recall:

Fraction correctly identified positive examples

\[\frac{true\_positives}{true\_positives + false\_negatives}\]

recall = true_positive / (true_positive + false_negative)
print(
    f"Recall: From the customer that would actually churn {recall:.2f} customers were predicted to\n",
    f"Failed to identify churning customers: {1.0 - recall:.2f}",
)

Recall: From the customer that would actually churn 0.55 customers were predicted to

 Failed to identify churning customers: 0.45

ROC Curves¶

Receiver Operating Characteristics

Is a way to describe the performance of a binary classification model

TPR (true positive rate)

true_positive_rate = true_positive / (true_positive + false_negative)
true_positive_rate

np.float64(0.5492227979274611)

FPR (false positive rate)

false_positive_rate = false_positive / (true_positive + false_negative)
false_positive_rate

np.float64(0.266839378238342)

Evaluate different confusion matrixes for each threshold

def tpr_fpr_dataframe(
    y_validation: NDArray[Union[np.int64, np.float64]],
    y_validation_pred: NDArray[np.float64],
) -> pd.DataFrame:
    scores = []

    thresholds = np.linspace(0, 1, 101)

    for threshold in thresholds:
        actual_positive = y_validation == 1
        actual_negative = y_validation == 0

        predict_positive = y_validation_pred >= threshold
        predict_negative = y_validation_pred < threshold

        true_positive = (predict_positive & actual_positive).sum()
        true_negative = (predict_negative & actual_negative).sum()
        false_positive = (predict_positive & actual_negative).sum()
        false_negative = (predict_negative & actual_positive).sum()

        scores.append(
            (
                threshold,
                true_positive,
                false_positive,
                false_negative,
                true_negative,
            )
        )

        df_scores = pd.DataFrame(
            scores,
            columns=[
                "threshold",
                "true_positive",
                "false_positive",
                "false_negative",
                "true_negative",
            ],
        )

    df_scores["true_positive_rate"] = df_scores.true_positive / (
        df_scores.true_positive + df_scores.false_negative
    )

    df_scores["false_positive_rate"] = df_scores.false_positive / (
        df_scores.false_positive + df_scores.true_negative
    )
    return df_scores

Visualize values in a DataFrame

df_scores = tpr_fpr_dataframe(y_validation, y_validation_pred)

df_scores[::10]

	threshold	true_positive	false_positive	false_negative	true_negative	true_positive_rate	false_positive_rate
0	0.0	386	1023	0	0	1.000000	1.000000
10	0.1	366	533	20	490	0.948187	0.521017
20	0.2	339	372	47	651	0.878238	0.363636
30	0.3	292	247	94	776	0.756477	0.241447
40	0.4	254	175	132	848	0.658031	0.171065
50	0.5	212	103	174	920	0.549223	0.100684
60	0.6	151	53	235	970	0.391192	0.051808
70	0.7	69	13	317	1010	0.178756	0.012708
80	0.8	4	0	382	1023	0.010363	0.000000
90	0.9	0	0	386	1023	0.000000	0.000000
100	1.0	0	0	386	1023	0.000000	0.000000

Plot true positive rates and true negative rates

plt.plot(df_scores.threshold, df_scores.true_positive_rate, label="TPR")
plt.plot(df_scores.threshold, df_scores.false_positive_rate, label="FPR")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

plt.legend()
plt.show()

Random Model¶

The probability is almost the same as flipping a coin

np.random.seed(1)
y_random = np.random.uniform(0, 1, size=len(y_validation))
y_random

array([4.17022005e-01, 7.20324493e-01, 1.14374817e-04, ...,
       7.73916250e-01, 3.34276405e-01, 8.89982208e-02], shape=(1409,))

((y_random >= 0.5) == y_validation).mean()

np.float64(0.5017743080198722)

df_random = tpr_fpr_dataframe(y_validation, y_random)

plt.plot(df_random.threshold, df_random.true_positive_rate, label="TPR")
plt.plot(df_random.threshold, df_random.false_positive_rate, label="FPR")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

plt.legend()
plt.show()

Ideal Model¶

100% accuracy

number_of_positives = (
    y_validation == 1
).sum()  # positive means that will churn
number_of_negatives = (
    y_validation == 0
).sum()  # negative means that will not churn

number_of_negatives, number_of_positives

(np.int64(1023), np.int64(386))

y_ideal = np.repeat([0, 1], [number_of_negatives, number_of_positives])
y_ideal

array([0, 0, 0, ..., 1, 1, 1], shape=(1409,))

y_ideal_pred = np.linspace(0, 1, len(y_validation))
y_ideal_pred

array([0.00000000e+00, 7.10227273e-04, 1.42045455e-03, ...,
       9.98579545e-01, 9.99289773e-01, 1.00000000e+00], shape=(1409,))

1 - y_validation.mean()

np.float64(0.7260468417317246)

((y_ideal_pred >= 0.726) == y_ideal).mean()

np.float64(1.0)

df_ideal = tpr_fpr_dataframe(y_ideal, y_ideal_pred)

df_ideal[::10]

	threshold	true_positive	false_positive	false_negative	true_negative	true_positive_rate	false_positive_rate
0	0.0	386	1023	0	0	1.000000	1.000000
10	0.1	386	882	0	141	1.000000	0.862170
20	0.2	386	741	0	282	1.000000	0.724340
30	0.3	386	600	0	423	1.000000	0.586510
40	0.4	386	459	0	564	1.000000	0.448680
50	0.5	386	319	0	704	1.000000	0.311828
60	0.6	386	178	0	845	1.000000	0.173998
70	0.7	386	37	0	986	1.000000	0.036168
80	0.8	282	0	104	1023	0.730570	0.000000
90	0.9	141	0	245	1023	0.365285	0.000000
100	1.0	1	0	385	1023	0.002591	0.000000

plt.plot(df_ideal.threshold, df_ideal.true_positive_rate, label="TPR")
plt.plot(df_ideal.threshold, df_ideal.false_positive_rate, label="FPR")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()

Plotting all models¶

plt.plot(
    df_scores.threshold, df_scores.true_positive_rate, label="TPR - scores"
)
plt.plot(
    df_scores.threshold, df_scores.false_positive_rate, label="FPR - scores"
)

plt.plot(
    df_ideal.threshold,
    df_ideal.true_positive_rate,
    label="TPR - ideal",
    color="black",
)
plt.plot(
    df_ideal.threshold,
    df_ideal.false_positive_rate,
    label="FPR - ideal",
    color="black",
)

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

plt.legend()
plt.show()

Plotting rates

The model must be as close to ideal as possible, if below of random the model performance is really bad

plt.figure(figsize=(5, 5))

plt.plot(
    df_scores.false_positive_rate, df_scores.true_positive_rate, label="model"
)
plt.plot(
    df_random.false_positive_rate,
    df_random.true_positive_rate,
    label="random",
)
plt.plot(
    df_ideal.false_positive_rate,
    df_ideal.true_positive_rate,
    label="ideal",
)

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()

Using scikit learn

skl_false_positive_rate, skl_true_positive_rate, skl_threshold = roc_curve(
    y_validation, y_validation_pred
)

plt.figure(figsize=(5, 5))


plt.plot(
    skl_false_positive_rate,
    skl_true_positive_rate,
    label="Model",
)
plt.plot([0, 1], [0, 1], label="Random", linestyle="--")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()

ROC AUC¶

Receiver Operating Characteristics - Area Under The Curve

Measuring the area under the curve is possible to understand if te model is close to the ideal or the baseline

skl_false_positive_rate, skl_true_positive_rate, skl_threshold = roc_curve(
    y_validation, y_validation_pred
)

auc(skl_false_positive_rate, skl_true_positive_rate)

0.8463120254863528

ROC AUC for scores DataFrame

auc(df_scores.false_positive_rate, df_scores.true_positive_rate)

0.8459194991870908

ROC AUC for ideal DataFrame

auc(df_ideal.false_positive_rate, df_ideal.true_positive_rate)

0.9999430203759136

Or using scikit-learn method

roc_auc_score(y_validation, y_validation_pred)

0.8463120254863528

Interpretation of ROC AUC

AUC: Probability that randomly selected positive example has higher score than randomly selected negative example and how well the model can order or customers

negative_occurrences = y_validation_pred[y_validation == 0]
positive_occurrences = y_validation_pred[y_validation == 1]

Calculating roc auc score

n = 100000
success = 0

for i in range(n):
    positive_index = random.randint(0, len(positive_occurrences) - 1)
    negative_index = random.randint(0, len(negative_occurrences) - 1)

    if (
        positive_occurrences[positive_index]
        > negative_occurrences[negative_index]
    ):
        success = success + 1

success / n

0.8461

Or using numpy

n = 100000
positive_index = np.random.randint(0, len(positive_occurrences), size=n)
negative_index = np.random.randint(0, len(negative_occurrences), size=n)

(
    (
        positive_occurrences[positive_index]
        > negative_occurrences[negative_index]
    )
).mean()

np.float64(0.8455)

Cross-Validation¶

Evaluating the same model on different subsets of data
Get the average prediction and the spread within predictions

K-Fold Cross Validation¶

Full train dataset	Test dataset
Train + Validation	Test

Split the full train dataset in multiple parts (folds)

1	2	3

Train with 1, 2 and validate with 3 using AUC
Train with 1, 3 and validate with 2 using AUC
Train with 2, 3 and validate with 1 using AUC

Define a train function

def train(
    df_train: pd.DataFrame, y_train: NDArray[np.float64], C=1.0
) -> Tuple[DictVectorizer, LogisticRegression]:
    """
    Trains a logistic regression model using a DictVectorizer for feature encoding.

    The function converts categorical and numerical features into a numeric
    feature matrix using ``DictVectorizer`` and fits a logistic regression
    classifier with L2 regularization.

    Args:
        df_train (pd.DataFrame): Training dataset containing both categorical
            and numerical feature columns.
        y_train (NDArray[np.float64]): Target labels corresponding to the
            training data.
        C (float, optional): Inverse of regularization strength for the logistic
            regression model. Smaller values specify stronger regularization.
            Defaults to 1.0.

    Returns:
        Tuple[DictVectorizer, LogisticRegression]:
            - DictVectorizer: Fitted vectorizer used to transform feature
              dictionaries into numeric arrays.
            - LogisticRegression: Trained logistic regression model.

    Raises:
        ValueError: If the input data contains incompatible shapes or missing
            required columns.
    """
    dicts = df_train[categorical_columns + numerical_columns].to_dict(
        orient="records"
    )

    dict_vectorizer = DictVectorizer(sparse=False)
    X_train = dict_vectorizer.fit_transform(dicts)

    model = LogisticRegression(max_iter=99999, C=C)
    model.fit(X_train, y_train)

    return dict_vectorizer, model

Train the model

dict_vectorizer, model = train(df_train, y_train)

Define a predict function

def predict(
    df: pd.DataFrame,
    dict_vectorizer: DictVectorizer,
    model: LogisticRegression,
):
    dicts = df[categorical_columns + numerical_columns].to_dict(
        orient="records"
    )
    X = dict_vectorizer.transform(dicts)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

Predict execution example

predict(df_validation, dict_vectorizer, model)[:5]

array([0.0066238 , 0.20482253, 0.21781799, 0.56375043, 0.21867476])

Run cross validation

n_splits = 5

for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
    scores = []

    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

    for train_index, validation_index in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_index]
        df_validation = df_full_train.iloc[validation_index]

        y_train = df_train.churn.values
        y_validation = df_validation.churn.values

        dict_vectorizer, model = train(df_train, y_train, C=C)
        y_pred = predict(df_validation, dict_vectorizer, model)

        auc = roc_auc_score(y_validation, y_pred)
        scores.append(auc)

    print(C, f"{np.mean(scores):.3f}", f"{np.std(scores):.3f}")

 14%|█▍        | 1/7 [00:08<00:48,  8.04s/it]

0.001 0.825 0.009

 29%|██▊       | 2/7 [00:27<01:14, 14.81s/it]

0.01 0.840 0.008

 43%|████▎     | 3/7 [01:01<01:33, 23.43s/it]

0.1 0.842 0.007

 57%|█████▋    | 4/7 [01:35<01:23, 27.88s/it]

0.5 0.842 0.007

 71%|███████▏  | 5/7 [02:02<00:54, 27.43s/it]

1 0.842 0.007

 86%|████████▌ | 6/7 [02:27<00:26, 26.63s/it]

5 0.842 0.007

100%|██████████| 7/7 [02:55<00:00, 25.05s/it]

10 0.842 0.007

Measure ROC AUC for the final model

dict_vectorizer, model = train(
    df_full_train, df_full_train.churn.values, C=1.0
)
y_pred = predict(df_test, dict_vectorizer, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.8584032088573997

Save the model¶

Define output dir

OUTPUT_DIR = pathlib.Path("artifacts/predicting-customer-churn")

os.makedirs(OUTPUT_DIR, exist_ok=True)

Define output file

output_file = OUTPUT_DIR / f"model_C={C}.bin"
output_file

PosixPath('artifacts/predicting-customer-churn/model_C=10.bin')

Save model

with open(output_file, "wb") as f_out:
    pickle.dump((dict_vectorizer, model), f_out)

Load the model

with open(output_file, "rb") as f_in:
    loaded_dict_vectorizer, loaded_model = pickle.load(f_in)

loaded_dict_vectorizer, loaded_model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=99999))

customer = {
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "yes",
    "dependents": "no",
    "phoneservice": "no",
    "multiplelines": "no_phone_service",
    "internetservice": "dsl",
    "onlinesecurity": "no",
    "onlinebackup": "yes",
    "deviceprotection": "no",
    "techsupport": "no",
    "streamingtv": "no",
    "streamingmovies": "no",
    "contract": "month-to-month",
    "paperlessbilling": "yes",
    "paymentmethod": "eletronic_check",
    "tenure": 1,
    "monthlycharges": 29.85,
    "totalcharges": 29.85,
}

X = loaded_dict_vectorizer.transform([customer])

loaded_model.predict_proba(X)[0, 1]

np.float64(0.5786968179280463)