Skip to content

Churn Prediction

  • Binary classification
\[g(x_{i}) \approx y_{i}\]
\[y_{i} \in \{0, 1\}\]
  • \(1\): Churn
  • \(0\): No Churn

Dataset:

telco-customer-churn

Install packages

1
2
3
4
5
6
7
8
9
!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1

Append notebooks directory to sys.path

1
2
3
import sys

sys.path.append("../../..")

Import packages

import os
import pathlib
import random
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
from typing import Tuple, Union
import numpy as np
from numpy.typing import NDArray
import seaborn as sns
import datetime
from collections import Counter
import pickle
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
    mutual_info_score, accuracy_score, roc_curve, auc, roc_auc_score
)
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)

pd.set_option("display.max_columns", None)

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

%matplotlib inline

load_dotenv()  # Root directory .env file
True

Utility scripts:

KaggleDataExtractor:

import base64
import io
import logging
import os
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger("KaggleExtractor")


class ExtractionConfig(ABC):
    pass


class DataExtractionStrategy(ABC):
    @abstractmethod
    def download_dataset(self, config: ExtractionConfig) -> None:
        pass


@dataclass(frozen=True)
class KaggleExtractionConfig(ExtractionConfig):
    dataset_slug: str  # e.g. "zynicide/wine-reviews"
    file_name: str  # file inside the Kaggle zip
    destination_path: str  # folder to extract to
    output_file_name: str | None = None  # optional rename


class KaggleDataExtractor(DataExtractionStrategy):
    def __init__(self, username: str, api_token: str) -> None:
        self.username = username
        self.api_token = api_token
        self.auth_header = self._create_auth_header()

    def _create_auth_header(self):
        token = f"{self.username}:{self.api_token}"
        base64_token = base64.b64encode(token.encode()).decode()
        return {"Authorization": f"Basic {base64_token}"}

    def download_dataset(self, config: ExtractionConfig) -> None:
        if not isinstance(config, KaggleExtractionConfig):
            raise TypeError("config must be a KaggleExtractionConfig instance")

        url = f"https://www.kaggle.com/api/v1/datasets/download/{config.dataset_slug}"
        request = Request(url, headers=self.auth_header)

        logger.info(f"Starting download from Kaggle: {url}")

        try:
            with urlopen(request) as response:
                data = response.read()
            logger.info("Download completed. Extracting zip file...")

            os.makedirs(config.destination_path, exist_ok=True)

            with zipfile.ZipFile(io.BytesIO(data)) as z:
                extracted_path = z.extract(
                    config.file_name, path=config.destination_path
                )

            if config.output_file_name is None:
                logger.info(
                    f"Dataset '{config.file_name}' extracted successfully "
                    f"to: {config.destination_path}"
                )
                return

            old_path = os.path.join(config.destination_path, config.file_name)
            new_path = os.path.join(
                config.destination_path, config.output_file_name
            )

            os.rename(old_path, new_path)

            logger.info(
                f"Dataset '{config.file_name}' extracted successfully "
                f"to: {config.destination_path}"
            )

        except HTTPError as e:
            logger.error(f"HTTP Error {e.code}: {e.reason}")
        except URLError as e:
            logger.error(f"URL Error: {e.reason}")
        except zipfile.BadZipFile:
            logger.error(
                "Failed to read zip file. Kaggle may have returned HTML instead of a zip."
            )
        except Exception as e:
            logger.exception(f"Unexpected error occurred: {e}")

Create data directory

1
2
3
DATA_DIR = pathlib.Path("data/predicting-customer-churn")

os.makedirs(DATA_DIR, exist_ok=True)

Download dataset from Kaggle

username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="blastchar/telco-customer-churn",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="churn.csv",
)

if not os.path.isfile(DATA_DIR / "churn.csv"):
    extractor.download_dataset(config)
2026-01-04 11:17:27 | INFO     | KaggleExtractor | Starting download from Kaggle: https://www.kaggle.com/api/v1/datasets/download/blastchar/telco-customer-churn

2026-01-04 11:17:28 | INFO     | KaggleExtractor | Download completed. Extracting zip file...

2026-01-04 11:17:28 | INFO     | KaggleExtractor | Dataset 'WA_Fn-UseC_-Telco-Customer-Churn.csv' extracted successfully to: data/predicting-customer-churn

Data Preparation

Load dataset

1
2
3
df = pd.read_csv(DATA_DIR / "churn.csv")

df.head(n=2)
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No

Inspect all columns at once

df.head(3).T
0 1 2
customerID 7590-VHVEG 5575-GNVDE 3668-QPYBK
gender Female Male Male
SeniorCitizen 0 0 0
Partner Yes No No
Dependents No No No
tenure 1 34 2
PhoneService No Yes Yes
MultipleLines No phone service No No
InternetService DSL DSL DSL
OnlineSecurity No Yes Yes
OnlineBackup Yes No Yes
DeviceProtection No Yes No
TechSupport No No No
StreamingTV No No No
StreamingMovies No No No
Contract Month-to-month One year Month-to-month
PaperlessBilling Yes No Yes
PaymentMethod Electronic check Mailed check Mailed check
MonthlyCharges 29.85 56.95 53.85
TotalCharges 29.85 1889.5 108.15
Churn No No Yes

Data summary

1
2
3
4
5
6
7
8
9
df_summary = pd.DataFrame(
    {
        "column": df.columns,
        "dtype": [df[col].dtype for col in df.columns],
        "sample_unique": [df[col].unique()[:6] for col in df.columns],
        "n_unique": [df[col].nunique() for col in df.columns],
    }
)
df_summary
column dtype sample_unique n_unique
0 customerID object [7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOC... 7043
1 gender object [Female, Male] 2
2 SeniorCitizen int64 [0, 1] 2
3 Partner object [Yes, No] 2
4 Dependents object [No, Yes] 2
5 tenure int64 [1, 34, 2, 45, 8, 22] 73
6 PhoneService object [No, Yes] 2
7 MultipleLines object [No phone service, No, Yes] 3
8 InternetService object [DSL, Fiber optic, No] 3
9 OnlineSecurity object [No, Yes, No internet service] 3
10 OnlineBackup object [Yes, No, No internet service] 3
11 DeviceProtection object [No, Yes, No internet service] 3
12 TechSupport object [No, Yes, No internet service] 3
13 StreamingTV object [No, Yes, No internet service] 3
14 StreamingMovies object [No, Yes, No internet service] 3
15 Contract object [Month-to-month, One year, Two year] 3
16 PaperlessBilling object [Yes, No] 2
17 PaymentMethod object [Electronic check, Mailed check, Bank transfer... 4
18 MonthlyCharges float64 [29.85, 56.95, 53.85, 42.3, 70.7, 99.65] 1585
19 TotalCharges object [29.85, 1889.5, 108.15, 1840.75, 151.65, 820.5] 6531
20 Churn object [No, Yes] 2

Clean column names

1
2
3
df.columns = df.columns.str.lower().str.replace(" ", "_")

df.head(n=2)
customerid gender seniorcitizen partner dependents tenure phoneservice multiplelines internetservice onlinesecurity onlinebackup deviceprotection techsupport streamingtv streamingmovies contract paperlessbilling paymentmethod monthlycharges totalcharges churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No

Select only object type columns

object_type_columns = list(df.dtypes[df.dtypes == "object"].index)
object_type_columns
['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

Clean columns

1
2
3
object_type_columns = list(df.dtypes[df.dtypes == "object"].index)
for column in object_type_columns:
    df[column] = df[column].str.lower().str.replace(" ", "_")

Inspect values of total charges, it should numeric

df.totalcharges[:5]
0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
Name: totalcharges, dtype: object

Cast total charges to numeric type

total_charges = pd.to_numeric(df.totalcharges, errors="coerce")
total_charges[:5]
0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
Name: totalcharges, dtype: float64

Check for null values

total_charges.loc[total_charges.isnull()][:5]
488    NaN
753    NaN
936    NaN
1082   NaN
1340   NaN
Name: totalcharges, dtype: float64

Treat the null values

df.totalcharges = total_charges.fillna(0)

Check churn field values

df.churn[:5]
0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

Encode churn field to binary

(df.churn == "yes").astype(int)[:5]
0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

Set original churn dataset column to binary

df.churn = (df.churn == "yes").astype(int)

Validation Framework

Set split sizes

  • Training dataset: 60%
  • Validation dataset: 20%
  • Test dataset: 20%

Split dataset into full train (train + validation) and test

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

Get dataset's sizes

len(df_full_train), len(df_test)
(5634, 1409)

Calculate how train and validation dataset's sizes should be

1
2
3
4
5
6
print(
    f"df_full_train size: {(100 - 20)/100.:.0%}\n"
    f"df_test size: {(20)/100.:.0%}\n"
    f"df_train size: 60% of 80% = {(60)/80.:.0%}\n"
    f"df_validation size: 20% of 80% = {(20)/80.:.0%}\n"
)
df_full_train size: 80%

df_test size: 20%

df_train size: 60% of 80% = 75%

df_validation size: 20% of 80% = 25%

Split full train dataset into train and validation datasets

1
2
3
df_train, df_validation = train_test_split(
    df_full_train, test_size=0.25, random_state=1
)

Get full length of dataset

len(df_train), len(df_validation), len(df_test)
(4225, 1409, 1409)

Reset dataset's indexes

1
2
3
df_train.reset_index(drop=True, inplace=True)
df_validation.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

Get target variables

1
2
3
y_train = df_train["churn"]
y_validation = df_validation["churn"]
y_test = df_test["churn"]

Remove target variables from original datasets

1
2
3
df_train.drop(columns=["churn"], inplace=True)
df_validation.drop(columns=["churn"], inplace=True)
df_test.drop(columns=["churn"], inplace=True)

Exploratory Data Analysis

Reset full train dataset index

df_full_train.reset_index(drop=True, inplace=True)

Inspect dataset

df_full_train.head(n=2)
customerid gender seniorcitizen partner dependents tenure phoneservice multiplelines internetservice onlinesecurity onlinebackup deviceprotection techsupport streamingtv streamingmovies contract paperlessbilling paymentmethod monthlycharges totalcharges churn
0 5442-pptjy male 0 yes yes 12 yes no no no_internet_service no_internet_service no_internet_service no_internet_service no_internet_service no_internet_service two_year no mailed_check 19.7 258.35 0
1 6261-rcvns female 0 no no 42 yes no dsl yes yes yes yes no yes one_year no credit_card_(automatic) 73.9 3160.55 1

Check if null values are present

df.isnull().sum()
customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

Inspect target variable churn

df_full_train.churn.value_counts()
churn
0    4113
1    1521
Name: count, dtype: int64

Get percent of total

df_full_train.churn.value_counts(normalize=True)
churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

Get mean

df_full_train.churn.mean()  # number of ones divided by total
np.float64(0.26996805111821087)

Mean and percent of total for churn is the same because is encoded to binary. So both calculations are number of ones divided by total

global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)
np.float64(0.27)

Inspect columns types

df_full_train.dtypes
customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

Set numerical columns

numerical_columns = ["tenure", "monthlycharges", "totalcharges"]

Set categorical columns

categorical_columns = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

Inspect categorical columns

df_full_train[categorical_columns].nunique()
gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

Feature Importance

Churn Rate

Difference:

  • (global_churn_rate - group_churn_rate) > 0: Less likely to churn
  • (global_churn_rate - group_churn_rate) < 0: More likely to churn

Risk Ratio:

  • (group_churn_rate / global_churn_rate) > 1: More likely to churn
  • (group_churn_rate / global_churn_rate) < 1: Less likely to churn
df_groups = []

for column in categorical_columns:
    global_churn_rate = df_full_train.churn.mean()
    df_group = (
        df_full_train[[column, "churn"]]
        .groupby(column)
        .churn.agg(["mean", "count"])
    )
    df_group["diff"] = df_group["mean"] - global_churn_rate
    df_group["risk"] = df_group["mean"] / global_churn_rate
    df_group = df_group.reset_index().rename(columns={column: "label"})
    df_group.insert(0, "column", column)
    df_groups.append(df_group)

result = pd.concat(df_groups, ignore_index=True)
result
column label mean count diff risk
0 gender female 0.276824 2796 0.006856 1.025396
1 gender male 0.263214 2838 -0.006755 0.974980
2 seniorcitizen 0 0.242270 4722 -0.027698 0.897403
3 seniorcitizen 1 0.413377 912 0.143409 1.531208
4 partner no 0.329809 2932 0.059841 1.221659
5 partner yes 0.205033 2702 -0.064935 0.759472
6 dependents no 0.313760 3968 0.043792 1.162212
7 dependents yes 0.165666 1666 -0.104302 0.613651
8 phoneservice no 0.241316 547 -0.028652 0.893870
9 phoneservice yes 0.273049 5087 0.003081 1.011412
10 multiplelines no 0.257407 2700 -0.012561 0.953474
11 multiplelines no_phone_service 0.241316 547 -0.028652 0.893870
12 multiplelines yes 0.290742 2387 0.020773 1.076948
13 internetservice dsl 0.192347 1934 -0.077621 0.712482
14 internetservice fiber_optic 0.425171 2479 0.155203 1.574895
15 internetservice no 0.077805 1221 -0.192163 0.288201
16 onlinesecurity no 0.420921 2801 0.150953 1.559152
17 onlinesecurity no_internet_service 0.077805 1221 -0.192163 0.288201
18 onlinesecurity yes 0.153226 1612 -0.116742 0.567570
19 onlinebackup no 0.404323 2498 0.134355 1.497672
20 onlinebackup no_internet_service 0.077805 1221 -0.192163 0.288201
21 onlinebackup yes 0.217232 1915 -0.052736 0.804660
22 deviceprotection no 0.395875 2473 0.125907 1.466379
23 deviceprotection no_internet_service 0.077805 1221 -0.192163 0.288201
24 deviceprotection yes 0.230412 1940 -0.039556 0.853480
25 techsupport no 0.418914 2781 0.148946 1.551717
26 techsupport no_internet_service 0.077805 1221 -0.192163 0.288201
27 techsupport yes 0.159926 1632 -0.110042 0.592390
28 streamingtv no 0.342832 2246 0.072864 1.269897
29 streamingtv no_internet_service 0.077805 1221 -0.192163 0.288201
30 streamingtv yes 0.302723 2167 0.032755 1.121328
31 streamingmovies no 0.338906 2213 0.068938 1.255358
32 streamingmovies no_internet_service 0.077805 1221 -0.192163 0.288201
33 streamingmovies yes 0.307273 2200 0.037305 1.138182
34 contract month-to-month 0.431701 3104 0.161733 1.599082
35 contract one_year 0.120573 1186 -0.149395 0.446621
36 contract two_year 0.028274 1344 -0.241694 0.104730
37 paperlessbilling no 0.172071 2313 -0.097897 0.637375
38 paperlessbilling yes 0.338151 3321 0.068183 1.252560
39 paymentmethod bank_transfer_(automatic) 0.168171 1219 -0.101797 0.622928
40 paymentmethod credit_card_(automatic) 0.164339 1217 -0.105630 0.608733
41 paymentmethod electronic_check 0.455890 1893 0.185922 1.688682
42 paymentmethod mailed_check 0.193870 1305 -0.076098 0.718121

Mutual information

Measure importance for categorical features

How much We can learn about one variable if we know the value of another

1
2
3
df_full_train[categorical_columns].apply(
    lambda col: mutual_info_score(col, df_full_train["churn"])
).sort_values(ascending=False)
contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

Correlation

Measure importance for numerical features

  • Positive correlation: both variables increase together
  • Negative correlation: When a variable increase other decrease
df_full_train[numerical_columns].corrwith(df_full_train.churn)
tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

One-hot encoding

Encode categorical features

1
2
3
4
5
6
7
8
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(
    orient="records"
)

dict_vectorizer = DictVectorizer(sparse=False)

X_train = dict_vectorizer.fit_transform(train_dicts)
X_train.shape
(4225, 45)
1
2
3
4
5
6
validation_dicts = df_validation[
    categorical_columns + numerical_columns
].to_dict(orient="records")

X_validation = dict_vectorizer.transform(validation_dicts)
X_validation.shape
(1409, 45)

Logistic Regression

\[g(x_{i}) = sigmoid(w_{j} + W^{t}x_{i})\]
  • Linear regression return a number between -\(\infty\) and +\(\infty\)
  • Logistic regression return a number between 0 and 1, due the usage of sigmoid function

Regression

  • Linear: gives a score
  • Logistic: transform the score into a probability

Classification

  • Binary
  • Multiclass

Sigmoid function

1
2
3
4
5
6
7
8
9
def sigmoid(z: NDArray[np.float64]) -> NDArray[np.float64]:
    return 1 / (1 + np.exp(-z))


z = np.linspace(-7, 7, 51)


plt.plot(z, sigmoid(z))
plt.show()
output_91_0.png Linear regression formula

1
2
3
4
5
6
7
8
def linear_regression(xi: NDArray[np.float64]) -> float:
    score = weight0

    for index in range(len(weights)):
        # linear operator (dot product)
        score = score + xi[index] * weights[index]

    return score

Logistic regression formula

1
2
3
4
5
6
7
8
9
def logistic_regression(xi: NDArray[np.float64]) -> float:
    score = weight0

    for index in range(len(weights)):
        # linear operator (dot product)
        score = score + xi[index] * weights[index]

    result = sigmoid(score)
    return score

Training a logistic regression model

1
2
3
4
5
6
7
model = LogisticRegression(max_iter=9999)
model.fit(X_train, y_train)

print(
    model.intercept_[0],  # bias term
    model.coef_[0].round(3),  # w (weights)
)
-0.045323382519725265 [ 0.685  0.039 -0.682  0.056 -0.015  0.114 -0.16   0.087  0.039  0.002

 -0.497  0.698 -0.16  -0.018 -0.187  0.066  0.162  0.117 -0.16   0.084

  0.285 -0.16  -0.084 -0.161  0.202 -0.045  0.086 -0.052 -0.003  0.106

 -0.011  0.066 -0.025  0.194 -0.094 -0.16   0.295 -0.054 -0.16   0.255

  0.235 -0.16  -0.034 -0.069  0.   ]

Probability of a customer churn without we knowing anything about it

sigmoid(model.intercept_[0])  # w0 (bias therm)
np.float64(0.4886710936321301)
1
2
3
# model.predict(X_train) # Label
y_validation_pred = model.predict_proba(X_validation)[:, 1]  # Probability
y_validation_pred
array([0.0066238 , 0.20482253, 0.21781799, ..., 0.15149644, 0.78847856,
       0.81190391], shape=(1409,))
churn_decision = y_validation_pred >= 0.5

Customers that may churn

df_validation[churn_decision].head()
customerid gender seniorcitizen partner dependents tenure phoneservice multiplelines internetservice onlinesecurity onlinebackup deviceprotection techsupport streamingtv streamingmovies contract paperlessbilling paymentmethod monthlycharges totalcharges
3 8433-wxgna male 0 no no 2 yes no fiber_optic yes no no no no no month-to-month yes electronic_check 75.70 189.20
8 3440-jpscl female 0 no no 6 yes no fiber_optic no no yes yes yes yes month-to-month yes mailed_check 99.95 547.65
12 7228-omtpn male 0 no no 4 yes no fiber_optic no no no no yes yes month-to-month yes electronic_check 88.45 370.65
19 6711-fldfb female 0 no no 7 yes yes fiber_optic no no no no no no month-to-month yes electronic_check 74.90 541.15
24 2612-ranwt female 0 no no 12 yes yes fiber_optic no no yes no yes yes month-to-month yes bank_transfer_(automatic) 100.15 1164.30
(y_validation == churn_decision).mean()
np.float64(0.8034066713981547)

Doing verification step by step

1
2
3
4
5
6
df_pred = pd.DataFrame()
df_pred["probability"] = y_validation_pred
df_pred["prediction"] = churn_decision.astype(int)
df_pred["actual"] = y_validation
df_pred["correct"] = df_pred.prediction == df_pred.actual
df_pred.head()
probability prediction actual correct
0 0.006624 0 0 True
1 0.204823 0 0 True
2 0.217818 0 0 True
3 0.563750 1 1 True
4 0.218675 0 0 True
df_pred.correct.mean()
np.float64(0.8034066713981547)

Model interpretation

  • Using a smaller model (trained with less features) and multiplying the values for their weights to reach the predicted value is an easier approach

Understand weights for each feature

1
2
3
4
5
6
pd.DataFrame(
    list(
        zip(dict_vectorizer.get_feature_names_out(), model.coef_[0].round(3))
    ),
    columns=["feature", "coefficient"],
).sort_values(by=["coefficient"], ascending=False).head(10)
feature coefficient
11 internetservice=fiber_optic 0.698
0 contract=month-to-month 0.685
36 streamingmovies=yes 0.295
20 onlinesecurity=no 0.285
39 streamingtv=yes 0.255
40 techsupport=no 0.235
24 paperlessbilling=yes 0.202
33 seniorcitizen 0.194
16 multiplelines=yes 0.162
17 onlinebackup=no 0.117
small_model_features = ["contract", "tenure", "monthlycharges"]
df_train[small_model_features].head()
contract tenure monthlycharges
0 two_year 72 115.50
1 month-to-month 10 95.25
2 month-to-month 5 75.55
3 month-to-month 5 80.85
4 two_year 18 20.10

Encode features

1
2
3
4
5
6
7
8
9
dicts_train_small = df_train[small_model_features].to_dict(orient="records")
dicts_validation_small = df_validation[small_model_features].to_dict(
    orient="records"
)

dict_vectorizer_small = DictVectorizer(sparse=False)
dict_vectorizer_small.fit(dicts_train_small)

dict_vectorizer_small.get_feature_names_out()
array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

Train model to get coefficients

1
2
3
4
5
6
7
8
9
X_train_small = dict_vectorizer_small.transform(dicts_train_small)

model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

w0 = model_small.intercept_[0]
w = model_small.coef_[0]

w0, w
(np.float64(-2.477957595829565),
 array([ 0.9711394 , -0.02379507, -0.94828863,  0.02748534, -0.03619005]))

Inspect coefficients

1
2
3
4
5
6
7
8
small_coefficients = dict(
    zip(
        dict_vectorizer_small.get_feature_names_out(),
        model_small.coef_[0].round(3),
    )
)

small_coefficients
{'contract=month-to-month': np.float64(0.971),
 'contract=one_year': np.float64(-0.024),
 'contract=two_year': np.float64(-0.948),
 'monthlycharges': np.float64(0.027),
 'tenure': np.float64(-0.036)}

Calculate the result for a customer:

  • month to month contract
  • $50 of monthly charges
  • 5 months that the customer has been with the company

Probability of churn

1
2
3
4
5
6
7
8
9
contract = (
    1 * small_coefficients["contract=month-to-month"]
    + 0 * small_coefficients["contract=one_year"]
    + 0 * small_coefficients["contract=two_year"]
)
monthly_charges = 50 * small_coefficients["monthlycharges"]
tenure = 5 * small_coefficients["tenure"]

sigmoid(model_small.intercept_[0] + contract + monthly_charges + tenure)
np.float64(0.41654870218821455)

Using the model

Train the model with complete dataset

# Encode full train dataset features
dicts_full_train = df_full_train[
    categorical_columns + numerical_columns
].to_dict(orient="records")

dicts_vectorizer_full_train = DictVectorizer(sparse=False)
X_full_train = dicts_vectorizer_full_train.fit_transform(dicts_full_train)

# Train model
y_full_train = df_full_train.churn.values
model = LogisticRegression(max_iter=99999)
model.fit(X_full_train, y_full_train)

# Encode test dataset features
dicts_test = df_test[categorical_columns + numerical_columns].to_dict(
    orient="records"
)
X_test = dicts_vectorizer_full_train.transform(dicts_test)

# Predict results
y_test_pred = model.predict_proba(X_test)[:, 1]
churn_decision = y_test_pred >= 0.5
churn_decision[:5]
array([False, False, False, False, False])

Model accuracy

(churn_decision == y_test).mean()
np.float64(0.8105039034776437)
customer_data = dicts_test[10]
customer_data
{'gender': 'male',
 'seniorcitizen': 1,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'tenure': 32,
 'monthlycharges': 93.95,
 'totalcharges': 2861.45}

Encode customer features to prediction

X_customer = dicts_vectorizer_full_train.transform([customer_data])
X_customer.shape  # 1 customer, 45 features
(1, 45)

Predict customer churn

model.predict_proba(X_customer)[:, 1]
array([0.47632824])

when probability < 0.5, is more likely that customer will not churn

Confirm the actual label

y_test[10]
np.int64(0)

Evaluation

Accuracy

  • Evaluate the model on different thresholds
1
2
3
4
5
6
7
8
thresholds = np.linspace(0, 1, 21)

scores = []

for threshold in thresholds:
    score = accuracy_score(y_validation, y_validation_pred >= threshold)
    print(f"{threshold:.2f}", f"{score:.2f}")
    scores.append(score)
0.00 0.27

0.05 0.51

0.10 0.61

0.15 0.66

0.20 0.70

0.25 0.73

0.30 0.76

0.35 0.77

0.40 0.78

0.45 0.79

0.50 0.80

0.55 0.80

0.60 0.80

0.65 0.79

0.70 0.77

0.75 0.74

0.80 0.73

0.85 0.73

0.90 0.73

0.95 0.73

1.00 0.73
Counter(y_validation_pred >= 1.0)
Counter({np.False_: 1409})

Check class imbalance

df_full_train.churn.value_counts()
churn
0    4113
1    1521
Name: count, dtype: int64

If we have a class with a lot more values than another, this is called class imbalanced, and the accuracy measure can be misleading for this cases because is calculation is:

\[\frac{number\_correct\_predictions}{total\_predictions}\]

We must have a way to identify if our model is good besides class imbalance

Confusion Table

True Negative False Negative False Positive True Positive
No Churn No Churn Churn Churn
Customer did not churn Customer churned Customer did not churn Customer Churned
Correct Wrong Wrong Correct

Building a confusion matrix

actual_positive = y_validation == 1
actual_negative = y_validation == 0

confusion_threshold = 0.5

predict_positive = y_validation_pred >= confusion_threshold
predict_negative = y_validation_pred < confusion_threshold

true_positive = (predict_positive & actual_positive).sum()
true_negative = (predict_negative & actual_negative).sum()
false_positive = (predict_positive & actual_negative).sum()
false_negative = (predict_negative & actual_positive).sum()

confusion_matrix = np.array(
    [[true_negative, false_positive], [false_negative, true_positive]]
)

confusion_matrix
array([[920, 103],
       [174, 212]])

Getting the accuracy from confusion matrix

1
2
3
4
5
accuracy = (true_positive + true_negative) / (
    true_positive + true_negative + false_positive + false_negative
)

accuracy
np.float64(0.8034066713981547)

Precision and Recall

Precision:

Fraction of positive predictions (customers that will churn) that are correct

\[\frac{true\_positives}{true\_positives + false\_positives}\]
1
2
3
4
5
precision = true_positive / (true_positive + false_positive)
print(
    f"Precision: From those we predicted would churn only {precision:.2f} actually would\n",
    f"Wrongly said that would churn: {1.0 - precision:.2f}",
)
Precision: From those we predicted would churn only 0.67 actually would

 Wrongly said that would churn: 0.33

Recall:

Fraction correctly identified positive examples

\[\frac{true\_positives}{true\_positives + false\_negatives}\]
1
2
3
4
5
recall = true_positive / (true_positive + false_negative)
print(
    f"Recall: From the customer that would actually churn {recall:.2f} customers were predicted to\n",
    f"Failed to identify churning customers: {1.0 - recall:.2f}",
)
Recall: From the customer that would actually churn 0.55 customers were predicted to

 Failed to identify churning customers: 0.45

ROC Curves

Receiver Operating Characteristics

Is a way to describe the performance of a binary classification model

TPR (true positive rate)

true_positive_rate = true_positive / (true_positive + false_negative)
true_positive_rate
np.float64(0.5492227979274611)

FPR (false positive rate)

false_positive_rate = false_positive / (true_positive + false_negative)
false_positive_rate
np.float64(0.266839378238342)

Evaluate different confusion matrixes for each threshold

def tpr_fpr_dataframe(
    y_validation: NDArray[Union[np.int64, np.float64]],
    y_validation_pred: NDArray[np.float64],
) -> pd.DataFrame:
    scores = []

    thresholds = np.linspace(0, 1, 101)

    for threshold in thresholds:
        actual_positive = y_validation == 1
        actual_negative = y_validation == 0

        predict_positive = y_validation_pred >= threshold
        predict_negative = y_validation_pred < threshold

        true_positive = (predict_positive & actual_positive).sum()
        true_negative = (predict_negative & actual_negative).sum()
        false_positive = (predict_positive & actual_negative).sum()
        false_negative = (predict_negative & actual_positive).sum()

        scores.append(
            (
                threshold,
                true_positive,
                false_positive,
                false_negative,
                true_negative,
            )
        )

        df_scores = pd.DataFrame(
            scores,
            columns=[
                "threshold",
                "true_positive",
                "false_positive",
                "false_negative",
                "true_negative",
            ],
        )

    df_scores["true_positive_rate"] = df_scores.true_positive / (
        df_scores.true_positive + df_scores.false_negative
    )

    df_scores["false_positive_rate"] = df_scores.false_positive / (
        df_scores.false_positive + df_scores.true_negative
    )
    return df_scores

Visualize values in a DataFrame

1
2
3
df_scores = tpr_fpr_dataframe(y_validation, y_validation_pred)

df_scores[::10]
threshold true_positive false_positive false_negative true_negative true_positive_rate false_positive_rate
0 0.0 386 1023 0 0 1.000000 1.000000
10 0.1 366 533 20 490 0.948187 0.521017
20 0.2 339 372 47 651 0.878238 0.363636
30 0.3 292 247 94 776 0.756477 0.241447
40 0.4 254 175 132 848 0.658031 0.171065
50 0.5 212 103 174 920 0.549223 0.100684
60 0.6 151 53 235 970 0.391192 0.051808
70 0.7 69 13 317 1010 0.178756 0.012708
80 0.8 4 0 382 1023 0.010363 0.000000
90 0.9 0 0 386 1023 0.000000 0.000000
100 1.0 0 0 386 1023 0.000000 0.000000

Plot true positive rates and true negative rates

1
2
3
4
5
6
7
8
plt.plot(df_scores.threshold, df_scores.true_positive_rate, label="TPR")
plt.plot(df_scores.threshold, df_scores.false_positive_rate, label="FPR")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

plt.legend()
plt.show()
output_159_0.png

Random Model

The probability is almost the same as flipping a coin

1
2
3
np.random.seed(1)
y_random = np.random.uniform(0, 1, size=len(y_validation))
y_random
array([4.17022005e-01, 7.20324493e-01, 1.14374817e-04, ...,
       7.73916250e-01, 3.34276405e-01, 8.89982208e-02], shape=(1409,))
((y_random >= 0.5) == y_validation).mean()
np.float64(0.5017743080198722)

df_random = tpr_fpr_dataframe(y_validation, y_random)

plt.plot(df_random.threshold, df_random.true_positive_rate, label="TPR")
plt.plot(df_random.threshold, df_random.false_positive_rate, label="FPR")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

plt.legend()
plt.show()
output_163_0.png

Ideal Model

100% accuracy

1
2
3
4
5
6
7
8
number_of_positives = (
    y_validation == 1
).sum()  # positive means that will churn
number_of_negatives = (
    y_validation == 0
).sum()  # negative means that will not churn

number_of_negatives, number_of_positives
(np.int64(1023), np.int64(386))
y_ideal = np.repeat([0, 1], [number_of_negatives, number_of_positives])
y_ideal
array([0, 0, 0, ..., 1, 1, 1], shape=(1409,))
y_ideal_pred = np.linspace(0, 1, len(y_validation))
y_ideal_pred
array([0.00000000e+00, 7.10227273e-04, 1.42045455e-03, ...,
       9.98579545e-01, 9.99289773e-01, 1.00000000e+00], shape=(1409,))
1 - y_validation.mean()
np.float64(0.7260468417317246)
((y_ideal_pred >= 0.726) == y_ideal).mean()
np.float64(1.0)
1
2
3
df_ideal = tpr_fpr_dataframe(y_ideal, y_ideal_pred)

df_ideal[::10]
threshold true_positive false_positive false_negative true_negative true_positive_rate false_positive_rate
0 0.0 386 1023 0 0 1.000000 1.000000
10 0.1 386 882 0 141 1.000000 0.862170
20 0.2 386 741 0 282 1.000000 0.724340
30 0.3 386 600 0 423 1.000000 0.586510
40 0.4 386 459 0 564 1.000000 0.448680
50 0.5 386 319 0 704 1.000000 0.311828
60 0.6 386 178 0 845 1.000000 0.173998
70 0.7 386 37 0 986 1.000000 0.036168
80 0.8 282 0 104 1023 0.730570 0.000000
90 0.9 141 0 245 1023 0.365285 0.000000
100 1.0 1 0 385 1023 0.002591 0.000000

1
2
3
4
5
6
7
plt.plot(df_ideal.threshold, df_ideal.true_positive_rate, label="TPR")
plt.plot(df_ideal.threshold, df_ideal.false_positive_rate, label="FPR")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
output_171_0.png

Plotting all models

plt.plot(
    df_scores.threshold, df_scores.true_positive_rate, label="TPR - scores"
)
plt.plot(
    df_scores.threshold, df_scores.false_positive_rate, label="FPR - scores"
)

plt.plot(
    df_ideal.threshold,
    df_ideal.true_positive_rate,
    label="TPR - ideal",
    color="black",
)
plt.plot(
    df_ideal.threshold,
    df_ideal.false_positive_rate,
    label="FPR - ideal",
    color="black",
)

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

plt.legend()
plt.show()
output_173_0.png Plotting rates

The model must be as close to ideal as possible, if below of random the model performance is really bad

plt.figure(figsize=(5, 5))

plt.plot(
    df_scores.false_positive_rate, df_scores.true_positive_rate, label="model"
)
plt.plot(
    df_random.false_positive_rate,
    df_random.true_positive_rate,
    label="random",
)
plt.plot(
    df_ideal.false_positive_rate,
    df_ideal.true_positive_rate,
    label="ideal",
)

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
output_175_0.png Using scikit learn

skl_false_positive_rate, skl_true_positive_rate, skl_threshold = roc_curve(
    y_validation, y_validation_pred
)

plt.figure(figsize=(5, 5))


plt.plot(
    skl_false_positive_rate,
    skl_true_positive_rate,
    label="Model",
)
plt.plot([0, 1], [0, 1], label="Random", linestyle="--")

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend()
plt.show()
output_177_0.png

ROC AUC

Receiver Operating Characteristics - Area Under The Curve

Measuring the area under the curve is possible to understand if te model is close to the ideal or the baseline

1
2
3
4
5
skl_false_positive_rate, skl_true_positive_rate, skl_threshold = roc_curve(
    y_validation, y_validation_pred
)

auc(skl_false_positive_rate, skl_true_positive_rate)
0.8463120254863528

ROC AUC for scores DataFrame

auc(df_scores.false_positive_rate, df_scores.true_positive_rate)
0.8459194991870908

ROC AUC for ideal DataFrame

auc(df_ideal.false_positive_rate, df_ideal.true_positive_rate)
0.9999430203759136

Or using scikit-learn method

roc_auc_score(y_validation, y_validation_pred)
0.8463120254863528

Interpretation of ROC AUC

AUC: Probability that randomly selected positive example has higher score than randomly selected negative example and how well the model can order or customers

negative_occurrences = y_validation_pred[y_validation == 0]
positive_occurrences = y_validation_pred[y_validation == 1]

Calculating roc auc score

n = 100000
success = 0

for i in range(n):
    positive_index = random.randint(0, len(positive_occurrences) - 1)
    negative_index = random.randint(0, len(negative_occurrences) - 1)

    if (
        positive_occurrences[positive_index]
        > negative_occurrences[negative_index]
    ):
        success = success + 1

success / n
0.8461

Or using numpy

n = 100000
positive_index = np.random.randint(0, len(positive_occurrences), size=n)
negative_index = np.random.randint(0, len(negative_occurrences), size=n)

(
    (
        positive_occurrences[positive_index]
        > negative_occurrences[negative_index]
    )
).mean()
np.float64(0.8455)

Cross-Validation

  • Evaluating the same model on different subsets of data
  • Get the average prediction and the spread within predictions

K-Fold Cross Validation

Full train dataset Test dataset
Train + Validation Test

Split the full train dataset in multiple parts (folds)

1 2 3
  • Train with 1, 2 and validate with 3 using AUC
  • Train with 1, 3 and validate with 2 using AUC
  • Train with 2, 3 and validate with 1 using AUC

Define a train function

def train(
    df_train: pd.DataFrame, y_train: NDArray[np.float64], C=1.0
) -> Tuple[DictVectorizer, LogisticRegression]:
    """
    Trains a logistic regression model using a DictVectorizer for feature encoding.

    The function converts categorical and numerical features into a numeric
    feature matrix using ``DictVectorizer`` and fits a logistic regression
    classifier with L2 regularization.

    Args:
        df_train (pd.DataFrame): Training dataset containing both categorical
            and numerical feature columns.
        y_train (NDArray[np.float64]): Target labels corresponding to the
            training data.
        C (float, optional): Inverse of regularization strength for the logistic
            regression model. Smaller values specify stronger regularization.
            Defaults to 1.0.

    Returns:
        Tuple[DictVectorizer, LogisticRegression]:
            - DictVectorizer: Fitted vectorizer used to transform feature
              dictionaries into numeric arrays.
            - LogisticRegression: Trained logistic regression model.

    Raises:
        ValueError: If the input data contains incompatible shapes or missing
            required columns.
    """
    dicts = df_train[categorical_columns + numerical_columns].to_dict(
        orient="records"
    )

    dict_vectorizer = DictVectorizer(sparse=False)
    X_train = dict_vectorizer.fit_transform(dicts)

    model = LogisticRegression(max_iter=99999, C=C)
    model.fit(X_train, y_train)

    return dict_vectorizer, model

Train the model

dict_vectorizer, model = train(df_train, y_train)

Define a predict function

def predict(
    df: pd.DataFrame,
    dict_vectorizer: DictVectorizer,
    model: LogisticRegression,
):
    dicts = df[categorical_columns + numerical_columns].to_dict(
        orient="records"
    )
    X = dict_vectorizer.transform(dicts)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

Predict execution example

predict(df_validation, dict_vectorizer, model)[:5]
array([0.0066238 , 0.20482253, 0.21781799, 0.56375043, 0.21867476])

Run cross validation

n_splits = 5

for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
    scores = []

    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

    for train_index, validation_index in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_index]
        df_validation = df_full_train.iloc[validation_index]

        y_train = df_train.churn.values
        y_validation = df_validation.churn.values

        dict_vectorizer, model = train(df_train, y_train, C=C)
        y_pred = predict(df_validation, dict_vectorizer, model)

        auc = roc_auc_score(y_validation, y_pred)
        scores.append(auc)

    print(C, f"{np.mean(scores):.3f}", f"{np.std(scores):.3f}")
 14%|█▍        | 1/7 [00:08<00:48,  8.04s/it]
0.001 0.825 0.009
 29%|██▊       | 2/7 [00:27<01:14, 14.81s/it]
0.01 0.840 0.008
 43%|████▎     | 3/7 [01:01<01:33, 23.43s/it]
0.1 0.842 0.007
 57%|█████▋    | 4/7 [01:35<01:23, 27.88s/it]
0.5 0.842 0.007
 71%|███████▏  | 5/7 [02:02<00:54, 27.43s/it]
1 0.842 0.007
 86%|████████▌ | 6/7 [02:27<00:26, 26.63s/it]
5 0.842 0.007
100%|██████████| 7/7 [02:55<00:00, 25.05s/it]
10 0.842 0.007

Measure ROC AUC for the final model

1
2
3
4
5
6
7
dict_vectorizer, model = train(
    df_full_train, df_full_train.churn.values, C=1.0
)
y_pred = predict(df_test, dict_vectorizer, model)

auc = roc_auc_score(y_test, y_pred)
auc
0.8584032088573997

Save the model

Define output dir

1
2
3
OUTPUT_DIR = pathlib.Path("artifacts/predicting-customer-churn")

os.makedirs(OUTPUT_DIR, exist_ok=True)

Define output file

output_file = OUTPUT_DIR / f"model_C={C}.bin"
output_file
PosixPath('artifacts/predicting-customer-churn/model_C=10.bin')

Save model

with open(output_file, "wb") as f_out:
    pickle.dump((dict_vectorizer, model), f_out)

Load the model

1
2
3
4
with open(output_file, "rb") as f_in:
    loaded_dict_vectorizer, loaded_model = pickle.load(f_in)

loaded_dict_vectorizer, loaded_model
(DictVectorizer(sparse=False), LogisticRegression(max_iter=99999))
customer = {
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "yes",
    "dependents": "no",
    "phoneservice": "no",
    "multiplelines": "no_phone_service",
    "internetservice": "dsl",
    "onlinesecurity": "no",
    "onlinebackup": "yes",
    "deviceprotection": "no",
    "techsupport": "no",
    "streamingtv": "no",
    "streamingmovies": "no",
    "contract": "month-to-month",
    "paperlessbilling": "yes",
    "paymentmethod": "eletronic_check",
    "tenure": 1,
    "monthlycharges": 29.85,
    "totalcharges": 29.85,
}
X = loaded_dict_vectorizer.transform([customer])
loaded_model.predict_proba(X)[0, 1]
np.float64(0.5786968179280463)