Skip to content

Credit Risk Score

  • Binary classification
\[g(x_{i}) \approx y_{i}\]
\[y_{i} \in \{0, 1\}\]
  • \(1\): Default
  • \(0\): No default

Dataset:

kaggle-credit-scoring or github-credit-scoring

Install packages

!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1 \
    xgboost==3.1.2

Append notebooks directory to sys.path

1
2
3
import sys

sys.path.append("../../..")
import os
import pathlib
import random
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
from typing import Tuple, Union
import numpy as np
from numpy.typing import NDArray
from IPython.utils.capture import capture_output
import seaborn as sns
import datetime
import pickle
import xgboost as xgb
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from tqdm import tqdm
from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)

pd.set_option("display.max_columns", None)

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

%matplotlib inline

load_dotenv()  # Root directory .env file
True

Utility scripts:

KaggleDataExtractor:

import base64
import io
import logging
import os
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger("KaggleExtractor")


class ExtractionConfig(ABC):
    pass


class DataExtractionStrategy(ABC):
    @abstractmethod
    def download_dataset(self, config: ExtractionConfig) -> None:
        pass


@dataclass(frozen=True)
class KaggleExtractionConfig(ExtractionConfig):
    dataset_slug: str  # e.g. "zynicide/wine-reviews"
    file_name: str  # file inside the Kaggle zip
    destination_path: str  # folder to extract to
    output_file_name: str | None = None  # optional rename


class KaggleDataExtractor(DataExtractionStrategy):
    def __init__(self, username: str, api_token: str) -> None:
        self.username = username
        self.api_token = api_token
        self.auth_header = self._create_auth_header()

    def _create_auth_header(self):
        token = f"{self.username}:{self.api_token}"
        base64_token = base64.b64encode(token.encode()).decode()
        return {"Authorization": f"Basic {base64_token}"}

    def download_dataset(self, config: ExtractionConfig) -> None:
        if not isinstance(config, KaggleExtractionConfig):
            raise TypeError("config must be a KaggleExtractionConfig instance")

        url = f"https://www.kaggle.com/api/v1/datasets/download/{config.dataset_slug}"
        request = Request(url, headers=self.auth_header)

        logger.info(f"Starting download from Kaggle: {url}")

        try:
            with urlopen(request) as response:
                data = response.read()
            logger.info("Download completed. Extracting zip file...")

            os.makedirs(config.destination_path, exist_ok=True)

            with zipfile.ZipFile(io.BytesIO(data)) as z:
                extracted_path = z.extract(
                    config.file_name, path=config.destination_path
                )

            if config.output_file_name is None:
                logger.info(
                    f"Dataset '{config.file_name}' extracted successfully "
                    f"to: {config.destination_path}"
                )
                return

            old_path = os.path.join(config.destination_path, config.file_name)
            new_path = os.path.join(
                config.destination_path, config.output_file_name
            )

            os.rename(old_path, new_path)

            logger.info(
                f"Dataset '{config.file_name}' extracted successfully "
                f"to: {config.destination_path}"
            )

        except HTTPError as e:
            logger.error(f"HTTP Error {e.code}: {e.reason}")
        except URLError as e:
            logger.error(f"URL Error: {e.reason}")
        except zipfile.BadZipFile:
            logger.error(
                "Failed to read zip file. Kaggle may have returned HTML instead of a zip."
            )
        except Exception as e:
            logger.exception(f"Unexpected error occurred: {e}")

Create data directory

1
2
3
DATA_DIR = pathlib.Path("data/credit-risk-score")

os.makedirs(DATA_DIR, exist_ok=True)

Download dataset from Kaggle

username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "CreditScoring.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="nightcrawler101/creditscoring-csv",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="credit-scoring.csv",
)

if not os.path.isfile(DATA_DIR / "credit-scoring.csv"):
    extractor.download_dataset(config)

Pass notebook variables to shell command

!head $DATA_DIR/credit-scoring.csv
"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"

1,9,1,60,30,2,1,3,73,129,0,0,800,846

1,17,1,60,58,3,1,1,48,131,0,0,1000,1658

2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985

1,0,1,60,24,1,1,1,63,182,2500,0,900,1325

1,0,1,36,26,1,1,1,46,107,0,0,310,910

1,1,2,60,36,2,1,1,75,214,3500,0,650,1645

1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800

1,9,5,12,27,1,1,1,35,80,0,0,200,1093

1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957

Data Preparation

Load dataset

1
2
3
df = pd.read_csv(DATA_DIR / "credit-scoring.csv")

df.head(n=2)
Status Seniority Home Time Age Marital Records Job Expenses Income Assets Debt Amount Price
0 1 9 1 60 30 2 1 3 73 129 0 0 800 846
1 1 17 1 60 58 3 1 1 48 131 0 0 1000 1658

Inspect all columns at once

df.head(3).T
0 1 2
Status 1 1 2
Seniority 9 17 10
Home 1 1 2
Time 60 60 36
Age 30 58 46
Marital 2 3 2
Records 1 1 2
Job 3 1 3
Expenses 73 48 90
Income 129 131 200
Assets 0 0 3000
Debt 0 0 0
Amount 800 1000 2000
Price 846 1658 2985

Data summary

1
2
3
4
5
6
7
8
9
df_summary = pd.DataFrame(
    {
        "column": df.columns,
        "dtype": [df[col].dtype for col in df.columns],
        "sample_unique": [df[col].unique()[:6] for col in df.columns],
        "n_unique": [df[col].nunique() for col in df.columns],
    }
)
df_summary
column dtype sample_unique n_unique
0 Status int64 [1, 2, 0] 3
1 Seniority int64 [9, 17, 10, 0, 1, 29] 47
2 Home int64 [1, 2, 5, 3, 6, 4] 7
3 Time int64 [60, 36, 12, 48, 18, 24] 11
4 Age int64 [30, 58, 46, 24, 26, 36] 50
5 Marital int64 [2, 3, 1, 4, 5, 0] 6
6 Records int64 [1, 2] 2
7 Job int64 [3, 1, 2, 0, 4] 5
8 Expenses int64 [73, 48, 90, 63, 46, 75] 94
9 Income int64 [129, 131, 200, 182, 107, 214] 353
10 Assets int64 [0, 3000, 2500, 3500, 10000, 15000] 160
11 Debt int64 [0, 2500, 260, 2000, 500, 99999999] 183
12 Amount int64 [800, 1000, 2000, 900, 310, 650] 285
13 Price int64 [846, 1658, 2985, 1325, 910, 1645] 1419

Clean column names

1
2
3
df.columns = df.columns.str.lower().str.replace(" ", "_")

df.head(n=2)
status seniority home time age marital records job expenses income assets debt amount price
0 1 9 1 60 30 2 1 3 73 129 0 0 800 846
1 1 17 1 60 58 3 1 1 48 131 0 0 1000 1658

Decode number variables

status_values = {
    1: "ok",
    2: "default",
    0: "unknown",
}
home_values = {
    1: "rent",
    2: "owner",
    3: "private",
    4: "ignore",
    5: "parents",
    6: "other",
    0: "unknown",
}
marital_values = {
    1: "single",
    2: "married",
    3: "widow",
    4: "separated",
    5: "divorced",
    0: "unknown",
}
records_values = {
    1: "no",
    2: "yes",
    0: "unknown",
}
job_values = {
    1: "fixed",
    2: "partime",
    3: "freelance",
    4: "others",
    0: "unknown",
}


df.status = df.status.map(status_values)
df.home = df.home.map(home_values)
df.marital = df.marital.map(marital_values)
df.job = df.job.map(job_values)
df.records = df.records.map(records_values)

Inspect decoding results

df.head()
status seniority home time age marital records job expenses income assets debt amount price
0 ok 9 rent 60 30 married no freelance 73 129 0 0 800 846
1 ok 17 rent 60 58 widow no fixed 48 131 0 0 1000 1658
2 default 10 owner 36 46 married yes freelance 90 200 3000 0 2000 2985
3 ok 0 rent 60 24 single no fixed 63 182 2500 0 900 1325
4 ok 0 rent 36 26 single no fixed 46 107 0 0 310 910

Inspect values range

df.describe().round()
seniority time age expenses income assets debt amount price
count 4455.0 4455.0 4455.0 4455.0 4455.0 4455.0 4455.0 4455.0 4455.0
mean 8.0 46.0 37.0 56.0 763317.0 1060341.0 404382.0 1039.0 1463.0
std 8.0 15.0 11.0 20.0 8703625.0 10217569.0 6344253.0 475.0 628.0
min 0.0 6.0 18.0 35.0 0.0 0.0 0.0 100.0 105.0
25% 2.0 36.0 28.0 35.0 80.0 0.0 0.0 700.0 1118.0
50% 5.0 48.0 36.0 51.0 120.0 3500.0 0.0 1000.0 1400.0
75% 12.0 60.0 45.0 72.0 166.0 6000.0 0.0 1300.0 1692.0
max 48.0 72.0 68.0 180.0 99999999.0 99999999.0 99999999.0 5000.0 11140.0

Check series for large numbers

1
2
3
4
5
6
for col_name in ["income", "assets", "debt"]:
    print(
        col_name,
        df[col_name].max(),
        df[df[col_name] < 99999999][col_name].max(),
    )
income 99999999 959

assets 99999999 300000

debt 99999999 30000

Replace values

for col_name in ["income", "assets", "debt"]:
    df[col_name] = df[col_name].replace(to_replace=99999999, value=np.nan)

Check if values were replaced

df.describe().round()
seniority time age expenses income assets debt amount price
count 4455.0 4455.0 4455.0 4455.0 4421.0 4408.0 4437.0 4455.0 4455.0
mean 8.0 46.0 37.0 56.0 131.0 5403.0 343.0 1039.0 1463.0
std 8.0 15.0 11.0 20.0 86.0 11573.0 1246.0 475.0 628.0
min 0.0 6.0 18.0 35.0 0.0 0.0 0.0 100.0 105.0
25% 2.0 36.0 28.0 35.0 80.0 0.0 0.0 700.0 1118.0
50% 5.0 48.0 36.0 51.0 120.0 3000.0 0.0 1000.0 1400.0
75% 12.0 60.0 45.0 72.0 165.0 6000.0 0.0 1300.0 1692.0
max 48.0 72.0 68.0 180.0 959.0 300000.0 30000.0 5000.0 11140.0

Check for status values

df.status.value_counts()
status
ok         3200
default    1254
unknown       1
Name: count, dtype: int64

Remove unlabeled data

df = df[df.status != "unknown"].reset_index(drop=True)

Verify removal

df.status.value_counts()
status
ok         3200
default    1254
Name: count, dtype: int64

Split datasets

  • 60% train
  • 20% validation
  • 20% test
df_full_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=11,
)
df_train, df_validation = train_test_split(
    df_full_train,
    test_size=0.25,
    random_state=11,
)

Drop indexes

1
2
3
df_train.reset_index(drop=True, inplace=True)
df_validation.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

Encode labels to integer

  • Default: 1
  • Ok: 0
1
2
3
y_train = (df_train.status == "default").astype(int).values
y_validation = (df_validation.status == "default").astype(int).values
y_test = (df_test.status == "default").astype(int).values

Drop target column status

1
2
3
df_train.drop(columns="status", inplace=True, errors="ignore")
df_validation.drop(columns="status", inplace=True, errors="ignore")
df_test.drop(columns="status", inplace=True, errors="ignore")

Decision Tree

Example of a simple decision tree

flowchart TD
    A[Start] --> B{Debt > Income?}

    B -- Yes --> C[Default]
    B -- No --> D{Income < 100?}

    D -- Yes --> C
    D -- No --> E[OK]

One Hot Encoding

1
2
3
4
train_dicts = df_train.fillna(0).to_dict(orient="records")
dict_vectorizer = DictVectorizer(sparse=False)

X_train = dict_vectorizer.fit_transform(train_dicts)

Train Decision Tree

1
2
3
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
print(decision_tree)
DecisionTreeClassifier()

Validation

validation_dicts = df_validation.fillna(0).to_dict(orient="records")
X_validation = dict_vectorizer.transform(validation_dicts)

Prediction on validation dataset

y_validation_pred = decision_tree.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, y_validation_pred)
0.6464392966990385

Prediction on train dataset

y_train_pred = decision_tree.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_train_pred)
1.0

Overfitting

Model memorizes the training data, but it fails to generalize. When new data comes, it does not now how to handle it

This can happen when our Tree is to deep, so the model can learn any possible combination

Train the tree with hyperparameter of max_depth

1
2
3
decision_tree = DecisionTreeClassifier(max_depth=3)
decision_tree.fit(X_train, y_train)
print(decision_tree)
DecisionTreeClassifier(max_depth=3)

New prediction on validation dataset

y_validation_pred = decision_tree.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, y_validation_pred)
0.7389079944782155

New prediction on train dataset

y_train_pred = decision_tree.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_train_pred)
0.7761016984958594
1
2
3
4
5
print(
    export_text(
        decision_tree, feature_names=dict_vectorizer.get_feature_names_out()
    )
)
|--- records=no <= 0.50

|   |--- seniority <= 6.50

|   |   |--- amount <= 862.50

|   |   |   |--- class: 0

|   |   |--- amount >  862.50

|   |   |   |--- class: 1

|   |--- seniority >  6.50

|   |   |--- income <= 103.50

|   |   |   |--- class: 1

|   |   |--- income >  103.50

|   |   |   |--- class: 0

|--- records=no >  0.50

|   |--- job=partime <= 0.50

|   |   |--- income <= 74.50

|   |   |   |--- class: 0

|   |   |--- income >  74.50

|   |   |   |--- class: 0

|   |--- job=partime >  0.50

|   |   |--- assets <= 8750.00

|   |   |   |--- class: 1

|   |   |--- assets >  8750.00

|   |   |   |--- class: 0

Decision Trees parameter tuning

  • max_depth
  • min_samples_leaf
1
2
3
4
5
6
7
8
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    decision_tree = DecisionTreeClassifier(max_depth=depth)
    decision_tree.fit(X_train, y_train)

    y_validation_pred = decision_tree.predict_proba(X_validation)[:, 1]
    auc = roc_auc_score(y_validation, y_validation_pred)

    print(f"{str(depth):4s} -> {auc:.3f}")
1    -> 0.606

2    -> 0.669

3    -> 0.739

4    -> 0.761

5    -> 0.766

6    -> 0.750

10   -> 0.680

15   -> 0.673

20   -> 0.651

None -> 0.647

Both max_depth and min_samples_leaf

scores = []

for depth in [4, 5, 6]:
    for samples in [1, 2, 5, 10, 15, 20, 100, 200, 500]:
        decision_tree = DecisionTreeClassifier(
            max_depth=depth, min_samples_leaf=samples
        )
        decision_tree.fit(X_train, y_train)

        y_validation_pred = decision_tree.predict_proba(X_validation)[:, 1]
        auc = roc_auc_score(y_validation, y_validation_pred)

        scores.append((depth, samples, auc))

df_scores = pd.DataFrame(
    scores,
    columns=[
        "max_depth",
        "min_samples_leaf",
        "roc_auc_score",
    ],
)

df_scores.head()
max_depth min_samples_leaf roc_auc_score
0 4 1 0.761283
1 4 2 0.761283
2 4 5 0.761283
3 4 10 0.761283
4 4 15 0.763726

Improve visualization

1
2
3
4
5
6
7
df_scores_pivot = df_scores.pivot(
    index="min_samples_leaf",
    columns=["max_depth"],
    values="roc_auc_score",
).round(3)

df_scores_pivot
max_depth 4 5 6
min_samples_leaf
1 0.761 0.766 0.751
2 0.761 0.767 0.765
5 0.761 0.768 0.762
10 0.761 0.762 0.778
15 0.764 0.772 0.785
20 0.761 0.774 0.774
100 0.756 0.763 0.776
200 0.747 0.759 0.768
500 0.680 0.680 0.680

Visualize in a heatmap

sns.heatmap(df_scores_pivot, annot=True, fmt=".3f")
plt.show()
output_75_0.png Train with newer parameters

1
2
3
decision_tree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
decision_tree.fit(X_train, y_train)
print(decision_tree)
DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)

Ensembles

Combining multiple models together

Random Forest

1
2
3
4
5
6
7
random_forest = RandomForestClassifier(
    n_estimators=10,
    random_state=1,
    n_jobs=-1,
)
random_forest.fit(X_train, y_train)
print(random_forest)
RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=1)

Prediction with Random Forest

y_validation_pred = random_forest.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, y_validation_pred)
0.7744726453706618

Get scores for Decision Tree

scores = []

for number in range(10, 201, 10):
    random_forest = RandomForestClassifier(
        n_estimators=number,
        random_state=1,
        n_jobs=-1,
    )
    random_forest.fit(X_train, y_train)

    y_validation_pred = random_forest.predict_proba(X_validation)[:, 1]
    auc = roc_auc_score(y_validation, y_validation_pred)

    scores.append((number, auc))

df_rf_scores = pd.DataFrame(
    scores,
    columns=["n_estimators", "roc_auc_score"],
)
df_rf_scores.head()
n_estimators roc_auc_score
0 10 0.774473
1 20 0.803532
2 30 0.815075
3 40 0.815686
4 50 0.817082

Plot parameters

1
2
3
4
plt.plot(df_rf_scores.n_estimators, df_rf_scores.roc_auc_score)
plt.xlabel("n_estimators")
plt.ylabel("roc_auc_score")
plt.show()
output_85_0.png Add depth parameter

scores = []

for depth in [5, 10, 15]:
    for number in range(10, 201, 10):
        random_forest = RandomForestClassifier(
            n_estimators=number,
            max_depth=depth,
            random_state=1,
            n_jobs=-1,
        )
        random_forest.fit(X_train, y_train)

        y_validation_pred = random_forest.predict_proba(X_validation)[:, 1]
        auc = roc_auc_score(y_validation, y_validation_pred)

        scores.append((depth, number, auc))

df_rf_scores = pd.DataFrame(
    scores,
    columns=["max_depth", "n_estimators", "roc_auc_score"],
)
df_rf_scores.head()
max_depth n_estimators roc_auc_score
0 5 10 0.787699
1 5 20 0.797731
2 5 30 0.800305
3 5 40 0.799708
4 5 50 0.799878

Plot parameters

for depth in [5, 10, 15]:
    df_subset = df_rf_scores[df_rf_scores.max_depth == depth]
    plt.plot(
        df_subset.n_estimators,
        df_subset.roc_auc_score,
        label=f"max_depth={depth}",
    )

plt.xlabel("n_estimators")
plt.ylabel("roc_auc_score")
plt.legend()
plt.show()
output_89_0.png Choose better parameter

max_depth = 10

Verifying best value for min_samples_leaf

scores = []

for samples in [1, 3, 5, 10, 50]:
    for number in range(10, 201, 10):
        random_forest = RandomForestClassifier(
            n_estimators=number,
            max_depth=max_depth,
            min_samples_leaf=samples,
            random_state=1,
            n_jobs=-1,
        )
        random_forest.fit(X_train, y_train)

        y_validation_pred = random_forest.predict_proba(X_validation)[:, 1]
        auc = roc_auc_score(y_validation, y_validation_pred)

        scores.append((samples, number, auc))

df_rf_scores = pd.DataFrame(
    scores,
    columns=[
        "min_samples_leaf",
        "n_estimators",
        "roc_auc_score",
    ],
)
df_rf_scores.head()
min_samples_leaf n_estimators roc_auc_score
0 1 10 0.791365
1 1 20 0.808496
2 1 30 0.811584
3 1 40 0.817839
4 1 50 0.817058

Plotting min_samples_leaf

colors = ["black", "blue", "orange", "red", "grey"]
min_samples_leaf_values = [1, 3, 5, 10, 50]


for samples, color in zip(min_samples_leaf_values, colors):
    df_subset = df_rf_scores[df_rf_scores.min_samples_leaf == samples]
    plt.plot(
        df_subset.n_estimators,
        df_subset.roc_auc_score,
        color=color,
        label=f"min_samples_leaf={samples}",
    )

plt.xlabel("n_estimators")
plt.ylabel("roc_auc_score")
plt.legend()
plt.show()
output_95_0.png Defining best value for min_samples_leaf

min_samples_leaf = 3

Training the model with new hyperparameter

1
2
3
4
5
6
7
8
9
random_forest = RandomForestClassifier(
    n_estimators=10,
    random_state=1,
    max_depth=max_depth,
    min_samples_leaf=min_samples_leaf,
    n_jobs=-1,
)
random_forest.fit(X_train, y_train)
print(random_forest)
RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=10,

                       n_jobs=-1, random_state=1)
y_validation_pred = random_forest.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, y_validation_pred)
0.8107577922549707

Boosting

Training models sequentially and a model corrects the error of previous one

feature_names = list(dict_vectorizer.get_feature_names_out())
d_train = xgb.DMatrix(
    X_train,
    label=y_train,
    feature_names=feature_names,
)
d_validation = xgb.DMatrix(
    X_validation,
    label=y_validation,
    feature_names=feature_names,
)

Train model

xgb_params = {
    "eta": 0.3,
    "max_depth": 6,
    "min_child_weight": 1,
    "objective": "binary:logistic",
    "nthread": 8,
    "seed": 1,
    "verbosity": 1,
}

model = xgb.train(xgb_params, d_train, num_boost_round=10)
model
<xgboost.core.Booster at 0x7531eacaa750>

Check ROC AUC

y_xgb_pred = model.predict(d_validation)
roc_auc_score(y_validation, y_xgb_pred)
0.8118506454190986

Watchlist for evaluation

watchlist = [(d_train, "train"), (d_validation, "validation")]

Get evaluation outputs from training

%%capture output

xgb_params = {
    "eta": 0.3, # Learning rate
    "max_depth": 6,
    "min_child_weight": 1,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "nthread": 8,
    "seed": 1,
    "verbosity": 1,
}

model = xgb.train(
    xgb_params, d_train, evals=watchlist, num_boost_round=200, verbose_eval=10
)
model

Parse xgb training metrics output

def parse_xgb_output(output):
    results = []

    for line in output.stdout.strip().split("\n"):
        it_line, train_line, val_line = line.split("\t")

        it = int(it_line.strip("[]"))
        train = float(train_line.split(":")[1])
        val = float(val_line.split(":")[1])

        results.append((it, train, val))

    columns = ["num_iter", "train_auc", "val_auc"]
    df_results = pd.DataFrame(results, columns=columns)
    return df_results

Parse metrics

df_xgb_score = parse_xgb_output(output)
df_xgb_score.head()
num_iter train_auc val_auc
0 0 0.86653 0.77999
1 10 0.95512 0.81115
2 20 0.97648 0.81877
3 30 0.98844 0.81613
4 40 0.99393 0.81407

Plot metrics

1
2
3
4
plt.plot(df_xgb_score.num_iter, df_xgb_score.train_auc, label="train")
plt.plot(df_xgb_score.num_iter, df_xgb_score.val_auc, label="validation")
plt.legend()
plt.show()
output_116_0.png Plot only validation

1
2
3
plt.plot(df_xgb_score.num_iter, df_xgb_score.val_auc, label="validation")
plt.legend()
plt.show()
output_118_0.png

Parameter tuning

Tuning Learning Rate (ETA)

etas = [0.01, 0.05, 0.1, 0.3, 1.0]
scores = {}

for eta in etas:
    with capture_output() as output:
        xgb_params = {
            "eta": eta,
            "max_depth": 6,
            "min_child_weight": 1,
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "nthread": 8,
            "seed": 1,
            "verbosity": 1,
        }

        model = xgb.train(
            xgb_params,
            d_train,
            evals=watchlist,
            num_boost_round=200,
            verbose_eval=10,
        )

    key = f"eta={eta}"
    scores[key] = parse_xgb_output(output)

Plotting results

for key, df_xgb_score in scores.items():
    plt.plot(
        df_xgb_score.num_iter,
        df_xgb_score.val_auc,
        label=key,
    )

plt.legend()
plt.xlabel("num_iter")
plt.ylabel("roc_auc_score")
plt.show()
output_123_0.png Tuning Max Depth

max_depths = [3, 4, 6, 10]
scores = {}

for max_depth in max_depths:
    with capture_output() as output:
        xgb_params = {
            "eta": 0.1,
            "max_depth": max_depth,
            "min_child_weight": 1,
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "nthread": 8,
            "seed": 1,
            "verbosity": 1,
        }

        model = xgb.train(
            xgb_params,
            d_train,
            evals=watchlist,
            num_boost_round=200,
            verbose_eval=10,
        )

    key = f"max_depth={max_depth}"
    scores[key] = parse_xgb_output(output)

Plot scores

for key, df_xgb_score in scores.items():
    plt.plot(
        df_xgb_score.num_iter,
        df_xgb_score.val_auc,
        label=key,
    )

plt.legend()
plt.xlabel("num_iter")
plt.ylabel("roc_auc_score")
plt.show()
output_127_0.png Plotting filtered scores

for key, df_xgb_score in scores.items():
    if key == "max_depth=10":
        continue
    plt.plot(
        df_xgb_score.num_iter,
        df_xgb_score.val_auc,
        label=key,
    )

plt.ylim(0.8, 0.84)

plt.legend()
plt.xlabel("num_iter")
plt.ylabel("roc_auc_score")
plt.show()
output_129_0.png Fixing best max_depth

max_depth = 3

Tuning Min child weight

min_child_weights = [1, 10, 30]
scores = {}

for min_child_weight in min_child_weights:
    with capture_output() as output:
        xgb_params = {
            "eta": 0.1,
            "max_depth": 3,
            "min_child_weight": min_child_weight,
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "nthread": 8,
            "seed": 1,
            "verbosity": 1,
        }

        model = xgb.train(
            xgb_params,
            d_train,
            evals=watchlist,
            num_boost_round=200,
            verbose_eval=10,
        )

    key = f"min_child_weight={min_child_weight}"
    scores[key] = parse_xgb_output(output)

Plot scores

for key, df_xgb_score in scores.items():
    plt.plot(
        df_xgb_score.num_iter,
        df_xgb_score.val_auc,
        label=key,
    )

plt.ylim(0.8, 0.84)

plt.legend()
plt.xlabel("num_iter")
plt.ylabel("roc_auc_score")
plt.show()
output_135_0.png Setting best min_child_weight

min_child_weight = 30
xgb_params = {
    "eta": 0.1,
    "max_depth": 30,
    "min_child_weight": 30,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "nthread": 8,
    "seed": 1,
    "verbosity": 1,
}

model = xgb.train(
    xgb_params,
    d_train,
    num_boost_round=175,
)

Selecting the final model

Decision Tree

1
2
3
decision_tree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
decision_tree.fit(X_train, y_train)
print(decision_tree)
DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)

Evaluate Decision Tree

y_dt_pred = decision_tree.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, y_dt_pred)
0.7853194400716863

Random forest

1
2
3
4
5
6
7
8
9
random_forest = RandomForestClassifier(
    n_estimators=200,
    random_state=1,
    max_depth=10,
    min_samples_leaf=3,
    n_jobs=-1,
)
random_forest.fit(X_train, y_train)
print(random_forest)
RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=200,

                       n_jobs=-1, random_state=1)

Evaluate Random Forest

y_rf_pred = random_forest.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, y_rf_pred)
0.8246258264512848

XGBoost

xgb_params = {
    "eta": 0.1,
    "max_depth": 30,
    "min_child_weight": 30,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "nthread": 8,
    "seed": 1,
    "verbosity": 1,
}

model = xgb.train(
    xgb_params,
    d_train,
    num_boost_round=175,
)

Evaluate XGBoost

y_xgb_pred = model.predict(d_validation)
roc_auc_score(y_validation, y_xgb_pred)
0.831425128962728

Train best model with all training data

1
2
3
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = (df_full_train.status == "default").astype(int).values
df_full_train.drop(columns="status", inplace=True, errors="ignore")

Transform full training and test datasets

1
2
3
4
5
6
7
dicts_full_train = df_full_train.fillna(0).to_dict(orient="records")

dict_vectorizer_full = DictVectorizer(sparse=False)
X_full_train = dict_vectorizer_full.fit_transform(dicts_full_train)

dicts_test = df_test.fillna(0).to_dict(orient="records")
X_test = dict_vectorizer_full.transform(dicts_test)

Create DMatrix for XGBoost

feature_names = list(dict_vectorizer_full.get_feature_names_out())

d_full_train = xgb.DMatrix(
    X_full_train,
    label=y_full_train,
    feature_names=feature_names,
)

d_test = xgb.DMatrix(
    X_test,
    feature_names=feature_names,
)

Train model

xgb_params = {
    "eta": 0.1,
    "max_depth": 3,
    "min_child_weight": 1,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "nthread": 8,
    "seed": 1,
    "verbosity": 1,
}

model = xgb.train(xgb_params, d_full_train, num_boost_round=175)

Evaluate chosen model

y_pred = model.predict(d_test)
roc_auc_score(y_test, y_pred)
0.8310871902644055