stability

Population Stability Index (PSI) and Event Stability Index (ESI) for monitoring feature and target drift over time.

PSI

PSI(
    n_quantile_bins: int = 10, missing_value: float = 0.0001
)

Bases: BaseEstimator

Population Stability Index.

Measures distributional shift between a reference dataset and a monitoring dataset. Fit on the reference, call score on any subsequent snapshot.

Parameters:

Name	Type	Description	Default
`n_quantile_bins`	`int`	Number of quantile bins for numeric features.	`10`
`missing_value`	`float`	Frequency floor applied to empty bins to avoid log(0).	`0.0001`

Attributes:

Name	Type	Description
`bin_breaks_`		Quantile cut points fitted on the reference (numeric only).
`ref_dist_`		Reference frequency distribution as a DataFrame.

Source code in datasci_toolkit/stability.py

def __init__(self, n_quantile_bins: int = 10, missing_value: float = 0.0001):
    self.n_quantile_bins = n_quantile_bins
    self.missing_value = missing_value

ESI

Event Stability Index.

Measures rank stability of a model score across time periods. Returns two variants: V1 (rank-correlation based) and V2 (event-rate-ratio based).

StabilityMonitor

StabilityMonitor(
    features: list,
    n_quantile_bins: int = 10,
    missing_value: float = 0.0001,
    col_weight: str | None = None,
)

Bases: BaseEstimator

Monitors PSI for a set of features over time.

Fits one PSI instance per feature on a reference DataFrame and exposes three scoring modes: against a fixed reference, consecutive period pairs, or arbitrary boolean masks.

Parameters:

Name	Type	Description	Default
`features`	`list`	Column names to monitor.	required
`n_quantile_bins`	`int`	Quantile bins for numeric features (passed to `PSI`).	`10`
`missing_value`	`float`	Frequency floor for empty bins (passed to `PSI`).	`0.0001`
`col_weight`	`str \| None`	Optional weight column in the input DataFrame.	`None`

Attributes:

Name	Type	Description
`psis_`		Dict mapping feature name to fitted `PSI` instance.

Source code in datasci_toolkit/stability.py

def __init__(self, features: list, n_quantile_bins: int = 10, missing_value: float = 0.0001, col_weight: str | None = None):
    self.features = features
    self.n_quantile_bins = n_quantile_bins
    self.missing_value = missing_value
    self.col_weight = col_weight

plot_psi_comparison

plot_psi_comparison(
    months: list,
    psi_values: list,
    labels: list,
    title: str = "PSI",
    size: tuple = (12, 8),
    output_folder: str | None = None,
    show: bool = True,
) -> None

Source code in datasci_toolkit/stability.py

def plot_psi_comparison(months: list, psi_values: list, labels: list, title: str = "PSI", size: tuple = (12, 8), output_folder: str | None = None, show: bool = True) -> None:
    n_models = len(psi_values)
    bar_positions = np.linspace(0, len(months), len(months))
    threshold_x = np.linspace(0, len(months) + 1, len(months) + 1)
    plt.figure(figsize=size)
    plt.grid(zorder=0)
    for model_index, arr in enumerate(psi_values):
        plt.bar(bar_positions - (1 / n_models) * model_index, arr, width=1 / n_models, label=labels[model_index], zorder=3)
    plt.plot(threshold_x, [0.1] * len(threshold_x), "black")
    plt.plot(threshold_x, [0.25] * len(threshold_x), "r")
    plt.title(title, fontsize=18)
    plt.xticks(bar_positions, months, rotation=45)
    plt.xlim(0, len(months) + 0.5)
    plt.ylim(0, 0.3)
    plt.xlabel("Months", fontsize=13)
    plt.ylabel("PSI", fontsize=13)
    plt.legend()
    if output_folder:
        plt.savefig(f"{output_folder}/psi_comparison_chart.png", format="png", dpi=72, bbox_inches="tight")
    if show:
        plt.show()
    plt.close()

psi_hist

psi_hist(
    data: DataFrame,
    scores: list,
    months: list,
    month_col: str,
    pivot: int = 0,
    score_names: list | None = None,
    title: str = "PSI",
    bins: int = 10,
    output_folder: str | None = None,
    show: bool = True,
) -> None

Source code in datasci_toolkit/stability.py

def psi_hist(data: pl.DataFrame, scores: list, months: list, month_col: str, pivot: int = 0, score_names: list | None = None, title: str = "PSI", bins: int = 10, output_folder: str | None = None, show: bool = True) -> None:
    psi = PSI(n_quantile_bins=bins)
    results = []
    for score_col in scores:
        ref = data.filter((pl.col(month_col) == months[pivot]) & pl.col(score_col).is_not_null())[score_col]
        psi.fit(ref)
        results.append([
            psi.score(data.filter((pl.col(month_col) == month) & pl.col(score_col).is_not_null())[score_col])
            for month in months
        ])
    plot_psi_comparison(months, results, score_names or [str(i) for i in range(len(scores))], title, output_folder=output_folder, show=show)