Metrics

Binary classification metrics with Polars-native inputs and period-level breakdowns.

Point-in-time metrics

import numpy as np
import polars as pl
from datasci_toolkit.metrics import gini, ks, lift, iv, feature_power

rng = np.random.default_rng(0)
N = 1000

score = rng.uniform(0, 1, N)
y = (score + rng.normal(0, 0.25, N) > 0.5).astype(float)

y_s  = pl.Series(y.tolist())
sc_s = pl.Series(score.tolist())

print(f"Gini  = {gini(y_s, sc_s):.4f}")
print(f"KS    = {ks(y_s, sc_s):.4f}")
print(f"Lift@10% = {lift(y_s, -sc_s, perc=10.0):.4f}")

Information Value

binned = (sc_s > 0.5).cast(pl.Int32)
print(f"IV = {iv(y_s, binned):.4f}")

Feature power — scan a whole DataFrame

Returns Gini and IV for every column, sorted descending.

X = pl.DataFrame({
    "strong": (-sc_s).to_list(),
    "medium": (pl.Series(rng.normal(0, 1, N).tolist()) * 0.5).to_list(),
    "noise":  rng.normal(0, 1, N).tolist(),
})
print(feature_power(X, y_s))

Bootstrap confidence interval

Estimates the sampling variability of Gini via bootstrap resampling.

from datasci_toolkit.metrics import BootstrapGini

bg = BootstrapGini(n_iter=500, ci_level=95.0, seed=42).fit(y_s, sc_s)

print(f"Mean Gini = {bg.mean_:.4f}")
print(f"Std       = {bg.std_:.4f}")
print(f"95% CI    = [{bg.ci_[0]:.4f}, {bg.ci_[1]:.4f}]")

Period metrics

Evaluate performance sliced by a time or cohort column.

from datasci_toolkit.metrics import gini_by_period, lift_by_period

periods = pl.Series(np.repeat(np.arange(5), N // 5).tolist())

gini_df = gini_by_period(y_s, sc_s, periods)
lift_df = lift_by_period(y_s, -sc_s, periods, perc=10.0)

print(gini_df.join(lift_df.select(["period", "lift"]), on="period"))

Periods where all records belong to a single class are skipped automatically (they can't produce a meaningful Gini).

With a population mask

Restrict scoring to a sub-population — e.g., only approved applications.

mask = pl.Series((rng.uniform(size=N) > 0.3).tolist())
gini_approved = gini_by_period(y_s, sc_s, periods, mask=mask)

Weighted metrics

weights = pl.Series(rng.uniform(0.5, 1.5, N).tolist())
gini_weighted = gini_by_period(y_s, sc_s, periods, sample_weight=weights)

Plotting period metrics

from datasci_toolkit.metrics import plot_metric_by_period

plot_metric_by_period(
    gini_df["period"].to_list(),
    [gini_df["gini"].to_list(), lift_df["lift"].to_list()],
    gini_df["count"].to_list(),
    labels=["Gini", "Lift@10%"],
    ylabel="Score",
    title="Performance over time",
)

Multi-series overlay — primary y-axis is bar chart of record counts, secondary y-axis is line chart per metric series.

AUC Stability

Measures how stable a model's AUC is across time periods. Useful for production model monitoring, A/B testing model versions, or detecting performance drift.

When to use

Model monitoring: score a model monthly and track whether AUC is declining, stable, or volatile -- a single scalar summarizes the trend
Model comparison: two models with similar average AUC but different stability profiles -- the stability score picks the one that won't degrade in production
Regulatory reporting: credit risk models require evidence of ongoing performance -- AUCStability quantifies "the model is still working" in one number

Basic usage

import numpy as np
from datasci_toolkit import AUCStability

rng = np.random.default_rng(0)
n_per_month = 500
y_true_all, y_pred_all, period_all = [], [], []
for month in range(1, 7):
    y = rng.integers(0, 2, n_per_month).astype(float)
    pred = y * 0.7 + rng.uniform(0, 0.3, n_per_month)
    y_true_all.extend(y.tolist())
    y_pred_all.extend(pred.tolist())
    period_all.extend([month] * n_per_month)

m = AUCStability().fit(
    np.array(y_true_all),
    np.array(y_pred_all),
    np.array(period_all),
)
print(f"Stability score: {m.stability_score_:.4f}")
print(f"Mean AUC:        {m.mean_auc_:.4f}")
print(f"Slope:           {m.slope_:.6f}")
print(m.period_aucs_)

Visualizing AUC stability

Plot the per-period AUCs with the fitted trend line to see how stable the model is.

import matplotlib.pyplot as plt

aucs = m.period_aucs_
periods = aucs["period"].to_list()
auc_vals = aucs["auc"].to_list()

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(periods, auc_vals, color="steelblue", alpha=0.7, label="Period AUC")
ax.axhline(m.mean_auc_, color="black", linestyle="--", linewidth=1, label=f"Mean AUC = {m.mean_auc_:.4f}")

# Trend line
x_arr = np.array(periods, dtype=float)
trend = m.mean_auc_ + m.slope_ * (x_arr - x_arr.mean())
ax.plot(periods, trend, color="red", linewidth=2, label=f"Trend (slope={m.slope_:.4f})")

ax.set_xlabel("Period")
ax.set_ylabel("AUC")
ax.set_title(f"AUC Stability (score={m.stability_score_:.4f})")
ax.legend()
ax.set_ylim(0.5, 1.0)
plt.tight_layout()
plt.show()

Comparing stable vs declining models

# Stable model (same as above)
m_stable = m

# Declining model: degrade predictions over time
y_true_dec, y_pred_dec, period_dec = [], [], []
for month in range(1, 7):
    y = rng.integers(0, 2, n_per_month).astype(float)
    noise = rng.uniform(0, 0.3 + month * 0.1, n_per_month)
    pred = y * max(0.8 - month * 0.08, 0.2) + noise
    y_true_dec.extend(y.tolist())
    y_pred_dec.extend(pred.tolist())
    period_dec.extend([month] * n_per_month)

m_declining = AUCStability().fit(
    np.array(y_true_dec), np.array(y_pred_dec), np.array(period_dec),
)

fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)
for ax, model, title in [
    (axes[0], m_stable, "Stable model"),
    (axes[1], m_declining, "Declining model"),
]:
    aucs = model.period_aucs_
    p = aucs["period"].to_list()
    a = aucs["auc"].to_list()
    ax.bar(p, a, color="steelblue", alpha=0.7)
    ax.axhline(model.mean_auc_, color="black", linestyle="--", linewidth=1)
    x_arr = np.array(p, dtype=float)
    trend = model.mean_auc_ + model.slope_ * (x_arr - x_arr.mean())
    ax.plot(p, trend, color="red", linewidth=2)
    ax.set_title(f"{title}\nscore={model.stability_score_:.4f}, slope={model.slope_:.4f}")
    ax.set_xlabel("Period")
    ax.set_ylim(0.4, 1.0)
axes[0].set_ylabel("AUC")
plt.tight_layout()
plt.show()

Plain mean AUC (no penalties)

m = AUCStability(slope_weight=0.0, std_weight=0.0).fit(
    np.array(y_true_all), np.array(y_pred_all), np.array(period_all),
)

Worst-N mode

m = AUCStability(worst_n=3).fit(
    np.array(y_true_all), np.array(y_pred_all), np.array(period_all),
)

Parameters

Parameter	Default	Description
`slope_weight`	`88.0`	Multiplier for `min(0, slope)` penalty. Higher = more penalty for declining AUC.
`std_weight`	`0.5`	Multiplier for residual std penalty. Higher = more penalty for volatile AUC.
`worst_n`	`None`	If set, ignores linear formula and returns mean of N worst period AUCs.

How it works

Computes AUC per unique period value (skips periods with <2 samples or single class)
Fits a linear trend through the period AUCs
stability_score = mean_auc + slope_weight * min(0, slope) - std_weight * std(residuals)
With worst_n: returns mean of the N lowest period AUCs instead