SoccerPredictAI
  • Home
  • Reports
    • 01 · EDA & Preprocessing
    • 02 · Feature Engineering
    • 03 · Experiment Studies v1.01–v1.05
    • 04 · Model Analysis
    • 05 · Holdout Analysis
    • 06 · Live Inference & Odds
    • 07 · Live Betting Strategy
  • Back to Docs (MkDocs)

On this page

  • 1. Champion model snapshot
  • 2. Hyperparameter tuning
  • 2.5 Model selection — select_model
  • 3. Baseline comparison
  • 4. Calibration analysis
  • 5. Feature importance & SHAP
    • 5.1 Built-in XGBoost gain importance
    • 5.2 SHAP values

Model Analysis — Calibration, SHAP & Tuning

Champion model deep-dive: hyperparameter search, calibration quality, feature attributions, and baseline comparison

Author

Dima Ivanov

Published

May 28, 2026

Show code
import sys
from pathlib import Path

project_root = Path().resolve().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import warnings
warnings.filterwarnings("ignore")

import os
import json
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from IPython.display import display, HTML, Markdown

with open(project_root / "params.yaml") as _f:
    PARAMS = yaml.safe_load(_f)

# ── MLflow setup ────────────────────────────────────────────────────────────
from src.pipelines._config import get_pipeline_config
_cfg = get_pipeline_config()
os.environ.setdefault("MLFLOW_S3_ENDPOINT_URL", _cfg.minio_endpoint_url)
os.environ.setdefault("AWS_ACCESS_KEY_ID",       _cfg.minio_access_key)
os.environ.setdefault("AWS_SECRET_ACCESS_KEY",   _cfg.minio_secret_key)

import mlflow
import mlflow.sklearn
mlflow.set_tracking_uri(_cfg.mlflow_tracking_uri)

_client     = mlflow.tracking.MlflowClient()
_model_name = PARAMS["inference"]["model_name"]
_alias      = PARAMS["inference"]["model_stage"]
_mv         = _client.get_model_version_by_alias(_model_name, _alias)
_champion   = _client.get_run(_mv.run_id)

RUN_ID      = _mv.run_id
MODEL_URI   = f"models:/{_model_name}@{_alias}"

1. Champion model snapshot

Show code
p = _champion.data.params
m = _champion.data.metrics

display(Markdown(f"""
| Property | Value |
|---|---|
| Model | `{p.get('model.name')}` |
| Run ID | `{RUN_ID[:12]}…` |
| Registered as | `{_model_name}` @ `{_alias}` |
| Features | {p.get('model.feature_count')} (num: {p.get('features.num')}, cat: {p.get('features.cat')}) |
| Calibration | `{p.get('calibration.method')}` (frac={p.get('calibration.calib_frac')}) |
| Hyperparams source | `{p.get('model.hyperparams_source')}` |
| **Holdout logloss** | **{m.get('final.logloss', float('nan')):.4f}** |
| Holdout accuracy | {m.get('final.accuracy', float('nan')):.3f} |
| ECE (calibrated) | {m.get('final.ece', m.get('final.ece_calibrated', float('nan'))):.4f} |
| ECE (raw, before calib) | {m.get('final.ece_raw', float('nan')):.4f} |
"""))
Property Value
Model xgb
Run ID 9e156226cffa…
Registered as soccer-match-outcome @ champion
Features 508 (num: 507, cat: 1)
Calibration isotonic (frac=0.15)
Hyperparams source tuned
Holdout logloss 1.0043
Holdout accuracy 0.504
ECE (calibrated) 0.0036
ECE (raw, before calib) 0.0307

2. Hyperparameter tuning

200 Optuna trials across XGB, HGB, LogReg. Objective: minimise CV log-loss on a stratified 10 % data fraction.

Show code
_exp_name = PARAMS["tuning"]["experiment_name"]
_exp      = _client.get_experiment_by_name(_exp_name)
_runs     = _client.search_runs(
    _exp.experiment_id,
    order_by=["start_time ASC"],
    max_results=500,
)

rows = []
for r in _runs:
    p_ = r.data.params
    m_ = r.data.metrics
    model_key = next(
        (k.split(".")[0] for k in p_ if k in ("xgb.n_estimators", "hgb.max_iter", "logreg.C")),
        p_.get("tuning.study_name", "unknown").split("_")[0],
    )
    rows.append({
        "model":      model_key,
        "trial":      int(float(m_.get("trial.number", len(rows)))),
        "cv_logloss": float(m_.get("cv.logloss_mean", float("nan"))),
        "study":      p_.get("tuning.study_name", ""),
        **{k: v for k, v in p_.items() if "." in k and k.split(".")[0] in ("xgb", "hgb", "logreg")},
    })

df_trials = pd.DataFrame(rows).dropna(subset=["cv_logloss"])
df_trials = df_trials.sort_values(["model", "trial"]).reset_index(drop=True)

print(f"Loaded {len(df_trials)} tuning trials")
print(df_trials.groupby("model")["cv_logloss"].agg(["count", "min", "mean"]).round(4))
Loaded 247 tuning trials
        count     min    mean
model                        
hgb       100  1.0074  1.0266
logreg     20  1.0053  1.0103
xgb       127  1.0044  1.0285
Show code
MODEL_COLORS = {"xgb": "#3498db", "hgb": "#2ecc71", "logreg": "#e67e22"}

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for model, grp in df_trials.groupby("model"):
    grp  = grp.sort_values("trial")
    color = MODEL_COLORS.get(model, "#95a5a6")

    axes[0].scatter(grp["trial"], grp["cv_logloss"], s=14, alpha=0.5, color=color, label=model)

    best_so_far = grp["cv_logloss"].cummin()
    axes[1].plot(grp["trial"].values, best_so_far.values, color=color, linewidth=2, label=model)

axes[0].set_xlabel("Trial")
axes[0].set_ylabel("CV Log-loss")
axes[0].set_title("All trials")
axes[0].legend(fontsize=9)

axes[1].set_xlabel("Trial")
axes[1].set_ylabel("Best CV Log-loss (cumulative min)")
axes[1].set_title("Convergence")
axes[1].legend(fontsize=9)

plt.tight_layout()
plt.show()

Optuna convergence: best CV logloss per trial (cumulative minimum)
Show code
best_per_model = (
    df_trials
    .sort_values("cv_logloss")
    .drop_duplicates(subset="model")
    [["model", "cv_logloss", "trial"]]
    .copy()
)
best_per_model["cv_logloss"] = best_per_model["cv_logloss"].map("{:.4f}".format)

display(HTML(best_per_model.to_html(index=False)))

_xgb_best  = df_trials[df_trials["model"] == "xgb"].sort_values("cv_logloss").iloc[0]
_xgb_hp_cols = [c for c in _xgb_best.index if c.startswith("xgb.")]
if _xgb_hp_cols:
    display(Markdown("**Best XGB hyperparameters (from tuning):**"))
    hp_df = pd.DataFrame({"param": _xgb_hp_cols, "value": [_xgb_best[c] for c in _xgb_hp_cols]})
    display(HTML(hp_df.to_html(index=False)))
model cv_logloss trial
xgb 1.0044 21
logreg 1.0053 35
hgb 1.0074 56

Best XGB hyperparameters (from tuning):

param value
xgb.n_estimators 450
xgb.max_depth 7
xgb.learning_rate 0.020703311978941263
xgb.subsample 0.5759004834558378
xgb.colsample_bytree 0.6582673603217585
xgb.min_child_weight 11
xgb.reg_alpha 0.0015821341467571611
xgb.reg_lambda 1.5593203026677782

2.5 Model selection — select_model

After the three Optuna studies complete, select_model reads xgb_best_params.json, logreg_best_params.json, and hgb_best_params.json and picks the candidate with the lowest CV log-loss. Result is written to data/models/best_model.json and feeds final_train.

Show code
import json

_best_model_path = project_root / "data/models/best_model.json"
try:
    _bm = json.loads(_best_model_path.read_text())

    _all_scores = _bm.get("all_scores", {})
    _rows = []
    for _name, _score in _all_scores.items():
        _rows.append({
            "Model": _name,
            "CV log-loss": f"{_score:.4f}" if _score != float('inf') else "∞ (disabled)",
            "Selected": "✅ winner" if _name == _bm["model_name"] else "",
        })
    _sel_df = pd.DataFrame(_rows).sort_values("CV log-loss")
    display(HTML(_sel_df.to_html(index=False)))

    display(Markdown(f"""
**Selected model:** `{_bm['model_name']}`  |  **CV log-loss:** `{_bm['cv_logloss']:.4f}`
**Experiment:** `{_bm.get('experiment_name', 'matches_clf_v1.0_select')}`
"""))
except FileNotFoundError:
    display(Markdown("> `data/models/best_model.json` not available in this environment (DVC data not pulled)."))
Model CV log-loss Selected
xgb 1.0092 ✅ winner
hgb_numonly 1.0115
logreg ∞ (disabled)

Selected model: xgb | CV log-loss: 1.0092 Experiment: matches_clf_v1.0_select


3. Baseline comparison

Three baselines evaluated on the same holdout set:

  • Uniform prior — always predict (0.333, 0.333, 0.333)
  • Historical prior — training-set home/draw/away frequencies
  • Always home — P(home win) = 1.0
  • Champion XGB — calibrated model
Show code
_holdout_path = project_root / "data" / "predictions" / "holdout_predictions.parquet"
df_preds = None
results  = None
try:
    df_preds = pd.read_parquet(_holdout_path)
except FileNotFoundError:
    display(Markdown("> `data/predictions/holdout_predictions.parquet` not available in this environment (DVC data not pulled)."))

if df_preds is not None:
    df_preds = df_preds  # keep reference for downstream cells
y_true       = df_preds["y_true"].values if df_preds is not None else None
n            = len(y_true) if y_true is not None else 0
labels       = [0, 1, 2]
proba_model  = df_preds[["proba_0", "proba_1", "proba_2"]].values if df_preds is not None else None

if y_true is not None:
    home_freq, draw_freq, away_freq = (
        np.mean(y_true == 0),
        np.mean(y_true == 1),
        np.mean(y_true == 2),
    )
    proba_prior = np.column_stack([
        np.full(n, home_freq),
        np.full(n, draw_freq),
        np.full(n, away_freq),
    ])

    proba_uniform = np.full((n, 3), 1 / 3)
    proba_home = np.column_stack([np.ones(n), np.zeros(n), np.zeros(n)])

    def _logloss(y, p):
        p = np.clip(p, 1e-7, 1 - 1e-7)
        p = p / p.sum(axis=1, keepdims=True)
        return float(-np.mean(np.log(p[np.arange(len(y)), y])))

    def _brier(y, p):
        from sklearn.preprocessing import label_binarize
        y_bin = label_binarize(y, classes=labels)
        return float(np.mean(np.sum((p - y_bin) ** 2, axis=1)))

    def _accuracy(y, p):
        return float(np.mean(np.argmax(p, axis=1) == y))

    results = []
    for name, proba in [
        ("Always home", proba_home),
        ("Uniform (0.33/0.33/0.33)", proba_uniform),
        ("Historical prior", proba_prior),
        ("Champion XGB (calibrated)", proba_model),
    ]:
        results.append({
            "Model":    name,
            "Log-loss": _logloss(y_true, proba),
            "Brier":    _brier(y_true, proba),
            "Accuracy": _accuracy(y_true, proba),
        })

    df_base = pd.DataFrame(results)
    xgb_ll  = df_base.loc[df_base["Model"].str.startswith("Champion"), "Log-loss"].values[0]

    df_base["Δ vs model"] = df_base["Log-loss"].apply(
        lambda x: f"+{x - xgb_ll:.4f}" if x != xgb_ll else "—"
    )
    df_base["Log-loss"] = df_base["Log-loss"].map("{:.4f}".format)
    df_base["Brier"]    = df_base["Brier"].map("{:.4f}".format)
    df_base["Accuracy"] = df_base["Accuracy"].map("{:.3f}".format)

    display(HTML(df_base.to_html(index=False)))
Model Log-loss Brier Accuracy Δ vs model
Always home 9.0530 1.1233 0.438 +8.0478
Uniform (0.33/0.33/0.33) 1.0986 0.6667 0.438 +0.0934
Historical prior 1.0711 0.6481 0.438 +0.0659
Champion XGB (calibrated) 1.0052 0.6011 0.504 —
Show code
if results is None:
    display(Markdown("> Baseline chart not available — holdout predictions not loaded."))
else:
    df_plot = pd.DataFrame(results)
    colors  = ["#95a5a6", "#bdc3c7", "#7f8c8d", "#3498db"]

    fig, ax = plt.subplots(figsize=(8, 3))
    bars    = ax.barh(df_plot["Model"], df_plot["Log-loss"], color=colors)
    ax.set_xlabel("Log-loss (lower is better)")
    ax.set_title("Model vs baselines on holdout set")
    ax.invert_yaxis()

    for bar, val in zip(bars, df_plot["Log-loss"]):
        ax.text(val + 0.005, bar.get_y() + bar.get_height() / 2,
                f"{val:.4f}", va="center", fontsize=9)

    plt.tight_layout()
    plt.show()

Log-loss: Champion model vs baselines

4. Calibration analysis

Isotonic calibration is applied post-training on a 15 % temporal holdout.

Show code
import tempfile
_tmpdir  = tempfile.mkdtemp()
ece_raw  = _champion.data.metrics.get("final.ece_raw", float("nan"))
ece_cal  = _champion.data.metrics.get("final.ece_calibrated",
           _champion.data.metrics.get("final.ece", float("nan")))

# Download pre-generated calibration figure from MLflow
try:
    _artifact_path = _client.download_artifacts(RUN_ID, "calibration_curves.png", _tmpdir)
    _has_artifact  = True
except Exception:
    _has_artifact  = False
Show code
if _has_artifact:
    from IPython.display import Image as IPImage
    IPImage(_artifact_path, width=700)
Show code
from sklearn.calibration import calibration_curve

CLASS_NAMES = {0: "Home win", 1: "Draw", 2: "Away win"}

if df_preds is None:
    display(Markdown("> Reliability diagrams not available — holdout predictions not loaded."))
else:
    proba_cal_arr = df_preds[["proba_0", "proba_1", "proba_2"]].values
    y_true_arr    = df_preds["y_true"].values

    fig, axes = plt.subplots(1, 3, figsize=(13, 4))

    for i, ax in enumerate(axes):
        frac_pos, mean_pred = calibration_curve(
            (y_true_arr == i).astype(int),
            proba_cal_arr[:, i],
            n_bins=10,
            strategy="quantile",
        )
        ax.plot([0, 1], [0, 1], "k--", linewidth=1, label="Perfect calibration")
        ax.plot(mean_pred, frac_pos, "o-", color="#3498db", linewidth=2,
                markersize=5, label="Calibrated model")
        ax.fill_between(mean_pred, frac_pos, mean_pred,
                        alpha=0.15, color="#e74c3c", label="Calibration gap")
        ax.set_xlabel("Mean predicted probability")
        ax.set_ylabel("Observed frequency")
        ax.set_title(CLASS_NAMES[i])
        ax.legend(fontsize=8)
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)

    plt.suptitle(f"Reliability diagrams (calibrated model)  |  ECE = {ece_cal:.4f}", y=1.01)
    plt.tight_layout()
    plt.show()

Reliability diagrams — calibrated probabilities per outcome class
Show code
display(Markdown(f"""
| Stage | ECE |
|---|---|
| Raw model (before isotonic calibration) | {ece_raw:.4f} |
| After isotonic calibration | {ece_cal:.4f} |
| Reduction | {ece_raw - ece_cal:.4f} ({(ece_raw - ece_cal) / ece_raw * 100:.1f}% improvement) |

> **Interpretation:** ECE close to 0 means predicted probabilities match observed
> frequencies well. An ECE of {ece_cal:.4f} indicates very good calibration —
> predicted 60 % probability outcomes happen approximately 60 % of the time.
"""))
Stage ECE
Raw model (before isotonic calibration) 0.0307
After isotonic calibration 0.0036
Reduction 0.0271 (88.3% improvement)

Interpretation: ECE close to 0 means predicted probabilities match observed frequencies well. An ECE of 0.0036 indicates very good calibration — predicted 60 % probability outcomes happen approximately 60 % of the time.


5. Feature importance & SHAP

5.1 Built-in XGBoost gain importance

Show code
try:
    _fi_path      = _client.download_artifacts(RUN_ID, "feature_importances.csv", _tmpdir)
    df_fi         = pd.read_csv(_fi_path)
    _fi_available = not df_fi.empty
except Exception:
    _fi_available = False
    df_fi         = pd.DataFrame()

if _fi_available:
    print(f"Feature importances loaded: {len(df_fi)} features, columns: {df_fi.columns.tolist()}")
else:
    print("feature_importances.csv not in MLflow artifacts — will derive from model")
feature_importances.csv not in MLflow artifacts — will derive from model
Show code
if not _fi_available:
    _model_loaded = mlflow.sklearn.load_model(MODEL_URI)
    _inner = getattr(_model_loaded, "estimator", _model_loaded)
    _clf   = getattr(_inner, "named_steps", {}).get("clf", _inner)
    if hasattr(_clf, "feature_importances_"):
        imp_vals = _clf.feature_importances_
        feat_names_imp = (
            _inner.named_steps["preprocessor"].get_feature_names_out()
            if hasattr(getattr(_inner, "named_steps", {}).get("preprocessor"), "get_feature_names_out")
            else [f"f{i}" for i in range(len(imp_vals))]
        )
        df_fi         = pd.DataFrame({"feature": feat_names_imp, "importance": imp_vals})
        _fi_available = True

if _fi_available and not df_fi.empty:
    imp_col  = next((c for c in ["importance", "gain", "weight"] if c in df_fi.columns), df_fi.columns[1])
    name_col = df_fi.columns[0]

    # ELO detection: feature names may be generic f{i} from sklearn pipeline;
    # map back via features_meta index so f506 → diff_elo_pre → orange.
    try:
        _meta_color = pd.read_parquet(project_root / "data" / "features" / "features_meta.parquet")
        _elo_names  = set(_meta_color.loc[_meta_color["metric"] == "elo", "name"])
        _feat_order = _meta_color["name"].tolist()
    except Exception:
        _elo_names, _feat_order = set(), []

    def _is_elo_feat(n):
        s = str(n)
        if "elo" in s.lower():
            return True
        if s.startswith("f") and s[1:].isdigit():
            idx = int(s[1:])
            return idx < len(_feat_order) and _feat_order[idx] in _elo_names
        return False

    df_top30 = df_fi.nlargest(30, imp_col).sort_values(imp_col, ascending=True)
    colors   = ["#e67e22" if _is_elo_feat(n) else "#3498db" for n in df_top30[name_col]]

    fig, ax = plt.subplots(figsize=(10, 9))
    ax.barh(df_top30[name_col], df_top30[imp_col], color=colors)
    ax.set_xlabel(f"Importance ({imp_col})")
    ax.set_title("Top-30 feature importance (XGBoost gain)")

    import matplotlib.patches as mpatches
    ax.legend(handles=[
        mpatches.Patch(facecolor="#e67e22", label="ELO features"),
        mpatches.Patch(facecolor="#3498db", label="Rolling stats / other"),
    ], fontsize=9)
    plt.tight_layout()
    plt.show()
else:
    display(Markdown("> Feature importances not available for this model configuration."))

Top-30 features by XGBoost gain importance (orange = ELO, blue = rolling stats)

5.2 SHAP values

Computed on a 2 000-row stratified sample of the holdout set.

Show code
try:
    import shap
    _shap_available = True
except ImportError:
    _shap_available = False

if not _shap_available:
    display(Markdown("> **shap** package not installed — run `pip install shap` to enable this section."))
Show code
if _shap_available:
    from src.features.select import select_model_features
    from src.data.params import load_params

    _params       = load_params()
    _feat_params  = _params.get("features_selected", _params.get("classification"))
    _df_meta      = pd.read_parquet(project_root / "data" / "features" / "features_meta.parquet")
    _num_cols     = select_model_features(
        _df_meta,
        side=_feat_params["side"],
        window_sizes=_feat_params["window_sizes"],
        include_elo=_feat_params.get("include_elo", True),
        include_rest_days=_feat_params.get("include_rest_days", False),
        include_h2h=_feat_params.get("include_h2h", False),
    )
    _cat_cols = _feat_params["cat_cols"]
    _X_cols   = _num_cols + [c for c in _cat_cols if c not in _num_cols]

    _df_ds   = pd.read_parquet(project_root / "data" / "processed" / "dataset.parquet")
    _df_test = pd.read_parquet(project_root / "data" / "splits" / "test_ids.parquet")
    _df_hold = _df_ds[_df_ds["id"].isin(_df_test["id"])].copy()
    _X_hold  = _df_hold[[c for c in _X_cols if c in _df_hold.columns]].copy()

    # Stratified 2 000-row sample (≤667 per class)
    _sample_idx = (
        _df_hold.groupby("outcome_1x2")
        .apply(lambda g: g.sample(min(len(g), 667), random_state=42))
        .index.get_level_values(1)
    )
    _X_sample = _X_hold.loc[_sample_idx].reset_index(drop=True)

    # Load model and extract raw XGBoost estimator
    _model_full = mlflow.sklearn.load_model(MODEL_URI)
    _inner_m    = getattr(_model_full, "estimator", _model_full)
    _clf_m      = getattr(_inner_m, "named_steps", {}).get("clf", _inner_m)
    _pre_m      = getattr(_inner_m, "named_steps", {}).get("preprocessor", None)

    if _pre_m is not None:
        _X_transformed  = _pre_m.transform(_X_sample)
        _feat_names_out = (
            _pre_m.get_feature_names_out().tolist()
            if hasattr(_pre_m, "get_feature_names_out")
            else [f"f{i}" for i in range(_X_transformed.shape[1])]
        )
    else:
        _X_transformed  = _X_sample.values
        _feat_names_out = _X_sample.columns.tolist()

    print(f"Computing SHAP on {len(_X_sample)} samples × {_X_transformed.shape[1]} features …")
    try:
        # shap.Explainer uses the new API compatible with XGBoost 3.x
        # TreeExplainer.shap_values() has a shape mismatch bug in XGBoost 3.x multiclass
        _explainer  = shap.Explainer(_clf_m, _X_transformed[:100])
        _shap_obj   = _explainer(_X_transformed)
        _sv_raw     = _shap_obj.values  # (n_samples, n_features) or (n_samples, n_features, n_classes)
        if _sv_raw.ndim == 3:
            # multiclass: split into per-class list for consistent downstream use
            _sv_class = [_sv_raw[:, :, i] for i in range(_sv_raw.shape[2])]
        elif _sv_raw.ndim == 2:
            _sv_class = [_sv_raw]
        else:
            raise ValueError(f"Unexpected SHAP array ndim: {_sv_raw.ndim}")
        _shap_ok = True
        print(f"Done — SHAP values shape: {_sv_raw.shape}")
    except Exception as _shap_err:
        _shap_ok = False
        print(f"SHAP computation failed: {_shap_err}")
Computing SHAP on 2001 samples × 508 features …
Done — SHAP values shape: (2001, 508, 3)
Show code
if _shap_available and "_shap_ok" in dir() and _shap_ok:
    fig, axes = plt.subplots(1, len(_sv_class), figsize=(6 * len(_sv_class), 7))
    if len(_sv_class) == 1:
        axes = [axes]

    import matplotlib.patches as mpatches

    for cls_idx, (ax, sv_cls) in enumerate(zip(axes, _sv_class)):
        mean_abs = np.abs(sv_cls).mean(axis=0)
        top20    = np.argsort(mean_abs)[::-1][:20]
        names    = [_feat_names_out[i] for i in top20]
        vals     = mean_abs[top20]
        elo_col  = ["#e67e22" if _is_elo_feat(n) else "#3498db" for n in names]

        ax.barh(names[::-1], vals[::-1], color=elo_col[::-1])
        ax.set_xlabel("Mean |SHAP value|")
        ax.set_title(f"Class {cls_idx}: {CLASS_NAMES.get(cls_idx, str(cls_idx))}")

    fig.legend(handles=[
        mpatches.Patch(facecolor="#e67e22", label="ELO features"),
        mpatches.Patch(facecolor="#3498db", label="Rolling stats / other"),
    ], loc="upper right", fontsize=9)
    plt.suptitle("Top-20 features by mean |SHAP value| per class", y=1.01)
    plt.tight_layout()
    plt.show()

Top-20 features by mean |SHAP value| per outcome class
Show code
if _shap_available and "_shap_ok" in dir() and _shap_ok:
    mean_abs_all = np.abs(np.concatenate(_sv_class, axis=0)).mean(axis=0)

    elo_mask    = np.array([_is_elo_feat(n) for n in _feat_names_out])
    elo_share   = mean_abs_all[elo_mask].sum()
    stats_share = mean_abs_all[~elo_mask].sum()
    total       = elo_share + stats_share

    fig, ax = plt.subplots(figsize=(6, 3))
    ax.barh(
        ["ELO features", "Rolling stats + other"],
        [elo_share / total * 100, stats_share / total * 100],
        color=["#e67e22", "#3498db"],
    )
    ax.set_xlabel("Share of total mean |SHAP| (%)")
    ax.set_title("Feature group importance (SHAP)")
    for bar, val in zip(ax.patches, [elo_share / total * 100, stats_share / total * 100]):
        ax.text(val + 0.3, bar.get_y() + bar.get_height() / 2,
                f"{val:.1f}%", va="center", fontsize=10, fontweight="bold")
    plt.tight_layout()
    plt.show()

    display(Markdown(f"""
**Why do rolling stats add little given ELO?**

| Feature group | Share of mean |SHAP| |
|---|---|
| ELO features | {elo_share / total * 100:.1f}% |
| Rolling stats + categorical | {stats_share / total * 100:.1f}% |

> ELO dominates feature attribution across all three outcome classes because it
> already encodes long-run team strength. Rolling window statistics contribute a
> non-zero share — they are *used* by the model — but carry redundant signal when
> ELO is present, which is why ablation studies show minimal marginal gain.
"""))

ELO vs rolling stats: share of total SHAP importance

Why do rolling stats add little given ELO?

Feature group Share of mean
ELO features 29.2%
Rolling stats + categorical 70.8%

ELO dominates feature attribution across all three outcome classes because it already encodes long-run team strength. Rolling window statistics contribute a non-zero share — they are used by the model — but carry redundant signal when ELO is present, which is why ablation studies show minimal marginal gain.