p = _champion.data.paramsm = _champion.data.metricsdisplay(Markdown(f"""| Property | Value ||---|---|| Model | `{p.get('model.name')}` || Run ID | `{RUN_ID[:12]}…` || Registered as | `{_model_name}` @ `{_alias}` || Features | {p.get('model.feature_count')} (num: {p.get('features.num')}, cat: {p.get('features.cat')}) || Calibration | `{p.get('calibration.method')}` (frac={p.get('calibration.calib_frac')}) || Hyperparams source | `{p.get('model.hyperparams_source')}` || **Holdout logloss** | **{m.get('final.logloss', float('nan')):.4f}** || Holdout accuracy | {m.get('final.accuracy', float('nan')):.3f} || ECE (calibrated) | {m.get('final.ece', m.get('final.ece_calibrated', float('nan'))):.4f} || ECE (raw, before calib) | {m.get('final.ece_raw', float('nan')):.4f} |"""))
Property
Value
Model
xgb
Run ID
9e156226cffa…
Registered as
soccer-match-outcome @ champion
Features
508 (num: 507, cat: 1)
Calibration
isotonic (frac=0.15)
Hyperparams source
tuned
Holdout logloss
1.0043
Holdout accuracy
0.504
ECE (calibrated)
0.0036
ECE (raw, before calib)
0.0307
2. Hyperparameter tuning
200 Optuna trials across XGB, HGB, LogReg. Objective: minimise CV log-loss on a stratified 10 % data fraction.
Show code
_exp_name = PARAMS["tuning"]["experiment_name"]_exp = _client.get_experiment_by_name(_exp_name)_runs = _client.search_runs( _exp.experiment_id, order_by=["start_time ASC"], max_results=500,)rows = []for r in _runs: p_ = r.data.params m_ = r.data.metrics model_key =next( (k.split(".")[0] for k in p_ if k in ("xgb.n_estimators", "hgb.max_iter", "logreg.C")), p_.get("tuning.study_name", "unknown").split("_")[0], ) rows.append({"model": model_key,"trial": int(float(m_.get("trial.number", len(rows)))),"cv_logloss": float(m_.get("cv.logloss_mean", float("nan"))),"study": p_.get("tuning.study_name", ""),**{k: v for k, v in p_.items() if"."in k and k.split(".")[0] in ("xgb", "hgb", "logreg")}, })df_trials = pd.DataFrame(rows).dropna(subset=["cv_logloss"])df_trials = df_trials.sort_values(["model", "trial"]).reset_index(drop=True)print(f"Loaded {len(df_trials)} tuning trials")print(df_trials.groupby("model")["cv_logloss"].agg(["count", "min", "mean"]).round(4))
Loaded 247 tuning trials
count min mean
model
hgb 100 1.0074 1.0266
logreg 20 1.0053 1.0103
xgb 127 1.0044 1.0285
Optuna convergence: best CV logloss per trial (cumulative minimum)
Show code
best_per_model = ( df_trials .sort_values("cv_logloss") .drop_duplicates(subset="model") [["model", "cv_logloss", "trial"]] .copy())best_per_model["cv_logloss"] = best_per_model["cv_logloss"].map("{:.4f}".format)display(HTML(best_per_model.to_html(index=False)))_xgb_best = df_trials[df_trials["model"] =="xgb"].sort_values("cv_logloss").iloc[0]_xgb_hp_cols = [c for c in _xgb_best.index if c.startswith("xgb.")]if _xgb_hp_cols: display(Markdown("**Best XGB hyperparameters (from tuning):**")) hp_df = pd.DataFrame({"param": _xgb_hp_cols, "value": [_xgb_best[c] for c in _xgb_hp_cols]}) display(HTML(hp_df.to_html(index=False)))
model
cv_logloss
trial
xgb
1.0044
21
logreg
1.0053
35
hgb
1.0074
56
Best XGB hyperparameters (from tuning):
param
value
xgb.n_estimators
450
xgb.max_depth
7
xgb.learning_rate
0.020703311978941263
xgb.subsample
0.5759004834558378
xgb.colsample_bytree
0.6582673603217585
xgb.min_child_weight
11
xgb.reg_alpha
0.0015821341467571611
xgb.reg_lambda
1.5593203026677782
2.5 Model selection — select_model
After the three Optuna studies complete, select_model reads xgb_best_params.json, logreg_best_params.json, and hgb_best_params.json and picks the candidate with the lowest CV log-loss. Result is written to data/models/best_model.json and feeds final_train.
Show code
import json_best_model_path = project_root /"data/models/best_model.json"try: _bm = json.loads(_best_model_path.read_text()) _all_scores = _bm.get("all_scores", {}) _rows = []for _name, _score in _all_scores.items(): _rows.append({"Model": _name,"CV log-loss": f"{_score:.4f}"if _score !=float('inf') else"∞ (disabled)","Selected": "✅ winner"if _name == _bm["model_name"] else"", }) _sel_df = pd.DataFrame(_rows).sort_values("CV log-loss") display(HTML(_sel_df.to_html(index=False))) display(Markdown(f"""**Selected model:** `{_bm['model_name']}` | **CV log-loss:** `{_bm['cv_logloss']:.4f}`**Experiment:** `{_bm.get('experiment_name', 'matches_clf_v1.0_select')}`"""))exceptFileNotFoundError: display(Markdown("> `data/models/best_model.json` not available in this environment (DVC data not pulled)."))
Reliability diagrams — calibrated probabilities per outcome class
Show code
display(Markdown(f"""| Stage | ECE ||---|---|| Raw model (before isotonic calibration) | {ece_raw:.4f} || After isotonic calibration | {ece_cal:.4f} || Reduction | {ece_raw - ece_cal:.4f} ({(ece_raw - ece_cal) / ece_raw *100:.1f}% improvement) |> **Interpretation:** ECE close to 0 means predicted probabilities match observed> frequencies well. An ECE of {ece_cal:.4f} indicates very good calibration —> predicted 60 % probability outcomes happen approximately 60 % of the time."""))
Stage
ECE
Raw model (before isotonic calibration)
0.0307
After isotonic calibration
0.0036
Reduction
0.0271 (88.3% improvement)
Interpretation: ECE close to 0 means predicted probabilities match observed frequencies well. An ECE of 0.0036 indicates very good calibration — predicted 60 % probability outcomes happen approximately 60 % of the time.
5. Feature importance & SHAP
5.1 Built-in XGBoost gain importance
Show code
try: _fi_path = _client.download_artifacts(RUN_ID, "feature_importances.csv", _tmpdir) df_fi = pd.read_csv(_fi_path) _fi_available =not df_fi.emptyexceptException: _fi_available =False df_fi = pd.DataFrame()if _fi_available:print(f"Feature importances loaded: {len(df_fi)} features, columns: {df_fi.columns.tolist()}")else:print("feature_importances.csv not in MLflow artifacts — will derive from model")
feature_importances.csv not in MLflow artifacts — will derive from model
Show code
ifnot _fi_available: _model_loaded = mlflow.sklearn.load_model(MODEL_URI) _inner =getattr(_model_loaded, "estimator", _model_loaded) _clf =getattr(_inner, "named_steps", {}).get("clf", _inner)ifhasattr(_clf, "feature_importances_"): imp_vals = _clf.feature_importances_ feat_names_imp = ( _inner.named_steps["preprocessor"].get_feature_names_out()ifhasattr(getattr(_inner, "named_steps", {}).get("preprocessor"), "get_feature_names_out")else [f"f{i}"for i inrange(len(imp_vals))] ) df_fi = pd.DataFrame({"feature": feat_names_imp, "importance": imp_vals}) _fi_available =Trueif _fi_available andnot df_fi.empty: imp_col =next((c for c in ["importance", "gain", "weight"] if c in df_fi.columns), df_fi.columns[1]) name_col = df_fi.columns[0]# ELO detection: feature names may be generic f{i} from sklearn pipeline;# map back via features_meta index so f506 → diff_elo_pre → orange.try: _meta_color = pd.read_parquet(project_root /"data"/"features"/"features_meta.parquet") _elo_names =set(_meta_color.loc[_meta_color["metric"] =="elo", "name"]) _feat_order = _meta_color["name"].tolist()exceptException: _elo_names, _feat_order =set(), []def _is_elo_feat(n): s =str(n)if"elo"in s.lower():returnTrueif s.startswith("f") and s[1:].isdigit(): idx =int(s[1:])return idx <len(_feat_order) and _feat_order[idx] in _elo_namesreturnFalse df_top30 = df_fi.nlargest(30, imp_col).sort_values(imp_col, ascending=True) colors = ["#e67e22"if _is_elo_feat(n) else"#3498db"for n in df_top30[name_col]] fig, ax = plt.subplots(figsize=(10, 9)) ax.barh(df_top30[name_col], df_top30[imp_col], color=colors) ax.set_xlabel(f"Importance ({imp_col})") ax.set_title("Top-30 feature importance (XGBoost gain)")import matplotlib.patches as mpatches ax.legend(handles=[ mpatches.Patch(facecolor="#e67e22", label="ELO features"), mpatches.Patch(facecolor="#3498db", label="Rolling stats / other"), ], fontsize=9) plt.tight_layout() plt.show()else: display(Markdown("> Feature importances not available for this model configuration."))
Top-30 features by XGBoost gain importance (orange = ELO, blue = rolling stats)
5.2 SHAP values
Computed on a 2 000-row stratified sample of the holdout set.
Show code
try:import shap _shap_available =TrueexceptImportError: _shap_available =Falseifnot _shap_available: display(Markdown("> **shap** package not installed — run `pip install shap` to enable this section."))
Show code
if _shap_available:from src.features.select import select_model_featuresfrom src.data.params import load_params _params = load_params() _feat_params = _params.get("features_selected", _params.get("classification")) _df_meta = pd.read_parquet(project_root /"data"/"features"/"features_meta.parquet") _num_cols = select_model_features( _df_meta, side=_feat_params["side"], window_sizes=_feat_params["window_sizes"], include_elo=_feat_params.get("include_elo", True), include_rest_days=_feat_params.get("include_rest_days", False), include_h2h=_feat_params.get("include_h2h", False), ) _cat_cols = _feat_params["cat_cols"] _X_cols = _num_cols + [c for c in _cat_cols if c notin _num_cols] _df_ds = pd.read_parquet(project_root /"data"/"processed"/"dataset.parquet") _df_test = pd.read_parquet(project_root /"data"/"splits"/"test_ids.parquet") _df_hold = _df_ds[_df_ds["id"].isin(_df_test["id"])].copy() _X_hold = _df_hold[[c for c in _X_cols if c in _df_hold.columns]].copy()# Stratified 2 000-row sample (≤667 per class) _sample_idx = ( _df_hold.groupby("outcome_1x2") .apply(lambda g: g.sample(min(len(g), 667), random_state=42)) .index.get_level_values(1) ) _X_sample = _X_hold.loc[_sample_idx].reset_index(drop=True)# Load model and extract raw XGBoost estimator _model_full = mlflow.sklearn.load_model(MODEL_URI) _inner_m =getattr(_model_full, "estimator", _model_full) _clf_m =getattr(_inner_m, "named_steps", {}).get("clf", _inner_m) _pre_m =getattr(_inner_m, "named_steps", {}).get("preprocessor", None)if _pre_m isnotNone: _X_transformed = _pre_m.transform(_X_sample) _feat_names_out = ( _pre_m.get_feature_names_out().tolist()ifhasattr(_pre_m, "get_feature_names_out")else [f"f{i}"for i inrange(_X_transformed.shape[1])] )else: _X_transformed = _X_sample.values _feat_names_out = _X_sample.columns.tolist()print(f"Computing SHAP on {len(_X_sample)} samples × {_X_transformed.shape[1]} features …")try:# shap.Explainer uses the new API compatible with XGBoost 3.x# TreeExplainer.shap_values() has a shape mismatch bug in XGBoost 3.x multiclass _explainer = shap.Explainer(_clf_m, _X_transformed[:100]) _shap_obj = _explainer(_X_transformed) _sv_raw = _shap_obj.values # (n_samples, n_features) or (n_samples, n_features, n_classes)if _sv_raw.ndim ==3:# multiclass: split into per-class list for consistent downstream use _sv_class = [_sv_raw[:, :, i] for i inrange(_sv_raw.shape[2])]elif _sv_raw.ndim ==2: _sv_class = [_sv_raw]else:raiseValueError(f"Unexpected SHAP array ndim: {_sv_raw.ndim}") _shap_ok =Trueprint(f"Done — SHAP values shape: {_sv_raw.shape}")exceptExceptionas _shap_err: _shap_ok =Falseprint(f"SHAP computation failed: {_shap_err}")
Computing SHAP on 2001 samples × 508 features …
Done — SHAP values shape: (2001, 508, 3)
Show code
if _shap_available and"_shap_ok"indir() and _shap_ok: fig, axes = plt.subplots(1, len(_sv_class), figsize=(6*len(_sv_class), 7))iflen(_sv_class) ==1: axes = [axes]import matplotlib.patches as mpatchesfor cls_idx, (ax, sv_cls) inenumerate(zip(axes, _sv_class)): mean_abs = np.abs(sv_cls).mean(axis=0) top20 = np.argsort(mean_abs)[::-1][:20] names = [_feat_names_out[i] for i in top20] vals = mean_abs[top20] elo_col = ["#e67e22"if _is_elo_feat(n) else"#3498db"for n in names] ax.barh(names[::-1], vals[::-1], color=elo_col[::-1]) ax.set_xlabel("Mean |SHAP value|") ax.set_title(f"Class {cls_idx}: {CLASS_NAMES.get(cls_idx, str(cls_idx))}") fig.legend(handles=[ mpatches.Patch(facecolor="#e67e22", label="ELO features"), mpatches.Patch(facecolor="#3498db", label="Rolling stats / other"), ], loc="upper right", fontsize=9) plt.suptitle("Top-20 features by mean |SHAP value| per class", y=1.01) plt.tight_layout() plt.show()
Top-20 features by mean |SHAP value| per outcome class
Show code
if _shap_available and"_shap_ok"indir() and _shap_ok: mean_abs_all = np.abs(np.concatenate(_sv_class, axis=0)).mean(axis=0) elo_mask = np.array([_is_elo_feat(n) for n in _feat_names_out]) elo_share = mean_abs_all[elo_mask].sum() stats_share = mean_abs_all[~elo_mask].sum() total = elo_share + stats_share fig, ax = plt.subplots(figsize=(6, 3)) ax.barh( ["ELO features", "Rolling stats + other"], [elo_share / total *100, stats_share / total *100], color=["#e67e22", "#3498db"], ) ax.set_xlabel("Share of total mean |SHAP| (%)") ax.set_title("Feature group importance (SHAP)")for bar, val inzip(ax.patches, [elo_share / total *100, stats_share / total *100]): ax.text(val +0.3, bar.get_y() + bar.get_height() /2,f"{val:.1f}%", va="center", fontsize=10, fontweight="bold") plt.tight_layout() plt.show() display(Markdown(f"""**Why do rolling stats add little given ELO?**| Feature group | Share of mean |SHAP| ||---|---|| ELO features | {elo_share / total *100:.1f}% || Rolling stats + categorical | {stats_share / total *100:.1f}% |> ELO dominates feature attribution across all three outcome classes because it> already encodes long-run team strength. Rolling window statistics contribute a> non-zero share — they are *used* by the model — but carry redundant signal when> ELO is present, which is why ablation studies show minimal marginal gain."""))
ELO vs rolling stats: share of total SHAP importance
Why do rolling stats add little given ELO?
Feature group
Share of mean
ELO features
29.2%
Rolling stats + categorical
70.8%
ELO dominates feature attribution across all three outcome classes because it already encodes long-run team strength. Rolling window statistics contribute a non-zero share — they are used by the model — but carry redundant signal when ELO is present, which is why ablation studies show minimal marginal gain.