metrics函数移动到bet tools

2025-10-25 22:17:28 +08:00
parent 22bede007e
commit 332284293f
3 changed files with 117 additions and 137 deletions
--- a/common/bet_tools.py
+++ b/common/bet_tools.py
@@ -1,5 +1,11 @@
 import math
+
+import numpy as np
+import pandas as pd
 from scipy.optimize import fsolve  # 导入 fsolve 函数用于数值求解
+from scipy.special import logit as sp_logit
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import log_loss


 def moneyline_to_prob(moneyline_odds: int) -> float:
@@ -154,6 +160,107 @@ def calculate_no_vig_moneyline_power(moneyline_odds_list: list[int]) -> list[int
    return no_vig_moneyline_odds


+def compute_metrics(
+    df: pd.DataFrame,
+    n_bins: int = 10,
+    bin_strategy: str = "uniform",  # 'uniform' or 'quantile'
+    include_draws: bool = True,
+    eps: float = 1e-6,
+) -> dict:
+    """
+    计算预测评估指标并拟合校准关系。
+
+        参数:
+            - df: 包含至少两列: 'win_prob' (预测主胜概率), 'res' (取 'won','refunded','lost')
+      - n_bins: ECE 分箱数
+      - bin_strategy: 'uniform' (等宽) 或 'quantile' (等频)
+      - include_draws: 若 True, 将 'draw' 视为非胜 (y=0)。若 False, 丢弃 'draw' 行。
+      - eps: 概率裁剪下限，用于数值稳定
+
+    返回:
+      dict 包含 logloss, brier, ece, accuracy, reg_alpha, reg_beta, ece_bins, n_samples
+    """
+    # 处理 refunded
+    if include_draws:
+        mask = df["res"].isin(["won", "refunded", "lost"])
+    else:
+        mask = df["res"].isin(["won", "lost"])
+    df = df[mask].copy()
+
+    # 标签: won=1, others=0 (包括 refunded)
+    y = df["res"].map({"won": 1, "refunded": 0, "lost": 0}).astype(int).values
+    p = df["win_prob"].astype(float).values
+
+    # 裁剪概率以保证数值稳定
+    p_clip = np.clip(p, eps, 1 - eps)
+
+    # logloss: 使用 sklearn 实现以获得更稳健的数值行为
+    try:
+        logloss = float(log_loss(y, p_clip, labels=[0, 1]))
+    except Exception:
+        # 备用实现
+        logloss = float(-np.mean(y * np.log(p_clip) + (1 - y) * np.log(1 - p_clip)))
+
+    # brier score
+    brier = float(np.mean((p_clip - y) ** 2))
+
+    # ECE 计算 (支持 uniform 或 quantile)
+    if bin_strategy == "quantile":
+        # quantile bin edges
+        try:
+            edges = np.unique(np.percentile(p_clip, np.linspace(0, 100, n_bins + 1)))
+            if len(edges) - 1 <= 0:
+                # fallback to uniform
+                bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
+            else:
+                # searchsorted to assign bins
+                bin_idxs = np.clip(
+                    np.searchsorted(edges, p_clip, side="right") - 1, 0, len(edges) - 2
+                )
+        except Exception:
+            bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
+    else:
+        bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
+
+    ece = 0.0
+    total = len(y)
+    bin_stats = []
+    for b in range(n_bins):
+        idx = bin_idxs == b
+        count = int(idx.sum())
+        if count == 0:
+            bin_stats.append(
+                {"count": 0, "mean_pred": float("nan"), "emp_freq": float("nan")}
+            )
+            continue
+        mean_pred = float(p_clip[idx].mean())
+        emp_freq = float(y[idx].mean())
+        ece += abs(mean_pred - emp_freq) * count
+        bin_stats.append({"count": count, "mean_pred": mean_pred, "emp_freq": emp_freq})
+    ece = float(ece / total) if total > 0 else float("nan")
+
+    # accuracy
+    acc = float(np.mean((p_clip >= 0.5) == (y == 1)))
+
+    # 校准拟合: 使用 LogisticRegression 拟合 logit(E[y]) = alpha + beta * logit(p)
+    X = sp_logit(p_clip).reshape(-1, 1)
+    clf = LogisticRegression(C=1e6, solver="lbfgs", max_iter=200)
+    clf.fit(X, y)
+    alpha = float(clf.intercept_[0])
+    beta = float(clf.coef_[0][0])
+
+    return {
+        "logloss": logloss,
+        "brier": brier,
+        "ece": ece,
+        "accuracy": acc,
+        "reg_alpha": alpha,
+        "reg_beta": beta,
+        # 'ece_bins': bin_stats,
+        "n_samples": int(total),
+    }
+
+
 # 示例
 if __name__ == "__main__":
    odds_list = [+150, -200, +300, -120]