282 lines
11 KiB
Python
282 lines
11 KiB
Python
import math
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.optimize import fsolve # 导入 fsolve 函数用于数值求解
|
|
from scipy.special import logit as sp_logit
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import log_loss
|
|
|
|
|
|
def moneyline_to_prob(moneyline_odds: int) -> float:
|
|
"""将 Moneyline 赔率转换为隐含概率."""
|
|
if moneyline_odds == 0:
|
|
raise ValueError("Moneyline odds cannot be 0")
|
|
elif moneyline_odds > 0:
|
|
# 正赔率 +X -> 隐含概率 = 100 / (100 + X)
|
|
return 100 / (moneyline_odds + 100)
|
|
else: # moneyline_odds <= 0
|
|
# 负赔率 -X -> 隐含概率 = X / (X + 100)
|
|
return abs(moneyline_odds) / (abs(moneyline_odds) + 100)
|
|
|
|
|
|
def prob_to_moneyline(probability: float) -> int:
|
|
"""将概率转换为 Moneyline 赔率 (四舍五入到最接近的整数)."""
|
|
if not 0 < probability < 1:
|
|
# 概率为 0 或 1 对应无限或 -100 的 Moneyline 赔率,这里简化处理,实际中极少遇到精确的 0 或 1
|
|
if math.isclose(probability, 0):
|
|
return float("inf")
|
|
if math.isclose(probability, 1):
|
|
return (
|
|
-100
|
|
) # 或者 raise ValueError("Probability must be between 0 and 1 (exclusive)")
|
|
raise ValueError("Probability must be between 0 and 1 (exclusive)")
|
|
|
|
if probability <= 0.5:
|
|
# 概率 <= 0.5 对应正 Moneyline 赔率 (Decimal >= 2.0)
|
|
# Decimal Odds = 1 / probability
|
|
# Moneyline = (Decimal Odds - 1) * 100
|
|
return round((1 / probability - 1) * 100, 2)
|
|
else:
|
|
# 概率 > 0.5 对应负 Moneyline 赔率 (Decimal < 2.0)
|
|
# Decimal Odds = 1 / probability
|
|
# Moneyline = -100 / (Decimal Odds - 1)
|
|
return round(-100 / (1 / probability - 1), 2)
|
|
|
|
|
|
def calculate_no_vig_moneyline_multir(moneyline_odds_list: list[int]) -> list[int]:
|
|
"""
|
|
通过乘法法(乘法归一法,Multiplicative Rescaling)对任意赔率组计算去除vig(取消庄家水钱)后的moneyline赔率。
|
|
|
|
具体步骤:
|
|
1. 将各moneyline赔率转换为隐含概率(带vig)。
|
|
2. 将所有隐含概率加总,得到带vig的总和sum_p,通常 >1。
|
|
3. 对每个概率除以总和,得到去vig的无水概率。
|
|
4. 将该去vig概率再换算回moneyline赔率。
|
|
|
|
示例:
|
|
输入: [+120, -150]
|
|
步骤:
|
|
implied_probs = [100/220, 150/250] = [0.4545, 0.6]
|
|
sum_p = 1.0545
|
|
novig_probs = [0.4545/1.0545, 0.6/1.0545]
|
|
回转moneyline
|
|
输出: 去vig后的moneyline列表
|
|
|
|
参数:
|
|
moneyline_odds_list (list[int]): 原始moneyline赔率列表
|
|
|
|
返回:
|
|
list[int]: 对应的去vig后moneyline赔率列表
|
|
"""
|
|
if not moneyline_odds_list:
|
|
return []
|
|
|
|
# 步骤1: 计算带vig的隐含概率
|
|
implied_probabilities = [moneyline_to_prob(odds) for odds in moneyline_odds_list]
|
|
# 步骤2: 计算总概率,理论上>1表示有vig
|
|
prob_total = sum(implied_probabilities)
|
|
# 步骤3: 每个概率除以总和,得到去vig的概率(归一化)
|
|
no_vig_probabilities = [prob / prob_total for prob in implied_probabilities]
|
|
# 步骤4: 概率转回moneyline赔率
|
|
no_vig_moneyline_odds = [
|
|
prob_to_moneyline(p_novig) for p_novig in no_vig_probabilities
|
|
]
|
|
|
|
return no_vig_moneyline_odds
|
|
|
|
|
|
def calculate_no_vig_moneyline_power(moneyline_odds_list: list[int]) -> list[int]:
|
|
"""
|
|
使用 Power Method (根据提供的文献描述) 计算无 vigorish 的 Moneyline 赔率。
|
|
该方法通过寻找 k 使得 sum(implied_prob^k) = 1 来调整概率。
|
|
|
|
参数:
|
|
moneyline_odds_list (list): 包含所有可能结果的 Moneyline 整数赔率列表 (例如, [+116, -156])。
|
|
|
|
返回:
|
|
list: 包含所有可能结果的无 vigorish Moneyline 整数赔率列表。
|
|
"""
|
|
if not moneyline_odds_list:
|
|
return []
|
|
|
|
# 1. 将 Moneyline 赔率转换为隐含概率 (pi)
|
|
implied_probabilities = [moneyline_to_prob(odds) for odds in moneyline_odds_list]
|
|
|
|
# 确保所有隐含概率都大于 0,否则无法进行幂运算或取对数 (数值求解时可能涉及)
|
|
if any(p <= 0 for p in implied_probabilities):
|
|
raise ValueError("All implied probabilities must be positive.")
|
|
|
|
total_implied_probability = sum(implied_probabilities)
|
|
|
|
# 如果总概率 <= 1,说明没有 vig 或 vig 极少,直接返回原始赔率
|
|
if total_implied_probability <= 1:
|
|
print(
|
|
"Warning: Input odds already have little or no vig. Returning original odds."
|
|
)
|
|
return moneyline_odds_list
|
|
|
|
# 2. 定义需要找到根的函数 f(k) = sum(pi^k) - 1
|
|
# 我们要找到 k 使得 sum(pi^k) = 1
|
|
# 由于 sum(pi) > 1 且 pi < 1, 我们需要 k > 1 才能让 pi^k < pi, 从而降低总和至 1。
|
|
def sum_pi_pow_k_minus_1(k):
|
|
# fsolve 传入的 k 是一个数组,我们需要取其第一个元素
|
|
k_val = k[0] if isinstance(k, (list, tuple)) else k
|
|
# 计算 sum(pi^k)
|
|
sum_val = sum(p**k_val for p in implied_probabilities)
|
|
return sum_val - 1 # 我们的目标是让这个函数等于 0
|
|
|
|
# 3. 寻找 k 使得 f(k) = 0
|
|
# 我们知道当 k=1 时,总和是 total_implied_probability (>1)。
|
|
# 当 k 增大时,sum(pi^k) 会减小。所以根 k 应该大于 1。
|
|
# 提供一个合理的初始猜测值给 fsolve,例如 1.1 或 1.5
|
|
initial_k_guess = [1.1] # fsolve 期望一个数组作为初始猜测
|
|
|
|
# 使用 fsolve 寻找 k
|
|
# fsolve 返回一个数组,即使只有一个解
|
|
k_solution = fsolve(sum_pi_pow_k_minus_1, initial_k_guess)
|
|
|
|
# 提取求解到的 k 值
|
|
k = k_solution[0]
|
|
|
|
# 4. 计算无 Vig 概率 pi_novig = pi^k
|
|
no_vig_probabilities = [p**k for p in implied_probabilities]
|
|
|
|
# 由于浮点数精度和数值求解的限制,最终的概率之和可能不严格等于 1。
|
|
# 虽然理论上由 k 的定义保证总和为 1,但实践中检查一下是有益的。
|
|
final_sum_check = sum(no_vig_probabilities)
|
|
if not math.isclose(final_sum_check, 1.0, abs_tol=1e-9):
|
|
print(
|
|
f"Warning: Final no-vig probabilities sum to {final_sum_check:.6f}, expected 1.0. Sum may need slight re-normalization."
|
|
)
|
|
# 理论上 Power Method 的定义保证了总和为 1,但如果因为数值误差偏离较多,
|
|
# 可以选择在这里进行最后的比例调整,但严格遵循方法定义是不需要的。
|
|
|
|
# 5. 将无 Vig 概率转换回 Moneyline 赔率
|
|
no_vig_moneyline_odds = [
|
|
prob_to_moneyline(p_novig) for p_novig in no_vig_probabilities
|
|
]
|
|
|
|
return no_vig_moneyline_odds
|
|
|
|
|
|
def compute_metrics(
|
|
df: pd.DataFrame,
|
|
n_bins: int = 10,
|
|
bin_strategy: str = "uniform", # 'uniform' or 'quantile'
|
|
include_draws: bool = True,
|
|
eps: float = 1e-6,
|
|
) -> dict:
|
|
"""
|
|
计算预测评估指标并拟合校准关系。
|
|
|
|
参数:
|
|
- df: 包含至少两列: 'win_prob' (预测主胜概率), 'res' (取 'won','refunded','lost')
|
|
- n_bins: ECE 分箱数
|
|
- bin_strategy: 'uniform' (等宽) 或 'quantile' (等频)
|
|
- include_draws: 若 True, 将 'draw' 视为非胜 (y=0)。若 False, 丢弃 'draw' 行。
|
|
- eps: 概率裁剪下限,用于数值稳定
|
|
|
|
返回:
|
|
dict 包含 logloss, brier, ece, accuracy, reg_alpha, reg_beta, ece_bins, n_samples
|
|
"""
|
|
# 处理 refunded
|
|
if include_draws:
|
|
mask = df["res"].isin(["won", "refunded", "lost"])
|
|
else:
|
|
mask = df["res"].isin(["won", "lost"])
|
|
df = df[mask].copy()
|
|
|
|
# 标签: won=1, others=0 (包括 refunded)
|
|
y = df["res"].map({"won": 1, "refunded": 0, "lost": 0}).astype(int).values
|
|
p = df["win_prob"].astype(float).values
|
|
|
|
# 裁剪概率以保证数值稳定
|
|
p_clip = np.clip(p, eps, 1 - eps)
|
|
|
|
# logloss: 使用 sklearn 实现以获得更稳健的数值行为
|
|
try:
|
|
logloss = float(log_loss(y, p_clip, labels=[0, 1]))
|
|
except Exception:
|
|
# 备用实现
|
|
logloss = float(-np.mean(y * np.log(p_clip) + (1 - y) * np.log(1 - p_clip)))
|
|
|
|
# brier score
|
|
brier = float(np.mean((p_clip - y) ** 2))
|
|
|
|
# ECE 计算 (支持 uniform 或 quantile)
|
|
if bin_strategy == "quantile":
|
|
# quantile bin edges
|
|
try:
|
|
edges = np.unique(np.percentile(p_clip, np.linspace(0, 100, n_bins + 1)))
|
|
if len(edges) - 1 <= 0:
|
|
# fallback to uniform
|
|
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
|
else:
|
|
# searchsorted to assign bins
|
|
bin_idxs = np.clip(
|
|
np.searchsorted(edges, p_clip, side="right") - 1, 0, len(edges) - 2
|
|
)
|
|
except Exception:
|
|
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
|
else:
|
|
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
|
|
|
ece = 0.0
|
|
total = len(y)
|
|
bin_stats = []
|
|
for b in range(n_bins):
|
|
idx = bin_idxs == b
|
|
count = int(idx.sum())
|
|
if count == 0:
|
|
bin_stats.append(
|
|
{"count": 0, "mean_pred": float("nan"), "emp_freq": float("nan")}
|
|
)
|
|
continue
|
|
mean_pred = float(p_clip[idx].mean())
|
|
emp_freq = float(y[idx].mean())
|
|
ece += abs(mean_pred - emp_freq) * count
|
|
bin_stats.append({"count": count, "mean_pred": mean_pred, "emp_freq": emp_freq})
|
|
ece = float(ece / total) if total > 0 else float("nan")
|
|
|
|
# accuracy
|
|
acc = float(np.mean((p_clip >= 0.5) == (y == 1)))
|
|
|
|
# 校准拟合: 使用 LogisticRegression 拟合 logit(E[y]) = alpha + beta * logit(p)
|
|
X = sp_logit(p_clip).reshape(-1, 1)
|
|
clf = LogisticRegression(C=1e6, solver="lbfgs", max_iter=200)
|
|
clf.fit(X, y)
|
|
alpha = float(clf.intercept_[0])
|
|
beta = float(clf.coef_[0][0])
|
|
|
|
return {
|
|
"logloss": logloss,
|
|
"brier": brier,
|
|
"ece": ece,
|
|
"accuracy": acc,
|
|
"reg_alpha": alpha,
|
|
"reg_beta": beta,
|
|
# 'ece_bins': bin_stats,
|
|
"n_samples": int(total),
|
|
}
|
|
|
|
|
|
# 示例
|
|
if __name__ == "__main__":
|
|
odds_list = [+150, -200, +300, -120]
|
|
for odds in odds_list:
|
|
prob = moneyline_to_prob(odds)
|
|
print(f"赔率 {odds}: 概率 {prob:.4f}")
|
|
|
|
odds = [+116, -156]
|
|
# 计算无 Vig 赔率使用 Power Method
|
|
no_vig_odds_power = calculate_no_vig_moneyline_power(odds)
|
|
|
|
print(f"原始 Moneyline 赔率: {odds}")
|
|
print(f"无 Vig Moneyline 赔率 (Power Method): {no_vig_odds_power}")
|
|
|
|
# 可选: 验证无 vig 赔率对应的概率之和是否接近 1
|
|
if no_vig_odds_power:
|
|
novig_probs_power = [moneyline_to_prob(o) for o in no_vig_odds_power]
|
|
print(f"无 Vig 概率之和 (基于计算出的赔率): {sum(novig_probs_power):.6f}")
|