格式化代码
This commit is contained in:
@@ -5,11 +5,13 @@ from sklearn.linear_model import LogisticRegression
|
|||||||
from sklearn.metrics import log_loss
|
from sklearn.metrics import log_loss
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(df: pd.DataFrame,
|
def compute_metrics(
|
||||||
|
df: pd.DataFrame,
|
||||||
n_bins: int = 10,
|
n_bins: int = 10,
|
||||||
bin_strategy: str = 'uniform', # 'uniform' or 'quantile'
|
bin_strategy: str = "uniform", # 'uniform' or 'quantile'
|
||||||
include_draws: bool = True,
|
include_draws: bool = True,
|
||||||
eps: float = 1e-6) -> dict:
|
eps: float = 1e-6,
|
||||||
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
计算预测评估指标并拟合校准关系。
|
计算预测评估指标并拟合校准关系。
|
||||||
|
|
||||||
@@ -25,14 +27,14 @@ def compute_metrics(df: pd.DataFrame,
|
|||||||
"""
|
"""
|
||||||
# 处理 refunded
|
# 处理 refunded
|
||||||
if include_draws:
|
if include_draws:
|
||||||
mask = df['res'].isin(['won', 'refunded', 'lost'])
|
mask = df["res"].isin(["won", "refunded", "lost"])
|
||||||
else:
|
else:
|
||||||
mask = df['res'].isin(['won', 'lost'])
|
mask = df["res"].isin(["won", "lost"])
|
||||||
df = df[mask].copy()
|
df = df[mask].copy()
|
||||||
|
|
||||||
# 标签: won=1, others=0 (包括 refunded)
|
# 标签: won=1, others=0 (包括 refunded)
|
||||||
y = df['res'].map({'won': 1, 'refunded': 0, 'lost': 0}).astype(int).values
|
y = df["res"].map({"won": 1, "refunded": 0, "lost": 0}).astype(int).values
|
||||||
p = df['win_prob'].astype(float).values
|
p = df["win_prob"].astype(float).values
|
||||||
|
|
||||||
# 裁剪概率以保证数值稳定
|
# 裁剪概率以保证数值稳定
|
||||||
p_clip = np.clip(p, eps, 1 - eps)
|
p_clip = np.clip(p, eps, 1 - eps)
|
||||||
@@ -48,7 +50,7 @@ def compute_metrics(df: pd.DataFrame,
|
|||||||
brier = float(np.mean((p_clip - y) ** 2))
|
brier = float(np.mean((p_clip - y) ** 2))
|
||||||
|
|
||||||
# ECE 计算(支持 uniform 或 quantile)
|
# ECE 计算(支持 uniform 或 quantile)
|
||||||
if bin_strategy == 'quantile':
|
if bin_strategy == "quantile":
|
||||||
# quantile bin edges
|
# quantile bin edges
|
||||||
try:
|
try:
|
||||||
edges = np.unique(np.percentile(p_clip, np.linspace(0, 100, n_bins + 1)))
|
edges = np.unique(np.percentile(p_clip, np.linspace(0, 100, n_bins + 1)))
|
||||||
@@ -57,7 +59,9 @@ def compute_metrics(df: pd.DataFrame,
|
|||||||
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
||||||
else:
|
else:
|
||||||
# searchsorted to assign bins
|
# searchsorted to assign bins
|
||||||
bin_idxs = np.clip(np.searchsorted(edges, p_clip, side='right') - 1, 0, len(edges) - 2)
|
bin_idxs = np.clip(
|
||||||
|
np.searchsorted(edges, p_clip, side="right") - 1, 0, len(edges) - 2
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
bin_idxs = np.minimum((p_clip * n_bins).astype(int), n_bins - 1)
|
||||||
else:
|
else:
|
||||||
@@ -70,39 +74,40 @@ def compute_metrics(df: pd.DataFrame,
|
|||||||
idx = bin_idxs == b
|
idx = bin_idxs == b
|
||||||
count = int(idx.sum())
|
count = int(idx.sum())
|
||||||
if count == 0:
|
if count == 0:
|
||||||
bin_stats.append({'count': 0, 'mean_pred': float('nan'), 'emp_freq': float('nan')})
|
bin_stats.append(
|
||||||
|
{"count": 0, "mean_pred": float("nan"), "emp_freq": float("nan")}
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
mean_pred = float(p_clip[idx].mean())
|
mean_pred = float(p_clip[idx].mean())
|
||||||
emp_freq = float(y[idx].mean())
|
emp_freq = float(y[idx].mean())
|
||||||
ece += abs(mean_pred - emp_freq) * count
|
ece += abs(mean_pred - emp_freq) * count
|
||||||
bin_stats.append({'count': count, 'mean_pred': mean_pred, 'emp_freq': emp_freq})
|
bin_stats.append({"count": count, "mean_pred": mean_pred, "emp_freq": emp_freq})
|
||||||
ece = float(ece / total) if total > 0 else float('nan')
|
ece = float(ece / total) if total > 0 else float("nan")
|
||||||
|
|
||||||
# accuracy
|
# accuracy
|
||||||
acc = float(np.mean((p_clip >= 0.5) == (y == 1)))
|
acc = float(np.mean((p_clip >= 0.5) == (y == 1)))
|
||||||
|
|
||||||
# 校准拟合: 使用 LogisticRegression 拟合 logit(E[y]) = alpha + beta * logit(p)
|
# 校准拟合: 使用 LogisticRegression 拟合 logit(E[y]) = alpha + beta * logit(p)
|
||||||
X = sp_logit(p_clip).reshape(-1, 1)
|
X = sp_logit(p_clip).reshape(-1, 1)
|
||||||
clf = LogisticRegression(C=1e6, solver='lbfgs', max_iter=200)
|
clf = LogisticRegression(C=1e6, solver="lbfgs", max_iter=200)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
alpha = float(clf.intercept_[0])
|
alpha = float(clf.intercept_[0])
|
||||||
beta = float(clf.coef_[0][0])
|
beta = float(clf.coef_[0][0])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'logloss': logloss,
|
"logloss": logloss,
|
||||||
'brier': brier,
|
"brier": brier,
|
||||||
'ece': ece,
|
"ece": ece,
|
||||||
'accuracy': acc,
|
"accuracy": acc,
|
||||||
'reg_alpha': alpha,
|
"reg_alpha": alpha,
|
||||||
'reg_beta': beta,
|
"reg_beta": beta,
|
||||||
# 'ece_bins': bin_stats,
|
# 'ece_bins': bin_stats,
|
||||||
'n_samples': int(total)
|
"n_samples": int(total),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
if __name__ == '__main__':
|
|
||||||
df = pd.read_feather("data/p_res.feather")
|
df = pd.read_feather("data/p_res.feather")
|
||||||
df['win_prob'] = df['power_p']
|
df["win_prob"] = df["power_p"]
|
||||||
res = compute_metrics(df)
|
res = compute_metrics(df)
|
||||||
print(res)
|
print(res)
|
||||||
|
|||||||
Reference in New Issue
Block a user