Files
factorhack/factor_mining/validator.py
2025-11-09 14:00:58 +08:00

238 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
因子有效性检验模块:整合所有检验方案
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional
from dataclasses import dataclass
from statsmodels.regression.linear_model import OLS
from validation import (
compute_ic,
compute_rolling_ic,
group_backtest,
factor_span_regression
)
@dataclass
class ValidationConfig:
"""验证配置"""
ic_window: int = 30
ic_method: str = "spearman" # "spearman" or "pearson"
n_groups: int = 3
group_period: int = 180
min_ic: float = 0.01
min_tstat: float = 1.5
min_r2_change: float = 0.05
class FactorValidator:
"""因子有效性检验器"""
def __init__(self, config: ValidationConfig):
self.config = config
def validate_ic(
self,
factor: pd.Series,
forward_return: pd.Series
) -> Dict:
"""
IC检验
Returns:
--------
dict: 包含mean_ic, ic_ir, ic_series等
"""
rolling_ic = compute_rolling_ic(
factor,
forward_return,
window=self.config.ic_window,
method=self.config.ic_method
)
mean_ic = rolling_ic.mean()
ic_std = rolling_ic.std()
ic_ir = mean_ic / (ic_std + 1e-8) # IC信息比率
return {
"mean_ic": mean_ic,
"ic_std": ic_std,
"ic_ir": ic_ir,
"ic_series": rolling_ic,
"is_valid": abs(mean_ic) >= self.config.min_ic
}
def validate_group_backtest(
self,
factor: pd.Series,
forward_return: pd.Series
) -> Dict:
"""
分组回测检验
Returns:
--------
dict: 包含mean_h_l_return, mean_h_l_tstat等
"""
result = group_backtest(
factor,
forward_return,
n_groups=self.config.n_groups,
group_period=self.config.group_period
)
is_valid = abs(result.get('mean_h_l_tstat', 0)) >= self.config.min_tstat
return {
**result,
"is_valid": is_valid
}
def validate_regression(
self,
factor: pd.Series,
forward_return: pd.Series,
other_factors: Optional[pd.DataFrame] = None
) -> Dict:
"""
因子跨度回归检验
Parameters:
-----------
factor : Series
待检验因子
forward_return : Series
未来收益率
other_factors : DataFrame, optional
其他因子(用于控制变量)
Returns:
--------
dict: 包含beta, tstat, r2_change等
"""
if other_factors is None:
other_factors = pd.DataFrame()
# 合并因子
factors_df = pd.concat([other_factors, factor.to_frame(name='target')], axis=1)
result = factor_span_regression(
factors_df,
forward_return,
target_factor='target'
)
is_valid = (
abs(result.get('tstat', 0)) >= self.config.min_tstat and
result.get('r2_change', 0) >= self.config.min_r2_change
)
return {
**result,
"is_valid": is_valid
}
def validate_all(
self,
factor: pd.Series,
forward_return: pd.Series,
other_factors: Optional[pd.DataFrame] = None
) -> Dict:
"""
综合检验:执行所有检验方法
Returns:
--------
dict: 包含所有检验结果和综合判断
"""
results = {}
# IC检验
ic_result = self.validate_ic(factor, forward_return)
results['ic'] = ic_result
# 分组回测
group_result = self.validate_group_backtest(factor, forward_return)
results['group_backtest'] = group_result
# 回归检验
reg_result = self.validate_regression(factor, forward_return, other_factors)
results['regression'] = reg_result
# 综合判断
is_valid = (
ic_result['is_valid'] and
group_result['is_valid'] and
reg_result['is_valid']
)
results['is_valid'] = is_valid
results['score'] = self._calculate_score(ic_result, group_result, reg_result)
return results
def _calculate_score(
self,
ic_result: Dict,
group_result: Dict,
reg_result: Dict
) -> float:
"""计算综合得分"""
score = 0.0
# IC得分权重0.3
ic_score = abs(ic_result.get('mean_ic', 0)) * 10
score += ic_score * 0.3
# 分组回测得分权重0.4
tstat = abs(group_result.get('mean_h_l_tstat', 0))
tstat_score = min(tstat / 3.0, 1.0) # 归一化到[0, 1]
score += tstat_score * 0.4
# 回归得分权重0.3
r2_change = reg_result.get('r2_change', 0)
r2_score = min(r2_change / 0.1, 1.0) # 归一化到[0, 1]
score += r2_score * 0.3
return score
def filter_factors(
self,
factors: pd.DataFrame,
forward_return: pd.Series
) -> pd.DataFrame:
"""
批量过滤因子:只保留有效因子
Returns:
--------
DataFrame: 有效因子
"""
valid_factors = []
for col in factors.columns:
factor = factors[col]
result = self.validate_all(factor, forward_return, factors.drop(columns=[col]))
if result['is_valid']:
valid_factors.append(col)
return factors[valid_factors] if valid_factors else pd.DataFrame()
def create_validator(
ic_window: int = 30,
min_ic: float = 0.01,
min_tstat: float = 1.5
) -> FactorValidator:
"""创建验证器(便捷函数)"""
config = ValidationConfig(
ic_window=ic_window,
min_ic=min_ic,
min_tstat=min_tstat
)
return FactorValidator(config)