factor minner

This commit is contained in:
2025-11-09 14:00:58 +08:00
parent a66e42a8ae
commit dc3d41d6e5
5 changed files with 1072 additions and 0 deletions

236
factor_mining/gp_miner.py Normal file
View File

@@ -0,0 +1,236 @@
"""
DEAP遗传编程挖掘器实现
"""
import random
import operator
from typing import List, Tuple, Optional
from dataclasses import dataclass
import numpy as np
import pandas as pd
from deap import algorithms, base, creator, gp, tools
from factor_mining.operators import FactorFormula, get_registry, get_operator
from factor_mining.mining import FactorMiner, MiningConfig
from data import compute_forward_returns
@dataclass
class GPConfig(MiningConfig):
"""GP挖掘配置"""
population_size: int = 200
generations: int = 30
tournament_size: int = 5
crossover_prob: float = 0.9
mutation_prob: float = 0.05
elitism: int = 5
max_depth_init: int = 1
max_depth: int = 8
complexity_penalty: float = 0.001
class GPMiner(FactorMiner):
"""DEAP遗传编程挖掘器"""
def __init__(self, config: GPConfig):
super().__init__(config)
self.config: GPConfig = config
self.toolbox: Optional[base.Toolbox] = None
self.pset: Optional[gp.PrimitiveSetTyped] = None
self.features: Optional[List[pd.Series]] = None
def get_name(self) -> str:
return "gp"
def _build_pset(self, feature_names: List[str]) -> gp.PrimitiveSetTyped:
"""构建GP原始集合"""
registry = get_registry()
pset = gp.PrimitiveSetTyped("MAIN", [np.ndarray for _ in feature_names], np.ndarray)
# 命名参数
for i, name in enumerate(feature_names):
pset.renameArguments(**{f"ARG{i}": name})
# 添加算子
for op_name in registry.list_all():
op = registry.get(op_name)
if op:
sig = op.get_signature()
params = list(sig.parameters.values())
# 根据参数数量判断是一元还是二元算子
if len(params) == 1:
# 一元算子
pset.addPrimitive(op.func, [np.ndarray], np.ndarray, name=op_name)
elif len(params) == 2:
# 二元算子
pset.addPrimitive(op.func, [np.ndarray, np.ndarray], np.ndarray, name=op_name)
# 添加常量
def _const() -> np.ndarray:
return np.array(random.uniform(-2.0, 2.0))
pset.addEphemeralConstant("const", _const, np.ndarray)
return pset
def _evaluate_individual(
self,
individual,
target: pd.Series
) -> Tuple[float]:
"""评估个体适应度"""
func = self.toolbox.compile(expr=individual)
# 构建特征矩阵
idx = target.index
inputs = [f.reindex(idx).to_numpy() for f in self.features]
try:
raw = func(*inputs)
except Exception:
return (-1e6,)
# 确保数组长度
if not isinstance(raw, np.ndarray):
return (-1e6,)
if raw.shape[0] != len(idx):
return (-1e6,)
# 转换为Series并清理
factor = pd.Series(raw, index=idx)
factor = factor.replace([np.inf, -np.inf], np.nan)
factor = factor.ffill().bfill()
# 计算滚动IC
window = self.config.ic_window
if len(factor) < window + 10:
return (-1e6,)
from validation import compute_rolling_ic
ic_series = compute_rolling_ic(factor, target, window=window, method=self.config.ic_method)
mean_ic = ic_series.mean()
if not np.isfinite(mean_ic):
return (-1e6,)
# 复杂度惩罚
complexity = len(individual)
fitness = mean_ic - self.config.complexity_penalty * complexity
if not np.isfinite(fitness):
fitness = -1e6
return (fitness,)
def _individual_to_formula(
self,
individual,
feature_names: List[str]
) -> FactorFormula:
"""将GP个体转换为因子公式"""
# GP表达式是PrimitiveTree转换为字符串后是函数调用形式
# 例如: "add(ARG0, ARG1)" 或 "mul(add(ARG0, ARG1), const)"
expr_str = str(individual)
# 替换ARG0, ARG1等为实际特征名
for i, name in enumerate(feature_names):
expr_str = expr_str.replace(f"ARG{i}", name)
# GP表达式已经是Python可执行的函数调用格式
# 例如: "add(close, open)" 可以直接eval
# 但需要确保所有算子都在环境中可用
return FactorFormula(expr_str, feature_names)
def mine(
self,
data: pd.DataFrame,
feature_cols: List[str],
price_col: str = "close"
) -> List[FactorFormula]:
"""执行GP挖掘"""
if self.config.seed is not None:
random.seed(self.config.seed)
np.random.seed(self.config.seed)
# 准备数据
price = data[price_col].astype(float)
forward_ret = compute_forward_returns(price, self.config.ret_horizon)
target = forward_ret
self.features = [data[c].astype(float) for c in feature_cols]
# 构建原始集合
self.pset = self._build_pset(feature_cols)
# 创建DEAP类型
if not hasattr(creator, "FitnessMax"):
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
if not hasattr(creator, "Individual"):
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)
# 构建工具箱
self.toolbox = base.Toolbox()
self.toolbox.register(
"expr",
gp.genHalfAndHalf,
pset=self.pset,
min_=1,
max_=self.config.max_depth_init
)
self.toolbox.register("individual", tools.initIterate, creator.Individual, self.toolbox.expr)
self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
self.toolbox.register("compile", gp.compile, pset=self.pset)
self.toolbox.register(
"evaluate",
self._evaluate_individual,
target=target
)
# 遗传算子
self.toolbox.register("select", tools.selTournament, tournsize=self.config.tournament_size)
self.toolbox.register("mate", gp.cxOnePoint)
self.toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
self.toolbox.register("mutate", gp.mutUniform, expr=self.toolbox.expr_mut, pset=self.pset)
# 控制树深度
self.toolbox.decorate(
"mate",
gp.staticLimit(key=operator.attrgetter("height"), max_value=self.config.max_depth)
)
self.toolbox.decorate(
"mutate",
gp.staticLimit(key=operator.attrgetter("height"), max_value=self.config.max_depth)
)
# 运行进化
pop = self.toolbox.population(n=self.config.population_size)
hof = tools.HallOfFame(maxsize=max(5, self.config.elitism))
stats_fit = tools.Statistics(lambda ind: ind.fitness.values[0])
stats_size = tools.Statistics(len)
mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
mstats.register("avg", np.nanmean)
mstats.register("std", np.nanstd)
mstats.register("min", np.nanmin)
mstats.register("max", np.nanmax)
pop, logbook = algorithms.eaSimple(
pop,
self.toolbox,
cxpb=self.config.crossover_prob,
mutpb=self.config.mutation_prob,
ngen=self.config.generations,
stats=mstats,
halloffame=hof,
verbose=True,
)
# 转换为因子公式
formulas = []
for individual in hof:
formula = self._individual_to_formula(individual, feature_cols)
formulas.append(formula)
return formulas

123
factor_mining/mining.py Normal file
View File

@@ -0,0 +1,123 @@
"""
因子挖掘抽象层支持多种挖掘算法DEAP、DL、RL等
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Optional, Any
import pandas as pd
from dataclasses import dataclass
from factor_mining.operators import FactorFormula
@dataclass
class MiningConfig:
"""挖掘配置基类"""
ret_horizon: int = 1
ic_window: int = 30
ic_method: str = "spearman" # "spearman" or "pearson"
seed: Optional[int] = None
class FactorMiner(ABC):
"""因子挖掘器抽象基类"""
def __init__(self, config: MiningConfig):
self.config = config
@abstractmethod
def mine(
self,
data: pd.DataFrame,
feature_cols: List[str],
price_col: str = "close"
) -> List[FactorFormula]:
"""
挖掘因子
Parameters:
-----------
data : DataFrame
数据
feature_cols : List[str]
特征列名列表
price_col : str
价格列名
Returns:
--------
List[FactorFormula]: 挖掘出的因子公式列表
"""
pass
@abstractmethod
def get_name(self) -> str:
"""获取挖掘器名称"""
pass
class MiningPipeline:
"""挖掘流程管理器"""
def __init__(self):
self.miners: Dict[str, FactorMiner] = {}
def register_miner(self, miner: FactorMiner):
"""注册挖掘器"""
name = miner.get_name()
if name in self.miners:
raise ValueError(f"挖掘器 '{name}' 已存在")
self.miners[name] = miner
def get_miner(self, name: str) -> Optional[FactorMiner]:
"""获取挖掘器"""
return self.miners.get(name)
def list_miners(self) -> List[str]:
"""列出所有挖掘器"""
return list(self.miners.keys())
def mine(
self,
miner_name: str,
data: pd.DataFrame,
feature_cols: List[str],
price_col: str = "close"
) -> List[FactorFormula]:
"""
使用指定挖掘器进行挖掘
Parameters:
-----------
miner_name : str
挖掘器名称
data : DataFrame
数据
feature_cols : List[str]
特征列名列表
price_col : str
价格列名
Returns:
--------
List[FactorFormula]: 挖掘出的因子公式列表
"""
miner = self.get_miner(miner_name)
if miner is None:
raise ValueError(f"挖掘器 '{miner_name}' 不存在")
return miner.mine(data, feature_cols, price_col)
# 全局挖掘流程管理器
_pipeline = MiningPipeline()
def register_miner(miner: FactorMiner):
"""注册挖掘器到全局管理器"""
_pipeline.register_miner(miner)
def get_pipeline() -> MiningPipeline:
"""获取全局挖掘流程管理器"""
return _pipeline

297
factor_mining/operators.py Normal file
View File

@@ -0,0 +1,297 @@
"""
算子系统:基础数学算子和技术指标算子的注册与管理
支持算子的注册、查询、反射调用
"""
import numpy as np
import pandas as pd
from typing import Dict, Callable, List, Optional, Any
from abc import ABC, abstractmethod
import inspect
class Operator(ABC):
"""算子基类"""
def __init__(self, name: str, func: Callable, description: str = ""):
"""
Parameters:
-----------
name : str
算子名称(唯一标识)
func : Callable
算子函数
description : str
算子描述
"""
self.name = name
self.func = func
self.description = description
self._signature = inspect.signature(func)
def __call__(self, *args, **kwargs):
"""调用算子函数"""
return self.func(*args, **kwargs)
def get_signature(self):
"""获取函数签名"""
return self._signature
def __repr__(self):
return f"Operator(name='{self.name}', description='{self.description}')"
class OperatorRegistry:
"""算子注册表"""
def __init__(self):
self._operators: Dict[str, Operator] = {}
def register(self, operator: Operator):
"""注册算子"""
if operator.name in self._operators:
raise ValueError(f"算子 '{operator.name}' 已存在")
self._operators[operator.name] = operator
def register_function(self, name: str, func: Callable, description: str = ""):
"""直接注册函数为算子"""
operator = Operator(name, func, description)
self.register(operator)
def get(self, name: str) -> Optional[Operator]:
"""获取算子"""
return self._operators.get(name)
def has(self, name: str) -> bool:
"""检查算子是否存在"""
return name in self._operators
def list_all(self) -> List[str]:
"""列出所有算子名称"""
return list(self._operators.keys())
def get_all(self) -> Dict[str, Operator]:
"""获取所有算子"""
return self._operators.copy()
# 全局算子注册表
_registry = OperatorRegistry()
def register_operator(name: str, description: str = ""):
"""装饰器:注册算子"""
def decorator(func: Callable):
_registry.register_function(name, func, description)
return func
return decorator
def get_operator(name: str) -> Optional[Operator]:
"""获取算子"""
return _registry.get(name)
def get_registry() -> OperatorRegistry:
"""获取全局注册表"""
return _registry
# ==================== 基础数学算子 ====================
@register_operator("add", "加法: x + y")
def _add(x: np.ndarray, y: np.ndarray) -> np.ndarray:
return x + y
@register_operator("sub", "减法: x - y")
def _sub(x: np.ndarray, y: np.ndarray) -> np.ndarray:
return x - y
@register_operator("mul", "乘法: x * y")
def _mul(x: np.ndarray, y: np.ndarray) -> np.ndarray:
return x * y
@register_operator("div", "除法: x / y (安全除法)")
def _div(x: np.ndarray, y: np.ndarray) -> np.ndarray:
denom = np.where(np.abs(y) < 1e-12, np.nan, y)
return x / denom
@register_operator("neg", "取负: -x")
def _neg(x: np.ndarray) -> np.ndarray:
return np.negative(x)
@register_operator("abs", "绝对值: |x|")
def _abs(x: np.ndarray) -> np.ndarray:
return np.abs(x)
@register_operator("log", "对数: log(|x|)")
def _log(x: np.ndarray) -> np.ndarray:
return np.log(np.clip(np.abs(x), 1e-12, None))
@register_operator("sqrt", "平方根: sqrt(x)")
def _sqrt(x: np.ndarray) -> np.ndarray:
return np.sqrt(np.clip(x, 0.0, None))
@register_operator("pow", "幂运算: x^y (限制范围)")
def _pow(x: np.ndarray, y: np.ndarray) -> np.ndarray:
y_clip = np.clip(y, -3.0, 3.0)
with np.errstate(over="ignore", invalid="ignore"):
out = np.power(np.clip(x, -1e6, 1e6), y_clip)
out[~np.isfinite(out)] = np.nan
return out
# ==================== 时间序列算子 ====================
def _rolling_mean(x: np.ndarray, window: int) -> np.ndarray:
s = pd.Series(x)
return s.rolling(window, min_periods=max(2, window // 2)).mean().to_numpy()
def _rolling_std(x: np.ndarray, window: int) -> np.ndarray:
s = pd.Series(x)
return s.rolling(window, min_periods=max(2, window // 2)).std().to_numpy()
def _ts_delta(x: np.ndarray, period: int) -> np.ndarray:
s = pd.Series(x)
return s.diff(period).to_numpy()
def _ts_rank(x: np.ndarray, window: int) -> np.ndarray:
s = pd.Series(x)
return (
s.rolling(window, min_periods=max(2, window // 2))
.apply(lambda a: pd.Series(a).rank(pct=True).iloc[-1], raw=False)
.to_numpy()
)
def _delay(x: np.ndarray, period: int) -> np.ndarray:
s = pd.Series(x)
return s.shift(period).to_numpy()
# 注册时间序列算子(带不同窗口)
for w in (3, 6, 12, 24, 48, 96):
_registry.register_function(
f"sma{w}", lambda x, w=w: _rolling_mean(x, w), f"简单移动平均: SMA(x, {w})"
)
_registry.register_function(
f"std{w}", lambda x, w=w: _rolling_std(x, w), f"滚动标准差: STD(x, {w})"
)
_registry.register_function(
f"rank{w}", lambda x, w=w: _ts_rank(x, w), f"滚动排名: RANK(x, {w})"
)
_registry.register_function(
f"delta{w}", lambda x, w=w: _ts_delta(x, w), f"差分: DELTA(x, {w})"
)
_registry.register_function(
f"delay{w}", lambda x, w=w: _delay(x, w), f"延迟: DELAY(x, {w})"
)
# ==================== 因子公式解析与计算 ====================
class FactorFormula:
"""因子公式:支持序列化和反序列化"""
def __init__(self, expression: str, feature_names: List[str]):
"""
Parameters:
-----------
expression : str
因子表达式(使用算子名称)
feature_names : List[str]
特征名称列表
"""
self.expression = expression
self.feature_names = feature_names
def compute(self, features: Dict[str, np.ndarray]) -> np.ndarray:
"""
计算因子值
Parameters:
-----------
features : Dict[str, np.ndarray]
特征字典key为特征名称
Returns:
--------
np.ndarray: 因子值
"""
# 构建计算环境
env = {}
# 添加特征
for name in self.feature_names:
if name not in features:
raise KeyError(f"特征 '{name}' 不存在")
env[name] = features[name]
# 添加算子
for op_name in _registry.list_all():
op = _registry.get(op_name)
if op:
env[op_name] = op.func
# 添加numpy和pandas用于某些表达式
env["np"] = np
env["pd"] = pd
# 执行表达式
try:
# 限制可用的内置函数
safe_builtins = {
"abs": abs,
"min": min,
"max": max,
"sum": sum,
"len": len,
}
result = eval(self.expression, {"__builtins__": safe_builtins}, env)
# 确保结果是numpy数组
if not isinstance(result, np.ndarray):
if isinstance(result, (int, float)):
# 标量转换为数组(广播)
result = np.full(len(features[self.feature_names[0]]), result)
else:
result = np.array(result)
# 确保长度一致
expected_len = len(features[self.feature_names[0]])
if len(result) != expected_len:
raise ValueError(
f"表达式结果长度 {len(result)} 与特征长度 {expected_len} 不匹配"
)
return result
except Exception as e:
raise RuntimeError(f"计算因子表达式失败: {e}\n表达式: {self.expression}")
def to_dict(self) -> Dict:
"""序列化为字典"""
return {"expression": self.expression, "feature_names": self.feature_names}
@classmethod
def from_dict(cls, data: Dict) -> "FactorFormula":
"""从字典反序列化"""
return cls(data["expression"], data["feature_names"])
def __repr__(self):
return f"FactorFormula(expression='{self.expression}', features={self.feature_names})"

237
factor_mining/validator.py Normal file
View File

@@ -0,0 +1,237 @@
"""
因子有效性检验模块:整合所有检验方案
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional
from dataclasses import dataclass
from statsmodels.regression.linear_model import OLS
from validation import (
compute_ic,
compute_rolling_ic,
group_backtest,
factor_span_regression
)
@dataclass
class ValidationConfig:
"""验证配置"""
ic_window: int = 30
ic_method: str = "spearman" # "spearman" or "pearson"
n_groups: int = 3
group_period: int = 180
min_ic: float = 0.01
min_tstat: float = 1.5
min_r2_change: float = 0.05
class FactorValidator:
"""因子有效性检验器"""
def __init__(self, config: ValidationConfig):
self.config = config
def validate_ic(
self,
factor: pd.Series,
forward_return: pd.Series
) -> Dict:
"""
IC检验
Returns:
--------
dict: 包含mean_ic, ic_ir, ic_series等
"""
rolling_ic = compute_rolling_ic(
factor,
forward_return,
window=self.config.ic_window,
method=self.config.ic_method
)
mean_ic = rolling_ic.mean()
ic_std = rolling_ic.std()
ic_ir = mean_ic / (ic_std + 1e-8) # IC信息比率
return {
"mean_ic": mean_ic,
"ic_std": ic_std,
"ic_ir": ic_ir,
"ic_series": rolling_ic,
"is_valid": abs(mean_ic) >= self.config.min_ic
}
def validate_group_backtest(
self,
factor: pd.Series,
forward_return: pd.Series
) -> Dict:
"""
分组回测检验
Returns:
--------
dict: 包含mean_h_l_return, mean_h_l_tstat等
"""
result = group_backtest(
factor,
forward_return,
n_groups=self.config.n_groups,
group_period=self.config.group_period
)
is_valid = abs(result.get('mean_h_l_tstat', 0)) >= self.config.min_tstat
return {
**result,
"is_valid": is_valid
}
def validate_regression(
self,
factor: pd.Series,
forward_return: pd.Series,
other_factors: Optional[pd.DataFrame] = None
) -> Dict:
"""
因子跨度回归检验
Parameters:
-----------
factor : Series
待检验因子
forward_return : Series
未来收益率
other_factors : DataFrame, optional
其他因子(用于控制变量)
Returns:
--------
dict: 包含beta, tstat, r2_change等
"""
if other_factors is None:
other_factors = pd.DataFrame()
# 合并因子
factors_df = pd.concat([other_factors, factor.to_frame(name='target')], axis=1)
result = factor_span_regression(
factors_df,
forward_return,
target_factor='target'
)
is_valid = (
abs(result.get('tstat', 0)) >= self.config.min_tstat and
result.get('r2_change', 0) >= self.config.min_r2_change
)
return {
**result,
"is_valid": is_valid
}
def validate_all(
self,
factor: pd.Series,
forward_return: pd.Series,
other_factors: Optional[pd.DataFrame] = None
) -> Dict:
"""
综合检验:执行所有检验方法
Returns:
--------
dict: 包含所有检验结果和综合判断
"""
results = {}
# IC检验
ic_result = self.validate_ic(factor, forward_return)
results['ic'] = ic_result
# 分组回测
group_result = self.validate_group_backtest(factor, forward_return)
results['group_backtest'] = group_result
# 回归检验
reg_result = self.validate_regression(factor, forward_return, other_factors)
results['regression'] = reg_result
# 综合判断
is_valid = (
ic_result['is_valid'] and
group_result['is_valid'] and
reg_result['is_valid']
)
results['is_valid'] = is_valid
results['score'] = self._calculate_score(ic_result, group_result, reg_result)
return results
def _calculate_score(
self,
ic_result: Dict,
group_result: Dict,
reg_result: Dict
) -> float:
"""计算综合得分"""
score = 0.0
# IC得分权重0.3
ic_score = abs(ic_result.get('mean_ic', 0)) * 10
score += ic_score * 0.3
# 分组回测得分权重0.4
tstat = abs(group_result.get('mean_h_l_tstat', 0))
tstat_score = min(tstat / 3.0, 1.0) # 归一化到[0, 1]
score += tstat_score * 0.4
# 回归得分权重0.3
r2_change = reg_result.get('r2_change', 0)
r2_score = min(r2_change / 0.1, 1.0) # 归一化到[0, 1]
score += r2_score * 0.3
return score
def filter_factors(
self,
factors: pd.DataFrame,
forward_return: pd.Series
) -> pd.DataFrame:
"""
批量过滤因子:只保留有效因子
Returns:
--------
DataFrame: 有效因子
"""
valid_factors = []
for col in factors.columns:
factor = factors[col]
result = self.validate_all(factor, forward_return, factors.drop(columns=[col]))
if result['is_valid']:
valid_factors.append(col)
return factors[valid_factors] if valid_factors else pd.DataFrame()
def create_validator(
ic_window: int = 30,
min_ic: float = 0.01,
min_tstat: float = 1.5
) -> FactorValidator:
"""创建验证器(便捷函数)"""
config = ValidationConfig(
ic_window=ic_window,
min_ic=min_ic,
min_tstat=min_tstat
)
return FactorValidator(config)