factor minner

2025-11-09 14:00:58 +08:00
parent a66e42a8ae
commit dc3d41d6e5
5 changed files with 1072 additions and 0 deletions
--- a/factor_mining/operators.py
+++ b/factor_mining/operators.py
@@ -0,0 +1,297 @@
+"""
+算子系统：基础数学算子和技术指标算子的注册与管理
+支持算子的注册、查询、反射调用
+"""
+
+import numpy as np
+import pandas as pd
+from typing import Dict, Callable, List, Optional, Any
+from abc import ABC, abstractmethod
+import inspect
+
+
+class Operator(ABC):
+    """算子基类"""
+
+    def __init__(self, name: str, func: Callable, description: str = ""):
+        """
+        Parameters:
+        -----------
+        name : str
+            算子名称（唯一标识）
+        func : Callable
+            算子函数
+        description : str
+            算子描述
+        """
+        self.name = name
+        self.func = func
+        self.description = description
+        self._signature = inspect.signature(func)
+
+    def __call__(self, *args, **kwargs):
+        """调用算子函数"""
+        return self.func(*args, **kwargs)
+
+    def get_signature(self):
+        """获取函数签名"""
+        return self._signature
+
+    def __repr__(self):
+        return f"Operator(name='{self.name}', description='{self.description}')"
+
+
+class OperatorRegistry:
+    """算子注册表"""
+
+    def __init__(self):
+        self._operators: Dict[str, Operator] = {}
+
+    def register(self, operator: Operator):
+        """注册算子"""
+        if operator.name in self._operators:
+            raise ValueError(f"算子 '{operator.name}' 已存在")
+        self._operators[operator.name] = operator
+
+    def register_function(self, name: str, func: Callable, description: str = ""):
+        """直接注册函数为算子"""
+        operator = Operator(name, func, description)
+        self.register(operator)
+
+    def get(self, name: str) -> Optional[Operator]:
+        """获取算子"""
+        return self._operators.get(name)
+
+    def has(self, name: str) -> bool:
+        """检查算子是否存在"""
+        return name in self._operators
+
+    def list_all(self) -> List[str]:
+        """列出所有算子名称"""
+        return list(self._operators.keys())
+
+    def get_all(self) -> Dict[str, Operator]:
+        """获取所有算子"""
+        return self._operators.copy()
+
+
+# 全局算子注册表
+_registry = OperatorRegistry()
+
+
+def register_operator(name: str, description: str = ""):
+    """装饰器：注册算子"""
+
+    def decorator(func: Callable):
+        _registry.register_function(name, func, description)
+        return func
+
+    return decorator
+
+
+def get_operator(name: str) -> Optional[Operator]:
+    """获取算子"""
+    return _registry.get(name)
+
+
+def get_registry() -> OperatorRegistry:
+    """获取全局注册表"""
+    return _registry
+
+
+# ==================== 基础数学算子 ====================
+
+
+@register_operator("add", "加法: x + y")
+def _add(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    return x + y
+
+
+@register_operator("sub", "减法: x - y")
+def _sub(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    return x - y
+
+
+@register_operator("mul", "乘法: x * y")
+def _mul(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    return x * y
+
+
+@register_operator("div", "除法: x / y (安全除法)")
+def _div(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    denom = np.where(np.abs(y) < 1e-12, np.nan, y)
+    return x / denom
+
+
+@register_operator("neg", "取负: -x")
+def _neg(x: np.ndarray) -> np.ndarray:
+    return np.negative(x)
+
+
+@register_operator("abs", "绝对值: |x|")
+def _abs(x: np.ndarray) -> np.ndarray:
+    return np.abs(x)
+
+
+@register_operator("log", "对数: log(|x|)")
+def _log(x: np.ndarray) -> np.ndarray:
+    return np.log(np.clip(np.abs(x), 1e-12, None))
+
+
+@register_operator("sqrt", "平方根: sqrt(x)")
+def _sqrt(x: np.ndarray) -> np.ndarray:
+    return np.sqrt(np.clip(x, 0.0, None))
+
+
+@register_operator("pow", "幂运算: x^y (限制范围)")
+def _pow(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    y_clip = np.clip(y, -3.0, 3.0)
+    with np.errstate(over="ignore", invalid="ignore"):
+        out = np.power(np.clip(x, -1e6, 1e6), y_clip)
+    out[~np.isfinite(out)] = np.nan
+    return out
+
+
+# ==================== 时间序列算子 ====================
+
+
+def _rolling_mean(x: np.ndarray, window: int) -> np.ndarray:
+    s = pd.Series(x)
+    return s.rolling(window, min_periods=max(2, window // 2)).mean().to_numpy()
+
+
+def _rolling_std(x: np.ndarray, window: int) -> np.ndarray:
+    s = pd.Series(x)
+    return s.rolling(window, min_periods=max(2, window // 2)).std().to_numpy()
+
+
+def _ts_delta(x: np.ndarray, period: int) -> np.ndarray:
+    s = pd.Series(x)
+    return s.diff(period).to_numpy()
+
+
+def _ts_rank(x: np.ndarray, window: int) -> np.ndarray:
+    s = pd.Series(x)
+    return (
+        s.rolling(window, min_periods=max(2, window // 2))
+        .apply(lambda a: pd.Series(a).rank(pct=True).iloc[-1], raw=False)
+        .to_numpy()
+    )
+
+
+def _delay(x: np.ndarray, period: int) -> np.ndarray:
+    s = pd.Series(x)
+    return s.shift(period).to_numpy()
+
+
+# 注册时间序列算子（带不同窗口）
+for w in (3, 6, 12, 24, 48, 96):
+    _registry.register_function(
+        f"sma{w}", lambda x, w=w: _rolling_mean(x, w), f"简单移动平均: SMA(x, {w})"
+    )
+    _registry.register_function(
+        f"std{w}", lambda x, w=w: _rolling_std(x, w), f"滚动标准差: STD(x, {w})"
+    )
+    _registry.register_function(
+        f"rank{w}", lambda x, w=w: _ts_rank(x, w), f"滚动排名: RANK(x, {w})"
+    )
+    _registry.register_function(
+        f"delta{w}", lambda x, w=w: _ts_delta(x, w), f"差分: DELTA(x, {w})"
+    )
+    _registry.register_function(
+        f"delay{w}", lambda x, w=w: _delay(x, w), f"延迟: DELAY(x, {w})"
+    )
+
+
+# ==================== 因子公式解析与计算 ====================
+
+
+class FactorFormula:
+    """因子公式：支持序列化和反序列化"""
+
+    def __init__(self, expression: str, feature_names: List[str]):
+        """
+        Parameters:
+        -----------
+        expression : str
+            因子表达式（使用算子名称）
+        feature_names : List[str]
+            特征名称列表
+        """
+        self.expression = expression
+        self.feature_names = feature_names
+
+    def compute(self, features: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        计算因子值
+
+        Parameters:
+        -----------
+        features : Dict[str, np.ndarray]
+            特征字典，key为特征名称
+
+        Returns:
+        --------
+        np.ndarray: 因子值
+        """
+        # 构建计算环境
+        env = {}
+
+        # 添加特征
+        for name in self.feature_names:
+            if name not in features:
+                raise KeyError(f"特征 '{name}' 不存在")
+            env[name] = features[name]
+
+        # 添加算子
+        for op_name in _registry.list_all():
+            op = _registry.get(op_name)
+            if op:
+                env[op_name] = op.func
+
+        # 添加numpy和pandas（用于某些表达式）
+        env["np"] = np
+        env["pd"] = pd
+
+        # 执行表达式
+        try:
+            # 限制可用的内置函数
+            safe_builtins = {
+                "abs": abs,
+                "min": min,
+                "max": max,
+                "sum": sum,
+                "len": len,
+            }
+            result = eval(self.expression, {"__builtins__": safe_builtins}, env)
+
+            # 确保结果是numpy数组
+            if not isinstance(result, np.ndarray):
+                if isinstance(result, (int, float)):
+                    # 标量转换为数组（广播）
+                    result = np.full(len(features[self.feature_names[0]]), result)
+                else:
+                    result = np.array(result)
+
+            # 确保长度一致
+            expected_len = len(features[self.feature_names[0]])
+            if len(result) != expected_len:
+                raise ValueError(
+                    f"表达式结果长度 {len(result)} 与特征长度 {expected_len} 不匹配"
+                )
+
+            return result
+        except Exception as e:
+            raise RuntimeError(f"计算因子表达式失败: {e}\n表达式: {self.expression}")
+
+    def to_dict(self) -> Dict:
+        """序列化为字典"""
+        return {"expression": self.expression, "feature_names": self.feature_names}
+
+    @classmethod
+    def from_dict(cls, data: Dict) -> "FactorFormula":
+        """从字典反序列化"""
+        return cls(data["expression"], data["feature_names"])
+
+    def __repr__(self):
+        return f"FactorFormula(expression='{self.expression}', features={self.feature_names})"