Files
factorhack/pipeline.py
2025-11-08 13:39:02 +08:00

288 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
主流程:时间序列因子挖掘、检验、回测、信号生成
"""
import pandas as pd
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')
from data import load_data, compute_technical_indicators, preprocess_data, compute_forward_returns
from factors import FactorMiner, create_default_factors
from validation import validate_factor, factor_span_regression
from combination import MultiFactorModel
from backtest import BacktestEngine
from signal import generate_signals
class FactorPipeline:
"""因子挖掘流程"""
def __init__(
self,
ret_horizon: int = 1,
ic_window: int = 30,
commission: float = 0.001,
slippage: float = 0.0005
):
"""
Parameters:
-----------
ret_horizon : int
未来收益率周期
ic_window : int
IC计算窗口
commission : float
手续费率
slippage : float
滑点
"""
self.ret_horizon = ret_horizon
self.ic_window = ic_window
self.commission = commission
self.slippage = slippage
self.data: Optional[pd.DataFrame] = None
self.factors: Optional[pd.DataFrame] = None
self.forward_return: Optional[pd.Series] = None
self.factor_miner: Optional[FactorMiner] = None
self.validation_results: Dict = {}
self.model: Optional[MultiFactorModel] = None
self.score: Optional[pd.Series] = None
self.backtest_results: Optional[Dict] = None
def load_and_preprocess(self, file_path: str) -> 'FactorPipeline':
"""步骤1加载和预处理数据"""
print("=" * 50)
print("步骤1加载和预处理数据")
print("=" * 50)
# 加载数据
self.data = load_data(file_path)
print(f"加载数据: {len(self.data)} 条记录")
# 计算技术指标
self.data = compute_technical_indicators(self.data)
print("计算技术指标完成")
# 预处理
self.data = preprocess_data(self.data)
print("数据预处理完成")
# 计算未来收益率
self.forward_return = compute_forward_returns(
self.data['close'],
horizon=self.ret_horizon
)
print(f"计算未来收益率完成(周期={self.ret_horizon}")
return self
def mine_factors(self, custom_miner: Optional[FactorMiner] = None) -> 'FactorPipeline':
"""步骤2因子挖掘"""
print("\n" + "=" * 50)
print("步骤2因子挖掘")
print("=" * 50)
if self.data is None:
raise ValueError("请先加载数据")
# 使用自定义或默认因子挖掘器
if custom_miner is None:
self.factor_miner = create_default_factors()
else:
self.factor_miner = custom_miner
# 计算所有因子
self.factors = self.factor_miner.compute_all_factors(self.data)
print(f"计算因子完成: {list(self.factors.columns)}")
return self
def validate_factors(self, min_ic: float = 0.01, min_tstat: float = 1.5) -> 'FactorPipeline':
"""步骤3因子检验"""
print("\n" + "=" * 50)
print("步骤3因子检验")
print("=" * 50)
if self.factors is None or self.forward_return is None:
raise ValueError("请先完成因子挖掘")
valid_factors = []
self.validation_results = {}
for factor_name in self.factors.columns:
factor = self.factors[factor_name]
# 综合检验
result = validate_factor(factor, self.forward_return, ic_window=self.ic_window)
self.validation_results[factor_name] = result
# 筛选有效因子
if (abs(result['mean_ic']) >= min_ic and
abs(result['mean_h_l_tstat']) >= min_tstat):
valid_factors.append(factor_name)
print(f"\n因子 {factor_name}:")
print(f" 平均IC: {result['mean_ic']:.4f}")
print(f" IC信息比率: {result['ic_ir']:.4f}")
print(f" H-L收益差: {result['mean_h_l_return']:.4f}")
print(f" H-L t统计量: {result['mean_h_l_tstat']:.4f}")
else:
print(f"\n因子 {factor_name} 未通过检验 (IC={result['mean_ic']:.4f}, t={result['mean_h_l_tstat']:.4f})")
# 只保留有效因子
if valid_factors:
self.factors = self.factors[valid_factors]
print(f"\n有效因子: {valid_factors}")
else:
print("\n警告:没有因子通过检验!")
return self
def combine_factors(
self,
weight_method: str = 'risk_parity',
window: Optional[int] = None
) -> 'FactorPipeline':
"""步骤4因子组合"""
print("\n" + "=" * 50)
print("步骤4因子组合")
print("=" * 50)
if self.factors is None or len(self.factors.columns) == 0:
raise ValueError("没有有效因子可组合")
# 创建多因子模型
self.model = MultiFactorModel(weight_method=weight_method)
self.model.fit(
self.factors,
forward_return=self.forward_return,
window=window
)
# 计算综合得分
self.score = self.model.predict(self.factors)
# 显示权重
weights = self.model.get_weights()
print("因子权重:")
for name, weight in weights.items():
print(f" {name}: {weight:.4f}")
print(f"\n综合得分统计:")
print(f" 均值: {self.score.mean():.4f}")
print(f" 标准差: {self.score.std():.4f}")
return self
def generate_signals(
self,
buy_threshold: float = 0.8,
sell_threshold: float = -0.8,
window: int = 30
) -> pd.Series:
"""步骤5生成交易信号"""
if self.score is None:
raise ValueError("请先完成因子组合")
signals = generate_signals(
self.score,
buy_threshold=buy_threshold,
sell_threshold=sell_threshold,
window=window
)
return signals
def backtest(
self,
signals: Optional[pd.Series] = None,
buy_threshold: float = 0.8,
sell_threshold: float = -0.8,
window: int = 30
) -> Dict:
"""步骤6回测"""
print("\n" + "=" * 50)
print("步骤6回测")
print("=" * 50)
if self.data is None:
raise ValueError("请先加载数据")
if signals is None:
signals = self.generate_signals(buy_threshold, sell_threshold, window)
# 创建回测引擎
engine = BacktestEngine(
commission=self.commission,
slippage=self.slippage
)
# 运行回测
self.backtest_results = engine.run(
signals,
self.data['close'],
score=self.score
)
# 显示结果
metrics = self.backtest_results['metrics']
print("\n回测结果:")
print(f" 总收益率: {metrics.get('total_return', 0)*100:.2f}%")
print(f" 年化收益率: {metrics.get('annual_return', 0)*100:.2f}%")
print(f" 年化波动率: {metrics.get('annual_volatility', 0)*100:.2f}%")
print(f" 夏普比率: {metrics.get('sharpe_ratio', 0):.2f}")
print(f" 最大回撤: {metrics.get('max_drawdown', 0)*100:.2f}%")
print(f" 胜率: {metrics.get('win_rate', 0)*100:.2f}%")
print(f" 盈亏比: {metrics.get('profit_loss_ratio', 0):.2f}")
print(f" 交易次数: {metrics.get('total_trades', 0)}")
return self.backtest_results
def run_full_pipeline(
self,
file_path: str,
custom_miner: Optional[FactorMiner] = None,
min_ic: float = 0.01,
min_tstat: float = 1.5,
weight_method: str = 'risk_parity',
buy_threshold: float = 0.8,
sell_threshold: float = -0.8
) -> Dict:
"""运行完整流程"""
self.load_and_preprocess(file_path) \
.mine_factors(custom_miner) \
.validate_factors(min_ic, min_tstat) \
.combine_factors(weight_method) \
.backtest(buy_threshold=buy_threshold, sell_threshold=sell_threshold)
return {
'factors': self.factors,
'score': self.score,
'validation': self.validation_results,
'backtest': self.backtest_results
}
if __name__ == "__main__":
# 示例使用
pipeline = FactorPipeline(ret_horizon=1, ic_window=30)
results = pipeline.run_full_pipeline(
file_path="ETH_USDT-1h.feather",
min_ic=0.01,
min_tstat=1.5,
weight_method='risk_parity',
buy_threshold=0.8,
sell_threshold=-0.8
)
# 保存结果
if results['factors'] is not None:
results['factors'].to_csv("factors.csv")
print("\n因子数据已保存到 factors.csv")
if results['score'] is not None:
results['score'].to_csv("score.csv")
print("综合得分已保存到 score.csv")