## 核心功能 - CrossMarketAligner: 跨市场数据对齐(解决 ffill 陷阱) - Pydantic Schema: 数据结构验证(OHLCVInputSchema, AlignedFactorSchema 等) - 验证装饰器: @validate_factor_after_align, @validate_returns_after_align ## 解决的问题 - 跨市场交易日历不同(美股/港股/A股) - ffill 收益率陷阱(休市日复制非零收益率) - NaN 传播问题 - 日期不一致问题 ## 测试验证 - 5/5 测试通过(因子对齐、收益率对齐、多标的对齐、信号验证、ffill陷阱) - 休市日收益率 = 0%(正确) - 无 NaN 传播 ## 架构设计 - shared/data/alignment.py - 对齐器实现 - shared/data/schemas.py - Pydantic Schema 定义 - tests/test_alignment.py - 完整测试套件
335 lines
11 KiB
Python
335 lines
11 KiB
Python
"""
|
||
跨市场数据对齐器
|
||
|
||
核心原则:
|
||
1. 因子在原始交易日历计算,再对齐到目标日历(A股)
|
||
2. 价格先对齐到目标日历,再计算收益率
|
||
3. 显式标记 ffill 填充的值
|
||
4. 严格验证对齐结果(Pydantic Schema + 内置验证)
|
||
|
||
解决的问题:
|
||
- 跨市场交易日历不同(美股/港股/A股假日不同)
|
||
- ffill 陷阱(收益率 vs 价格)
|
||
- NaN 传播
|
||
- 日期不一致
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from typing import Dict, List, Optional, Tuple
|
||
import warnings
|
||
from functools import wraps
|
||
|
||
# 导入 Schema 验证
|
||
from framework_v2.shared.data.schemas import (
|
||
OHLCVInputSchema,
|
||
AlignedFactorSchema,
|
||
AlignedReturnsSchema,
|
||
MultiAssetReturnsSchema,
|
||
AlignmentValidationResult,
|
||
validate_ohlcv_before_align,
|
||
validate_factor_after_align,
|
||
validate_returns_after_align
|
||
)
|
||
|
||
|
||
class CrossMarketAligner:
|
||
"""
|
||
跨市场数据对齐器
|
||
|
||
使用示例:
|
||
>>> aligner = CrossMarketAligner(target_calendar=a_share_dates)
|
||
>>>
|
||
>>> # 对齐因子值
|
||
>>> aligned = aligner.align_factor(factor_series, source_calendar=us_dates)
|
||
>>>
|
||
>>> # 对齐收益率
|
||
>>> returns = aligner.align_returns(close_series, code='^GSPC')
|
||
>>>
|
||
>>> # 对齐多标的
|
||
>>> returns_df = aligner.align_multi_asset(close_dict)
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
target_calendar: pd.Index,
|
||
max_nan_ratio: float = 0.1,
|
||
max_single_day_return: float = 0.5
|
||
):
|
||
"""
|
||
初始化
|
||
|
||
Args:
|
||
target_calendar: 目标交易日历(A股)
|
||
max_nan_ratio: 最大允许 NaN 比例(默认 10%)
|
||
max_single_day_return: 最大单日收益率(默认 50%,用于检测异常)
|
||
"""
|
||
self.target_calendar = target_calendar
|
||
self.max_nan_ratio = max_nan_ratio
|
||
self.max_single_day_return = max_single_day_return
|
||
|
||
# 统计信息
|
||
self._stats = {
|
||
'aligned_factors': 0,
|
||
'aligned_returns': 0,
|
||
'warnings': []
|
||
}
|
||
|
||
@validate_factor_after_align # ← Pydantic Schema 验证
|
||
def align_factor(
|
||
self,
|
||
factor_series: pd.Series,
|
||
source_calendar: pd.Index,
|
||
code: str = ''
|
||
) -> pd.DataFrame:
|
||
"""
|
||
对齐因子值到目标日历
|
||
|
||
规则:
|
||
- 因子在 source_calendar 计算
|
||
- 对齐到 target_calendar(ffill)
|
||
- 标记哪些是填充值(is_filled 列)
|
||
|
||
Args:
|
||
factor_series: 因子值序列(source_calendar 索引)
|
||
source_calendar: 原始交易日历
|
||
code: 标的代码(用于日志)
|
||
|
||
Returns:
|
||
DataFrame with columns:
|
||
- value: 对齐后的因子值
|
||
- is_filled: 是否为 ffill 填充值
|
||
"""
|
||
# 1. reindex + ffill
|
||
aligned = factor_series.reindex(self.target_calendar, method='ffill')
|
||
|
||
# 2. 标记填充值(不在 source_calendar 中的日期)
|
||
is_filled = ~aligned.index.isin(source_calendar)
|
||
|
||
# 3. 验证
|
||
self._validate_factor_alignment(aligned, is_filled, code)
|
||
|
||
# 4. 统计
|
||
self._stats['aligned_factors'] += 1
|
||
|
||
return pd.DataFrame({
|
||
'value': aligned,
|
||
'is_filled': is_filled
|
||
}, index=self.target_calendar)
|
||
|
||
@validate_returns_after_align # ← Pydantic Schema 验证
|
||
def align_returns(
|
||
self,
|
||
close_series: pd.Series,
|
||
code: str
|
||
) -> pd.Series:
|
||
"""
|
||
对齐收益率到目标日历
|
||
|
||
规则:
|
||
- 价格先 ffill 到 target_calendar
|
||
- 再计算 pct_change
|
||
- 休市日收益率 = 0%(价格不变)
|
||
|
||
重要:
|
||
❌ 错误:先计算收益率,再 ffill(会复制非零收益率)
|
||
✅ 正确:先 ffill 价格,再计算收益率(休市日收益率 = 0%)
|
||
|
||
Args:
|
||
close_series: 收盘价序列(原始日历)
|
||
code: 标的代码(用于日志和错误信息)
|
||
|
||
Returns:
|
||
收益率序列(target_calendar 索引)
|
||
"""
|
||
# 1. 价格对齐到目标日历
|
||
close_aligned = close_series.reindex(
|
||
self.target_calendar,
|
||
method='ffill'
|
||
)
|
||
|
||
# 2. 计算收益率(关键:fill_method=None,不填充 NaN)
|
||
returns = close_aligned.pct_change(fill_method=None)
|
||
|
||
# 3. 填充首日 NaN(首日无前一日,收益率 = 0)
|
||
if len(returns) > 0:
|
||
returns.iloc[0] = 0.0
|
||
|
||
# 4. 填充剩余 NaN(如果价格全 NaN,收益率也全 NaN)
|
||
nan_ratio = returns.isna().sum() / len(returns)
|
||
if nan_ratio > 0:
|
||
# 用 0 填充(表示"无数据,收益率为 0")
|
||
returns = returns.fillna(0.0)
|
||
warnings.warn(
|
||
f"{code}: 收益率 NaN 比例 {nan_ratio:.1%},已填充为 0"
|
||
)
|
||
|
||
# 5. 验证
|
||
self._validate_returns(returns, code)
|
||
|
||
# 6. 统计
|
||
self._stats['aligned_returns'] += 1
|
||
|
||
return returns
|
||
|
||
def align_multi_asset(
|
||
self,
|
||
close_dict: Dict[str, pd.Series]
|
||
) -> pd.DataFrame:
|
||
"""
|
||
对齐多标的收益率
|
||
|
||
Args:
|
||
close_dict: {标的代码: 收盘价序列}
|
||
|
||
Returns:
|
||
收益率 DataFrame(所有标的同索引 = target_calendar)
|
||
"""
|
||
returns_dict = {}
|
||
|
||
for code, close_series in close_dict.items():
|
||
try:
|
||
returns_dict[code] = self.align_returns(close_series, code)
|
||
except Exception as e:
|
||
warnings.warn(f"{code}: 收益率对齐失败 - {e}")
|
||
# 填充全 0
|
||
returns_dict[code] = pd.Series(
|
||
0.0,
|
||
index=self.target_calendar,
|
||
name=code
|
||
)
|
||
|
||
# 合并为 DataFrame
|
||
returns_df = pd.DataFrame(returns_dict, index=self.target_calendar)
|
||
|
||
# 最终验证:不能有 NaN
|
||
if returns_df.isna().any().any():
|
||
nan_cols = returns_df.columns[returns_df.isna().any()]
|
||
raise ValueError(
|
||
f"多标的收益率对齐后仍包含 NaN\n"
|
||
f"NaN 列: {list(nan_cols)}\n"
|
||
f"这不应该发生,请检查 align_returns 逻辑"
|
||
)
|
||
|
||
return returns_df
|
||
|
||
def validate_alignment(
|
||
self,
|
||
signals: pd.DataFrame,
|
||
returns_df: pd.DataFrame
|
||
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
||
"""
|
||
验证信号与收益率对齐,并返回对齐后的结果
|
||
|
||
Args:
|
||
signals: 信号 DataFrame
|
||
returns_df: 收益率 DataFrame
|
||
|
||
Returns:
|
||
(aligned_signals, aligned_returns)
|
||
|
||
Raises:
|
||
ValueError: 如果对齐后日期太少
|
||
"""
|
||
# 1. 找共同日期
|
||
common_dates = signals.index.intersection(returns_df.index)
|
||
|
||
# 2. 检查丢失的日期
|
||
lost_signals = len(signals) - len(common_dates)
|
||
lost_returns = len(returns_df) - len(common_dates)
|
||
|
||
if lost_signals > 0 or lost_returns > 0:
|
||
warnings.warn(
|
||
f"信号与收益率对齐丢失日期\n"
|
||
f"信号: {len(signals)} → {len(common_dates)} (丢失 {lost_signals})\n"
|
||
f"收益: {len(returns_df)} → {len(common_dates)} (丢失 {lost_returns})"
|
||
)
|
||
|
||
# 3. 检查对齐后日期是否太少
|
||
if len(common_dates) < 10:
|
||
raise ValueError(
|
||
f"对齐后日期太少: {len(common_dates)} 天\n"
|
||
f"信号和收益率可能使用了不同的日历"
|
||
)
|
||
|
||
# 4. 裁剪到共同日期
|
||
aligned_signals = signals.loc[common_dates]
|
||
aligned_returns = returns_df.loc[common_dates]
|
||
|
||
# 5. 使用 Pydantic Schema 验证结果
|
||
validation_result = AlignmentValidationResult(
|
||
signals_aligned=True,
|
||
returns_aligned=True,
|
||
common_dates_count=len(common_dates),
|
||
lost_signals=lost_signals,
|
||
lost_returns=lost_returns
|
||
)
|
||
|
||
# 6. 如果验证失败,会抛出异常
|
||
# (Pydantic 自动验证 field_validator)
|
||
|
||
return aligned_signals, aligned_returns
|
||
|
||
def _validate_factor_alignment(
|
||
self,
|
||
aligned: pd.Series,
|
||
is_filled: pd.Series,
|
||
code: str
|
||
):
|
||
"""验证因子对齐结果"""
|
||
# 1. 检查 NaN 比例
|
||
nan_ratio = aligned.isna().sum() / len(aligned)
|
||
if nan_ratio > self.max_nan_ratio:
|
||
warnings.warn(
|
||
f"{code}: 因子 NaN 比例过高 ({nan_ratio:.1%} > {self.max_nan_ratio:.1%})"
|
||
)
|
||
|
||
# 2. 检查填充比例
|
||
fill_ratio = is_filled.sum() / len(is_filled)
|
||
if fill_ratio > 0.3:
|
||
warnings.warn(
|
||
f"{code}: 因子填充比例过高 ({fill_ratio:.1%})\n"
|
||
f"可能源日历与目标日历差异太大"
|
||
)
|
||
|
||
def _validate_returns(
|
||
self,
|
||
returns: pd.Series,
|
||
code: str
|
||
):
|
||
"""验证收益率数据"""
|
||
# 1. 检查 NaN 比例
|
||
nan_ratio = returns.isna().sum() / len(returns)
|
||
if nan_ratio > self.max_nan_ratio:
|
||
raise ValueError(
|
||
f"{code}: 收益率 NaN 比例过高 ({nan_ratio:.1%} > {self.max_nan_ratio:.1%})"
|
||
)
|
||
|
||
# 2. 检查异常值
|
||
max_return = returns.abs().max()
|
||
if max_return > self.max_single_day_return:
|
||
warnings.warn(
|
||
f"{code}: 发现异常收益率 ({max_return:.1%} > {self.max_single_day_return:.1%})\n"
|
||
f"可能数据有问题"
|
||
)
|
||
|
||
# 3. 检查索引是否匹配目标日历
|
||
if not returns.index.equals(self.target_calendar):
|
||
raise ValueError(
|
||
f"{code}: 收益率索引与目标日历不匹配\n"
|
||
f"收益率长度: {len(returns)}\n"
|
||
f"目标日历长度: {len(self.target_calendar)}"
|
||
)
|
||
|
||
def get_stats(self) -> dict:
|
||
"""获取对齐统计信息"""
|
||
return self._stats.copy()
|
||
|
||
def reset_stats(self):
|
||
"""重置统计信息"""
|
||
self._stats = {
|
||
'aligned_factors': 0,
|
||
'aligned_returns': 0,
|
||
'warnings': []
|
||
}
|