消除回测前视偏差(Look-Ahead Bias): - 新增 ETFDataCache 本地缓存系统,预下载全量ETF(含已退市)基础信息和日线数据 - 改造 ETFUniverseBuilder 支持纯历史模式,每个时间点只使用当时可获得的数据 - 动量.py 新增 dynamic 模式,回测中每60交易日动态重建ETF候选池 - momentum_experiment.py 同步支持动态重建 - 新增 ETF筛选引擎文档和动态池方案文档 无前视偏差实验结果(6组对比,2015-2026): A: 全仓1只 CAGR=3.32%, MaxDD=-63.19%, Sharpe=0.26 B: 等权3只 CAGR=3.40%, MaxDD=-49.72%, Sharpe=0.30 ← 最优 C: 反波动率3只 CAGR=1.73%, MaxDD=-38.59%, Sharpe=0.21 D: 等权5只 CAGR=2.77%, MaxDD=-42.39%, Sharpe=0.29 E: 反波动率5只 CAGR=-0.37%, MaxDD=-19.56%, Sharpe=-0.03 F: 动量>0全选等权 CAGR=2.02%, MaxDD=-43.27%, Sharpe=0.24 最优方案: B(等权3只)夏普、Calmar、CAGR三项均最高
745 lines
31 KiB
Python
745 lines
31 KiB
Python
"""
|
||
动态ETF池自动化筛选引擎
|
||
=========================
|
||
多层漏斗筛选,从全市场ETF中选出低相关、高流动性、覆盖多资产类别的最优轮动候选池。
|
||
|
||
参考文献:
|
||
- TrendFolios (arxiv:2506.09330): 资产标签化 + 无前视偏差
|
||
- AEGIS (arxiv:2604.09060): 流动性硬门槛 + 定期重建
|
||
- HRP (SSRN:2708678): 层次聚类相关性优化
|
||
- Faber GTAA (SSRN:962461): 风险因子覆盖设计
|
||
- Antonacci Dual Momentum (SSRN:2042750): 跨资产分散化
|
||
|
||
用法:
|
||
python scripts/build_etf_universe.py # 当前日期构建
|
||
python scripts/build_etf_universe.py --date 20240101 # 指定日期构建
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import time
|
||
import argparse
|
||
import logging
|
||
from pathlib import Path
|
||
from datetime import datetime, timedelta
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
|
||
import tushare as ts
|
||
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ============================================================
|
||
# 配置
|
||
# ============================================================
|
||
DEFAULT_CONFIG = {
|
||
'min_list_days': 365, # 上市满1年
|
||
'min_daily_amount': 5000, # 日均成交额(万元)
|
||
'lookback_amount_days': 60, # 计算日均成交额的窗口
|
||
'n_select': 'auto', # 最终池大小: 'auto'=ENB驱动, 或整数固定
|
||
'candidate_multiplier': 3.0, # Layer4 候选池 = ENB估计 * 此倍数
|
||
'min_per_class': 2, # 每类最少保留数
|
||
'max_corr': 0.85, # 最大允许相关系数
|
||
'corr_lookback_days': 120, # 相关性计算窗口
|
||
'max_equity_ratio': 0.5, # A股行业占比上限
|
||
'enb_fallback': 12, # ENB计算失败时的回退值
|
||
}
|
||
|
||
# ============================================================
|
||
# Layer 3: 大类资产分类配置
|
||
# ============================================================
|
||
# 分类优先级: fund_type/invest_type(官方字段) > benchmark(跟踪指数) > name(名称关键词兜底)
|
||
|
||
# Layer 4: 大类资产类别列表 (保留数量由数据驱动计算)
|
||
ASSET_CLASSES = ['A股宽基', 'A股行业', 'A股主题', '港股', '美股',
|
||
'全球/其他', '商品', '债券', 'REITs', '货币/现金']
|
||
|
||
# --- 以下为分类规则(仅作名称兜底时使用) ---
|
||
_BROAD_KW = ['沪深300', '中证500', '中证1000', '创业板', '上证50', '科创50',
|
||
'上证180', '深证100', '中证100', 'A50', 'A500', '中证800',
|
||
'万得全A', '富时A50', 'MSCI中国A']
|
||
_HK_KW = ['恒生', '港股', 'H股', '港股通']
|
||
_US_KW = ['纳斯达克', '纳指', '标普500', '美股', 'S&P500', '道琼斯']
|
||
_GLOBAL_KW = ['日经', '德国', '法国', '越南', '印度', '东南亚',
|
||
'沙特', '韩国', '英国', '全球', '亚太']
|
||
_THEME_KW = ['红利', '央企', '国企', 'ESG', '碳中和', '数字经济',
|
||
'人工智能', 'AI', '机器人', '信创', '北证50',
|
||
'一带一路', '养老', '价值', '成长', '质量',
|
||
'现金流', '低波']
|
||
|
||
|
||
class ETFUniverseBuilder:
|
||
"""动态ETF池筛选引擎"""
|
||
|
||
def __init__(self, config: dict = None, ref_date: str = None, data_cache=None):
|
||
"""
|
||
Args:
|
||
config: 配置字典,缺省用 DEFAULT_CONFIG
|
||
ref_date: 参考日期 YYYYMMDD,缺省为当天
|
||
data_cache: ETFDataCache 实例,传入则使用本地缓存(无前视偏差模式)
|
||
"""
|
||
self.cfg = {**DEFAULT_CONFIG, **(config or {})}
|
||
self.ref_date = ref_date or datetime.now().strftime('%Y%m%d')
|
||
self.ref_dt = pd.Timestamp(self.ref_date)
|
||
self.data_cache = data_cache
|
||
|
||
if data_cache is None:
|
||
token = os.getenv('TUSHARE_TOKEN')
|
||
if not token:
|
||
raise ValueError("请设置环境变量 TUSHARE_TOKEN")
|
||
self.pro = ts.pro_api(token)
|
||
else:
|
||
self.pro = None # 缓存模式不需要 API
|
||
|
||
self.output_dir = Path(__file__).parent.parent / 'data' / 'etf_universe'
|
||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 管线日志
|
||
self._log_lines = []
|
||
|
||
def _log(self, msg: str):
|
||
logger.info(msg)
|
||
self._log_lines.append(msg)
|
||
|
||
def _api_call(self, func, **kwargs):
|
||
"""带重试和限流的 API 调用"""
|
||
for attempt in range(3):
|
||
try:
|
||
result = func(**kwargs)
|
||
time.sleep(0.35)
|
||
return result
|
||
except Exception as e:
|
||
if attempt < 2:
|
||
time.sleep(2)
|
||
else:
|
||
raise e
|
||
|
||
# ============================================================
|
||
# Layer 0: 获取全量 ETF 基础数据
|
||
# ============================================================
|
||
def fetch_etf_universe(self) -> pd.DataFrame:
|
||
"""获取全量上市ETF基础信息"""
|
||
self._log("=" * 60)
|
||
self._log("Layer 0: 获取全量ETF基础信息")
|
||
self._log("=" * 60)
|
||
|
||
if self.data_cache is not None:
|
||
# 缓存模式: 从本地读取,只保留 ref_date 时已上市且未退市的
|
||
df = self.data_cache.load_basic().copy()
|
||
df['list_date'] = pd.to_datetime(df['list_date'])
|
||
|
||
# 只保留 ref_date 时已上市的
|
||
mask = df['list_date'] <= self.ref_dt
|
||
|
||
# 排除 ref_date 之前已退市的
|
||
if 'delist_date' in df.columns:
|
||
delist = pd.to_datetime(df['delist_date'], errors='coerce')
|
||
mask = mask & (delist.isna() | (delist > self.ref_dt))
|
||
|
||
# 只保留 market='E' 的(缓存可能包含场外基金)
|
||
if 'type' in df.columns:
|
||
# fund_basic 的 type 字段区分 ETF 类型
|
||
pass # 缓存已经是 market='E' 的
|
||
|
||
df = df[mask].copy()
|
||
self._log(f" 缓存模式: 截至 {self.ref_date} 已上市ETF: {len(df)} 只")
|
||
else:
|
||
# 在线模式: 调用 API
|
||
df = self._api_call(
|
||
self.pro.fund_basic,
|
||
market='E',
|
||
status='L',
|
||
fields='ts_code,name,management,list_date,fund_type,invest_type,benchmark,type,trustee'
|
||
)
|
||
|
||
if df is None or df.empty:
|
||
raise RuntimeError("获取ETF列表失败,请检查Tushare权限")
|
||
|
||
self._log(f" 全量上市ETF: {len(df)} 只")
|
||
df['list_date'] = pd.to_datetime(df['list_date'])
|
||
|
||
return df
|
||
|
||
# ============================================================
|
||
# Layer 1: 基础过滤
|
||
# ============================================================
|
||
def basic_filter(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""硬性门槛过滤"""
|
||
self._log("\n" + "=" * 60)
|
||
self._log("Layer 1: 基础过滤")
|
||
self._log("=" * 60)
|
||
|
||
before = len(df)
|
||
|
||
# 1. 上市时间过滤
|
||
cutoff = self.ref_dt - timedelta(days=self.cfg['min_list_days'])
|
||
df = df[df['list_date'] <= cutoff].copy()
|
||
self._log(f" 上市满1年: {before} -> {len(df)}")
|
||
|
||
# 2. 排除货币型、QDII中的债券型
|
||
# fund_type: 股票型/混合型/债券型/货币型/其他
|
||
if 'fund_type' in df.columns:
|
||
exclude_types = ['货币型']
|
||
mask = ~df['fund_type'].str.contains('|'.join(exclude_types), na=False)
|
||
df = df[mask]
|
||
self._log(f" 排除货币型: -> {len(df)}")
|
||
|
||
# 3. 排除杠杆/反向 ETF
|
||
leverage_kw = ['杠杆', '反向', '两倍', '三倍', '2X', '3X', '-1X', '分级']
|
||
mask = ~df['name'].str.contains('|'.join(leverage_kw), na=False, case=False)
|
||
df = df[mask]
|
||
self._log(f" 排除杠杆/反向: -> {len(df)}")
|
||
|
||
# 4. 获取流动性数据(日均成交额)
|
||
self._log(f"\n 获取近{self.cfg['lookback_amount_days']}日成交额数据...")
|
||
amount_start = (self.ref_dt - timedelta(days=self.cfg['lookback_amount_days'] * 2)).strftime('%Y%m%d')
|
||
|
||
amounts = {}
|
||
total = len(df)
|
||
for idx, (_, row) in enumerate(df.iterrows()):
|
||
code = row['ts_code']
|
||
if idx % 50 == 0:
|
||
self._log(f" 进度: {idx}/{total}")
|
||
try:
|
||
if self.data_cache is not None:
|
||
# 缓存模式
|
||
daily_df = self.data_cache.load_cached_daily(code, self.ref_date)
|
||
if not daily_df.empty:
|
||
daily_df = daily_df[daily_df['trade_date'] >= amount_start]
|
||
if not daily_df.empty and 'amount' in daily_df.columns:
|
||
avg_amount = daily_df['amount'].astype(float).mean() / 10
|
||
amounts[code] = avg_amount
|
||
else:
|
||
# 在线模式
|
||
daily = self._api_call(
|
||
self.pro.fund_daily,
|
||
ts_code=code,
|
||
start_date=amount_start,
|
||
end_date=self.ref_date,
|
||
fields='ts_code,trade_date,amount'
|
||
)
|
||
if daily is not None and not daily.empty:
|
||
# amount 单位是千元,转成万元
|
||
avg_amount = daily['amount'].astype(float).mean() / 10
|
||
amounts[code] = avg_amount
|
||
except Exception:
|
||
pass
|
||
|
||
df['avg_daily_amount'] = df['ts_code'].map(amounts)
|
||
df = df.dropna(subset=['avg_daily_amount'])
|
||
df = df[df['avg_daily_amount'] >= self.cfg['min_daily_amount']]
|
||
self._log(f" 日均成交额>={self.cfg['min_daily_amount']}万: -> {len(df)}")
|
||
|
||
self._log(f"\nLayer 1 结果: {before} -> {len(df)}")
|
||
return df
|
||
|
||
# ============================================================
|
||
# Layer 2: 同指数去重
|
||
# ============================================================
|
||
def dedup_by_index(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""同一跟踪指数只保留最优的一只ETF"""
|
||
self._log("\n" + "=" * 60)
|
||
self._log("Layer 2: 同指数去重")
|
||
self._log("=" * 60)
|
||
|
||
before = len(df)
|
||
|
||
# 尝试获取指数信息做去重
|
||
# 先从 name 中提取隐含的指数信息
|
||
# 用名称相似度进行分组: 去掉 ETF/联接/LOF 等后缀
|
||
import re
|
||
|
||
def extract_index_name(name: str) -> str:
|
||
"""从ETF名称提取核心指数名"""
|
||
# 去掉常见后缀
|
||
for suffix in ['ETF', 'LOF', '联接', '基金', 'A', 'C', '(', '(']:
|
||
name = name.split(suffix)[0]
|
||
# 去掉基金公司前缀 (通常是2-4个汉字 + 核心名)
|
||
# 常见基金公司
|
||
companies = ['华夏', '易方达', '南方', '华安', '嘉实', '富国', '广发',
|
||
'博时', '工银', '招商', '华宝', '天弘', '中银', '建信',
|
||
'汇添富', '鹏华', '国泰', '银华', '大成', '景顺', '长城',
|
||
'中欧', '交银', '兴全', '平安', '万家', '泰康', '诺安',
|
||
'华泰柏瑞', '华泰', '浦银安盛', '国金', '长信', '东方',
|
||
'中证', '方正富邦', '前海开源', '申万菱信', '融通']
|
||
for c in companies:
|
||
if name.startswith(c):
|
||
name = name[len(c):]
|
||
break
|
||
return name.strip()
|
||
|
||
df = df.copy()
|
||
df['index_name'] = df['name'].apply(extract_index_name)
|
||
|
||
# 按 index_name 分组,每组选日均成交额最大的
|
||
df = df.sort_values('avg_daily_amount', ascending=False)
|
||
df = df.drop_duplicates(subset='index_name', keep='first')
|
||
|
||
self._log(f" 同名去重: {before} -> {len(df)}")
|
||
return df
|
||
|
||
# ============================================================
|
||
# Layer 3: 大类资产标签化
|
||
# ============================================================
|
||
def label_asset_class(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
三级分类链:
|
||
1. fund_type / invest_type (官方字段,最可靠)
|
||
2. benchmark (跟踪指数名称)
|
||
3. name (关键词兜底)
|
||
"""
|
||
self._log("\n" + "=" * 60)
|
||
self._log("Layer 3: 大类资产标签化 (官方字段优先)")
|
||
self._log("=" * 60)
|
||
|
||
def _name_has(text: str, keywords: list) -> bool:
|
||
"""text 中是否包含任一 keyword"""
|
||
t = text.lower()
|
||
return any(kw.lower() in t for kw in keywords)
|
||
|
||
def classify_row(row) -> str:
|
||
ft = str(row.get('fund_type', '') or '')
|
||
it = str(row.get('invest_type', '') or '')
|
||
bm = str(row.get('benchmark', '') or '')
|
||
name = str(row.get('name', '') or '')
|
||
combined = f"{name} {bm}" # 名称 + 跟踪指数拼接
|
||
|
||
# ---- 第1级: fund_type 硬判断 ----
|
||
if ft == 'REITs':
|
||
return 'REITs'
|
||
if ft == '货币市场型':
|
||
return '货币/现金'
|
||
if ft == '商品型':
|
||
return '商品'
|
||
|
||
# ---- 第2级: invest_type 细分 ----
|
||
if it in ('黄金现货合约', '白银期货型', '有色金属期货型',
|
||
'能源化工期货型', '豆粕期货型', '原油主题基金'):
|
||
return '商品'
|
||
|
||
# 债券型
|
||
if ft == '债券型':
|
||
return '债券'
|
||
|
||
# ---- 第3级: 商品类优先判断 (油气/石油/能源类本质是商品,即使QDII包装) ----
|
||
if _name_has(combined, ['油气', '原油', '石油', '能源行业']):
|
||
return '商品'
|
||
|
||
# ---- 第4级: 地域判断 (从 benchmark + name) ----
|
||
# 港股
|
||
if _name_has(combined, _HK_KW):
|
||
return '港股'
|
||
# 美股
|
||
if _name_has(combined, _US_KW):
|
||
return '美股'
|
||
# 全球/其他
|
||
if _name_has(combined, _GLOBAL_KW):
|
||
return '全球/其他'
|
||
|
||
# ---- 第5级: A股内部细分 (fund_type=股票型/混合型) ----
|
||
if ft in ('股票型', '混合型') or it in ('被动指数型', '增强指数型'):
|
||
# 宽基指数
|
||
if _name_has(combined, _BROAD_KW):
|
||
return 'A股宽基'
|
||
# 主题策略
|
||
if _name_has(combined, _THEME_KW):
|
||
return 'A股主题'
|
||
# 剩余股票型默认为行业
|
||
return 'A股行业'
|
||
|
||
# ---- 兜底 ----
|
||
# 还有一些“另类投资型”等少数类别
|
||
if _name_has(name, ['日利', '添益', '货币']):
|
||
return '货币/现金'
|
||
if _name_has(name, ['债', '短融', '利率']):
|
||
return '债券'
|
||
|
||
return '未分类'
|
||
|
||
df = df.copy()
|
||
df['asset_class'] = df.apply(classify_row, axis=1)
|
||
|
||
# 统计每类数量
|
||
class_counts = df['asset_class'].value_counts()
|
||
self._log("\n 分类结果:")
|
||
for cls, cnt in class_counts.items():
|
||
self._log(f" {cls}: {cnt} 只")
|
||
|
||
# 未分类检查
|
||
n_unclassified = (df['asset_class'] == '未分类').sum()
|
||
total = len(df)
|
||
coverage = (total - n_unclassified) / total * 100 if total > 0 else 0
|
||
self._log(f"\n 分类覆盖率: {coverage:.1f}% ({total - n_unclassified}/{total})")
|
||
|
||
if n_unclassified > 0:
|
||
self._log(f" 未分类 {n_unclassified} 只:")
|
||
unclassified = df[df['asset_class'] == '未分类'].nlargest(10, 'avg_daily_amount')
|
||
for _, row in unclassified.iterrows():
|
||
self._log(f" {row['ts_code']} {row['name']} "
|
||
f"[ft={row.get('fund_type','')}, it={row.get('invest_type','')}] "
|
||
f"(日均{row['avg_daily_amount']:.0f}万)")
|
||
|
||
return df
|
||
|
||
# ============================================================
|
||
# Layer 4: 类内预筛选
|
||
# ============================================================
|
||
@staticmethod
|
||
def _compute_enb(corr_matrix) -> float:
|
||
"""计算 Effective Number of Bets (Meucci 2009)
|
||
ENB = exp(- sum(p_i * ln(p_i))), p_i = λ_i / sum(λ)
|
||
"""
|
||
import numpy as np
|
||
eigenvalues = np.linalg.eigvalsh(corr_matrix.values)
|
||
eigenvalues = eigenvalues[eigenvalues > 1e-10] # 只取正特征值
|
||
p = eigenvalues / eigenvalues.sum()
|
||
return float(np.exp(-np.sum(p * np.log(p))))
|
||
|
||
def _compute_class_limits(self, df: pd.DataFrame) -> dict:
|
||
"""数据驱动的类内保留数量: max(min_per_class, round(class_ratio * budget))
|
||
budget = candidate_multiplier * ENB估计 (首次用 enb_fallback)
|
||
"""
|
||
class_counts = df['asset_class'].value_counts().to_dict()
|
||
total = sum(class_counts.get(c, 0) for c in ASSET_CLASSES)
|
||
if total == 0:
|
||
return {c: self.cfg['min_per_class'] for c in ASSET_CLASSES}
|
||
|
||
# 预估 budget
|
||
n_classes_present = sum(1 for c in ASSET_CLASSES if class_counts.get(c, 0) > 0)
|
||
enb_est = self.cfg.get('enb_fallback', 12)
|
||
budget = int(enb_est * self.cfg['candidate_multiplier'])
|
||
|
||
limits = {}
|
||
for cls in ASSET_CLASSES:
|
||
cnt = class_counts.get(cls, 0)
|
||
if cnt == 0:
|
||
limits[cls] = 0
|
||
continue
|
||
ratio = cnt / total
|
||
raw = ratio * budget
|
||
limits[cls] = min(cnt, max(self.cfg['min_per_class'], round(raw)))
|
||
|
||
self._log(f" 候选预算: budget={budget} (ENB估计={enb_est}, 倍数={self.cfg['candidate_multiplier']})")
|
||
self._log(f" 等比分配: {limits}")
|
||
return limits
|
||
|
||
def intra_class_select(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""数据驱动类内预筛选: 按各类占比等比分配名额"""
|
||
self._log("\n" + "=" * 60)
|
||
self._log("Layer 4: 类内预筛选 (等比分配)")
|
||
self._log("=" * 60)
|
||
|
||
before = len(df)
|
||
limits = self._compute_class_limits(df)
|
||
selected = []
|
||
|
||
for cls_name in ASSET_CLASSES:
|
||
limit = limits.get(cls_name, 0)
|
||
if limit == 0:
|
||
continue
|
||
cls_df = df[df['asset_class'] == cls_name]
|
||
if cls_df.empty:
|
||
continue
|
||
top = cls_df.nlargest(limit, 'avg_daily_amount')
|
||
selected.append(top)
|
||
self._log(f" {cls_name}: {len(cls_df)} -> {len(top)} 只")
|
||
for _, row in top.iterrows():
|
||
self._log(f" {row['ts_code']} {row['name']} (日均{row['avg_daily_amount']:.0f}万)")
|
||
|
||
# 未分类中流动性特别好的保留少量
|
||
unclassified = df[df['asset_class'] == '未分类']
|
||
if not unclassified.empty:
|
||
top_unc = unclassified.nlargest(2, 'avg_daily_amount')
|
||
top_unc = top_unc[top_unc['avg_daily_amount'] >= self.cfg['min_daily_amount'] * 10]
|
||
if not top_unc.empty:
|
||
selected.append(top_unc)
|
||
self._log(f" 未分类(超高流动): {len(top_unc)} 只")
|
||
|
||
result = pd.concat(selected, ignore_index=True) if selected else pd.DataFrame()
|
||
self._log(f"\nLayer 4 结果: {before} -> {len(result)}")
|
||
return result
|
||
|
||
# ============================================================
|
||
# Layer 5: 相关性优化选择
|
||
# ============================================================
|
||
def correlation_optimize(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""ENB驱动 + 贪心最大分散化选择"""
|
||
self._log("\n" + "=" * 60)
|
||
self._log("Layer 5: 相关性优化选择 (ENB驱动)")
|
||
self._log("=" * 60)
|
||
|
||
# 1. 获取收益率数据计算相关性
|
||
self._log(f" 获取{self.cfg['corr_lookback_days']}日收益率数据...")
|
||
corr_start = (self.ref_dt - timedelta(days=self.cfg['corr_lookback_days'] * 2)).strftime('%Y%m%d')
|
||
|
||
returns_dict = {}
|
||
for _, row in df.iterrows():
|
||
code = row['ts_code']
|
||
try:
|
||
if self.data_cache is not None:
|
||
# 缓存模式
|
||
daily = self.data_cache.load_cached_daily(code, self.ref_date)
|
||
if not daily.empty and len(daily) >= 60:
|
||
daily = daily[daily['trade_date'] >= corr_start]
|
||
daily = daily.sort_values('trade_date')
|
||
daily['ret'] = daily['close'].astype(float).pct_change()
|
||
returns_dict[code] = daily.set_index('trade_date')['ret'].tail(self.cfg['corr_lookback_days'])
|
||
else:
|
||
# 在线模式
|
||
daily = self._api_call(
|
||
self.pro.fund_daily,
|
||
ts_code=code,
|
||
start_date=corr_start,
|
||
end_date=self.ref_date,
|
||
fields='ts_code,trade_date,close'
|
||
)
|
||
if daily is not None and len(daily) >= 60:
|
||
daily = daily.sort_values('trade_date')
|
||
daily['ret'] = daily['close'].astype(float).pct_change()
|
||
returns_dict[code] = daily.set_index('trade_date')['ret'].tail(self.cfg['corr_lookback_days'])
|
||
except Exception:
|
||
pass
|
||
|
||
if len(returns_dict) < 5:
|
||
self._log(" 收益率数据不足,跳过相关性优化")
|
||
df = df.copy()
|
||
df['selected'] = True
|
||
return df
|
||
|
||
ret_df = pd.DataFrame(returns_dict).dropna(axis=1, thresh=60)
|
||
corr_matrix = ret_df.corr()
|
||
|
||
self._log(f" 有效相关性矩阵: {len(corr_matrix)} x {len(corr_matrix)}")
|
||
|
||
# 2. 确定目标池大小
|
||
n_select_cfg = self.cfg['n_select']
|
||
if n_select_cfg == 'auto':
|
||
# 用候选池相关性矩阵的 ENB 确定自然池大小
|
||
enb = self._compute_enb(corr_matrix)
|
||
n_select = max(6, min(int(round(enb)), len(corr_matrix)))
|
||
self._log(f" 候选池 ENB = {enb:.2f} -> 目标池大小 = {n_select}")
|
||
else:
|
||
n_select = int(n_select_cfg)
|
||
self._log(f" 固定目标池大小 = {n_select}")
|
||
|
||
if len(df) <= n_select:
|
||
self._log(f" 候选 {len(df)} <= 目标 {n_select},全部保留")
|
||
df = df.copy()
|
||
df['selected'] = True
|
||
return df
|
||
|
||
# 3. 贪心选择
|
||
available_codes = set(corr_matrix.columns) & set(df['ts_code'].values)
|
||
df_indexed = df.set_index('ts_code')
|
||
|
||
# Step A: 每个大类先选入流动性最好的1只(确保覆盖)
|
||
selected = []
|
||
for cls_name in ASSET_CLASSES:
|
||
cls_codes = df_indexed[df_indexed['asset_class'] == cls_name].index
|
||
cls_available = [c for c in cls_codes if c in available_codes]
|
||
if cls_available:
|
||
# 按流动性排序
|
||
best = max(cls_available, key=lambda c: df_indexed.loc[c, 'avg_daily_amount'])
|
||
selected.append(best)
|
||
available_codes.discard(best)
|
||
|
||
self._log(f" 类别覆盖: 已选 {len(selected)} 只")
|
||
|
||
# Step B: 贪心填充剩余名额
|
||
remaining = n_select - len(selected)
|
||
candidates = list(available_codes)
|
||
|
||
for _ in range(remaining):
|
||
if not candidates:
|
||
break
|
||
|
||
best_candidate = None
|
||
best_max_corr = 2.0 # 越小越好
|
||
|
||
for c in candidates:
|
||
if c not in corr_matrix.columns:
|
||
continue
|
||
# 计算与已选集合的最大相关系数
|
||
if selected:
|
||
selected_in_corr = [s for s in selected if s in corr_matrix.columns]
|
||
if selected_in_corr:
|
||
max_corr = corr_matrix.loc[c, selected_in_corr].abs().max()
|
||
else:
|
||
max_corr = 0
|
||
else:
|
||
max_corr = 0
|
||
|
||
if max_corr < best_max_corr:
|
||
best_max_corr = max_corr
|
||
best_candidate = c
|
||
|
||
if best_candidate is None:
|
||
break
|
||
|
||
# 检查相关系数阈值
|
||
if best_max_corr > self.cfg['max_corr']:
|
||
self._log(f" 剩余候选相关性均>{self.cfg['max_corr']:.2f},停止选择")
|
||
break
|
||
|
||
selected.append(best_candidate)
|
||
candidates.remove(best_candidate)
|
||
|
||
# 检查 A股行业占比约束
|
||
selected_df = df_indexed.loc[[s for s in selected if s in df_indexed.index]]
|
||
equity_count = (selected_df['asset_class'] == 'A股行业').sum()
|
||
total_count = len(selected_df)
|
||
if total_count > 0 and equity_count / total_count > self.cfg['max_equity_ratio']:
|
||
self._log(f" A股行业占比 {equity_count}/{total_count} 超限,需裁剪")
|
||
# 从A股行业中移除相关性最高的
|
||
equity_codes = selected_df[selected_df['asset_class'] == 'A股行业'].index.tolist()
|
||
max_equity = int(total_count * self.cfg['max_equity_ratio'])
|
||
while len(equity_codes) > max_equity:
|
||
# 找出与其他A股行业相关性最高的
|
||
worst = None
|
||
worst_avg_corr = -1
|
||
for ec in equity_codes:
|
||
others = [c for c in equity_codes if c != ec and c in corr_matrix.columns]
|
||
if others and ec in corr_matrix.columns:
|
||
avg_corr = corr_matrix.loc[ec, others].abs().mean()
|
||
if avg_corr > worst_avg_corr:
|
||
worst_avg_corr = avg_corr
|
||
worst = ec
|
||
if worst:
|
||
selected.remove(worst)
|
||
equity_codes.remove(worst)
|
||
self._log(f" 移除高相关A股行业: {worst}")
|
||
else:
|
||
break
|
||
|
||
# 3. 标记结果
|
||
df = df.copy()
|
||
df['selected'] = df['ts_code'].isin(selected)
|
||
|
||
self._log(f"\nLayer 5 最终选出: {df['selected'].sum()} 只")
|
||
final = df[df['selected']].copy()
|
||
for _, row in final.iterrows():
|
||
self._log(f" {row['ts_code']} {row['name']} [{row['asset_class']}] 日均{row['avg_daily_amount']:.0f}万")
|
||
|
||
# 保存相关性矩阵
|
||
final_codes = [c for c in final['ts_code'] if c in corr_matrix.columns]
|
||
if final_codes:
|
||
final_corr = corr_matrix.loc[final_codes, final_codes]
|
||
corr_path = self.output_dir / f'corr_matrix_{self.ref_date}.csv'
|
||
final_corr.to_csv(corr_path, float_format='%.3f')
|
||
self._log(f"\n 相关性矩阵已保存: {corr_path}")
|
||
|
||
return df
|
||
|
||
# ============================================================
|
||
# 保存结果
|
||
# ============================================================
|
||
def save_results(self, df: pd.DataFrame):
|
||
"""保存筛选结果和日志"""
|
||
# 保存最终池
|
||
final = df[df['selected'] == True].copy()
|
||
cols = ['ts_code', 'name', 'asset_class', 'avg_daily_amount']
|
||
cols = [c for c in cols if c in final.columns]
|
||
universe_path = self.output_dir / f'universe_{self.ref_date}.csv'
|
||
final[cols].to_csv(universe_path, index=False, encoding='utf-8-sig')
|
||
self._log(f"\n最终ETF池已保存: {universe_path}")
|
||
|
||
# 保存 latest 软链接/副本
|
||
latest_path = self.output_dir / 'universe_latest.csv'
|
||
final[cols].to_csv(latest_path, index=False, encoding='utf-8-sig')
|
||
|
||
# 保存管线日志
|
||
log_path = self.output_dir / f'pipeline_log_{self.ref_date}.txt'
|
||
with open(log_path, 'w', encoding='utf-8') as f:
|
||
f.write('\n'.join(self._log_lines))
|
||
self._log(f"管线日志已保存: {log_path}")
|
||
|
||
# 打印最终汇总
|
||
self._log("\n" + "=" * 60)
|
||
self._log("筛选完成!")
|
||
self._log("=" * 60)
|
||
self._log(f"最终池: {len(final)} 只ETF")
|
||
class_dist = final['asset_class'].value_counts()
|
||
for cls, cnt in class_dist.items():
|
||
self._log(f" {cls}: {cnt}")
|
||
|
||
# ============================================================
|
||
# 主运行入口
|
||
# ============================================================
|
||
def run(self) -> pd.DataFrame:
|
||
"""执行完整筛选管线"""
|
||
self._log(f"参考日期: {self.ref_date}")
|
||
self._log(f"配置: {self.cfg}")
|
||
|
||
raw = self.fetch_etf_universe() # Layer 0
|
||
filtered = self.basic_filter(raw) # Layer 1
|
||
deduped = self.dedup_by_index(filtered) # Layer 2
|
||
labeled = self.label_asset_class(deduped) # Layer 3
|
||
shortlist = self.intra_class_select(labeled) # Layer 4
|
||
final = self.correlation_optimize(shortlist) # Layer 5
|
||
self.save_results(final)
|
||
|
||
return final
|
||
|
||
|
||
# ============================================================
|
||
# 便捷函数:供动量策略回测调用
|
||
# ============================================================
|
||
def build_universe(ref_date: str = None, config: dict = None, data_cache=None) -> dict:
|
||
"""
|
||
构建ETF池并返回 {ts_code: name} 字典,可直接用于动量策略 CONFIG['etf_pool']
|
||
|
||
Args:
|
||
ref_date: 参考日期 YYYYMMDD
|
||
config: 覆盖默认配置
|
||
data_cache: ETFDataCache 实例(缓存模式,无前视偏差)
|
||
|
||
Returns:
|
||
dict: {ts_code: name}
|
||
"""
|
||
builder = ETFUniverseBuilder(config=config, ref_date=ref_date, data_cache=data_cache)
|
||
result = builder.run()
|
||
final = result[result['selected'] == True]
|
||
return dict(zip(final['ts_code'], final['name']))
|
||
|
||
|
||
def load_latest_universe() -> dict:
|
||
"""
|
||
加载最近一次构建的ETF池
|
||
|
||
Returns:
|
||
dict: {ts_code: name}
|
||
"""
|
||
latest_path = Path(__file__).parent.parent / 'data' / 'etf_universe' / 'universe_latest.csv'
|
||
if not latest_path.exists():
|
||
raise FileNotFoundError(f"未找到ETF池文件: {latest_path}\n请先运行 build_etf_universe.py")
|
||
df = pd.read_csv(latest_path)
|
||
return dict(zip(df['ts_code'], df['name']))
|
||
|
||
|
||
# ============================================================
|
||
# CLI 入口
|
||
# ============================================================
|
||
if __name__ == '__main__':
|
||
parser = argparse.ArgumentParser(description='动态ETF池筛选引擎')
|
||
parser.add_argument('--date', type=str, default=None,
|
||
help='参考日期 YYYYMMDD (默认: 当天)')
|
||
parser.add_argument('--n-select', type=str, default='auto',
|
||
help='最终池大小: auto=ENB驱动, 或整数 (默认: auto)')
|
||
parser.add_argument('--min-amount', type=float, default=5000,
|
||
help='最低日均成交额(万) (默认: 5000)')
|
||
args = parser.parse_args()
|
||
|
||
cfg = {
|
||
'n_select': args.n_select if args.n_select == 'auto' else int(args.n_select),
|
||
'min_daily_amount': args.min_amount,
|
||
}
|
||
|
||
builder = ETFUniverseBuilder(config=cfg, ref_date=args.date)
|
||
builder.run()
|