perf(http): 并行获取数据加速数据加载

使用 ThreadPoolExecutor 并行获取多个标的的数据:
- 信号源 (index): 11个标的并行获取
- 交易源 (ETF): 4个标的并行获取
- 溢价率数据: 4个标的并行获取

性能提升:5个标的从 ~15s 串行 → ~4.6s 并行(约 3x 加速)

修改:
- 增大 urllib3 连接池 maxsize=16 支持并行连接
- 使用 concurrent.futures.ThreadPoolExecutor
This commit is contained in:
2026-06-02 22:29:59 +08:00
parent 81045f9d85
commit e29f57749d
2 changed files with 49 additions and 20 deletions

View File

@@ -24,7 +24,10 @@ load_dotenv()
# HTTP client (urllib3 替代 requests修复 SSL EOF 问题) # HTTP client (urllib3 替代 requests修复 SSL EOF 问题)
# ============================================================ # ============================================================
_http_pool = urllib3.PoolManager() _http_pool = urllib3.PoolManager(
maxsize=16, # 支持并行连接
timeout=urllib3.Timeout(connect=10, read=120)
)
def _http_get(url: str, params: dict = None, timeout: int = 120) -> urllib3.HTTPResponse: def _http_get(url: str, params: dict = None, timeout: int = 120) -> urllib3.HTTPResponse:
"""使用 urllib3 发起 GET 请求(替代 requests.get修复 OpenSSL 3.5 + Caddy 的 SSL EOF 问题)""" """使用 urllib3 发起 GET 请求(替代 requests.get修复 OpenSSL 3.5 + Caddy 的 SSL EOF 问题)"""

View File

@@ -21,6 +21,7 @@ import pandas as pd
from pathlib import Path from pathlib import Path
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
PROJECT_ROOT = Path(__file__).parent.parent PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT)) sys.path.insert(0, str(PROJECT_ROOT))
@@ -31,7 +32,10 @@ from rotation.config_loader import load_rotation_config, RotationStrategyConfig
# HTTP client (urllib3 替代 requests修复 SSL EOF 问题) # HTTP client (urllib3 替代 requests修复 SSL EOF 问题)
# ============================================================ # ============================================================
_http_pool = urllib3.PoolManager(timeout=urllib3.Timeout(connect=10, read=120)) _http_pool = urllib3.PoolManager(
maxsize=16, # 支持并行连接
timeout=urllib3.Timeout(connect=10, read=120)
)
class _HttpResponse: class _HttpResponse:
"""urllib3 响应包装,提供 requests 兼容接口""" """urllib3 响应包装,提供 requests 兼容接口"""
@@ -364,32 +368,54 @@ class SimpleRotationStrategy:
self.trading_calendar: Optional[pd.DatetimeIndex] = None self.trading_calendar: Optional[pd.DatetimeIndex] = None
def _preload_data(self): def _preload_data(self):
"""Preload all historical data""" """Preload all historical data (parallel fetching)"""
start_date = self.config.backtest.start_date start_date = self.config.backtest.start_date
end_date = self.config.backtest.end_date or datetime.now().strftime('%Y-%m-%d') end_date = self.config.backtest.end_date or datetime.now().strftime('%Y-%m-%d')
preload_start = (pd.Timestamp(start_date) - timedelta(days=self.n_days * 2)).strftime('%Y-%m-%d') preload_start = (pd.Timestamp(start_date) - timedelta(days=self.n_days * 2)).strftime('%Y-%m-%d')
print("\n[1/4] Preloading signal sources (index raw)...") print(f"\n[1/4] Preloading signal sources (index raw) [{len(self.signal_codes)} codes, parallel]...")
for code in self.signal_codes: # Parallel fetch signal sources
df = self.data_cache.preload(code, preload_start, end_date, adj='raw') with ThreadPoolExecutor(max_workers=8) as executor:
if df is not None: futures = {executor.submit(self.data_cache.preload, code, preload_start, end_date, 'raw'): code for code in self.signal_codes}
self.index_data[code] = df for future in as_completed(futures):
code = futures[future]
try:
df = future.result()
if df is not None:
self.index_data[code] = df
except Exception as e:
print(f" x {code}: {e}")
print(f"\n Signal: {len(self.index_data)}/{len(self.signal_codes)} OK") print(f"\n Signal: {len(self.index_data)}/{len(self.signal_codes)} OK")
print("\n[2/4] Preloading trade sources (ETF hfq)...") print(f"\n[2/4] Preloading trade sources (ETF hfq) [{len(set(self.signal_to_trade.values()))} codes, parallel]...")
trade_codes = set(self.signal_to_trade.values()) trade_codes = set(self.signal_to_trade.values())
# Determine adj for each trade code
trade_adj_map = {}
for code in trade_codes: for code in trade_codes:
is_bond = any( is_bond = any(a.trade_source == code and a.group == 'BOND' for a in self.config.asset_pools.assets.values())
a.trade_source == code and a.group == 'BOND' trade_adj_map[code] = 'raw' if is_bond else 'hfq'
for a in self.config.asset_pools.assets.values()
) # Parallel fetch trade sources
adj = 'raw' if is_bond else 'hfq' with ThreadPoolExecutor(max_workers=8) as executor:
df = self.data_cache.preload(code, preload_start, end_date, adj=adj) futures = {executor.submit(self.data_cache.preload, code, preload_start, end_date, trade_adj_map[code]): code for code in trade_codes}
if df is not None: for future in as_completed(futures):
self.etf_data[code] = df code = futures[future]
# Load premium data cache for all ETF trade codes try:
for code in trade_codes: df = future.result()
self.data_cache.preload_premium(code, end_date=end_date) if df is not None:
self.etf_data[code] = df
except Exception as e:
print(f" x {code}: {e}")
# Parallel fetch premium data
with ThreadPoolExecutor(max_workers=8) as executor:
futures = {executor.submit(self.data_cache.preload_premium, code, end_date): code for code in trade_codes}
for future in as_completed(futures):
code = futures[future]
try:
future.result()
except Exception:
pass
print(f"\n Trade: {len(self.etf_data)}/{len(trade_codes)} OK, premium: {len(self.data_cache.premium_data)} loaded") print(f"\n Trade: {len(self.etf_data)}/{len(trade_codes)} OK, premium: {len(self.data_cache.premium_data)} loaded")
# Load benchmark # Load benchmark