使用playwright代替akshare爬指数数据
This commit is contained in:
@@ -3,6 +3,41 @@ FROM python:3.12-slim
|
|||||||
# 设置工作目录
|
# 设置工作目录
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 安装系统依赖
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
wget \
|
||||||
|
gnupg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 安装Playwright的依赖
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
libnss3 \
|
||||||
|
libnspr4 \
|
||||||
|
libatk1.0-0 \
|
||||||
|
libatk-bridge2.0-0 \
|
||||||
|
libcups2 \
|
||||||
|
libdrm2 \
|
||||||
|
libdbus-1-3 \
|
||||||
|
libxkbcommon0 \
|
||||||
|
libxcomposite1 \
|
||||||
|
libxdamage1 \
|
||||||
|
libxfixes3 \
|
||||||
|
libxrandr2 \
|
||||||
|
libgbm1 \
|
||||||
|
libasound2 \
|
||||||
|
libpango-1.0-0 \
|
||||||
|
libpangocairo-1.0-0 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 配置pip源
|
||||||
|
RUN mkdir -p /root/.pip && \
|
||||||
|
echo "[global]" > /root/.pip/pip.conf && \
|
||||||
|
echo "index-url = https://pypi.tuna.tsinghua.edu.cn/simple" >> /root/.pip/pip.conf && \
|
||||||
|
echo "trusted-host = pypi.tuna.tsinghua.edu.cn" >> /root/.pip/pip.conf
|
||||||
|
|
||||||
|
# 更新pip
|
||||||
|
RUN python -m pip install --upgrade pip
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends tzdata && \
|
apt-get install -y --no-install-recommends tzdata && \
|
||||||
ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
|
ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
|
||||||
@@ -23,6 +58,16 @@ COPY requirements.txt .
|
|||||||
|
|
||||||
RUN uv pip install --system -r requirements.txt
|
RUN uv pip install --system -r requirements.txt
|
||||||
|
|
||||||
|
# 预先安装 Playwright 的系统依赖
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
xvfb \
|
||||||
|
libopengl0 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 安装Playwright浏览器
|
||||||
|
RUN playwright install chromium
|
||||||
|
RUN playwright install-deps
|
||||||
|
|
||||||
|
|
||||||
# 暴露端口
|
# 暴露端口
|
||||||
EXPOSE 80
|
EXPOSE 80
|
||||||
|
|||||||
1
em_browser_state.json
Normal file
1
em_browser_state.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"cookies": [{"name": "fullscreengg", "value": "1", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "fullscreengg2", "value": "1", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "qgqp_b_id", "value": "97fe41278894bed48362b5a789967a07", "domain": ".eastmoney.com", "path": "/", "expires": 1796300946.515741, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_nvi", "value": "i2BBcnTi2AO9bsuYZ0j6d8247", "domain": ".eastmoney.com", "path": "/", "expires": 1793276946, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_si", "value": "35941754435197", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_asi", "value": "delete", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "nid", "value": "0019f2c6f761ce5374eb765f94801d3e", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "nid_create_time", "value": "1761740947152", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "gvi", "value": "qLqK7zZ_mgPbeKzwrmP5z1c45", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "gvi_create_time", "value": "1761740947153", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "wsc_checkuser_ok", "value": "1", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_pvi", "value": "54550357370034", "domain": ".eastmoney.com", "path": "/", "expires": 1796300966.64546, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_sp", "value": "2025-10-29%2020%3A29%3A06", "domain": ".eastmoney.com", "path": "/", "expires": 1796300966.645772, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_inirUrl", "value": "", "domain": ".eastmoney.com", "path": "/", "expires": 1796300966.646048, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_sn", "value": "3", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_psi", "value": "20251029202926644-113200313003-0775966632", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}], "origins": [{"origin": "https://quote.eastmoney.com", "localStorage": [{"name": "st_pvi", "value": "54550357370034"}, {"name": "st_sp", "value": "2025-10-29 20:29:06"}, {"name": "st_inirUrl", "value": ""}]}]}
|
||||||
102
em_index_sport.py
Normal file
102
em_index_sport.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
import time
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from loguru import logger
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def index_data_scraper(index_code: str, data_file_path: str):
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
|
||||||
|
def on_response(response):
|
||||||
|
if "push2.eastmoney.com/api/qt/clist/get" in response.url:
|
||||||
|
try:
|
||||||
|
if response.request.failure is None and response.status == 200:
|
||||||
|
data = response.text()
|
||||||
|
# logger.info(f"最新数据: \n{data}")
|
||||||
|
|
||||||
|
# 保存响应数据
|
||||||
|
with open(data_file_path, "a", encoding="utf-8") as f:
|
||||||
|
f.write(data)
|
||||||
|
f.write("\n")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理响应数据失败: {e}")
|
||||||
|
|
||||||
|
browser_state_file_path = "./em_browser_state.json"
|
||||||
|
browser = p.chromium.launch(args=["--start-maximized"], headless=True)
|
||||||
|
context = browser.new_context(
|
||||||
|
storage_state=browser_state_file_path, no_viewport=True
|
||||||
|
)
|
||||||
|
page = context.new_page()
|
||||||
|
page.on("response", on_response)
|
||||||
|
url = f"https://quote.eastmoney.com/center/gridlist.html#{index_code}"
|
||||||
|
page.goto(url)
|
||||||
|
# page.pause()
|
||||||
|
for i in range(1, 500):
|
||||||
|
logger.info(f"第{i}次点击")
|
||||||
|
try:
|
||||||
|
page.get_by_role("link", name=">", exact=True).click(timeout=30000)
|
||||||
|
except Exception as e:
|
||||||
|
break
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_data(data_file_path: str):
|
||||||
|
df_list = []
|
||||||
|
with open(data_file_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
match = re.search(r"\((\{.*\})\);?$", line)
|
||||||
|
json_str = match.group(1)
|
||||||
|
data = json.loads(json_str)
|
||||||
|
inner_temp_df = pd.DataFrame(data["data"]["diff"])
|
||||||
|
df_list.append(inner_temp_df)
|
||||||
|
logger.info(inner_temp_df)
|
||||||
|
temp_df = pd.concat(df_list, ignore_index=True)
|
||||||
|
temp_df["f3"] = pd.to_numeric(temp_df["f3"], errors="coerce")
|
||||||
|
temp_df.sort_values(by=["f3"], ascending=False, inplace=True, ignore_index=True)
|
||||||
|
temp_df.reset_index(inplace=True)
|
||||||
|
temp_df["index"] = temp_df["index"].astype(int) + 1
|
||||||
|
col_name_map = {
|
||||||
|
"index": "序号",
|
||||||
|
"f12": "代码",
|
||||||
|
"f14": "名称",
|
||||||
|
"f2": "最新价",
|
||||||
|
"f3": "涨跌幅",
|
||||||
|
"f4": "涨跌额",
|
||||||
|
"f5": "成交量",
|
||||||
|
"f6": "成交额",
|
||||||
|
"f7": "振幅",
|
||||||
|
"f15": "最高",
|
||||||
|
"f16": "最低",
|
||||||
|
"f17": "今开",
|
||||||
|
"f18": "昨收",
|
||||||
|
"f10": "量比",
|
||||||
|
}
|
||||||
|
temp_df.rename(
|
||||||
|
columns=col_name_map,
|
||||||
|
inplace=True,
|
||||||
|
)
|
||||||
|
new_cols = col_name_map.values()
|
||||||
|
temp_df = temp_df[new_cols]
|
||||||
|
for col in new_cols:
|
||||||
|
temp_df[col] = pd.to_numeric(temp_df[col], errors="coerce")
|
||||||
|
return temp_df
|
||||||
|
|
||||||
|
|
||||||
|
def get_index_latest_data():
|
||||||
|
today = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
|
data_file_path = f"./{today}.txt"
|
||||||
|
index_code = "index_sh"
|
||||||
|
index_code_list = ["index_sh", "index_sz", "index_components", "index_zzzs"]
|
||||||
|
for index_code in index_code_list:
|
||||||
|
logger.info(f"开始更新指数数据: {index_code}")
|
||||||
|
index_data_scraper(index_code=index_code, data_file_path=data_file_path)
|
||||||
|
df = parse_data(data_file_path)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
get_index_latest_data()
|
||||||
@@ -17,9 +17,12 @@ urllib3>=1.26.0
|
|||||||
# 进度条
|
# 进度条
|
||||||
tqdm>=4.65.0
|
tqdm>=4.65.0
|
||||||
|
|
||||||
|
|
||||||
# 时间处理
|
# 时间处理
|
||||||
python-dateutil>=2.8.0
|
python-dateutil>=2.8.0
|
||||||
schedule
|
schedule
|
||||||
akshare
|
akshare
|
||||||
TA-Lib
|
TA-Lib
|
||||||
tabulate
|
tabulate
|
||||||
|
|
||||||
|
playwright>=1.45.1
|
||||||
@@ -3,7 +3,8 @@ from db_config import DatabaseManager, DatabaseConfig
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import akshare as ak
|
import akshare as ak
|
||||||
from index_downloader import get_all_stock_index
|
# from index_downloader import get_all_stock_index
|
||||||
|
from em_index_sport import get_index_latest_data
|
||||||
import schedule
|
import schedule
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
@@ -24,7 +25,8 @@ def get_latest_index_kline_date():
|
|||||||
# "/Users/aszer/Documents/vscode/etf/data/index_history_data/000001.csv",
|
# "/Users/aszer/Documents/vscode/etf/data/index_history_data/000001.csv",
|
||||||
# encoding="utf-8-sig",
|
# encoding="utf-8-sig",
|
||||||
# )
|
# )
|
||||||
df = get_all_stock_index()
|
# df = get_all_stock_index()
|
||||||
|
df = get_index_latest_data()
|
||||||
column_mapping = {
|
column_mapping = {
|
||||||
"date": "date",
|
"date": "date",
|
||||||
"代码": "code",
|
"代码": "code",
|
||||||
@@ -74,7 +76,7 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
db_config = DatabaseConfig()
|
db_config = DatabaseConfig(env="daily")
|
||||||
logger.info(f"数据库连接: {db_config.connection_string}")
|
logger.info(f"数据库连接: {db_config.connection_string}")
|
||||||
|
|
||||||
# 如果只是测试连接
|
# 如果只是测试连接
|
||||||
@@ -104,6 +106,7 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# main()
|
||||||
logger.info(datetime.now())
|
logger.info(datetime.now())
|
||||||
PULL_SCHEDULE: str = os.getenv("PULL_SCHEDULE", "16:00")
|
PULL_SCHEDULE: str = os.getenv("PULL_SCHEDULE", "16:00")
|
||||||
logger.info(f"PULL_SCHEDULE: {PULL_SCHEDULE}")
|
logger.info(f"PULL_SCHEDULE: {PULL_SCHEDULE}")
|
||||||
|
|||||||
Reference in New Issue
Block a user