From af4ac038847d637e7dbe14f8fbe249169ef2be5d Mon Sep 17 00:00:00 2001 From: aszerW Date: Wed, 29 Oct 2025 22:14:31 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8playwright=E4=BB=A3=E6=9B=BFa?= =?UTF-8?q?kshare=E7=88=AC=E6=8C=87=E6=95=B0=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile_base | 47 ++++++++++++++++++- em_browser_state.json | 1 + em_index_sport.py | 102 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 5 ++- update_data.py | 9 ++-- 5 files changed, 159 insertions(+), 5 deletions(-) create mode 100644 em_browser_state.json create mode 100644 em_index_sport.py diff --git a/Dockerfile_base b/Dockerfile_base index 0716331..7cb5bf8 100644 --- a/Dockerfile_base +++ b/Dockerfile_base @@ -3,6 +3,41 @@ FROM python:3.12-slim # 设置工作目录 WORKDIR /app +# 安装系统依赖 +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + && rm -rf /var/lib/apt/lists/* + +# 安装Playwright的依赖 +RUN apt-get update && apt-get install -y \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libasound2 \ + libpango-1.0-0 \ + libpangocairo-1.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# 配置pip源 +RUN mkdir -p /root/.pip && \ + echo "[global]" > /root/.pip/pip.conf && \ + echo "index-url = https://pypi.tuna.tsinghua.edu.cn/simple" >> /root/.pip/pip.conf && \ + echo "trusted-host = pypi.tuna.tsinghua.edu.cn" >> /root/.pip/pip.conf + +# 更新pip +RUN python -m pip install --upgrade pip + RUN apt-get update && \ apt-get install -y --no-install-recommends tzdata && \ ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ @@ -10,7 +45,7 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN pip install uv \ - && rm -rf /root/.cache/pip + && rm -rf /root/.cache/pip ENV UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple @@ -23,6 +58,16 @@ COPY requirements.txt . RUN uv pip install --system -r requirements.txt +# 预先安装 Playwright 的系统依赖 +RUN apt-get update && apt-get install -y \ + xvfb \ + libopengl0 \ + && rm -rf /var/lib/apt/lists/* + +# 安装Playwright浏览器 +RUN playwright install chromium +RUN playwright install-deps + # 暴露端口 EXPOSE 80 diff --git a/em_browser_state.json b/em_browser_state.json new file mode 100644 index 0000000..a94fd24 --- /dev/null +++ b/em_browser_state.json @@ -0,0 +1 @@ +{"cookies": [{"name": "fullscreengg", "value": "1", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "fullscreengg2", "value": "1", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "qgqp_b_id", "value": "97fe41278894bed48362b5a789967a07", "domain": ".eastmoney.com", "path": "/", "expires": 1796300946.515741, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_nvi", "value": "i2BBcnTi2AO9bsuYZ0j6d8247", "domain": ".eastmoney.com", "path": "/", "expires": 1793276946, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_si", "value": "35941754435197", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_asi", "value": "delete", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "nid", "value": "0019f2c6f761ce5374eb765f94801d3e", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "nid_create_time", "value": "1761740947152", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "gvi", "value": "qLqK7zZ_mgPbeKzwrmP5z1c45", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "gvi_create_time", "value": "1761740947153", "domain": ".eastmoney.com", "path": "/", "expires": 1769516947, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "wsc_checkuser_ok", "value": "1", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_pvi", "value": "54550357370034", "domain": ".eastmoney.com", "path": "/", "expires": 1796300966.64546, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_sp", "value": "2025-10-29%2020%3A29%3A06", "domain": ".eastmoney.com", "path": "/", "expires": 1796300966.645772, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_inirUrl", "value": "", "domain": ".eastmoney.com", "path": "/", "expires": 1796300966.646048, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_sn", "value": "3", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "st_psi", "value": "20251029202926644-113200313003-0775966632", "domain": ".eastmoney.com", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}], "origins": [{"origin": "https://quote.eastmoney.com", "localStorage": [{"name": "st_pvi", "value": "54550357370034"}, {"name": "st_sp", "value": "2025-10-29 20:29:06"}, {"name": "st_inirUrl", "value": ""}]}]} \ No newline at end of file diff --git a/em_index_sport.py b/em_index_sport.py new file mode 100644 index 0000000..1dfe9b7 --- /dev/null +++ b/em_index_sport.py @@ -0,0 +1,102 @@ +import time +from playwright.sync_api import sync_playwright +from loguru import logger +import datetime +import re +import json +import pandas as pd + + +def index_data_scraper(index_code: str, data_file_path: str): + + with sync_playwright() as p: + + def on_response(response): + if "push2.eastmoney.com/api/qt/clist/get" in response.url: + try: + if response.request.failure is None and response.status == 200: + data = response.text() + # logger.info(f"最新数据: \n{data}") + + # 保存响应数据 + with open(data_file_path, "a", encoding="utf-8") as f: + f.write(data) + f.write("\n") + except Exception as e: + logger.error(f"处理响应数据失败: {e}") + + browser_state_file_path = "./em_browser_state.json" + browser = p.chromium.launch(args=["--start-maximized"], headless=True) + context = browser.new_context( + storage_state=browser_state_file_path, no_viewport=True + ) + page = context.new_page() + page.on("response", on_response) + url = f"https://quote.eastmoney.com/center/gridlist.html#{index_code}" + page.goto(url) + # page.pause() + for i in range(1, 500): + logger.info(f"第{i}次点击") + try: + page.get_by_role("link", name=">", exact=True).click(timeout=30000) + except Exception as e: + break + time.sleep(10) + + +def parse_data(data_file_path: str): + df_list = [] + with open(data_file_path, "r", encoding="utf-8") as f: + for line in f: + match = re.search(r"\((\{.*\})\);?$", line) + json_str = match.group(1) + data = json.loads(json_str) + inner_temp_df = pd.DataFrame(data["data"]["diff"]) + df_list.append(inner_temp_df) + logger.info(inner_temp_df) + temp_df = pd.concat(df_list, ignore_index=True) + temp_df["f3"] = pd.to_numeric(temp_df["f3"], errors="coerce") + temp_df.sort_values(by=["f3"], ascending=False, inplace=True, ignore_index=True) + temp_df.reset_index(inplace=True) + temp_df["index"] = temp_df["index"].astype(int) + 1 + col_name_map = { + "index": "序号", + "f12": "代码", + "f14": "名称", + "f2": "最新价", + "f3": "涨跌幅", + "f4": "涨跌额", + "f5": "成交量", + "f6": "成交额", + "f7": "振幅", + "f15": "最高", + "f16": "最低", + "f17": "今开", + "f18": "昨收", + "f10": "量比", + } + temp_df.rename( + columns=col_name_map, + inplace=True, + ) + new_cols = col_name_map.values() + temp_df = temp_df[new_cols] + for col in new_cols: + temp_df[col] = pd.to_numeric(temp_df[col], errors="coerce") + return temp_df + + +def get_index_latest_data(): + today = datetime.datetime.now().strftime("%Y%m%d") + data_file_path = f"./{today}.txt" + index_code = "index_sh" + index_code_list = ["index_sh", "index_sz", "index_components", "index_zzzs"] + for index_code in index_code_list: + logger.info(f"开始更新指数数据: {index_code}") + index_data_scraper(index_code=index_code, data_file_path=data_file_path) + df = parse_data(data_file_path) + return df + + +if __name__ == "__main__": + get_index_latest_data() diff --git a/requirements.txt b/requirements.txt index 9d67dfc..ca0b3b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,9 +17,12 @@ urllib3>=1.26.0 # 进度条 tqdm>=4.65.0 + # 时间处理 python-dateutil>=2.8.0 schedule akshare TA-Lib -tabulate \ No newline at end of file +tabulate + +playwright>=1.45.1 \ No newline at end of file diff --git a/update_data.py b/update_data.py index 0f1ba97..8dcc8f1 100644 --- a/update_data.py +++ b/update_data.py @@ -3,7 +3,8 @@ from db_config import DatabaseManager, DatabaseConfig from loguru import logger from datetime import datetime import akshare as ak -from index_downloader import get_all_stock_index +# from index_downloader import get_all_stock_index +from em_index_sport import get_index_latest_data import schedule import time import traceback @@ -24,7 +25,8 @@ def get_latest_index_kline_date(): # "/Users/aszer/Documents/vscode/etf/data/index_history_data/000001.csv", # encoding="utf-8-sig", # ) - df = get_all_stock_index() + # df = get_all_stock_index() + df = get_index_latest_data() column_mapping = { "date": "date", "代码": "code", @@ -74,7 +76,7 @@ def main(): return try: - db_config = DatabaseConfig() + db_config = DatabaseConfig(env="daily") logger.info(f"数据库连接: {db_config.connection_string}") # 如果只是测试连接 @@ -104,6 +106,7 @@ def main(): if __name__ == "__main__": + # main() logger.info(datetime.now()) PULL_SCHEDULE: str = os.getenv("PULL_SCHEDULE", "16:00") logger.info(f"PULL_SCHEDULE: {PULL_SCHEDULE}")