Files
qoder-config/skills/opencli-websearch/scripts/download_content.py
aszerW c3ea38c045 feat(repo): 整理 Qoder Skills 和 MCP 配置到仓库
- 添加 5 个用户级别 Skills:
  - auto-commit: 自动 Git 提交
  - karpathy-guidelines: 编码规范指南
  - opencli-websearch: 多源网络搜索
  - pdf-reader: PDF 内容提取
  - repo-analyzer: 项目深度分析

- 添加 Playwright MCP 配置 (21 个浏览器自动化工具)
- 创建完整的 README.md 文档说明
2026-04-18 11:17:41 +08:00

194 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
使用 OpenCLI web read 下载文档内容
"""
import os
import sys
import json
import hashlib
import subprocess
import argparse
from pathlib import Path
from typing import Optional, List
from urllib.parse import urlparse
def download_with_opencli(url: str, output_dir: str, timeout: int = 60) -> Optional[str]:
"""
使用 OpenCLI web read 下载文档内容
Args:
url: 要下载的 URL
output_dir: 输出目录
timeout: 超时时间(秒)
Returns:
下载文件的本地路径,失败返回 None
"""
# 生成文件名
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
parsed = urlparse(url)
domain = parsed.netloc.replace(".", "_")
filename = f"{domain}_{url_hash}.md"
output_path = os.path.join(output_dir, filename)
# 构建命令
cmd = ["opencli", "web", "read", "--url", url, "--output", output_path]
print(f"下载: {url}")
print(f"输出: {output_path}")
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout
)
if result.returncode == 0:
if os.path.exists(output_path):
file_size = os.path.getsize(output_path)
print(f"✓ 成功下载 ({file_size} bytes)")
return output_path
else:
print(f"✗ 文件未生成")
return None
else:
print(f"✗ 下载失败: {result.stderr[:200]}")
return None
except subprocess.TimeoutExpired:
print(f"✗ 下载超时")
return None
except Exception as e:
print(f"✗ 错误: {str(e)}")
return None
def batch_download(urls: List[str], output_dir: str, max_workers: int = 3) -> dict:
"""
批量下载多个 URL
Args:
urls: URL 列表
output_dir: 输出目录
max_workers: 最大并行数
Returns:
下载结果字典 {url: local_path or None}
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
results = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(download_with_opencli, url, output_dir): url
for url in urls
}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
results[url] = future.result()
except Exception as e:
print(f"{url} 异常: {str(e)}")
results[url] = None
return results
def load_results_from_search(search_dir: str) -> List[str]:
"""
从之前的搜索结果中加载 URL 列表
Args:
search_dir: 搜索结果目录
Returns:
URL 列表
"""
results_file = os.path.join(search_dir, "results.json")
if not os.path.exists(results_file):
print(f"未找到结果文件: {results_file}")
return []
with open(results_file, "r", encoding="utf-8") as f:
data = json.load(f)
urls = []
for source, result in data.items():
if result.get("success") and result.get("output"):
# 简单解析输出中的 URL
output = result["output"]
for line in output.split("\n"):
if "url:" in line.lower() or "http" in line:
# 提取 URL
import re
url_match = re.search(r'https?://[^\s\'"<>]+', line)
if url_match:
urls.append(url_match.group())
return list(set(urls))
def main():
parser = argparse.ArgumentParser(description="使用 OpenCLI 下载文档内容")
parser.add_argument("--url", help="单个 URL 下载")
parser.add_argument("--urls", nargs="+", help="多个 URL 下载")
parser.add_argument("--from-search", help="从搜索结果目录加载 URL")
parser.add_argument("--output-dir", required=True, help="输出目录")
parser.add_argument("--max-workers", type=int, default=3, help="最大并行数")
args = parser.parse_args()
# 确保输出目录存在
os.makedirs(args.output_dir, exist_ok=True)
# 收集 URL 列表
urls = []
if args.url:
urls.append(args.url)
if args.urls:
urls.extend(args.urls)
if args.from_search:
search_urls = load_results_from_search(args.from_search)
urls.extend(search_urls)
print(f"从搜索结果加载 {len(search_urls)} 个 URL")
if not urls:
print("错误: 未提供 URL")
return 1
# 去重
urls = list(set(urls))
print(f"\n{len(urls)} 个唯一 URL 待下载\n")
# 批量下载
results = batch_download(urls, args.output_dir, args.max_workers)
# 统计
success_count = sum(1 for v in results.values() if v is not None)
print(f"\n{'='*60}")
print(f"下载完成: {success_count}/{len(urls)} 成功")
print(f"{'='*60}")
# 保存下载记录
record_file = os.path.join(args.output_dir, "download_record.json")
with open(record_file, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"下载记录: {record_file}")
return 0
if __name__ == "__main__":
sys.exit(main())