feat(repo): 整理 Qoder Skills 和 MCP 配置到仓库
- 添加 5 个用户级别 Skills: - auto-commit: 自动 Git 提交 - karpathy-guidelines: 编码规范指南 - opencli-websearch: 多源网络搜索 - pdf-reader: PDF 内容提取 - repo-analyzer: 项目深度分析 - 添加 Playwright MCP 配置 (21 个浏览器自动化工具) - 创建完整的 README.md 文档说明
This commit is contained in:
193
skills/opencli-websearch/scripts/download_content.py
Normal file
193
skills/opencli-websearch/scripts/download_content.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
使用 OpenCLI web read 下载文档内容
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def download_with_opencli(url: str, output_dir: str, timeout: int = 60) -> Optional[str]:
|
||||
"""
|
||||
使用 OpenCLI web read 下载文档内容
|
||||
|
||||
Args:
|
||||
url: 要下载的 URL
|
||||
output_dir: 输出目录
|
||||
timeout: 超时时间(秒)
|
||||
|
||||
Returns:
|
||||
下载文件的本地路径,失败返回 None
|
||||
"""
|
||||
# 生成文件名
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace(".", "_")
|
||||
filename = f"{domain}_{url_hash}.md"
|
||||
output_path = os.path.join(output_dir, filename)
|
||||
|
||||
# 构建命令
|
||||
cmd = ["opencli", "web", "read", "--url", url, "--output", output_path]
|
||||
|
||||
print(f"下载: {url}")
|
||||
print(f"输出: {output_path}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
if os.path.exists(output_path):
|
||||
file_size = os.path.getsize(output_path)
|
||||
print(f"✓ 成功下载 ({file_size} bytes)")
|
||||
return output_path
|
||||
else:
|
||||
print(f"✗ 文件未生成")
|
||||
return None
|
||||
else:
|
||||
print(f"✗ 下载失败: {result.stderr[:200]}")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f"✗ 下载超时")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"✗ 错误: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def batch_download(urls: List[str], output_dir: str, max_workers: int = 3) -> dict:
|
||||
"""
|
||||
批量下载多个 URL
|
||||
|
||||
Args:
|
||||
urls: URL 列表
|
||||
output_dir: 输出目录
|
||||
max_workers: 最大并行数
|
||||
|
||||
Returns:
|
||||
下载结果字典 {url: local_path or None}
|
||||
"""
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
results = {}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_url = {
|
||||
executor.submit(download_with_opencli, url, output_dir): url
|
||||
for url in urls
|
||||
}
|
||||
|
||||
for future in as_completed(future_to_url):
|
||||
url = future_to_url[future]
|
||||
try:
|
||||
results[url] = future.result()
|
||||
except Exception as e:
|
||||
print(f"✗ {url} 异常: {str(e)}")
|
||||
results[url] = None
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def load_results_from_search(search_dir: str) -> List[str]:
|
||||
"""
|
||||
从之前的搜索结果中加载 URL 列表
|
||||
|
||||
Args:
|
||||
search_dir: 搜索结果目录
|
||||
|
||||
Returns:
|
||||
URL 列表
|
||||
"""
|
||||
results_file = os.path.join(search_dir, "results.json")
|
||||
|
||||
if not os.path.exists(results_file):
|
||||
print(f"未找到结果文件: {results_file}")
|
||||
return []
|
||||
|
||||
with open(results_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
urls = []
|
||||
for source, result in data.items():
|
||||
if result.get("success") and result.get("output"):
|
||||
# 简单解析输出中的 URL
|
||||
output = result["output"]
|
||||
for line in output.split("\n"):
|
||||
if "url:" in line.lower() or "http" in line:
|
||||
# 提取 URL
|
||||
import re
|
||||
url_match = re.search(r'https?://[^\s\'"<>]+', line)
|
||||
if url_match:
|
||||
urls.append(url_match.group())
|
||||
|
||||
return list(set(urls))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="使用 OpenCLI 下载文档内容")
|
||||
parser.add_argument("--url", help="单个 URL 下载")
|
||||
parser.add_argument("--urls", nargs="+", help="多个 URL 下载")
|
||||
parser.add_argument("--from-search", help="从搜索结果目录加载 URL")
|
||||
parser.add_argument("--output-dir", required=True, help="输出目录")
|
||||
parser.add_argument("--max-workers", type=int, default=3, help="最大并行数")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 确保输出目录存在
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# 收集 URL 列表
|
||||
urls = []
|
||||
|
||||
if args.url:
|
||||
urls.append(args.url)
|
||||
|
||||
if args.urls:
|
||||
urls.extend(args.urls)
|
||||
|
||||
if args.from_search:
|
||||
search_urls = load_results_from_search(args.from_search)
|
||||
urls.extend(search_urls)
|
||||
print(f"从搜索结果加载 {len(search_urls)} 个 URL")
|
||||
|
||||
if not urls:
|
||||
print("错误: 未提供 URL")
|
||||
return 1
|
||||
|
||||
# 去重
|
||||
urls = list(set(urls))
|
||||
print(f"\n共 {len(urls)} 个唯一 URL 待下载\n")
|
||||
|
||||
# 批量下载
|
||||
results = batch_download(urls, args.output_dir, args.max_workers)
|
||||
|
||||
# 统计
|
||||
success_count = sum(1 for v in results.values() if v is not None)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"下载完成: {success_count}/{len(urls)} 成功")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 保存下载记录
|
||||
record_file = os.path.join(args.output_dir, "download_record.json")
|
||||
with open(record_file, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"下载记录: {record_file}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user