From d25cfa5a2a2e58b2578ffb4b9671b6164223bebd Mon Sep 17 00:00:00 2001 From: Liang Jiaqing Date: Mon, 20 Apr 2026 23:22:36 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20vision=5Fsop=E7=B2=BE=E7=AE=80?= =?UTF-8?q?=E6=94=B9=E5=86=99=20+=20=E6=96=B0=E5=A2=9Evision=5Fapi.templat?= =?UTF-8?q?e.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + memory/vision_api.template.py | 113 ++++++++++++++++++++++++++++++++++ memory/vision_sop.md | 38 +++--------- 3 files changed, 123 insertions(+), 29 deletions(-) create mode 100644 memory/vision_api.template.py diff --git a/.gitignore b/.gitignore index b0d27dd..3f2dbe7 100644 --- a/.gitignore +++ b/.gitignore @@ -80,6 +80,7 @@ memory/L4_raw_sessions/* !memory/ocr_utils.py !memory/vision_sop.md !memory/ui_detect.py +!memory/vision_api.template.py # Memory management !memory/memory_cleanup_sop.md diff --git a/memory/vision_api.template.py b/memory/vision_api.template.py new file mode 100644 index 0000000..58286bd --- /dev/null +++ b/memory/vision_api.template.py @@ -0,0 +1,113 @@ +import base64, requests, sys, os +from io import BytesIO +from pathlib import Path + +# ============ 用户配置区(从 template 拷贝后只需改这里)============ +CLAUDE_CONFIG_KEY = 'claude_config141' # mykey.py 中 Claude 配置的变量名 +OPENAI_CONFIG_KEY = 'oai_config1' # mykey.py 中 OpenAI 配置的变量名 +MODELSCOPE_API_KEY = '' # 直接填你的 ModelScope token +DEFAULT_BACKEND = 'claude' # 默认后端: 'claude' / 'openai' / 'modelscope' +# ================================================================= + +MODELSCOPE_API_BASE = 'https://api-inference.modelscope.cn' +MODELSCOPE_MODEL = 'Qwen/Qwen3-VL-235B-A22B-Instruct' + +_DIR = os.path.dirname(os.path.abspath(__file__)) +for _p in [os.path.join(_DIR, '..'), os.path.join(_DIR, '../..')]: + if _p not in sys.path: sys.path.insert(0, _p) + +def ask_vision(image_input, prompt="详细描述这张图片的内容", timeout=60, max_pixels=1440000, backend=DEFAULT_BACKEND): + try: + b64 = _prepare_image(image_input, max_pixels) + except Exception as e: + return f"Error: 图片处理失败 - {type(e).__name__}: {e}" + try: + if backend == 'claude': + return _call_claude(b64, prompt, timeout) + elif backend == 'openai': + mk = _load_config() + cfg = getattr(mk, OPENAI_CONFIG_KEY) + return _call_openai_compat( + b64, prompt, timeout, + apibase=cfg['apibase'], apikey=cfg['apikey'], model=cfg['model'], proxy=cfg.get('proxy') + ) + elif backend == 'modelscope': + return _call_openai_compat( + b64, prompt, timeout, + apibase=MODELSCOPE_API_BASE, apikey=MODELSCOPE_API_KEY, model=MODELSCOPE_MODEL, proxy=None + ) + else: return f"Error: 未知backend '{backend}',可选: claude, openai, modelscope" + except requests.exceptions.Timeout: + return f"Error: 请求超时 (>{timeout}s)" + except requests.exceptions.RequestException as e: + return f"Error: API请求失败 - {type(e).__name__}: {e}" + except (KeyError, ValueError) as e: + return f"Error: 响应解析失败 - {e}" + +# ===================== 以下为内部实现 ===================== + +def _prepare_image(image_input, max_pixels=1440000): + """加载+缩放+base64编码,返回b64字符串""" + from PIL import Image + if isinstance(image_input, Image.Image): + img = image_input + elif isinstance(image_input, (str, Path)): + img = Image.open(image_input) + else: + raise TypeError(f"image_input 必须是文件路径或PIL Image,实际: {type(image_input).__name__}") + w, h = img.size + if w * h > max_pixels: + scale = (max_pixels / (w * h)) ** 0.5 + new_w, new_h = int(w * scale), int(h * scale) + img = img.resize((new_w, new_h), Image.Resampling.LANCZOS) + print(f" 📐 缩放: {w}×{h} → {new_w}×{new_h}") + if img.mode in ('RGBA', 'LA', 'P'): + rgb = Image.new('RGB', img.size, (255, 255, 255)) + rgb.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) + img = rgb + buf = BytesIO() + img.save(buf, format='JPEG', quality=80, optimize=True) + b64 = base64.b64encode(buf.getvalue()).decode('utf-8') + print(f" 📦 Base64: {len(buf.getvalue())/1024:.1f}KB") + return b64 + +def _load_config(): + import mykey + return mykey + +def _call_claude(b64, prompt, timeout, max_tokens=1024): + mk = _load_config() + cfg = getattr(mk, CLAUDE_CONFIG_KEY) + resp = requests.post( + cfg['apibase'] + '/v1/messages', + json={'model': cfg['model'], 'max_tokens': max_tokens, 'messages': [{ + 'role': 'user', + 'content': [ + {'type': 'image', 'source': {'type': 'base64', 'media_type': 'image/jpeg', 'data': b64}}, + {'type': 'text', 'text': prompt} + ] + }]}, + headers={'x-api-key': cfg['apikey'], 'anthropic-version': '2023-06-01', 'content-type': 'application/json'}, + timeout=timeout + ) + resp.raise_for_status() + return resp.json()['content'][0]['text'] + +def _call_openai_compat(b64, prompt, timeout, *, apibase, apikey, model, proxy=None): + proxies = {'https': proxy, 'http': proxy} if proxy else None + resp = requests.post( + apibase.rstrip('/') + '/v1/chat/completions', + json={'model': model, 'messages': [{ + 'role': 'user', + 'content': [ + {'type': 'text', 'text': prompt}, + {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{b64}'}} + ] + }]}, + headers={'Authorization': f"Bearer {apikey}", 'Content-Type': 'application/json'}, + proxies=proxies, timeout=timeout + ) + resp.raise_for_status() + return resp.json()['choices'][0]['message']['content'] + +if __name__ == '__main__': diff --git a/memory/vision_sop.md b/memory/vision_sop.md index 59de51c..2619a34 100644 --- a/memory/vision_sop.md +++ b/memory/vision_sop.md @@ -3,41 +3,21 @@ ## ⚠️ 前置规则(必须遵守) 1. **先枚举窗口**:调用 vision 前必须先用 `pygetwindow` 枚举窗口标题,确认目标窗口存在且已激活到前台。窗口不存在就不要截图。 -2. **🚫 禁止全屏截图**:必须先 `win32gui.GetWindowRect` 获取目标窗口坐标,再 `ImageGrab.grab(bbox=...)` 截窗口区域。能截局部(如标题栏)就不截整窗口,能截窗口就绝不全屏。全屏截图在任何场景下都不允许。 +2. **🚫 禁止全屏截图**:必须先利用ljqCtrl截取窗口区域。能截局部(如标题栏)就不截整窗口,能截窗口就绝不全屏。全屏截图在任何场景下都不允许。 3. **能不用 vision 就不用**:如果窗口标题/本地 OCR(`ocr_utils.py`)能获取所需信息,就不要调用 vision API,省 token 且更可靠。Vision 是最后手段。 ## 快速用法 -### 函数签名 -```python -ask_vision( - image_input, - prompt: str | None = None, - timeout: int = 60, - max_pixels: int = 1_440_000, -) -> str -``` - -### 示例 ```python from vision_api import ask_vision -result = ask_vision("image.png", prompt="描述图片内容") # 路径或PIL Image均可 +result = ask_vision(image, prompt="描述图片内容", backend="claude", timeout=60, max_pixels=1_440_000) +# image: 文件路径(str/Path) 或 PIL Image +# backend: 'claude'(默认) | 'openai' | 'modelscope' +# 返回 str:成功为模型回复,失败为 'Error: ...' ``` -返回 `str`:成功为模型回复,失败为 `Error: ...`。 -## 核心参数 -- `image_input`: 文件路径(str/Path) 或 PIL Image 对象 -- `prompt`: 提示词(默认:详细描述这张图片的内容) -- `max_pixels`: 最大像素数(默认1440000,超则自动缩放) -- `timeout`: 超时秒数(默认60) +## 如果没有 `vision_api.py`,初次构建vision能力 -## 故障排除 -| 问题 | 解决方案 | -|------|--------| -| 导入失败 | 可检查 `mykey.py` 文件是否存在(仅检查存在性,不读取内容) | -| 超时 | 提高 timeout 或降低 max_pixels | -| 格式错误 | 确保使用 PIL 支持的格式(PNG/JPG/GIF等) | - -## 关键风险与坑点 (L3 Caveats) -- **无重试机制**: `vision_api.py` 内部未实现 API 错误重试(如 503、超时)。在自动化流程中使用时,**必须在上层代码手动实现重试逻辑**(建议指数退避),否则偶发网络波动会导致任务直接崩溃中断。 -- **API Config**: 失效时直接改 `vision_api.py` 中的 `cfg = mk.claude_configXXX`。 \ No newline at end of file +1. 复制 `memory/vision_api.template.py` → `memory/vision_api.py` +2. 只改头部"用户配置区":去 `mykey.py` 里扫描变量名(⚠️ 只看名字,禁止输出 apikey 值),尝试找能用配置名填入 `CLAUDE_CONFIG_KEY` / `OPENAI_CONFIG_KEY`,`DEFAULT_BACKEND` 选后端,并测试 +3. 保底:没有可用 config 时去 `https://modelscope.cn/my/myaccesstoken` 申请 token 填入 `MODELSCOPE_API_KEY`