From d25cfa5a2a2e58b2578ffb4b9671b6164223bebd Mon Sep 17 00:00:00 2001
From: Liang Jiaqing <l.j.q.light@gmail.com>
Date: Mon, 20 Apr 2026 23:22:36 +0800
Subject: [PATCH] =?UTF-8?q?refactor:=20vision=5Fsop=E7=B2=BE=E7=AE=80?=
 =?UTF-8?q?=E6=94=B9=E5=86=99=20+=20=E6=96=B0=E5=A2=9Evision=5Fapi.templat?=
 =?UTF-8?q?e.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                    |   1 +
 memory/vision_api.template.py | 113 ++++++++++++++++++++++++++++++++++
 memory/vision_sop.md          |  38 +++---------
 3 files changed, 123 insertions(+), 29 deletions(-)
 create mode 100644 memory/vision_api.template.py

diff --git a/.gitignore b/.gitignore
index b0d27dd..3f2dbe7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,6 +80,7 @@ memory/L4_raw_sessions/*
 !memory/ocr_utils.py
 !memory/vision_sop.md
 !memory/ui_detect.py
+!memory/vision_api.template.py
 
 # Memory management
 !memory/memory_cleanup_sop.md
diff --git a/memory/vision_api.template.py b/memory/vision_api.template.py
new file mode 100644
index 0000000..58286bd
--- /dev/null
+++ b/memory/vision_api.template.py
@@ -0,0 +1,113 @@
+import base64, requests, sys, os
+from io import BytesIO
+from pathlib import Path
+
+# ============ 用户配置区（从 template 拷贝后只需改这里）============
+CLAUDE_CONFIG_KEY = 'claude_config141'   # mykey.py 中 Claude 配置的变量名
+OPENAI_CONFIG_KEY = 'oai_config1'        # mykey.py 中 OpenAI 配置的变量名
+MODELSCOPE_API_KEY = ''                  # 直接填你的 ModelScope token
+DEFAULT_BACKEND = 'claude'               # 默认后端: 'claude' / 'openai' / 'modelscope'
+# =================================================================
+
+MODELSCOPE_API_BASE = 'https://api-inference.modelscope.cn'
+MODELSCOPE_MODEL = 'Qwen/Qwen3-VL-235B-A22B-Instruct'
+
+_DIR = os.path.dirname(os.path.abspath(__file__))
+for _p in [os.path.join(_DIR, '..'), os.path.join(_DIR, '../..')]:
+    if _p not in sys.path: sys.path.insert(0, _p)
+
+def ask_vision(image_input, prompt="详细描述这张图片的内容", timeout=60, max_pixels=1440000, backend=DEFAULT_BACKEND):
+    try:
+        b64 = _prepare_image(image_input, max_pixels)
+    except Exception as e:
+        return f"Error: 图片处理失败 - {type(e).__name__}: {e}"
+    try:
+        if backend == 'claude':
+            return _call_claude(b64, prompt, timeout)
+        elif backend == 'openai':
+            mk = _load_config()
+            cfg = getattr(mk, OPENAI_CONFIG_KEY)
+            return _call_openai_compat(
+                b64, prompt, timeout,
+                apibase=cfg['apibase'], apikey=cfg['apikey'], model=cfg['model'], proxy=cfg.get('proxy')
+            )
+        elif backend == 'modelscope':
+            return _call_openai_compat(
+                b64, prompt, timeout,
+                apibase=MODELSCOPE_API_BASE, apikey=MODELSCOPE_API_KEY, model=MODELSCOPE_MODEL, proxy=None
+            )
+        else: return f"Error: 未知backend '{backend}'，可选: claude, openai, modelscope"
+    except requests.exceptions.Timeout:
+        return f"Error: 请求超时 (>{timeout}s)"
+    except requests.exceptions.RequestException as e:
+        return f"Error: API请求失败 - {type(e).__name__}: {e}"
+    except (KeyError, ValueError) as e:
+        return f"Error: 响应解析失败 - {e}"
+
+# ===================== 以下为内部实现 =====================
+
+def _prepare_image(image_input, max_pixels=1440000):
+    """加载+缩放+base64编码，返回b64字符串"""
+    from PIL import Image
+    if isinstance(image_input, Image.Image):
+        img = image_input
+    elif isinstance(image_input, (str, Path)):
+        img = Image.open(image_input)
+    else:
+        raise TypeError(f"image_input 必须是文件路径或PIL Image，实际: {type(image_input).__name__}")
+    w, h = img.size
+    if w * h > max_pixels:
+        scale = (max_pixels / (w * h)) ** 0.5
+        new_w, new_h = int(w * scale), int(h * scale)
+        img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+        print(f"  📐 缩放: {w}×{h} → {new_w}×{new_h}")
+    if img.mode in ('RGBA', 'LA', 'P'):
+        rgb = Image.new('RGB', img.size, (255, 255, 255))
+        rgb.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
+        img = rgb
+    buf = BytesIO()
+    img.save(buf, format='JPEG', quality=80, optimize=True)
+    b64 = base64.b64encode(buf.getvalue()).decode('utf-8')
+    print(f"  📦 Base64: {len(buf.getvalue())/1024:.1f}KB")
+    return b64
+
+def _load_config():
+    import mykey
+    return mykey
+
+def _call_claude(b64, prompt, timeout, max_tokens=1024):
+    mk = _load_config()
+    cfg = getattr(mk, CLAUDE_CONFIG_KEY)
+    resp = requests.post(
+        cfg['apibase'] + '/v1/messages',
+        json={'model': cfg['model'], 'max_tokens': max_tokens, 'messages': [{
+            'role': 'user',
+            'content': [
+                {'type': 'image', 'source': {'type': 'base64', 'media_type': 'image/jpeg', 'data': b64}},
+                {'type': 'text', 'text': prompt}
+            ]
+        }]},
+        headers={'x-api-key': cfg['apikey'], 'anthropic-version': '2023-06-01', 'content-type': 'application/json'},
+        timeout=timeout
+    )
+    resp.raise_for_status()
+    return resp.json()['content'][0]['text']
+
+def _call_openai_compat(b64, prompt, timeout, *, apibase, apikey, model, proxy=None):
+    proxies = {'https': proxy, 'http': proxy} if proxy else None
+    resp = requests.post(
+        apibase.rstrip('/') + '/v1/chat/completions',
+        json={'model': model, 'messages': [{
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{b64}'}}
+            ]
+        }]},
+        headers={'Authorization': f"Bearer {apikey}", 'Content-Type': 'application/json'},
+        proxies=proxies, timeout=timeout
+    )
+    resp.raise_for_status()
+    return resp.json()['choices'][0]['message']['content']
+
+if __name__ == '__main__':
diff --git a/memory/vision_sop.md b/memory/vision_sop.md
index 59de51c..2619a34 100644
--- a/memory/vision_sop.md
+++ b/memory/vision_sop.md
@@ -3,41 +3,21 @@
 ## ⚠️ 前置规则（必须遵守）
 
 1. **先枚举窗口**：调用 vision 前必须先用 `pygetwindow` 枚举窗口标题，确认目标窗口存在且已激活到前台。窗口不存在就不要截图。
-2. **🚫 禁止全屏截图**：必须先 `win32gui.GetWindowRect` 获取目标窗口坐标，再 `ImageGrab.grab(bbox=...)` 截窗口区域。能截局部（如标题栏）就不截整窗口，能截窗口就绝不全屏。全屏截图在任何场景下都不允许。
+2. **🚫 禁止全屏截图**：必须先利用ljqCtrl截取窗口区域。能截局部（如标题栏）就不截整窗口，能截窗口就绝不全屏。全屏截图在任何场景下都不允许。
 3. **能不用 vision 就不用**：如果窗口标题/本地 OCR（`ocr_utils.py`）能获取所需信息，就不要调用 vision API，省 token 且更可靠。Vision 是最后手段。
 
 ## 快速用法
 
-### 函数签名
-```python
-ask_vision(
-    image_input,
-    prompt: str | None = None,
-    timeout: int = 60,
-    max_pixels: int = 1_440_000,
-) -> str
-```
-
-### 示例
 ```python
 from vision_api import ask_vision
-result = ask_vision("image.png", prompt="描述图片内容")  # 路径或PIL Image均可
+result = ask_vision(image, prompt="描述图片内容", backend="claude", timeout=60, max_pixels=1_440_000)
+# image: 文件路径(str/Path) 或 PIL Image
+# backend: 'claude'(默认) | 'openai' | 'modelscope'
+# 返回 str：成功为模型回复，失败为 'Error: ...'
 ```
-返回 `str`：成功为模型回复，失败为 `Error: ...`。
 
-## 核心参数
-- `image_input`: 文件路径(str/Path) 或 PIL Image 对象
-- `prompt`: 提示词（默认：详细描述这张图片的内容）
-- `max_pixels`: 最大像素数（默认1440000，超则自动缩放）
-- `timeout`: 超时秒数（默认60）
+## 如果没有 `vision_api.py`，初次构建vision能力
 
-## 故障排除
-| 问题 | 解决方案 |
-|------|--------|
-| 导入失败 | 可检查 `mykey.py` 文件是否存在（仅检查存在性，不读取内容） |
-| 超时 | 提高 timeout 或降低 max_pixels |
-| 格式错误 | 确保使用 PIL 支持的格式（PNG/JPG/GIF等） |
-
-## 关键风险与坑点 (L3 Caveats)
-- **无重试机制**: `vision_api.py` 内部未实现 API 错误重试（如 503、超时）。在自动化流程中使用时，**必须在上层代码手动实现重试逻辑**（建议指数退避），否则偶发网络波动会导致任务直接崩溃中断。
-- **API Config**: 失效时直接改 `vision_api.py` 中的 `cfg = mk.claude_configXXX`。
\ No newline at end of file
+1. 复制 `memory/vision_api.template.py` → `memory/vision_api.py`
+2. 只改头部"用户配置区"：去 `mykey.py` 里扫描变量名（⚠️ 只看名字，禁止输出 apikey 值），尝试找能用配置名填入 `CLAUDE_CONFIG_KEY` / `OPENAI_CONFIG_KEY`，`DEFAULT_BACKEND` 选后端，并测试
+3. 保底：没有可用 config 时去 `https://modelscope.cn/my/myaccesstoken` 申请 token 填入 `MODELSCOPE_API_KEY`