diff --git a/.gitignore b/.gitignore index 5668042..78eca49 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,11 @@ memory/L4_raw_sessions/* # Keychain !memory/keychain.py +# Vision / OCR / UI detection tools +!memory/ocr_utils.py +!memory/vision_sop.md +!memory/ui_detect.py + # Visual Studio .vs/ restore_commit.txt diff --git a/memory/ocr_utils.py b/memory/ocr_utils.py new file mode 100644 index 0000000..e296ed4 --- /dev/null +++ b/memory/ocr_utils.py @@ -0,0 +1,101 @@ +""" +本地 OCR 工具 +- OCR引擎: rapidocr-onnxruntime (~1s/次, 中英文准确率高, 带bbox) +- 坑(rapid): result[i][2] conf 是 str 不是 float +- 坑(rapid): 无文字时 result 返回 None 而非空列表 +- 坑: enhance 放大+高对比度处理,对清晰文字有害,默认关闭 +- 坑(远程桌面): ImageGrab/mss 在 RDP 断开后截图全黑,用 ocr_window(hwnd) 代替 +""" +import re +from PIL import ImageGrab, Image, ImageEnhance + +_LANG = 'zh-Hans-CN' +_rapid_engine = None + +def _get_rapid(): + global _rapid_engine + if _rapid_engine is None: + from rapidocr_onnxruntime import RapidOCR + _rapid_engine = RapidOCR() + return _rapid_engine + +def _preprocess(img, scale=3, contrast=3.0): + img = ImageEnhance.Contrast(img).enhance(contrast) + img = img.resize((img.width * scale, img.height * scale)) + return img + +def _strip_cjk_spaces(t): + return re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', t) + +def _ocr_rapid(img): + import numpy as np + engine = _get_rapid() + arr = np.array(img) + result, elapse = engine(arr) + if not result: + return {'text': '', 'lines': [], 'details': []} + lines = [r[1] for r in result] + details = [{'bbox': r[0], 'text': r[1], 'conf': float(r[2])} for r in result] + text = _strip_cjk_spaces('\n'.join(lines)) + return {'text': text, 'lines': [_strip_cjk_spaces(l) for l in lines], 'details': details} + +def ocr_image(image_input, lang=_LANG, enhance=False, engine=None): + """ + 对 PIL Image 做 OCR + :param image_input: PIL Image 对象 或 文件路径(str) + :param lang: 保留参数,当前未使用 + :param enhance: 预处理 + :param engine: 保留参数,当前仅支持 rapid/None + :return: dict {'text': 全文, 'lines': [行文本], 'details': [bbox+conf]} + """ + if isinstance(image_input, str): + image_input = Image.open(image_input) + if enhance: + image_input = _preprocess(image_input) + if engine not in (None, 'rapid'): + raise ValueError("Only rapid OCR is supported") + return _ocr_rapid(image_input) + +def ocr_screen(bbox=None, lang=_LANG, enhance=False, engine=None): + """ + 截取屏幕区域并 OCR + :param bbox: (x1, y1, x2, y2) 像素坐标,None=全屏 + :return: dict {'text': 全文, 'lines': [行文本], 'details': [bbox+conf](仅rapid)} + """ + img = ImageGrab.grab(bbox=bbox) + return ocr_image(img, lang, enhance, engine) + +def ocr_window(hwnd, lang=_LANG, enhance=False, engine=None): + """ + 截取窗口并 OCR (使用 PrintWindow API,支持远程桌面断开场景) + :param hwnd: 窗口句柄(int) + :return: dict {'text': 全文, 'lines': [行文本], 'details': [bbox+conf](仅rapid)} + """ + import win32gui, win32ui + from ctypes import windll + l, t, r, b = win32gui.GetWindowRect(hwnd) + w, h = r - l, b - t + hwndDC = win32gui.GetWindowDC(hwnd) + mfcDC = win32ui.CreateDCFromHandle(hwndDC) + saveDC = mfcDC.CreateCompatibleDC() + saveBitMap = win32ui.CreateBitmap() + saveBitMap.CreateCompatibleBitmap(mfcDC, w, h) + saveDC.SelectObject(saveBitMap) + windll.user32.PrintWindow(hwnd, saveDC.GetSafeHdc(), 3) + bmpinfo = saveBitMap.GetInfo() + bmpstr = saveBitMap.GetBitmapBits(True) + img = Image.frombuffer('RGB', (bmpinfo['bmWidth'], bmpinfo['bmHeight']), bmpstr, 'raw', 'BGRX', 0, 1) + win32gui.DeleteObject(saveBitMap.GetHandle()) + saveDC.DeleteDC() + mfcDC.DeleteDC() + win32gui.ReleaseDC(hwnd, hwndDC) + return ocr_image(img, lang, enhance, engine) + +if __name__ == "__main__": + r = ocr_screen((0, 0, 400, 100)) + print(f"识别结果: {r['text']}") + for line in r['lines']: + print(f" 行: {line}") + if 'details' in r: + for d in r['details']: + print(f" [{d['conf']:.3f}] {d['text']}") \ No newline at end of file diff --git a/memory/ui_detect.py b/memory/ui_detect.py new file mode 100644 index 0000000..d41dbbf --- /dev/null +++ b/memory/ui_detect.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +极简UI元素检测脚本 - 基于OmniParser的YOLO模型 +依赖: ultralytics, rapidocr-onnxruntime, pillow, numpy +""" +import sys +from pathlib import Path +from ultralytics import YOLO +from PIL import Image, ImageDraw +import numpy as np + +DEFAULT_MODEL = str(Path(__file__).resolve().parent.parent / 'temp' / 'weights' / 'icon_detect' / 'model.pt') + +# 可选:使用rapidocr做OCR +try: + from rapidocr_onnxruntime import RapidOCR + ocr_engine = RapidOCR() + HAS_OCR = True +except ImportError: + HAS_OCR = False + print("警告: rapidocr未安装,跳过OCR功能") + +def detect_ui_elements(image_path, model_path=None, conf_threshold=0.25): + model_path = model_path or DEFAULT_MODEL + """检测UI元素并返回边界框""" + # 加载模型 + model = YOLO(model_path) + + # 推理 + results = model(image_path, conf=conf_threshold, verbose=False) + + # 提取检测结果 + detections = [] + for result in results: + boxes = result.boxes + for box in boxes: + x1, y1, x2, y2 = box.xyxy[0].cpu().numpy() + conf = float(box.conf[0]) + cls = int(box.cls[0]) + detections.append({ + 'bbox': [int(x1), int(y1), int(x2), int(y2)], + 'confidence': conf, + 'class': cls + }) + + return detections + +def ocr_text(image_path): + """OCR识别文本""" + if not HAS_OCR: + return [] + + result, _ = ocr_engine(image_path) + if not result: + return [] + + texts = [] + for item in result: + bbox, text, conf = item + texts.append({ + 'text': text, + 'bbox': bbox, + 'confidence': conf + }) + return texts + +def visualize(image_path, detections, ocr_results=None, output_path=None): + """可视化检测结果""" + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + + # 画UI元素框(红色) + for det in detections: + x1, y1, x2, y2 = det['bbox'] + draw.rectangle([x1, y1, x2, y2], outline='red', width=2) + draw.text((x1, y1-10), f"{det['confidence']:.2f}", fill='red') + + # 画OCR文本框(蓝色) + if ocr_results: + for ocr in ocr_results: + bbox = ocr['bbox'] + points = [(bbox[i][0], bbox[i][1]) for i in range(4)] + draw.polygon(points, outline='blue') + draw.text((points[0][0], points[0][1]-10), ocr['text'][:10], fill='blue') + + if output_path: + img.save(output_path) + return img + +def main(): + if len(sys.argv) < 2: + print("用法: python ui_detect.py <图片路径> <模型路径> [输出路径]") + print("示例: python ui_detect.py screenshot.png weights/icon_detect/model.pt output.png") + sys.exit(1) + + image_path = sys.argv[1] + model_path = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_MODEL + output_path = sys.argv[3] if len(sys.argv) > 3 else "output.png" + + print(f"检测图片: {image_path}") + print(f"使用模型: {model_path}") + + # UI元素检测 + print("\n[1/2] YOLO检测UI元素...") + detections = detect_ui_elements(image_path, model_path) + print(f"检测到 {len(detections)} 个UI元素") + for i, det in enumerate(detections, 1): + print(f" {i}. bbox={det['bbox']}, conf={det['confidence']:.3f}") + + # OCR文本识别 + ocr_results = None + if HAS_OCR: + print("\n[2/2] OCR识别文本...") + ocr_results = ocr_text(image_path) + print(f"识别到 {len(ocr_results)} 个文本区域") + for i, ocr in enumerate(ocr_results, 1): + print(f" {i}. text='{ocr['text']}', conf={ocr['confidence']:.3f}") + + # 可视化 + print(f"\n保存结果到: {output_path}") + visualize(image_path, detections, ocr_results, output_path) + + # 输出JSON格式结果 + import json + result = { + 'ui_elements': detections, + 'ocr_texts': ocr_results or [] + } + json_path = output_path.replace('.png', '.json') + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + print(f"JSON结果: {json_path}") + +if __name__ == "__main__": + main() diff --git a/memory/vision_sop.md b/memory/vision_sop.md new file mode 100644 index 0000000..97e6938 --- /dev/null +++ b/memory/vision_sop.md @@ -0,0 +1,48 @@ +# Vision API SOP + +## ⚠️ 前置规则(必须遵守) + +1. **先枚举窗口**:调用 vision 前必须先用 `pygetwindow` 枚举窗口标题,确认目标窗口存在且已激活到前台。窗口不存在就不要截图。 +2. **🚫 禁止全屏截图**:必须先 `win32gui.GetWindowRect` 获取目标窗口坐标,再 `ImageGrab.grab(bbox=...)` 截窗口区域。能截局部(如标题栏)就不截整窗口,能截窗口就绝不全屏。全屏截图在任何场景下都不允许。 +3. **能不用 vision 就不用**:如果窗口标题/本地 OCR(`ocr_utils.py`)能获取所需信息,就不要调用 vision API,省 token 且更可靠。Vision 是最后手段。 + +## 快速用法 + +### 函数签名 +```python +ask_vision( + image_input, + prompt: str | None = None, + timeout: int = 60, + max_pixels: int = 1_440_000, +) -> str +``` + +### 示例 +```python +from vision_api import ask_vision +result = ask_vision("image.png", prompt="描述图片内容") # 路径或PIL Image均可 +``` +返回 `str`:成功为模型回复,失败为 `Error: ...`。 + +## 核心参数 +- `image_input`: 文件路径(str/Path) 或 PIL Image 对象 +- `prompt`: 提示词(默认:详细描述这张图片的内容) +- `max_pixels`: 最大像素数(默认1440000,超则自动缩放) +- `timeout`: 超时秒数(默认60) + +## 故障排除 +| 问题 | 解决方案 | +|------|--------| +| 导入失败 | 可检查 `../../mykey.py` 文件是否存在(仅检查存在性,不读取内容) | +| 超时 | 提高 timeout 或降低 max_pixels | +| 格式错误 | 确保使用 PIL 支持的格式(PNG/JPG/GIF等) | + +## 关键风险与坑点 (L3 Caveats) +- **无重试机制**: `vision_api.py` 内部未实现 API 错误重试(如 503、超时)。在自动化流程中使用时,**必须在上层代码手动实现重试逻辑**(建议指数退避),否则偶发网络波动会导致任务直接崩溃中断。 +- **API Config**: 当前使用 `claude_config141`(ncode.vkm2.com, 已验证)。备选可用: `native_claude_config2/84/5535`。失效时直接改 `vision_api.py` 中的 `cfg = mk.claude_configXXX`。 + +--- +更新: 2025-07-18 | 修复oai_config导入+返回值统一str +更新: 2026-02-18 | 默认后端改为Claude原生API | SOP精简(删废话/水段/合并示例) +更新: 2026-07 | 修复config(原claude_config8不存在)→改为claude_config141