feat: publish ocr_utils, vision_sop, ui_detect to public repo

This commit is contained in:
Liang Jiaqing
2026-04-15 00:00:14 +08:00
parent d0a9b5de08
commit 97560a4fba
4 changed files with 289 additions and 0 deletions

101
memory/ocr_utils.py Normal file
View File

@@ -0,0 +1,101 @@
"""
本地 OCR 工具
- OCR引擎: rapidocr-onnxruntime (~1s/次, 中英文准确率高, 带bbox)
- 坑(rapid): result[i][2] conf 是 str 不是 float
- 坑(rapid): 无文字时 result 返回 None 而非空列表
- 坑: enhance 放大+高对比度处理,对清晰文字有害,默认关闭
- 坑(远程桌面): ImageGrab/mss 在 RDP 断开后截图全黑,用 ocr_window(hwnd) 代替
"""
import re
from PIL import ImageGrab, Image, ImageEnhance
_LANG = 'zh-Hans-CN'
_rapid_engine = None
def _get_rapid():
global _rapid_engine
if _rapid_engine is None:
from rapidocr_onnxruntime import RapidOCR
_rapid_engine = RapidOCR()
return _rapid_engine
def _preprocess(img, scale=3, contrast=3.0):
img = ImageEnhance.Contrast(img).enhance(contrast)
img = img.resize((img.width * scale, img.height * scale))
return img
def _strip_cjk_spaces(t):
return re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', t)
def _ocr_rapid(img):
import numpy as np
engine = _get_rapid()
arr = np.array(img)
result, elapse = engine(arr)
if not result:
return {'text': '', 'lines': [], 'details': []}
lines = [r[1] for r in result]
details = [{'bbox': r[0], 'text': r[1], 'conf': float(r[2])} for r in result]
text = _strip_cjk_spaces('\n'.join(lines))
return {'text': text, 'lines': [_strip_cjk_spaces(l) for l in lines], 'details': details}
def ocr_image(image_input, lang=_LANG, enhance=False, engine=None):
"""
对 PIL Image 做 OCR
:param image_input: PIL Image 对象 或 文件路径(str)
:param lang: 保留参数,当前未使用
:param enhance: 预处理
:param engine: 保留参数,当前仅支持 rapid/None
:return: dict {'text': 全文, 'lines': [行文本], 'details': [bbox+conf]}
"""
if isinstance(image_input, str):
image_input = Image.open(image_input)
if enhance:
image_input = _preprocess(image_input)
if engine not in (None, 'rapid'):
raise ValueError("Only rapid OCR is supported")
return _ocr_rapid(image_input)
def ocr_screen(bbox=None, lang=_LANG, enhance=False, engine=None):
"""
截取屏幕区域并 OCR
:param bbox: (x1, y1, x2, y2) 像素坐标None=全屏
:return: dict {'text': 全文, 'lines': [行文本], 'details': [bbox+conf](仅rapid)}
"""
img = ImageGrab.grab(bbox=bbox)
return ocr_image(img, lang, enhance, engine)
def ocr_window(hwnd, lang=_LANG, enhance=False, engine=None):
"""
截取窗口并 OCR (使用 PrintWindow API支持远程桌面断开场景)
:param hwnd: 窗口句柄(int)
:return: dict {'text': 全文, 'lines': [行文本], 'details': [bbox+conf](仅rapid)}
"""
import win32gui, win32ui
from ctypes import windll
l, t, r, b = win32gui.GetWindowRect(hwnd)
w, h = r - l, b - t
hwndDC = win32gui.GetWindowDC(hwnd)
mfcDC = win32ui.CreateDCFromHandle(hwndDC)
saveDC = mfcDC.CreateCompatibleDC()
saveBitMap = win32ui.CreateBitmap()
saveBitMap.CreateCompatibleBitmap(mfcDC, w, h)
saveDC.SelectObject(saveBitMap)
windll.user32.PrintWindow(hwnd, saveDC.GetSafeHdc(), 3)
bmpinfo = saveBitMap.GetInfo()
bmpstr = saveBitMap.GetBitmapBits(True)
img = Image.frombuffer('RGB', (bmpinfo['bmWidth'], bmpinfo['bmHeight']), bmpstr, 'raw', 'BGRX', 0, 1)
win32gui.DeleteObject(saveBitMap.GetHandle())
saveDC.DeleteDC()
mfcDC.DeleteDC()
win32gui.ReleaseDC(hwnd, hwndDC)
return ocr_image(img, lang, enhance, engine)
if __name__ == "__main__":
r = ocr_screen((0, 0, 400, 100))
print(f"识别结果: {r['text']}")
for line in r['lines']:
print(f" 行: {line}")
if 'details' in r:
for d in r['details']:
print(f" [{d['conf']:.3f}] {d['text']}")

135
memory/ui_detect.py Normal file
View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""
极简UI元素检测脚本 - 基于OmniParser的YOLO模型
依赖: ultralytics, rapidocr-onnxruntime, pillow, numpy
"""
import sys
from pathlib import Path
from ultralytics import YOLO
from PIL import Image, ImageDraw
import numpy as np
DEFAULT_MODEL = str(Path(__file__).resolve().parent.parent / 'temp' / 'weights' / 'icon_detect' / 'model.pt')
# 可选使用rapidocr做OCR
try:
from rapidocr_onnxruntime import RapidOCR
ocr_engine = RapidOCR()
HAS_OCR = True
except ImportError:
HAS_OCR = False
print("警告: rapidocr未安装跳过OCR功能")
def detect_ui_elements(image_path, model_path=None, conf_threshold=0.25):
model_path = model_path or DEFAULT_MODEL
"""检测UI元素并返回边界框"""
# 加载模型
model = YOLO(model_path)
# 推理
results = model(image_path, conf=conf_threshold, verbose=False)
# 提取检测结果
detections = []
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
conf = float(box.conf[0])
cls = int(box.cls[0])
detections.append({
'bbox': [int(x1), int(y1), int(x2), int(y2)],
'confidence': conf,
'class': cls
})
return detections
def ocr_text(image_path):
"""OCR识别文本"""
if not HAS_OCR:
return []
result, _ = ocr_engine(image_path)
if not result:
return []
texts = []
for item in result:
bbox, text, conf = item
texts.append({
'text': text,
'bbox': bbox,
'confidence': conf
})
return texts
def visualize(image_path, detections, ocr_results=None, output_path=None):
"""可视化检测结果"""
img = Image.open(image_path)
draw = ImageDraw.Draw(img)
# 画UI元素框红色
for det in detections:
x1, y1, x2, y2 = det['bbox']
draw.rectangle([x1, y1, x2, y2], outline='red', width=2)
draw.text((x1, y1-10), f"{det['confidence']:.2f}", fill='red')
# 画OCR文本框蓝色
if ocr_results:
for ocr in ocr_results:
bbox = ocr['bbox']
points = [(bbox[i][0], bbox[i][1]) for i in range(4)]
draw.polygon(points, outline='blue')
draw.text((points[0][0], points[0][1]-10), ocr['text'][:10], fill='blue')
if output_path:
img.save(output_path)
return img
def main():
if len(sys.argv) < 2:
print("用法: python ui_detect.py <图片路径> <模型路径> [输出路径]")
print("示例: python ui_detect.py screenshot.png weights/icon_detect/model.pt output.png")
sys.exit(1)
image_path = sys.argv[1]
model_path = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_MODEL
output_path = sys.argv[3] if len(sys.argv) > 3 else "output.png"
print(f"检测图片: {image_path}")
print(f"使用模型: {model_path}")
# UI元素检测
print("\n[1/2] YOLO检测UI元素...")
detections = detect_ui_elements(image_path, model_path)
print(f"检测到 {len(detections)} 个UI元素")
for i, det in enumerate(detections, 1):
print(f" {i}. bbox={det['bbox']}, conf={det['confidence']:.3f}")
# OCR文本识别
ocr_results = None
if HAS_OCR:
print("\n[2/2] OCR识别文本...")
ocr_results = ocr_text(image_path)
print(f"识别到 {len(ocr_results)} 个文本区域")
for i, ocr in enumerate(ocr_results, 1):
print(f" {i}. text='{ocr['text']}', conf={ocr['confidence']:.3f}")
# 可视化
print(f"\n保存结果到: {output_path}")
visualize(image_path, detections, ocr_results, output_path)
# 输出JSON格式结果
import json
result = {
'ui_elements': detections,
'ocr_texts': ocr_results or []
}
json_path = output_path.replace('.png', '.json')
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"JSON结果: {json_path}")
if __name__ == "__main__":
main()

48
memory/vision_sop.md Normal file
View File

@@ -0,0 +1,48 @@
# Vision API SOP
## ⚠️ 前置规则(必须遵守)
1. **先枚举窗口**:调用 vision 前必须先用 `pygetwindow` 枚举窗口标题,确认目标窗口存在且已激活到前台。窗口不存在就不要截图。
2. **🚫 禁止全屏截图**:必须先 `win32gui.GetWindowRect` 获取目标窗口坐标,再 `ImageGrab.grab(bbox=...)` 截窗口区域。能截局部(如标题栏)就不截整窗口,能截窗口就绝不全屏。全屏截图在任何场景下都不允许。
3. **能不用 vision 就不用**:如果窗口标题/本地 OCR`ocr_utils.py`)能获取所需信息,就不要调用 vision API省 token 且更可靠。Vision 是最后手段。
## 快速用法
### 函数签名
```python
ask_vision(
image_input,
prompt: str | None = None,
timeout: int = 60,
max_pixels: int = 1_440_000,
) -> str
```
### 示例
```python
from vision_api import ask_vision
result = ask_vision("image.png", prompt="描述图片内容") # 路径或PIL Image均可
```
返回 `str`:成功为模型回复,失败为 `Error: ...`
## 核心参数
- `image_input`: 文件路径(str/Path) 或 PIL Image 对象
- `prompt`: 提示词(默认:详细描述这张图片的内容)
- `max_pixels`: 最大像素数默认1440000超则自动缩放
- `timeout`: 超时秒数默认60
## 故障排除
| 问题 | 解决方案 |
|------|--------|
| 导入失败 | 可检查 `../../mykey.py` 文件是否存在(仅检查存在性,不读取内容) |
| 超时 | 提高 timeout 或降低 max_pixels |
| 格式错误 | 确保使用 PIL 支持的格式PNG/JPG/GIF等 |
## 关键风险与坑点 (L3 Caveats)
- **无重试机制**: `vision_api.py` 内部未实现 API 错误重试(如 503、超时。在自动化流程中使用时**必须在上层代码手动实现重试逻辑**(建议指数退避),否则偶发网络波动会导致任务直接崩溃中断。
- **API Config**: 当前使用 `claude_config141`(ncode.vkm2.com, 已验证)。备选可用: `native_claude_config2/84/5535`。失效时直接改 `vision_api.py` 中的 `cfg = mk.claude_configXXX`
---
更新: 2025-07-18 | 修复oai_config导入+返回值统一str
更新: 2026-02-18 | 默认后端改为Claude原生API | SOP精简(删废话/水段/合并示例)
更新: 2026-07 | 修复config(原claude_config8不存在)→改为claude_config141