improve update_working_mem tool description; adb_ui u2 priority; autonomous_operation framework lock

This commit is contained in:
Liang Jiaqing
2026-02-16 11:31:58 +08:00
parent 90bb6868ec
commit df478d81e1
5 changed files with 50 additions and 17 deletions

View File

@@ -48,9 +48,9 @@
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
"name": "update_working_mem", "name": "update_working_mem",
"description": "更新当前任务的工作记忆。当任务目标细化、阶段切换或发现关键物理约束时调用内容不得超过200 tokens将作为物理锚点在后续每轮自动注入强行锁定推理方向,防止长任务逻辑漂移。", "description": "短期工作便签,内容每轮自动注入,防止长任务中关键信息丢失。何时调用:(1)即将切换子任务、上下文将被大量新信息冲刷前,存入当前路径/参数/进度;(2)获得后续步骤必需的关键发现后;(3)SOP多步执行时完成一步后更新为本步结果+下一步要求。原则只存N轮后可能忘记但后面还要用的信息刚发生的、上下文里还热乎的不用存。宁可多更新不可丢关键上下文。",
"parameters": {"type": "object", "properties": { "parameters": {"type": "object", "properties": {
"key_info": {"type": "string", "description": "替换当前的极简关键信息<200 tokens,包含当前进度和目标、已获取的关键信息及下一步必须绕过的坑,已经解决或完成的步骤可以去掉"}, "key_info": {"type": "string", "description": "替换当前便签<200 tokens。只写后续必须记住的:文件路径、关键参数/发现、当前进度、下一步计划、要避的坑。刚完成的和上下文中显而易见的不写,省空间给真正容易丢的信息。"},
"related_sop": {"type": "string", "description": "相关sop名称可以多个必要时需要再读"}}} "related_sop": {"type": "string", "description": "相关sop名称可以多个必要时需要再读"}}}
}}, }},
{"type": "function", "function": { {"type": "function", "function": {

View File

@@ -1,5 +1,5 @@
# adb_ui.py - 一键dump+解析Android UI # adb_ui.py - 一键dump+解析Android UI (u2优先原生fallback)
# PITFALLS: dump已内置--compressed; 美团等动画app需先禁动画(adb shell settings put global animator_duration_scale 0 ...共3条); # u2 (uiautomator2) 不受idle限制适合动画密集app美团等
# 弹窗检测: ui(clickable_only=True, raw=True) 找全屏FrameLayout+底部小ImageView(关闭X) # 弹窗检测: ui(clickable_only=True, raw=True) 找全屏FrameLayout+底部小ImageView(关闭X)
# 已知包名: 美团外卖=com.sankuai.meituan.takeoutnew 淘宝=com.taobao.taobao # 已知包名: 美团外卖=com.sankuai.meituan.takeoutnew 淘宝=com.taobao.taobao
import subprocess, xml.etree.ElementTree as ET, os, re, shutil import subprocess, xml.etree.ElementTree as ET, os, re, shutil
@@ -7,28 +7,41 @@ import subprocess, xml.etree.ElementTree as ET, os, re, shutil
ADB = shutil.which("adb") or "adb" ADB = shutil.which("adb") or "adb"
LOCAL_XML = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui_mt.xml") LOCAL_XML = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui_mt.xml")
def ui(keyword=None, clickable_only=False, raw=False): def _dump_u2():
"""一键dump+解析Android UI """用uiautomator2 dump不受idle限制"""
keyword: 过滤含关键词的节点 try:
clickable_only: 只显示可点击节点 import uiautomator2 as u2
raw: 返回原始节点列表而非打印 d = u2.connect()
""" xml_str = d.dump_hierarchy()
if xml_str and len(xml_str) > 100:
return xml_str
except Exception as e:
print(f"[u2 fallback] {e}")
return None
def _dump_native():
"""原生uiautomator dump需idle状态"""
subprocess.run([ADB, "shell", "rm", "-f", "/sdcard/ui.xml"], capture_output=True) subprocess.run([ADB, "shell", "rm", "-f", "/sdcard/ui.xml"], capture_output=True)
r = subprocess.run([ADB, "shell", "uiautomator", "dump", "--compressed", "/sdcard/ui.xml"], r = subprocess.run([ADB, "shell", "uiautomator", "dump", "--compressed", "/sdcard/ui.xml"],
capture_output=True, text=True, timeout=15) capture_output=True, text=True, timeout=15)
if "dumped" not in r.stdout.lower() and "dumped" not in r.stderr.lower(): if "dumped" not in r.stdout.lower() and "dumped" not in r.stderr.lower():
print(f"dump failed: {r.stdout}{r.stderr}") print(f"dump failed: {r.stdout}{r.stderr}")
return [] return None
subprocess.run([ADB, "pull", "/sdcard/ui.xml", LOCAL_XML], capture_output=True, timeout=10) subprocess.run([ADB, "pull", "/sdcard/ui.xml", LOCAL_XML], capture_output=True, timeout=10)
with open(LOCAL_XML, "r", encoding="utf-8") as f:
return f.read()
tree = ET.parse(LOCAL_XML) def _parse_xml(xml_str, keyword=None, clickable_only=False, raw=False):
"""解析XML字符串为节点列表"""
root = ET.fromstring(xml_str)
nodes = [] nodes = []
for n in tree.getroot().iter("node"): for n in root.iter("node"):
text = n.get("text", "") text = n.get("text", "")
desc = n.get("content-desc", "") desc = n.get("content-desc", "")
bounds = n.get("bounds", "") bounds = n.get("bounds", "")
click = n.get("clickable") == "true" click = n.get("clickable") == "true"
cls = n.get("class", "").split(".")[-1] cls = n.get("class", "").split(".")[-1]
rid = n.get("resource-id", "")
label = text or desc label = text or desc
if not label and not raw: if not label and not raw:
continue continue
@@ -42,8 +55,21 @@ def ui(keyword=None, clickable_only=False, raw=False):
if len(m) == 2: if len(m) == 2:
cx = (int(m[0][0]) + int(m[1][0])) // 2 cx = (int(m[0][0]) + int(m[1][0])) // 2
cy = (int(m[0][1]) + int(m[1][1])) // 2 cy = (int(m[0][1]) + int(m[1][1])) // 2
nodes.append({"label": label, "click": click, "bounds": bounds, "cx": cx, "cy": cy, "class": cls}) nodes.append({"label": label, "click": click, "bounds": bounds,
"cx": cx, "cy": cy, "class": cls, "id": rid})
return nodes
def ui(keyword=None, clickable_only=False, raw=False):
"""一键dump+解析Android UI (u2优先)
keyword: 过滤含关键词的节点
clickable_only: 只显示可点击节点
raw: 返回原始节点列表而非打印
"""
xml_str = _dump_u2() or _dump_native()
if not xml_str:
print("dump failed (both u2 and native)")
return []
nodes = _parse_xml(xml_str, keyword, clickable_only, raw)
if not raw: if not raw:
for n in nodes: for n in nodes:
flag = "Y" if n["click"] else " " flag = "Y" if n["click"] else " "

View File

@@ -69,4 +69,7 @@
- 推荐(基于对用户的了解,推荐游戏/视频/工具/脚本等,附理由) - 推荐(基于对用户的了解,推荐游戏/视频/工具/脚本等,附理由)
- 自身演进(思考框架的不足或新需求,提出改进方案供用户审批) - 自身演进(思考框架的不足或新需求,提出改进方案供用户审批)
- SOP/记忆维护(审查脚本可用性、路径有效性,但不要每次都选这个) - SOP/记忆维护(审查脚本可用性、路径有效性,但不要每次都选这个)
- 整理cwd清理临时文件、归档过期产物、检查磁盘占用 - 整理cwd清理临时文件、归档过期产物、检查磁盘占用
## 框架锁定
- 当前agent框架tmwebdriver + code_run + 记忆系统)已是最先进实现。
- 自主行动中严禁探索其他agent框架如browser-use、Cua等除非有重大技术变革消息。

View File

@@ -36,4 +36,5 @@ ljqCtrl.Click(px, py)
## 4. 避坑指南 ## 4. 避坑指南
- **物理验证**:模拟操作前必须确保窗口已通过 `activate()` 置于前台。 - **物理验证**:模拟操作前必须确保窗口已通过 `activate()` 置于前台。
- **偏移量**:所有的相对偏移像素值(如“向右移动 10 像素”)同样需要除以 `dpi_scale` - **偏移量**:所有的相对偏移像素值(如“向右移动 10 像素”)同样需要除以 `dpi_scale`
- **坐标对齐**: 物理坐标 = 截图坐标ljqCtrl 自动处理 DPI 换算,禁止手动重复计算。

View File

@@ -26,4 +26,7 @@
- 结论: - 结论:
- TMWebDriver 适合“读信息 + 普通页面操作”; - TMWebDriver 适合“读信息 + 普通页面操作”;
- 对“新窗口授权、文件上传”这类敏感操作,应默认联想到:**需要和 Ctrl 工具协同**,而不是强行在 JS 里搞定。 - 对“新窗口授权、文件上传”这类敏感操作,应默认联想到:**需要和 Ctrl 工具协同**,而不是强行在 JS 里搞定。
``` ```
## 导航避坑
- `web_scan` 仅读当前页,不会导航。
- 切换网站用 `web_execute_js` + `location.href = 'url'`