refactor: optimize code extraction priority in tool handlers & update autofill SOP

2026-03-27 19:43:46 +08:00
parent a1c289dbd1
commit 38e2a460e3
3 changed files with 29 additions and 32 deletions
--- a/assets/tools_schema.json
+++ b/assets/tools_schema.json
@@ -1,10 +1,11 @@
 [
  {"type": "function", "function": {
    "name": "code_run",
-    "description": "代码执行器。优先使用python，仅在必要系统操作时使用 powershell。注意：不能同时调用多个，执行的代码放在回复正文中，以 ```python 或 ```powershell 代码块的形式。严禁在代码中硬编码大量数据，如有需要应通过文件读取",
+    "description": "代码执行器。优先python，系统操作用powershell。禁同时调用多个。为免转义问题，代码放正文 ```python/powershell 块中。禁硬编码大量数据",
    "parameters": {"type": "object", "properties": {
-      "type": {"type": "string", "enum": ["python", "powershell"], "description": "执行环境类型，默认为 python", "default": "python"},
+      "script": {"type": "string", "description": "[Optional] 要执行的代码。为免转义建议留空，改用正文代码块（与此参数互斥）"},
-      "timeout": {"type": "integer", "description": "执行超时时间（秒），默认 60", "default": 60},
+      "type": {"type": "string", "enum": ["python", "powershell"], "description": "代码类型", "default": "python"},
      "timeout": {"type": "integer", "description": "执行超时时间（秒）", "default": 60},
      "cwd": {"type": "string", "description": "工作目录，默认为当前工作目录"}}}
  }},
  {"type": "function", "function": {
@@ -42,11 +43,11 @@
  }},
  {"type": "function", "function": {
    "name": "web_execute_js",
-    "description": "万能网页操控工具。通过执行 JavaScript 脚本实现对浏览器的完全控制（如点击、滚动、提取特定数据）。鼓励在有把握情况下（记忆中有selector/做法等）精准使用以减少web_scan调用。执行结果可选择保存到本地文件进行后续分析",
+    "description": "执行 JS 控制浏览器。建议精准使用减少 web_scan。为免转义问题，代码优先考虑放回复正文 ```javascript 块",
    "parameters": {"type": "object", "properties": {
-      "script": {"type": "string", "description": "要执行的代码或JS文件路径"},
+      "script": {"type": "string", "description": "[Optional] JS代码或路径。为免转义建议留空，改用正文代码块（与此参数互斥）"},
-      "save_to_file": {"type": "string", "description": "结果存文件，适合返回值较长时。不支持await", "default": ""},
+      "save_to_file": {"type": "string", "description": "结果存文件，适合返回值较长时", "default": ""},
-      "no_monitor": {"type": "boolean", "description": "跳过页面变更监控，省2-3秒。仅在纯读取信息时设置，页面操作时不要设置", "default": false}}, "required": ["script"]}
+      "no_monitor": {"type": "boolean", "description": "跳过页面变更监控，省2-3秒。仅在纯读取信息时设置，页面操作时不要设置", "default": false}}}
  }},
  {"type": "function", "function": {
    "name": "update_working_checkpoint",
--- a/ga.py
+++ b/ga.py
@@ -282,27 +282,26 @@ class GenericAgentHandler(BaseHandler):
                ret.next_prompt += "\nPROTOCOL_VIOLATION: 上一轮遗漏了<summary>。 已根据物理动作自动补全。请务必在下次回复中记得<summary>协议。" 
        self.history_info.append('[Agent] ' + smart_format(summary, max_str_len=100))
    def _extract_code_block(self, response, code_type):
        matches = re.findall(rf"```{code_type}\n(.*?)\n```", response.content, re.DOTALL)
        return matches[-1].strip() if matches else None
    def do_code_run(self, args, response):
        '''执行代码片段，有长度限制，不允许代码中放大量数据，如有需要应当通过文件读取进行。
        '''
        if response.tool_calls and sum(1 for tc in response.tool_calls[:args.get('_index', 0)] if tc.function.name == 'code_run') > 0:
            return StepOutcome("[BLANK]", next_prompt="no multi code_run in one round!") 
        code_type = args.get("type", "python")
-        # 从 response.content 中提取代码块, 匹配 ```python ... ``` 或 ```powershell ... ```
+        code = args.get("code") or args.get("script")
-        pattern = rf"```{code_type}\n(.*?)\n```"
+        if not code:
-        matches = re.findall(pattern, response.content, re.DOTALL)
+            code = self._extract_code_block(response, code_type)
-        warning = ""
+            if not code: return StepOutcome(None, next_prompt=f"[Error] Code missing. Use ```{code_type} block or 'script' arg.")
        if not matches:
            code = args.get("code") or args.get("script")
            if not code: return StepOutcome(None, next_prompt=f"【系统错误】：你调用了 code_run，但未在先在回复正文中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")
            warning = "\n下次要记得先在回复正文中提供代码块，而不是放在参数中"
        else: code = matches[-1].strip()   # 提取最后一个代码块（通常是模型修正后的最终逻辑）
        timeout = args.get("timeout", 60)
        raw_path = os.path.join(self.cwd, args.get("cwd", './'))
        cwd = os.path.normpath(os.path.abspath(raw_path))
        code_cwd = os.path.normpath(self.cwd)
        result = yield from code_run(code, code_type, timeout, cwd, code_cwd=code_cwd, stop_signal=self.code_stop_signal)
-        next_prompt = self._get_anchor_prompt() + warning
+        next_prompt = self._get_anchor_prompt()
        return StepOutcome(result, next_prompt=next_prompt)
    def do_ask_user(self, args, response):
@@ -328,11 +327,9 @@ class GenericAgentHandler(BaseHandler):
        return StepOutcome(result, next_prompt=next_prompt)
    def do_web_execute_js(self, args, response):
-        '''web情况下的优先使用工具，执行任何js达成对浏览器的*完全*控制。
+        '''web情况下的优先使用工具，执行任何js达成对浏览器的*完全*控制。支持将结果保存到文件供后续读取分析。'''
-        支持将结果保存到文件供后续读取分析，但保存功能仅限即时读取，与await等异步操作不兼容。
+        script = args.get("script", "") or self._extract_code_block(response, "javascript")
-        '''
+        if not script: return StepOutcome(None, next_prompt="[Error] Script missing. Use ```javascript block or 'script' arg.")
        script = args.get("script", "")
        if not script: return StepOutcome(None, next_prompt="[Error] Empty script param. Check your tool call arguments.")
        abs_path = self._get_abs_path(script.strip())
        if os.path.isfile(abs_path):
            with open(abs_path, 'r', encoding='utf-8') as f: script = f.read()
--- a/memory/tmwebdriver_sop.md
+++ b/memory/tmwebdriver_sop.md
@@ -126,17 +126,16 @@ document.body.appendChild(el);  // 响应写回el.textContent
 - 渲染检查：`DOM.resolveNode` → `Runtime.callFunctionOn` 检查offsetHeight>0
 - 完整pipeline: getDocument(pierce) → querySelector → getBoxModel → 四点平均坐标 → Input三事件点击
-## autofill获取
+## autofill获取与登录 (需 v0.4+ 脚本支持 await)
 检测：web_scan输出input带`data-autofilled="true"`，value显示为受保护提示(非真实值，Chrome安全保护需点击释放)
- ⭐首选CDP单次点击：JS取任一autofill输入框坐标→CDP `Input.dispatchMouseEvent` mousePressed一次即可释放→JS读`.value`
+- ⭐**一键释放与登录**：利用 v0.4 脚本的顶层 `await`，在单次 `web_execute_js` 中连贯完成：
-  - ⚠点击一个autofill字段会释放页面上**所有**autofill字段的值，无需逐个点击
+  1. JS获取输入框坐标。
-  - ⚠只需mousePressed，不需要mouseReleased配对
+  2. CDP发送 `Input.dispatchMouseEvent` (mousePressed) 物理点击释放autofill。
-  - ⚠tabId：当前注入页无需指定(默认sender.tab.id)，跨tab才需显式tabId(整数)
+  3. `await new Promise(r => setTimeout(r, 500))` 等待释放。
-  - 示例(当前页)：`{cmd:'cdp',method:'Input.dispatchMouseEvent',params:{type:'mousePressed',x:X,y:Y,button:'left',clickCount:1}}`
+  4. 派发 `input`/`change` 事件唤醒前端框架（解禁登录按钮）。
-  - 示例(跨tab)：先`{cmd:'tabs'}`获取tabId(整数)，再`{cmd:'cdp',tabId:N,method:'Input.dispatchMouseEvent',params:{...}}`
+  5. 触发登录点击。
-  - ⚠batch的`$N.path`引用会将整数tabId转为字符串导致类型错误，跨tab时建议分两次命令而非batch
+- ⚠只需 `mousePressed`，无需 `mouseReleased`。点击一个字段即释放全页。
- 备选PostMessage物理点击(仅Windows/需前台)：枚举Chrome窗口标题匹配→rect*dpr→WM_LBUTTONDOWN/UP到Chrome_RenderWidgetHostHWND子窗口
+- ⚠已淘汰旧版跨 tab 查 tabId 或 Python 轮询的繁琐流程，直接在当前页异步完成。
  - 坑：多RenderWidgetHostHWND共存，必须按父窗口标题匹配再取子窗口
 ## 验证码/页面视觉截图
 - ⭐首选CDP截图：`Page.captureScreenshot`(format:'png')→返回base64，无需前台/后台tab也行，全页高清