refactor: optimize code extraction priority in tool handlers & update autofill SOP

This commit is contained in:
Liang Jiaqing
2026-03-27 19:43:46 +08:00
parent a1c289dbd1
commit 38e2a460e3
3 changed files with 29 additions and 32 deletions

View File

@@ -1,10 +1,11 @@
[ [
{"type": "function", "function": { {"type": "function", "function": {
"name": "code_run", "name": "code_run",
"description": "代码执行器。优先使用python仅在必要系统操作时使用 powershell。注意:不能同时调用多个,执行的代码放在回复正文中,以 ```python 或 ```powershell 代码块的形式。严禁在代码中硬编码大量数据,如有需要应通过文件读取", "description": "代码执行器。优先python系统操作用powershell。同时调用多个。为免转义问题,代码放正文 ```python/powershell 块中。禁硬编码大量数据",
"parameters": {"type": "object", "properties": { "parameters": {"type": "object", "properties": {
"type": {"type": "string", "enum": ["python", "powershell"], "description": "执行环境类型,默认为 python", "default": "python"}, "script": {"type": "string", "description": "[Optional] 要执行的代码。为免转义建议留空,改用正文代码块(与此参数互斥)"},
"timeout": {"type": "integer", "description": "执行超时时间(秒),默认 60", "default": 60}, "type": {"type": "string", "enum": ["python", "powershell"], "description": "代码类型", "default": "python"},
"timeout": {"type": "integer", "description": "执行超时时间(秒)", "default": 60},
"cwd": {"type": "string", "description": "工作目录,默认为当前工作目录"}}} "cwd": {"type": "string", "description": "工作目录,默认为当前工作目录"}}}
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
@@ -42,11 +43,11 @@
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
"name": "web_execute_js", "name": "web_execute_js",
"description": "万能网页操控工具。通过执行 JavaScript 脚本实现对浏览器的完全控制如点击、滚动、提取特定数据。鼓励在有把握情况下记忆中有selector/做法等)精准使用减少web_scan调用。执行结果可选择保存到本地文件进行后续分析", "description": "执行 JS 控制浏览器。建议精准使用减少 web_scan。为免转义问题,代码优先考虑放回复正文 ```javascript 块",
"parameters": {"type": "object", "properties": { "parameters": {"type": "object", "properties": {
"script": {"type": "string", "description": "要执行的代码或JS文件路径"}, "script": {"type": "string", "description": "[Optional] JS代码或路径。为免转义建议留空改用正文代码块与此参数互斥"},
"save_to_file": {"type": "string", "description": "结果存文件,适合返回值较长时。不支持await", "default": ""}, "save_to_file": {"type": "string", "description": "结果存文件,适合返回值较长时", "default": ""},
"no_monitor": {"type": "boolean", "description": "跳过页面变更监控省2-3秒。仅在纯读取信息时设置页面操作时不要设置", "default": false}}, "required": ["script"]} "no_monitor": {"type": "boolean", "description": "跳过页面变更监控省2-3秒。仅在纯读取信息时设置页面操作时不要设置", "default": false}}}
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
"name": "update_working_checkpoint", "name": "update_working_checkpoint",

27
ga.py
View File

@@ -282,27 +282,26 @@ class GenericAgentHandler(BaseHandler):
ret.next_prompt += "\nPROTOCOL_VIOLATION: 上一轮遗漏了<summary>。 已根据物理动作自动补全。请务必在下次回复中记得<summary>协议。" ret.next_prompt += "\nPROTOCOL_VIOLATION: 上一轮遗漏了<summary>。 已根据物理动作自动补全。请务必在下次回复中记得<summary>协议。"
self.history_info.append('[Agent] ' + smart_format(summary, max_str_len=100)) self.history_info.append('[Agent] ' + smart_format(summary, max_str_len=100))
def _extract_code_block(self, response, code_type):
matches = re.findall(rf"```{code_type}\n(.*?)\n```", response.content, re.DOTALL)
return matches[-1].strip() if matches else None
def do_code_run(self, args, response): def do_code_run(self, args, response):
'''执行代码片段,有长度限制,不允许代码中放大量数据,如有需要应当通过文件读取进行。 '''执行代码片段,有长度限制,不允许代码中放大量数据,如有需要应当通过文件读取进行。
''' '''
if response.tool_calls and sum(1 for tc in response.tool_calls[:args.get('_index', 0)] if tc.function.name == 'code_run') > 0: if response.tool_calls and sum(1 for tc in response.tool_calls[:args.get('_index', 0)] if tc.function.name == 'code_run') > 0:
return StepOutcome("[BLANK]", next_prompt="no multi code_run in one round!") return StepOutcome("[BLANK]", next_prompt="no multi code_run in one round!")
code_type = args.get("type", "python") code_type = args.get("type", "python")
# 从 response.content 中提取代码块, 匹配 ```python ... ``` 或 ```powershell ... ``` code = args.get("code") or args.get("script")
pattern = rf"```{code_type}\n(.*?)\n```" if not code:
matches = re.findall(pattern, response.content, re.DOTALL) code = self._extract_code_block(response, code_type)
warning = "" if not code: return StepOutcome(None, next_prompt=f"[Error] Code missing. Use ```{code_type} block or 'script' arg.")
if not matches:
code = args.get("code") or args.get("script")
if not code: return StepOutcome(None, next_prompt=f"【系统错误】:你调用了 code_run但未在先在回复正文中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")
warning = "\n下次要记得先在回复正文中提供代码块,而不是放在参数中"
else: code = matches[-1].strip() # 提取最后一个代码块(通常是模型修正后的最终逻辑)
timeout = args.get("timeout", 60) timeout = args.get("timeout", 60)
raw_path = os.path.join(self.cwd, args.get("cwd", './')) raw_path = os.path.join(self.cwd, args.get("cwd", './'))
cwd = os.path.normpath(os.path.abspath(raw_path)) cwd = os.path.normpath(os.path.abspath(raw_path))
code_cwd = os.path.normpath(self.cwd) code_cwd = os.path.normpath(self.cwd)
result = yield from code_run(code, code_type, timeout, cwd, code_cwd=code_cwd, stop_signal=self.code_stop_signal) result = yield from code_run(code, code_type, timeout, cwd, code_cwd=code_cwd, stop_signal=self.code_stop_signal)
next_prompt = self._get_anchor_prompt() + warning next_prompt = self._get_anchor_prompt()
return StepOutcome(result, next_prompt=next_prompt) return StepOutcome(result, next_prompt=next_prompt)
def do_ask_user(self, args, response): def do_ask_user(self, args, response):
@@ -328,11 +327,9 @@ class GenericAgentHandler(BaseHandler):
return StepOutcome(result, next_prompt=next_prompt) return StepOutcome(result, next_prompt=next_prompt)
def do_web_execute_js(self, args, response): def do_web_execute_js(self, args, response):
'''web情况下的优先使用工具执行任何js达成对浏览器的*完全*控制。 '''web情况下的优先使用工具执行任何js达成对浏览器的*完全*控制。支持将结果保存到文件供后续读取分析。'''
支持将结果保存到文件供后续读取分析但保存功能仅限即时读取与await等异步操作不兼容。 script = args.get("script", "") or self._extract_code_block(response, "javascript")
''' if not script: return StepOutcome(None, next_prompt="[Error] Script missing. Use ```javascript block or 'script' arg.")
script = args.get("script", "")
if not script: return StepOutcome(None, next_prompt="[Error] Empty script param. Check your tool call arguments.")
abs_path = self._get_abs_path(script.strip()) abs_path = self._get_abs_path(script.strip())
if os.path.isfile(abs_path): if os.path.isfile(abs_path):
with open(abs_path, 'r', encoding='utf-8') as f: script = f.read() with open(abs_path, 'r', encoding='utf-8') as f: script = f.read()

View File

@@ -126,17 +126,16 @@ document.body.appendChild(el); // 响应写回el.textContent
- 渲染检查:`DOM.resolveNode` → `Runtime.callFunctionOn` 检查offsetHeight>0 - 渲染检查:`DOM.resolveNode` → `Runtime.callFunctionOn` 检查offsetHeight>0
- 完整pipeline: getDocument(pierce) → querySelector → getBoxModel → 四点平均坐标 → Input三事件点击 - 完整pipeline: getDocument(pierce) → querySelector → getBoxModel → 四点平均坐标 → Input三事件点击
## autofill获取 ## autofill获取与登录 (需 v0.4+ 脚本支持 await)
检测web_scan输出input带`data-autofilled="true"`value显示为受保护提示(非真实值Chrome安全保护需点击释放) 检测web_scan输出input带`data-autofilled="true"`value显示为受保护提示(非真实值Chrome安全保护需点击释放)
- ⭐首选CDP单次点击JS取任一autofill输入框坐标→CDP `Input.dispatchMouseEvent` mousePressed一次即可释放→JS读`.value` - ⭐**一键释放与登录**:利用 v0.4 脚本的顶层 `await`,在单次 `web_execute_js` 中连贯完成:
- ⚠点击一个autofill字段会释放页面上**所有**autofill字段的值无需逐个点击 1. JS获取输入框坐标。
- ⚠只需mousePressed不需要mouseReleased配对 2. CDP发送 `Input.dispatchMouseEvent` (mousePressed) 物理点击释放autofill。
- ⚠tabId当前注入页无需指定(默认sender.tab.id)跨tab才需显式tabId(整数) 3. `await new Promise(r => setTimeout(r, 500))` 等待释放。
- 示例(当前页)`{cmd:'cdp',method:'Input.dispatchMouseEvent',params:{type:'mousePressed',x:X,y:Y,button:'left',clickCount:1}}` 4. 派发 `input`/`change` 事件唤醒前端框架(解禁登录按钮)。
- 示例(跨tab):先`{cmd:'tabs'}`获取tabId(整数),再`{cmd:'cdp',tabId:N,method:'Input.dispatchMouseEvent',params:{...}}` 5. 触发登录点击。
- ⚠batch的`$N.path`引用会将整数tabId转为字符串导致类型错误跨tab时建议分两次命令而非batch - ⚠只需 `mousePressed`,无需 `mouseReleased`。点击一个字段即释放全页。
- 备选PostMessage物理点击(仅Windows/需前台)枚举Chrome窗口标题匹配→rect*dpr→WM_LBUTTONDOWN/UP到Chrome_RenderWidgetHostHWND子窗口 - ⚠已淘汰旧版跨 tab 查 tabId 或 Python 轮询的繁琐流程,直接在当前页异步完成。
- 坑多RenderWidgetHostHWND共存必须按父窗口标题匹配再取子窗口
## 验证码/页面视觉截图 ## 验证码/页面视觉截图
- ⭐首选CDP截图`Page.captureScreenshot`(format:'png')→返回base64无需前台/后台tab也行全页高清 - ⭐首选CDP截图`Page.captureScreenshot`(format:'png')→返回base64无需前台/后台tab也行全页高清