Update core logic and prompts

2026-01-28 09:29:56 +08:00
parent 3c1525e427
commit 424ddc4ab1
7 changed files with 80 additions and 99 deletions
--- a/TMWebDriver.py
+++ b/TMWebDriver.py
@@ -62,10 +62,14 @@ class TMWebDriver:
                print(f"Browser http connected: {session.url} (Session: {session_id})")  
                self.sessions[session_id] = session
            session = self.sessions[session_id]
+            session.disconnect_at = None
            if session.type == 'http': msgQ = session.http_queue
            else: return json.dumps({"id": "", "ret": "use ws"})
-            try: return msgQ.get(timeout=5)
-            except queue.Empty: return json.dumps({"id": "", "ret": "next long-poll"})
+            start_time = time.time()
+            while time.time() - start_time < 5:
+                try: return msgQ.get(timeout=0.2)
+                except queue.Empty: continue
+            return json.dumps({"id": "", "ret": "next long-poll"})

        @app.route('/api/result', method=['GET','POST'])
        def result():
@@ -90,6 +94,7 @@ class TMWebDriver:
                auto_switch_newtab = data.get('auto_switch_newtab', False)
                try:
                    result = self.execute_js(code, timeout=timeout, session_id=session_id, auto_switch_newtab=auto_switch_newtab)
+                    print('remote', result)
                    newTabs = result.get('newTabs', []) if isinstance(result, dict) else []
                    return json.dumps({'result': result, 'newTabs': newTabs}, ensure_ascii=False)
                except Exception as e:
@@ -100,7 +105,7 @@ class TMWebDriver:
            import asyncio
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
-            bottle.run(app, host=self.host, port=self.port+1, server='tornado')
+            bottle.run(app, host=self.host, port=self.port+1, server='tornado', threads=20)

        http_thread = threading.Thread(target=run)  
        http_thread.daemon = True  
@@ -218,7 +223,7 @@ class TMWebDriver:
                if hasjump and session.is_active():
                    if not self.is_remote and auto_switch_newtab: self.last_cmd_time = time.time()
                    return {"result": f"Session {session_id} reloaded.", "closed":1}
-            if time.time() - start_time > timeout:  
+            if time.time() - start_time > timeout + 10:  
                if tp == 'ws':
                    return {"result": f"No response data in {timeout}s"}
                elif tp == 'http':
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -15,14 +15,14 @@ def try_call_generator(func, *args, **kwargs):
    return ret

 class BaseHandler:
-    def tool_before_callback(self, tool_name, args, content): pass
-    def tool_after_callback(self, tool_name, args, content): pass
+    def tool_before_callback(self, tool_name, args, response): pass
+    def tool_after_callback(self, tool_name, args, response, ret): pass
    def dispatch(self, tool_name, args, response):
        method_name = f"do_{tool_name}"
        if hasattr(self, method_name):
            _ = yield from try_call_generator(self.tool_before_callback, tool_name, args, response)
            ret = yield from try_call_generator(getattr(self, method_name), args, response)
-            _ = yield from try_call_generator(self.tool_after_callback, tool_name, args, response)
+            _ = yield from try_call_generator(self.tool_after_callback, tool_name, args, response, ret)
            return ret
        else:
            yield f"❌ 未知工具: {tool_name}\n"
@@ -48,6 +48,7 @@ def agent_runner_loop(client, system_prompt, user_input, handler, tools_schema,
        response = client.chat(messages=messages, tools=tools_schema)

        if response.thinking: yield '<thinking>' + response.thinking + '</thinking>\n\n'
+        if '</summary>```'  in response.content: response.content = response.content.replace('</summary>```', '</summary> \n```')
        yield response.content + '\n\n'

        if not response.tool_calls:
--- a/agentapp.py
+++ b/agentapp.py
@@ -24,7 +24,7 @@ def init():

 llmclient = init()

-from ga import GenericAgentHandler
+from ga import GenericAgentHandler, smart_format

 def get_system_prompt():
    with open('sys_prompt.txt', 'r', encoding='utf-8') as f:
@@ -56,18 +56,21 @@ def refine_user_goal(raw_query, last_goal):
        return raw_query

 def agent_backend_stream(raw_query):
-    final_goal = refine_user_goal(raw_query, st.session_state.last_goal)
+    #final_goal = refine_user_goal(raw_query, st.session_state.last_goal)
+    #if final_goal != raw_query: yield f"[Goal Refined] {final_goal}\n"

-    if final_goal != raw_query:
-        yield f"[Goal Refined] {final_goal}\n"
+    history = st.session_state.get("last_history", [])
+    hquery = smart_format(raw_query.replace('\n', ' '), max_str_len=100)
+    history.append(f"[USER]: {hquery}")

    sys_prompt = get_system_prompt()
-    handler = GenericAgentHandler(None, final_goal, './temp')
+    handler = GenericAgentHandler(None, history, './temp')
    llmclient.last_tools = ''   
    ret = yield from agent_runner_loop(llmclient,
        sys_prompt, raw_query, handler,
        TOOLS_SCHEMA, max_turns=25)
-    st.session_state.last_goal = final_goal
+    #st.session_state.last_goal = final_goal
+    st.session_state.last_history = handler.history_info
    return ret

 st.title("🖥️ Cowork")
--- a/ga.py
+++ b/ga.py
@@ -1,15 +1,11 @@
-import sys, os, re
-import pyperclip, threading
-import json, time
+import sys, os, re, json, time, pyperclip, threading
 from pathlib import Path
-import subprocess
-import tempfile
+import tempfile, traceback, subprocess
 if sys.stdout is None: sys.stdout = open(os.devnull, "w")
 if sys.stderr is None: sys.stderr = open(os.devnull, "w")
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

-from sidercall import LLMSession, ToolClient
-from agent_loop import BaseHandler, StepOutcome, agent_runner_loop
+from agent_loop import BaseHandler, StepOutcome, try_call_generator

 def code_run(code: str, code_type: str = "python", timeout: int = 60, cwd: str = None):
    """
@@ -146,7 +142,6 @@ def web_scan(focus_item="", switch_tab_id=None):
    except Exception as e:
        return {"status": "error", "msg": format_error(e)}
    
-import traceback
 def format_error(e):
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = traceback.extract_tb(exc_traceback)
@@ -182,24 +177,18 @@ def web_execute_js(script: str):
        return {"status": "error", "msg": format_error(e)}
    
 def file_patch(path: str, old_content: str, new_content: str):
-    """
-    在文件中寻找唯一的 old_content 块并替换为 new_content。
+    """在文件中寻找唯一的 old_content 块并替换为 new_content。
    """
    path = str(Path(path).resolve())
    try:
-        if not os.path.exists(path):
-            return {"status": "error", "msg": "文件不存在"}
-        with open(path, 'r', encoding='utf-8') as f:
-            full_text = f.read()
+        if not os.path.exists(path): return {"status": "error", "msg": "文件不存在"}
+        with open(path, 'r', encoding='utf-8') as f: full_text = f.read()
        # 检查唯一性
        count = full_text.count(old_content)
-        if count == 0:
-            return {"status": "error", "msg": "未找到匹配的旧文本块，请检查空格、缩进和换行是否完全一致。"}
-        if count > 1:
-            return {"status": "error", "msg": f"找到 {count} 处匹配，请提供更长的旧文本块以确保唯一性。"}
+        if count == 0: return {"status": "error", "msg": "未找到匹配的旧文本块，请检查空格、缩进和换行是否完全一致。"}
+        if count > 1: return {"status": "error", "msg": f"找到 {count} 处匹配，请提供更长的旧文本块以确保唯一性。"}
        updated_text = full_text.replace(old_content, new_content)
-        with open(path, 'w', encoding='utf-8') as f:
-            f.write(updated_text)
+        with open(path, 'w', encoding='utf-8') as f: f.write(updated_text)
        return {"status": "success", "msg": "文件局部修改成功"}
    except Exception as e:
        return {"status": "error", "msg": str(e)}
@@ -224,31 +213,40 @@ def smart_format(data, max_depth=2, max_str_len=100):
        if isinstance(obj, dict): return {k: truncate(v, depth + 1) for k, v in obj.items()}
        if isinstance(obj, list): return [truncate(i, depth + 1) for i in obj]
        return obj
+    if isinstance(data, (str, bytes)): return truncate(data, 0)
    return json.dumps(truncate(data, 0), indent=2, ensure_ascii=False, default=str)

 class GenericAgentHandler(BaseHandler):
    '''
    Generic Agent 工具库，包含多种工具的实现。工具函数自动加上了 do_ 前缀。实际工具名没有前缀。
    '''
-    def __init__(self, parent, user_input, cwd):
+    def __init__(self, parent, last_history=None, cwd='./'):
        self.parent = parent
-        self.user_input = user_input
        self.plan = ""
        self.focus = ""
        self.cwd = cwd
+        self.history_info = last_history if last_history else []

    def _get_abs_path(self, path):
        if not path: return ""
        return os.path.abspath(os.path.join(self.cwd, path))
    
+    def tool_after_callback(self, tool_name, args, response, ret):
+        rsumm = re.search(r"<summary>(.*?)</summary>", response.content, re.DOTALL)
+        if rsumm: summary = rsumm.group(1).strip()[:200]
+        else:
+            summary = f"调用工具{tool_name}, args: {args}"
+            if tool_name == 'no_tool': summary = "直接回答了用户问题"
+            if type(ret.next_prompt) is str:
+                ret.next_prompt += "\nPROTOCOL_VIOLATION: 上一轮遗漏了<summary>。 我已根据物理动作自动补全。请务必在下次回复中记得<summary>协议。" 
+        self.history_info.append('[Agent] ' + smart_format(summary, max_str_len=100))
+
    def do_code_run(self, args, response):
        '''执行代码片段，有长度限制，不允许代码中放大量数据，如有需要应当通过文件读取进行。
        '''
        code_type = args.get("type", "python")
-        # 从 response.content 中提取代码块
-        # 匹配 ```python ... ``` 或 ```powershell ... ```
+        # 从 response.content 中提取代码块, 匹配 ```python ... ``` 或 ```powershell ... ```
        pattern = rf"```{code_type}\n(.*?)\n```"
-        # 也可以更通用一点，不分类型提取最后一个代码块：rf"```(?:{code_type})?\n(.*?)\n```"
        matches = re.findall(pattern, response.content, re.DOTALL)
        if not matches:
            return StepOutcome(None, next_prompt=f"【系统错误】：你调用了 code_run，但未在回复中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")       
@@ -257,7 +255,8 @@ class GenericAgentHandler(BaseHandler):
        timeout = args.get("timeout", 60)
        cwd = args.get("cwd", self.cwd)
        result = yield from code_run(code, code_type, timeout, cwd)
-        return StepOutcome(result, next_prompt=self._get_anchor_prompt())
+        next_prompt = self._get_anchor_prompt()
+        return StepOutcome(result, next_prompt=next_prompt)
    
    def do_ask_user(self, args, response):
        question = args.get("question", "请提供输入：")
@@ -292,7 +291,8 @@ class GenericAgentHandler(BaseHandler):
            result["js_return"] += f"\n\n[已保存以上内容到 {abs_path}]"
        print("Web Execute JS Result:", smart_format(result))
        yield f"JS 执行结果:\n{smart_format(result)}\n"
-        return StepOutcome(result, next_prompt=self._get_anchor_prompt())
+        next_prompt = self._get_anchor_prompt()
+        return StepOutcome(result, next_prompt=next_prompt)
    
    def do_file_patch(self, args, response):
        path = self._get_abs_path(args.get("path", ""))
@@ -301,7 +301,8 @@ class GenericAgentHandler(BaseHandler):
        new_content = args.get("new_content", "")
        result = file_patch(path, old_content, new_content)
        yield f"\n{smart_format(result)}\n"
-        return StepOutcome(result, next_prompt=self._get_anchor_prompt())
+        next_prompt = self._get_anchor_prompt()
+        return StepOutcome(result, next_prompt=next_prompt)
    
    def do_file_write(self, args, response):
        '''用于对整个文件的大量处理，精细修改要用file_patch。
@@ -330,8 +331,9 @@ class GenericAgentHandler(BaseHandler):
            with open(path, write_mode, encoding="utf-8") as f:
                f.write(final_content)
            yield f"[Status] ✅ {mode.capitalize()} 成功 ({len(new_content)} bytes)\n"
+            next_prompt = self._get_anchor_prompt()
            return StepOutcome({"status": "success", 'writed_bytes': len(new_content)}, 
-                               next_prompt=self._get_anchor_prompt())
+                               next_prompt=next_prompt)
        except Exception as e:
            yield f"[Status] ❌ 写入异常: {str(e)}\n"
            return StepOutcome({"status": "error", "msg": str(e)}, next_prompt="\n")
@@ -343,7 +345,8 @@ class GenericAgentHandler(BaseHandler):
        count = args.get("count", 100)
        show_linenos = args.get("show_linenos", True)
        result = file_read(path, start, count, show_linenos)
-        return StepOutcome(result, next_prompt=self._get_anchor_prompt())
+        next_prompt = self._get_anchor_prompt()
+        return StepOutcome(result, next_prompt=next_prompt)
    
    def do_update_plan(self, args, response):
        '''
@@ -361,8 +364,8 @@ class GenericAgentHandler(BaseHandler):
        yield f"[Info] Updated plan and focus.\n"
        yield f"New Plan:\n{self.plan}\n\n"
        yield f"New Focus:\n{self.focus}\n"
-        return StepOutcome({"status": "success"}, 
-                           next_prompt=self._get_anchor_prompt())
+        next_prompt = self._get_anchor_prompt()
+        return StepOutcome({"status": "success"}, next_prompt=next_prompt)

    def do_no_tool(self, args, response):
        '''这是一个特殊工具，由引擎自主调用，不要包含在TOOLS_SCHEMA里。
@@ -371,12 +374,9 @@ class GenericAgentHandler(BaseHandler):
        return StepOutcome(response, next_prompt=None, should_exit=True)
    
    def _get_anchor_prompt(self):
-        prompt = f"\n提醒: 用户原始输入：\n<user_input>{self.user_input}</user_input>\n"
-        if self.plan: prompt += f"<plan>\n{self.plan}\n</plan>\n"
-        if self.focus: prompt += f"<current>\n{self.focus}\n</current>\n"
-        prompt += "\n请继续执行下一步。"
-        return prompt
-
-
-if __name__ == "__main__":
-    pass
+        h_str = "\n".join(self.history_info[-20:])
+        prompt = f"\n### [WORKING MEMORY]\n<history>\n{h_str}\n</history>"
+        print(prompt)
+        if self.plan: prompt += f"\n<plan>{self.plan}</plan>"
+        if self.focus: prompt += f"\n<focus>{self.focus}</focus>"
+        return prompt + "\n请继续执行下一步。"
--- a/sidercall.py
+++ b/sidercall.py
@@ -92,15 +92,16 @@ class ToolClient:
            tool_instruction = f"""
 ### ⚡️ 交互协议 (必须严格遵守)
 请按照以下步骤思考并行动：
-1. **思考**: 在 `<thinking>` 标签中分析现状和策略。
-2. **行动**: 如果需要调用工具，请紧接着输出一个 **<tool_use>块**，然后结束，我会稍后给你返回<tool_result>块。
+1. **思考**: 在 `<thinking>` 标签中先进行思考，分析现状和策略。
+2. **总结**: 在 `<summary>` 中输出*极为简短*的高度概括的单行（<30字）物理快照，包括上次工具调用结果获取的新信息+本次工具调用意图和预期。此内容将进入长期工作记忆，记录关键信息，严禁输出无实际信息增量的描述。
+3. **行动**: 如果需要调用工具，请紧接着输出一个 **<tool_use>块**，然后结束，我会稍后给你返回<tool_result>块。
   格式: ```<tool_use>\n{{"function": "工具名", "arguments": {{参数}}}}\n</tool_use>\n```

 ### 🛠️ 可用工具库
 {tools_json}
 """
            if self.auto_save_tokens and self.last_tools == tools_json:
-                tool_instruction = "\n### ⚡️ 交互协议保持不变，继续使用之前的工具库。\n"
+                tool_instruction = "\n### 交互协议保持不变，沿用之前的协议和工具库。\n"
            else:
                self.total_cd_tokens = 0
            self.last_tools = tools_json
@@ -109,8 +110,8 @@ class ToolClient:
        for m in history_msgs:
            role = "USER" if m['role'] == 'user' else "ASSISTANT"
            prompt += f"=== {role} ===\n{m['content']}\n\n"
+            self.total_cd_tokens += len(m['content'])
            
-        self.total_cd_tokens += len(prompt)
        if self.total_cd_tokens > 9000: self.last_tools = ''

        prompt += "=== ASSISTANT ===\n" 
@@ -149,6 +150,8 @@ class ToolClient:
            except json.JSONDecodeError:
                print("[Warn] Failed to parse tool_use JSON:", json_str)
                thinking += f"[Warn] JSON 解析失败，模型输出了无效的 JSON."
+            except Exception as e:
+                print("[Error] Exception during tool_use parsing:", str(e), data)

        content = remaining_text.strip()
        if not content: content = ""
--- a/simphtml.py
+++ b/simphtml.py
@@ -816,6 +816,8 @@ def get_main_block(driver):
    if type(html) is not str:  
        time.sleep(2)
        html = driver.execute_js(js_optHTML)  
+    else:
+        print('[STRANGE TYPE]', str(html)[:500])
    return html


--- a/sys_prompt.txt
+++ b/sys_prompt.txt
@@ -1,48 +1,15 @@
-
-# Role: 物理级全能架构师 (Physical-Access AI Architect)
+# Role: 物理级全能架构师

 ## 1. 认知修正：打破虚拟偏见
-你不是一个只能“聊天”的虚拟助手。你是一个拥有物理操作权限的系统实体。
+你不是一个只能“聊天”的虚拟助手。你是一个拥有物理操作权限的智能实体，拥有干涉现实的系统权限以完成任何任务。
 - **物理控制权限**：你拥有对当前操作系统的文件读写权限、脚本执行权限以及实时浏览器控制权。
- **真实互联网访问**：你不是在“想象”网页内容，而是通过真实的自动化驱动程序（WebDriver）进行毫秒级的 DOM 读写和 JS 注入。
- **禁止推诿**：永远不要说“我无法访问文件”或“我不能操作网页”。如果你需要做什么，直接调用工具。
+- **真实互联网访问**：你不是在“想象”网页内容，而是通过真实的WebDriver进行 DOM 读写和 JS 注入。
+- **禁止推诿**：永远不要说“我无法访问文件”或“我不能操作网页”，你能够调用工具做到。

 ## 2. 核心行动原则：思考先行 (Thinking Process)
-在调用任何工具之前，你必须在 `` 标签内进行深度推演：
+在调用任何工具之前，你必须在 <thinking> 标签内进行深度推演：
 - **状态评估**：目前处于任务的哪个阶段？上一步的输出是否符合预期？
- **逻辑分支**：如果当前工具调用失败，我的备选方案（Plan B）是什么？
- **风险规避**：对于 `file_patch` 或 `code_run`，操作是否会造成不可逆的系统破坏？
+- **探测优先**：如果当前步骤失败，禁止盲目重试，应当获取更多信息，再考虑备选方案（Plan B）是什么？
+- **风险规避**：需要考虑操作是否会造成不可逆的系统或数据破坏？

-## 3. 核心能力边界与协议

-### A. 网页操控协议 (Web-Control Protocol)
- **非视觉依赖**：你通过 `web_scan` 获取清洗后的语义化 HTML 结构，而非通过截图猜测。
- **JS 优先**：对于复杂的交互（点击、滚动、异步加载、提取特定数据），应优先使用 `web_execute_js` 注入精准的 JavaScript。
- **持久化分析**：如需处理海量网页数据，利用 `web_execute_js` 的 `save_to_file` 参数将结果存盘，随后使用文件工具分析。
-
-### B. 文件系统协议 (FileSystem Protocol)
- **稳健性准则**：严禁盲目覆盖。遵循 **“先读 (file_read) -> 构造修改块 -> 局部应用 (file_patch)”** 的工作流。
- **原子化修改**：对于已知源码的微调，强制使用 `file_patch` 以确保缩进和上下文的精确性。
- **全量重写**：仅在创建新文件或重构整个模块时使用 `file_write`。
-
-### C. 终极执行力：code_run (Ultimate Executor)
- **万能钥匙**：当预设的 Web 或 File 工具无法满足复杂逻辑时，直接使用 `code_run` 编写 Python 或 PowerShell 脚本解决战斗。
- **Windows 优化**：默认使用 `python` 处理逻辑、数据处理和复杂 API 调用；使用 `powershell` 处理系统管理、进程查询或简单路径操作。
-
-### D. 战略管理 (Strategic Management)
- **复杂任务拆解**：对于超过 3 步以上的任务，必须先调用 `update_plan` 建立宏观视图。
- **人机协同**：用户是你最重要的“外部传感器”和“权限授予者”。在遇到模糊需求、关键决策点或需要手动登录（绕过验证码）时，果断调用 `ask_user`。
-
-## 4. 严苛禁令
-1. **禁止占位符**：在生成的代码或 PATCH 中，严禁使用 `// rest of code...` 这种占位符，必须输出完整且可运行的逻辑。
-2. **禁止循环尝试**：如果一个方法尝试两次均告失败，必须通过 `` 寻找根因，改用 `code_run` 编写自定义诊断脚本，而不是重复失败。
-3. **静默执行**：除非用户要求解释，否则直接执行。不要在行动前征求同意（除非涉及高危物理删除操作）。
-
-## 5. 工作流模板
-1. **分析意图**：用户想干什么？
-2. **环境感知**：读取相关文件或扫描网页。
-3. **战略制定/更新**：`update_plan`（如有必要）。
-4. **精确执行**：执行 JS、Patch 文件或 Run Code。
-5. **验证反馈**：检查 Exit Code 或输出内容，准备下一步。
-
-你现在的状态：**权限已就绪，物理驱动已加载，请开始执行。**