feat: stream LLM responses and improve agent UI

2026-02-04 18:56:50 +08:00
parent 541d44567d
commit a9209daaf7
6 changed files with 140 additions and 64 deletions
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -45,14 +45,9 @@ def agent_runner_loop(client, system_prompt, user_input, handler, tools_schema,
    ]
    for turn in range(max_turns):
        yield f"**LLM Running (Turn {turn+1}) ...**\n\n"
-        response = client.chat(messages=messages, tools=tools_schema)
+        response_gen = client.chat(messages=messages, tools=tools_schema)
-
+        response = yield from response_gen
-        if response.thinking: yield '<thinking>' + response.thinking + '</thinking>\n\n'
+        yield '\n\n'
        showcontent = response.content
        if '</summary>' in showcontent: showcontent = showcontent.replace('</summary>', '</summary>\n\n')
        if '</file_content>' in showcontent:
            showcontent = re.sub(r'<file_content>\s*(.*?)\s*</file_content>', r'\n````\n<file_content>\n\1\n</file_content>\n````', showcontent, flags=re.DOTALL)
        yield showcontent + '\n\n'
        if not response.tool_calls:
            tool_name, args = 'no_tool', {}
--- a/agentmain.py
+++ b/agentmain.py
@@ -31,7 +31,7 @@ class GeneraticAgent:
        from sidercall import sider_cookie, oai_apikey, oai_apibase
        llm_sessions = []
        if sider_cookie: llm_sessions += [SiderLLMSession(default_model=x) for x in \
-                                    ["gemini-3.0-flash", "claude-haiku-4.5", "gpt-5-mini"]]
+                                    ["gemini-3.0-flash", "claude-haiku-4.5", "kimi-k2"]]
        if oai_apikey: llm_sessions += [LLMSession(api_key=oai_apikey, api_base=oai_apibase)]
        if len(llm_sessions) > 0: 
            llmclient = ToolClient([x.ask for x in llm_sessions], auto_save_tokens=True)
@@ -48,6 +48,10 @@ class GeneraticAgent:
        self.stop_sig = False
        self.current_source = 'none'
    def next_llm(self):
        self.llm_no = (self.llm_no + 1) % len(self.llmclient.raw_apis)
        self.llmclient.last_tools = ''
    def abort(self):
        if not self.is_running: return
        self.stop_sig = True
@@ -73,17 +77,20 @@ class GeneraticAgent:
            self.llmclient.raw_api = self.llmclient.raw_apis[self.llm_no]
            gen = agent_runner_loop(self.llmclient, sys_prompt, 
                        raw_query, handler, TOOLS_SCHEMA, max_turns=25)
            try:
                full_response = ""
                for chunk in gen:
                    if self.stop_sig: break
                    full_response += chunk
                    self.display_queue.put({'next': full_response, 'source': source})
                if '</summary>' in full_response: full_response = full_response.replace('</summary>', '</summary>\n\n')
                if '</file_content>' in full_response: full_response = re.sub(r'<file_content>\s*(.*?)\s*</file_content>', r'\n````\n<file_content>\n\1\n</file_content>\n````', full_response, flags=re.DOTALL)
                self.display_queue.put({'done': full_response, 'source': source})
                self.history = handler.history_info
            except Exception as e:
                print(f"Backend Error: {format_error(e)}")
-                self.display_queue.put({'done': '异常停止', 'source': source})
+                self.display_queue.put({'done': full_response + f'\n```\n{format_error(e)}\n```', 'source': source})
            finally:
                self.is_running = False
                self.stop_sig = False
--- a/assets/tools_schema.json
+++ b/assets/tools_schema.json
@@ -62,7 +62,7 @@
  }},
  {"type": "function", "function": {
    "name": "conclude_and_reflect",
-    "description": "当模型认为当前任务执行完美，且有具有长期价值的环境事实或用户偏好需要提炼并存入全局记忆时，调用此工具。注意：此工具无参数，调用即代表触发记忆提炼流程。",
+    "description": "当模型认为当前任务（非处理记忆）执行完美，且有具有长期价值的环境事实或用户偏好需要提炼并存入全局记忆时，调用此工具。注意：此工具调用即代表触发记忆提炼流程，如果已经在记忆提炼流程无需调用。",
    "parameters": {"type": "object", "properties": {}}}
  }
 ]
--- a/ga.py
+++ b/ga.py
@@ -267,8 +267,8 @@ class GenericAgentHandler(BaseHandler):
        warning = ""
        if not matches:
            code = args.get("code")
-            if not code: return StepOutcome(None, next_prompt=f"【系统错误】：你调用了 code_run，但未在回复中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")
+            if not code: return StepOutcome(None, next_prompt=f"【系统错误】：你调用了 code_run，但未在先在回复正文中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")
-            warning = "\n下次要记得在回复中提供代码块，而不是放在参数中"
+            warning = "\n下次要记得先在回复正文中提供代码块，而不是放在参数中"
        else: code = matches[-1].strip()   # 提取最后一个代码块（通常是模型修正后的最终逻辑）
        timeout = args.get("timeout", 60)
        raw_path = os.path.join(self.cwd, args.get("cwd", './'))
@@ -395,12 +395,42 @@ class GenericAgentHandler(BaseHandler):
    def do_no_tool(self, args, response):
        '''这是一个特殊工具，由引擎自主调用，不要包含在TOOLS_SCHEMA里。
        当模型在一轮中未显式调用任何工具时，由引擎自动触发。
        二次确认仅在回复几乎只包含<thinking>/<summary>和一段大代码块时触发。
        '''
-        if not response or not getattr(response, 'content', '').strip():
+        content = getattr(response, 'content', '') or ""
        # 1. 空回复保护：要求模型重新生成内容或调用工具
        if not response or not content.strip():
            yield "[Warn] LLM returned an empty response. Retrying...\n"
            next_prompt = "[System] 检测到空回复，请重新生成内容或调用工具。"
            return StepOutcome({}, next_prompt=next_prompt, should_exit=False)
-        yield "[Info] No tool called. Final response to user.\n"
+        # 2. 检测“包含较大代码块但未调用工具”的情况
        # 这里通过三引号代码块 + 最少字符数的方式粗略判断“大段代码”
        code_block_pattern = r"```[a-zA-Z0-9_]*\n[\s\S]{100,}?```"
        m = re.search(code_block_pattern, content)
        if m:
            # 仅当 content 由 <thinking> / <summary> 和该代码块构成时才触发二次确认
            residual = content
            # 去掉代码块本身
            residual = residual.replace(m.group(0), "")
            # 去掉<thinking>和<summary>块（大小写不敏感）
            residual = re.sub(r"<thinking>[\s\S]*?</thinking>", "", residual, flags=re.IGNORECASE)
            residual = re.sub(r"<summary>[\s\S]*?</summary>", "", residual, flags=re.IGNORECASE)
            # 如果去除上述结构后的非空白字符很少，说明没有额外自然语言说明
            clean_residual = re.sub(r"\s+", "", residual)
            if len(clean_residual) <= 50:
                yield "[Info] Detected large code block without tool call and no extra natural language. Requesting clarification.\n"
                next_prompt = (
                    "[System] 检测到你在上一轮回复中主要内容是较大代码块（仅配有<thinking>/<summary>），且本轮未调用任何工具。\n"
                    "如果这些代码需要执行、写入文件或进一步分析，请重新组织回复并显式调用相应工具"
                    "（例如：code_run、file_write、file_patch 等）；\n"
                    "如果只是向用户展示或讲解代码片段，请在回复中补充自然语言说明，"
                    "并明确是否还需要额外的实际操作。"
                )
                return StepOutcome({}, next_prompt=next_prompt, should_exit=False)
        # 3. 正常情况：直接将回复返回给用户并结束循环
        yield "[Info] Final response to user.\n"
        return StepOutcome(response, next_prompt=None, should_exit=True)
    def do_conclude_and_reflect(self, args, response):
@@ -421,14 +451,16 @@ class GenericAgentHandler(BaseHandler):
        print(prompt)
        if self.plan: prompt += f"\n<plan>{self.plan}</plan>"
        if self.focus: prompt += f"\n<focus>{self.focus}</focus>"
-        return prompt + "\n请继续执行下一步。"
+        return prompt
 def get_global_memory():
    prompt = "\n"
    try:
        with open('memory/global_mem_insight.txt', 'r', encoding='utf-8') as f: insight = f.read()
-        prompt += f"\n\n[Global Memory Insight]\n"
+        prompt += f"\n\n[Memory Insight (../memory/global_mem_insight.txt)]\n"
-        prompt += 'IMPORTANT PATHS: ../memory/global_mem.txt (Facts), ../memory/global_mem_insight.txt (Logic), ../ (Your Code Root), ../temp (./, Your default cwd) \n'
+        prompt += 'IMPORTANT PATHS: ../memory/global_mem.txt (Facts), ../ (Your Code Root)\n'
        prompt += f'cwd = {os.path.abspath("./temp")}\n'
        prompt += f'But prefer use relative paths (./ = cwd) to locate.\n'
        prompt += 'MEM_RULE: Insight is the index of Facts. Sync Insight whenever Facts change. For details, read Facts.\n'
        prompt += "EXT: ../memory/ may contain other task-specific memories.\n"
        prompt += insight + "\n"
--- a/sidercall.py
+++ b/sidercall.py
@@ -1,4 +1,4 @@
-import os, json, re, time, requests
+import os, json, re, time, requests, sys
 try: from mykey import sider_cookie
 except ImportError: sider_cookie = ""
@@ -10,12 +10,14 @@ class SiderLLMSession:
        from sider_ai_api import Session
        self._core = Session(cookie=sider_cookie, proxies={'https':'127.0.0.1:2082'})   
        self.default_model = default_model
-    def ask(self, prompt, model=None):
+    def ask(self, prompt, model=None, stream=False):
        if model is None: model = self.default_model
        if len(prompt) > 29000: 
            print(f"[Warn] Prompt too long ({len(prompt)} chars), truncating.")
            prompt = prompt[-29000:]
-        return ''.join(self._core.chat(prompt, model))
+        gen = self._core.chat(prompt, model)
        if stream: return gen
        return ''.join(list(gen))
 class LLMSession:
    def __init__(self, api_key=oai_apikey, api_base=oai_apibase, model=oai_model, context_win=16000):
@@ -28,16 +30,28 @@ class LLMSession:
    def raw_ask(self, messages, model=None, temperature=0.5):
        if model is None: model = self.model
-        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
+        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", "Accept": "text/event-stream"}
        payload = {"model": model, "messages": messages, "temperature": temperature, "stream": True}
        try:
-            response = requests.post(
+            with requests.post(f"{self.api_base}/chat/completions",
-                f"{self.api_base}/chat/completions", headers=headers, timeout=60,
+                            headers=headers, json=payload, stream=True, timeout=(5, 60)) as r:
-                json={"model": model, "messages": messages, "temperature": temperature} )
+                r.raise_for_status()
-            res_json = response.json()
+                buffer = ''
-            content = res_json["choices"][0]["message"]["content"]
+                for line in r.iter_lines():
-            return content
+                    line = line.decode("utf-8")
                    if not line or not line.startswith("data:"): continue
                    data = line[5:].lstrip()
                    if data == "[DONE]": break
                    obj = json.loads(data)
                    ch = (obj.get("choices") or [{}])[0]
                    if ch.get("finish_reason") is not None: break
                    delta = (ch.get("delta") or {}).get("content")
                    if not delta: continue
                    yield delta
                    buffer += delta
                    if '</tool_use>' in buffer[-30:]: break
        except Exception as e:
-            return f"Error: {str(e)}"
+            yield f"Error: {str(e)}"
    def make_messages(self, raw_list, omit_images=True):
        messages = []
@@ -60,22 +74,28 @@ class LLMSession:
        p = "Summarize prev summary and prev conversations into compact memory (facts/decisions/constraints/open questions). Do NOT restate long schemas. The new summary should less than 1000 tokens.\n"
        messages = self.make_messages(old, omit_images=True)
        messages += [{"role":"user", "content":p}]
-        summary = self.raw_ask(messages, model, temperature=0.1)
+        summary = ''.join(list(self.raw_ask(messages, model, temperature=0.1)))
        if not summary.startswith("Error:"): 
            self.raw_msgs.insert(0, {"role":"system", "prompt":"Prev summary:\n"+summary, "image":None})
        else: self.raw_msgs = old + self.raw_msgs   # 不做了，下次再做
-    def ask(self, prompt, model=None, image_base64=None):
+    def ask(self, prompt, model=None, image_base64=None, stream=False):
        if model is None: model = self.model
        self.raw_msgs.append({"role": "user", "prompt": prompt, "image": image_base64})
        messages = self.make_messages(self.raw_msgs[:-1], omit_images=True)
        messages += self.make_messages([self.raw_msgs[-1]], omit_images=False)
        total_len = sum(2000 if isinstance(m["content"], list) else len(str(m["content"]))//4 for m in messages)   # estimate token count
-        content = self.raw_ask(messages, model)
+        gen = self.raw_ask(messages, model)
-        if not content.startswith("Error:"):
+        def _ask_gen():
-            self.raw_msgs.append({"role": "assistant", "prompt": content, "image": None})
+            content = ''
-        if total_len > self.context_win: self.summary_history()
+            for chunk in gen:
-        return content
+                content += chunk; yield chunk
            if not content.startswith("Error:"):
                self.raw_msgs.append({"role": "assistant", "prompt": content, "image": None})
            if total_len > 5000: print(f"[Debug] Whole context length {total_len}.")
            if total_len > self.context_win: self.summary_history()
        if stream: return _ask_gen()
        return ''.join(list(_ask_gen())) 
 class MockFunction:
@@ -109,7 +129,10 @@ class ToolClient:
    def chat(self, messages, tools=None):
        full_prompt = self._build_protocol_prompt(messages, tools)      
        print("Full prompt length:", len(full_prompt))
-        raw_text = self.raw_api(full_prompt)
+        gen = self.raw_api(full_prompt, stream=True)
        raw_text = ''
        for chunk in gen:
            raw_text += chunk; yield chunk
        with open('model_responses.txt', 'a', encoding='utf-8', errors="replace") as f:
            f.write(f"=== Prompt ===\n{full_prompt}\n=== Response ===\n{raw_text}\n\n")
        return self._parse_mixed_response(raw_text)
@@ -127,7 +150,7 @@ class ToolClient:
 请按照以下步骤思考并行动：
 1. **思考**: 在 `<thinking>` 标签中先进行思考，分析现状和策略。
 2. **总结**: 在 `<summary>` 中输出*极为简短*的高度概括的单行（<30字）物理快照，包括上次工具调用结果获取的新信息+本次工具调用意图和预期。此内容将进入长期工作记忆，记录关键信息，严禁输出无实际信息增量的描述。
-3. **行动**: 如果需要调用工具，请紧接着输出一个 **<tool_use>块**，然后结束，我会稍后给你返回<tool_result>块。
+3. **行动**: 如果需要调用工具，请在回复正文之后输出一个 **<tool_use>块**，然后结束，我会稍后给你返回<tool_result>块。
   格式: ```<tool_use>\n{{"function": "工具名", "arguments": {{参数}}}}\n</tool_use>\n```
 ### 可用工具库
@@ -164,7 +187,7 @@ class ToolClient:
        tool_calls = None
        tool_pattern = r"<tool_use>(.*?)</tool_use>"
-        tool_match = re.search(tool_pattern, text, re.DOTALL)
+        tool_match = re.search(tool_pattern, remaining_text, re.DOTALL)
        json_str = ""
        if tool_match:
@@ -173,6 +196,8 @@ class ToolClient:
        elif '<tool_use>' in remaining_text:
            weaktoolstr = remaining_text.split('<tool_use>')[-1].strip()
            json_str = weaktoolstr if weaktoolstr.endswith('}') else ''
            if json_str == '' and '```' in weaktoolstr and weaktoolstr.split('```')[0].strip().endswith('}'):
                json_str = weaktoolstr.split('```')[0].strip()
            remaining_text = remaining_text.replace('<tool_use>'+weaktoolstr, "")
        if json_str:
@@ -184,7 +209,7 @@ class ToolClient:
                if func_name: tool_calls = [MockToolCall(func_name, args)]
            except json.JSONDecodeError:
                print("[Warn] Failed to parse tool_use JSON:", json_str)
-                thinking += f"[Warn] JSON 解析失败，模型输出了无效的 JSON."
+                remaining_text += f"[Warning] JSON 解析失败，模型输出了无效的 JSON."
            except Exception as e:
                print("[Error] Exception during tool_use parsing:", str(e), data)
@@ -198,20 +223,32 @@ def tryparse(json_str):
        return json.loads(json_str[:-1])
 if __name__ == "__main__":
-    llmclient = ToolClient(LLMSession().ask)
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-    response = llmclient.chat(
+    try: from mykey import sider_cookie
    except ImportError: sider_cookie = ""
    try: from mykey import oai_apikey, oai_apibase, oai_model
    except ImportError: oai_apikey = oai_apibase = oai_model = ""
    llmclient = ToolClient(LLMSession(api_key=oai_apikey, api_base=oai_apibase, model=oai_model).ask)
    print(llmclient.raw_api("Hello, world!", stream=False))
    #llmclient = ToolClient(SiderLLMSession().ask)
    def get_final(gen):
        try:
            while True: 
                print('mid:', next(gen))
        except StopIteration as e:
            return e.value
    response = get_final(llmclient.chat(
        messages=[{"role": "user", "content": "我的IP是多少"}], 
        tools=[{"name": "get_ip", "parameters": {}}]
-    )
+    ))
    # 4. 获取结果
    print(f"思考: {response.thinking}") 
    # -> 我需要查一下 IP。
    if response.tool_calls:
        cmd = response.tool_calls[0]
        print(f"调用: {cmd.function.name} 参数: {cmd.function.arguments}")
-    response = llmclient.chat(
+    response = get_final(llmclient.chat(
        messages=[{"role": "user", "content": "<tool_result>10.176.45.12</tool_result>"}] 
-    )
+    ))
    print(response.content)
--- a/stapp.py
+++ b/stapp.py
@@ -35,27 +35,32 @@ def render_llm_switcher():
    current_idx = agent.llm_no
    st.caption(f"LLM Core: {current_idx}")
    if st.button("切换备用链路"):
-        agent.llm_no = (current_idx + 1) % len(agent.llmclient.raw_apis)
+        agent.next_llm()
        st.rerun(scope="fragment")
    if st.button("强行停止任务"):
        agent.abort()
        st.toast("已发送停止信号")
    if st.button("重新注入System Prompt"):
        agent.llmclient.last_tools = ''
        st.toast("下次将重新注入System Prompt")
 with st.sidebar: render_llm_switcher()
@st.fragment(run_every="1s")
 def global_queue_listener():
-    if agent.current_source != 'auto': return
+    if agent.current_source == 'auto':
-    while not agent.display_queue.empty():
+        while not agent.display_queue.empty():
-        item = agent.display_queue.get()
+            item = agent.display_queue.get()
-        if 'next' in item:
+            if item.get('source') == 'auto':
-            st.session_state.idle_buf = item['next']
+                if 'next' in item: st.session_state.idle_buf = item['next']
-        if 'done' in item:
+                if 'done' in item:
-            st.session_state.messages.append({"role": "assistant", "content": f"{item['done']}"})
+                    st.session_state.messages.append({"role": "assistant", "content": f"🤖 {item['done']}"})
-            st.session_state.idle_buf = ""
+                    st.session_state.idle_buf = ""; st.rerun()
-            st.rerun()
+        if st.session_state.get("idle_buf"):
-    if st.session_state.get("idle_buf"):
+            with st.chat_message("assistant"):
-        with st.chat_message("assistant"):
+                st.write(st.session_state.idle_buf + "▌")
-            st.write(st.session_state.idle_buf + "▌")
+    else:
        st.caption("🟢 Agent Listener Active", help=f"Last sync: {int(time.time())}")
        st.session_state.idle_buf = "" 
 global_queue_listener()
@@ -65,10 +70,10 @@ def agent_backend_stream(prompt):
        while True:
            item = agent.display_queue.get()
            if 'next' in item: yield item['next'] 
-            if 'done' in item: break
+            if 'done' in item: 
                yield item['done']; break
    finally:
        agent.abort()
        print('User aborted the operation.')
 if prompt := st.chat_input("请输入指令"):
    st.session_state.messages.append({"role": "user", "content": prompt})