From a9209daaf71969ab27a6fee96d9681e9c391977d Mon Sep 17 00:00:00 2001
From: Liang Jiaqing <l.j.q.light@gmail.com>
Date: Wed, 4 Feb 2026 18:56:50 +0800
Subject: [PATCH] feat: stream LLM responses and improve agent UI

---
 agent_loop.py            | 11 ++---
 agentmain.py             | 11 ++++-
 assets/tools_schema.json |  2 +-
 ga.py                    | 46 ++++++++++++++++---
 sidercall.py             | 99 +++++++++++++++++++++++++++-------------
 stapp.py                 | 35 ++++++++------
 6 files changed, 140 insertions(+), 64 deletions(-)
diff --git a/agent_loop.py b/agent_loop.py
index 5d02ff0..bc5ba77 100644
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -45,14 +45,9 @@ def agent_runner_loop(client, system_prompt, user_input, handler, tools_schema,
     ]
     for turn in range(max_turns):
         yield f"**LLM Running (Turn {turn+1}) ...**\n\n"
-        response = client.chat(messages=messages, tools=tools_schema)
-
-        if response.thinking: yield '<thinking>' + response.thinking + '</thinking>\n\n'
-        showcontent = response.content
-        if '</summary>' in showcontent: showcontent = showcontent.replace('</summary>', '</summary>\n\n')
-        if '</file_content>' in showcontent:
-            showcontent = re.sub(r'<file_content>\s*(.*?)\s*</file_content>', r'\n````\n<file_content>\n\1\n</file_content>\n````', showcontent, flags=re.DOTALL)
-        yield showcontent + '\n\n'
+        response_gen = client.chat(messages=messages, tools=tools_schema)
+        response = yield from response_gen
+        yield '\n\n'
 
         if not response.tool_calls:
             tool_name, args = 'no_tool', {}
diff --git a/agentmain.py b/agentmain.py
index 29edfb0..303daf8 100644
--- a/agentmain.py
+++ b/agentmain.py
@@ -31,7 +31,7 @@ class GeneraticAgent:
         from sidercall import sider_cookie, oai_apikey, oai_apibase
         llm_sessions = []
         if sider_cookie: llm_sessions += [SiderLLMSession(default_model=x) for x in \
-                                    ["gemini-3.0-flash", "claude-haiku-4.5", "gpt-5-mini"]]
+                                    ["gemini-3.0-flash", "claude-haiku-4.5", "kimi-k2"]]
         if oai_apikey: llm_sessions += [LLMSession(api_key=oai_apikey, api_base=oai_apibase)]
         if len(llm_sessions) > 0: 
             llmclient = ToolClient([x.ask for x in llm_sessions], auto_save_tokens=True)
@@ -48,6 +48,10 @@ class GeneraticAgent:
         self.stop_sig = False
         self.current_source = 'none'
 
+    def next_llm(self):
+        self.llm_no = (self.llm_no + 1) % len(self.llmclient.raw_apis)
+        self.llmclient.last_tools = ''
+
     def abort(self):
         if not self.is_running: return
         self.stop_sig = True
@@ -73,17 +77,20 @@ class GeneraticAgent:
             self.llmclient.raw_api = self.llmclient.raw_apis[self.llm_no]
             gen = agent_runner_loop(self.llmclient, sys_prompt, 
                         raw_query, handler, TOOLS_SCHEMA, max_turns=25)
+                        
             try:
                 full_response = ""
                 for chunk in gen:
                     if self.stop_sig: break
                     full_response += chunk
                     self.display_queue.put({'next': full_response, 'source': source})
+                if '</summary>' in full_response: full_response = full_response.replace('</summary>', '</summary>\n\n')
+                if '</file_content>' in full_response: full_response = re.sub(r'<file_content>\s*(.*?)\s*</file_content>', r'\n````\n<file_content>\n\1\n</file_content>\n````', full_response, flags=re.DOTALL)
                 self.display_queue.put({'done': full_response, 'source': source})
                 self.history = handler.history_info
             except Exception as e:
                 print(f"Backend Error: {format_error(e)}")
-                self.display_queue.put({'done': '异常停止', 'source': source})
+                self.display_queue.put({'done': full_response + f'\n```\n{format_error(e)}\n```', 'source': source})
             finally:
                 self.is_running = False
                 self.stop_sig = False
diff --git a/assets/tools_schema.json b/assets/tools_schema.json
index 971ee40..2f183a9 100644
--- a/assets/tools_schema.json
+++ b/assets/tools_schema.json
@@ -62,7 +62,7 @@
   }},
   {"type": "function", "function": {
     "name": "conclude_and_reflect",
-    "description": "当模型认为当前任务执行完美，且有具有长期价值的环境事实或用户偏好需要提炼并存入全局记忆时，调用此工具。注意：此工具无参数，调用即代表触发记忆提炼流程。",
+    "description": "当模型认为当前任务（非处理记忆）执行完美，且有具有长期价值的环境事实或用户偏好需要提炼并存入全局记忆时，调用此工具。注意：此工具调用即代表触发记忆提炼流程，如果已经在记忆提炼流程无需调用。",
     "parameters": {"type": "object", "properties": {}}}
   }
 ]
\ No newline at end of file
diff --git a/ga.py b/ga.py
index 97d15d0..539b6db 100644
--- a/ga.py
+++ b/ga.py
@@ -267,8 +267,8 @@ class GenericAgentHandler(BaseHandler):
         warning = ""
         if not matches:
             code = args.get("code")
-            if not code: return StepOutcome(None, next_prompt=f"【系统错误】：你调用了 code_run，但未在回复中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")
-            warning = "\n下次要记得在回复中提供代码块，而不是放在参数中"
+            if not code: return StepOutcome(None, next_prompt=f"【系统错误】：你调用了 code_run，但未在先在回复正文中提供 ```{code_type} 代码块。请重新输出代码并附带工具调用。")
+            warning = "\n下次要记得先在回复正文中提供代码块，而不是放在参数中"
         else: code = matches[-1].strip()   # 提取最后一个代码块（通常是模型修正后的最终逻辑）
         timeout = args.get("timeout", 60)
         raw_path = os.path.join(self.cwd, args.get("cwd", './'))
@@ -395,12 +395,42 @@ class GenericAgentHandler(BaseHandler):
 
     def do_no_tool(self, args, response):
         '''这是一个特殊工具，由引擎自主调用，不要包含在TOOLS_SCHEMA里。
+        当模型在一轮中未显式调用任何工具时，由引擎自动触发。
+        二次确认仅在回复几乎只包含<thinking>/<summary>和一段大代码块时触发。
         '''
-        if not response or not getattr(response, 'content', '').strip():
+        content = getattr(response, 'content', '') or ""
+
+        # 1. 空回复保护：要求模型重新生成内容或调用工具
+        if not response or not content.strip():
             yield "[Warn] LLM returned an empty response. Retrying...\n"
             next_prompt = "[System] 检测到空回复，请重新生成内容或调用工具。"
             return StepOutcome({}, next_prompt=next_prompt, should_exit=False)
-        yield "[Info] No tool called. Final response to user.\n"
+        # 2. 检测“包含较大代码块但未调用工具”的情况
+        # 这里通过三引号代码块 + 最少字符数的方式粗略判断“大段代码”
+        code_block_pattern = r"```[a-zA-Z0-9_]*\n[\s\S]{100,}?```"
+        m = re.search(code_block_pattern, content)
+        if m:
+            # 仅当 content 由 <thinking> / <summary> 和该代码块构成时才触发二次确认
+            residual = content
+            # 去掉代码块本身
+            residual = residual.replace(m.group(0), "")
+            # 去掉<thinking>和<summary>块（大小写不敏感）
+            residual = re.sub(r"<thinking>[\s\S]*?</thinking>", "", residual, flags=re.IGNORECASE)
+            residual = re.sub(r"<summary>[\s\S]*?</summary>", "", residual, flags=re.IGNORECASE)
+            # 如果去除上述结构后的非空白字符很少，说明没有额外自然语言说明
+            clean_residual = re.sub(r"\s+", "", residual)
+            if len(clean_residual) <= 50:
+                yield "[Info] Detected large code block without tool call and no extra natural language. Requesting clarification.\n"
+                next_prompt = (
+                    "[System] 检测到你在上一轮回复中主要内容是较大代码块（仅配有<thinking>/<summary>），且本轮未调用任何工具。\n"
+                    "如果这些代码需要执行、写入文件或进一步分析，请重新组织回复并显式调用相应工具"
+                    "（例如：code_run、file_write、file_patch 等）；\n"
+                    "如果只是向用户展示或讲解代码片段，请在回复中补充自然语言说明，"
+                    "并明确是否还需要额外的实际操作。"
+                )
+                return StepOutcome({}, next_prompt=next_prompt, should_exit=False)
+        # 3. 正常情况：直接将回复返回给用户并结束循环
+        yield "[Info] Final response to user.\n"
         return StepOutcome(response, next_prompt=None, should_exit=True)
     
     def do_conclude_and_reflect(self, args, response):
@@ -421,14 +451,16 @@ class GenericAgentHandler(BaseHandler):
         print(prompt)
         if self.plan: prompt += f"\n<plan>{self.plan}</plan>"
         if self.focus: prompt += f"\n<focus>{self.focus}</focus>"
-        return prompt + "\n请继续执行下一步。"
+        return prompt
 
 def get_global_memory():
     prompt = "\n"
     try:
         with open('memory/global_mem_insight.txt', 'r', encoding='utf-8') as f: insight = f.read()
-        prompt += f"\n\n[Global Memory Insight]\n"
-        prompt += 'IMPORTANT PATHS: ../memory/global_mem.txt (Facts), ../memory/global_mem_insight.txt (Logic), ../ (Your Code Root), ../temp (./, Your default cwd) \n'
+        prompt += f"\n\n[Memory Insight (../memory/global_mem_insight.txt)]\n"
+        prompt += 'IMPORTANT PATHS: ../memory/global_mem.txt (Facts), ../ (Your Code Root)\n'
+        prompt += f'cwd = {os.path.abspath("./temp")}\n'
+        prompt += f'But prefer use relative paths (./ = cwd) to locate.\n'
         prompt += 'MEM_RULE: Insight is the index of Facts. Sync Insight whenever Facts change. For details, read Facts.\n'
         prompt += "EXT: ../memory/ may contain other task-specific memories.\n"
         prompt += insight + "\n"
diff --git a/sidercall.py b/sidercall.py
index 017780a..3f88c7b 100644
--- a/sidercall.py
+++ b/sidercall.py
@@ -1,4 +1,4 @@
-import os, json, re, time, requests
+import os, json, re, time, requests, sys
 
 try: from mykey import sider_cookie
 except ImportError: sider_cookie = ""
@@ -10,12 +10,14 @@ class SiderLLMSession:
         from sider_ai_api import Session
         self._core = Session(cookie=sider_cookie, proxies={'https':'127.0.0.1:2082'})   
         self.default_model = default_model
-    def ask(self, prompt, model=None):
+    def ask(self, prompt, model=None, stream=False):
         if model is None: model = self.default_model
         if len(prompt) > 29000: 
             print(f"[Warn] Prompt too long ({len(prompt)} chars), truncating.")
             prompt = prompt[-29000:]
-        return ''.join(self._core.chat(prompt, model))
+        gen = self._core.chat(prompt, model)
+        if stream: return gen
+        return ''.join(list(gen))
   
 class LLMSession:
     def __init__(self, api_key=oai_apikey, api_base=oai_apibase, model=oai_model, context_win=16000):
@@ -28,17 +30,29 @@ class LLMSession:
 
     def raw_ask(self, messages, model=None, temperature=0.5):
         if model is None: model = self.model
-        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
+        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", "Accept": "text/event-stream"}
+        payload = {"model": model, "messages": messages, "temperature": temperature, "stream": True}
         try:
-            response = requests.post(
-                f"{self.api_base}/chat/completions", headers=headers, timeout=60,
-                json={"model": model, "messages": messages, "temperature": temperature} )
-            res_json = response.json()
-            content = res_json["choices"][0]["message"]["content"]
-            return content
+            with requests.post(f"{self.api_base}/chat/completions",
+                            headers=headers, json=payload, stream=True, timeout=(5, 60)) as r:
+                r.raise_for_status()
+                buffer = ''
+                for line in r.iter_lines():
+                    line = line.decode("utf-8")
+                    if not line or not line.startswith("data:"): continue
+                    data = line[5:].lstrip()
+                    if data == "[DONE]": break
+                    obj = json.loads(data)
+                    ch = (obj.get("choices") or [{}])[0]
+                    if ch.get("finish_reason") is not None: break
+                    delta = (ch.get("delta") or {}).get("content")
+                    if not delta: continue
+                    yield delta
+                    buffer += delta
+                    if '</tool_use>' in buffer[-30:]: break
         except Exception as e:
-            return f"Error: {str(e)}"
-        
+            yield f"Error: {str(e)}"
+
     def make_messages(self, raw_list, omit_images=True):
         messages = []
         for msg in raw_list:
@@ -60,22 +74,28 @@ class LLMSession:
         p = "Summarize prev summary and prev conversations into compact memory (facts/decisions/constraints/open questions). Do NOT restate long schemas. The new summary should less than 1000 tokens.\n"
         messages = self.make_messages(old, omit_images=True)
         messages += [{"role":"user", "content":p}]
-        summary = self.raw_ask(messages, model, temperature=0.1)
+        summary = ''.join(list(self.raw_ask(messages, model, temperature=0.1)))
         if not summary.startswith("Error:"): 
             self.raw_msgs.insert(0, {"role":"system", "prompt":"Prev summary:\n"+summary, "image":None})
         else: self.raw_msgs = old + self.raw_msgs   # 不做了，下次再做
 
-    def ask(self, prompt, model=None, image_base64=None):
+    def ask(self, prompt, model=None, image_base64=None, stream=False):
         if model is None: model = self.model
         self.raw_msgs.append({"role": "user", "prompt": prompt, "image": image_base64})
         messages = self.make_messages(self.raw_msgs[:-1], omit_images=True)
         messages += self.make_messages([self.raw_msgs[-1]], omit_images=False)
         total_len = sum(2000 if isinstance(m["content"], list) else len(str(m["content"]))//4 for m in messages)   # estimate token count
-        content = self.raw_ask(messages, model)
-        if not content.startswith("Error:"):
-            self.raw_msgs.append({"role": "assistant", "prompt": content, "image": None})
-        if total_len > self.context_win: self.summary_history()
-        return content
+        gen = self.raw_ask(messages, model)
+        def _ask_gen():
+            content = ''
+            for chunk in gen:
+                content += chunk; yield chunk
+            if not content.startswith("Error:"):
+                self.raw_msgs.append({"role": "assistant", "prompt": content, "image": None})
+            if total_len > 5000: print(f"[Debug] Whole context length {total_len}.")
+            if total_len > self.context_win: self.summary_history()
+        if stream: return _ask_gen()
+        return ''.join(list(_ask_gen())) 
         
 
 class MockFunction:
@@ -109,7 +129,10 @@ class ToolClient:
     def chat(self, messages, tools=None):
         full_prompt = self._build_protocol_prompt(messages, tools)      
         print("Full prompt length:", len(full_prompt))
-        raw_text = self.raw_api(full_prompt)
+        gen = self.raw_api(full_prompt, stream=True)
+        raw_text = ''
+        for chunk in gen:
+            raw_text += chunk; yield chunk
         with open('model_responses.txt', 'a', encoding='utf-8', errors="replace") as f:
             f.write(f"=== Prompt ===\n{full_prompt}\n=== Response ===\n{raw_text}\n\n")
         return self._parse_mixed_response(raw_text)
@@ -127,7 +150,7 @@ class ToolClient:
 请按照以下步骤思考并行动：
 1. **思考**: 在 `<thinking>` 标签中先进行思考，分析现状和策略。
 2. **总结**: 在 `<summary>` 中输出*极为简短*的高度概括的单行（<30字）物理快照，包括上次工具调用结果获取的新信息+本次工具调用意图和预期。此内容将进入长期工作记忆，记录关键信息，严禁输出无实际信息增量的描述。
-3. **行动**: 如果需要调用工具，请紧接着输出一个 **<tool_use>块**，然后结束，我会稍后给你返回<tool_result>块。
+3. **行动**: 如果需要调用工具，请在回复正文之后输出一个 **<tool_use>块**，然后结束，我会稍后给你返回<tool_result>块。
    格式: ```<tool_use>\n{{"function": "工具名", "arguments": {{参数}}}}\n</tool_use>\n```
 
 ### 可用工具库
@@ -164,7 +187,7 @@ class ToolClient:
         
         tool_calls = None
         tool_pattern = r"<tool_use>(.*?)</tool_use>"
-        tool_match = re.search(tool_pattern, text, re.DOTALL)
+        tool_match = re.search(tool_pattern, remaining_text, re.DOTALL)
         
         json_str = ""
         if tool_match:
@@ -173,6 +196,8 @@ class ToolClient:
         elif '<tool_use>' in remaining_text:
             weaktoolstr = remaining_text.split('<tool_use>')[-1].strip()
             json_str = weaktoolstr if weaktoolstr.endswith('}') else ''
+            if json_str == '' and '```' in weaktoolstr and weaktoolstr.split('```')[0].strip().endswith('}'):
+                json_str = weaktoolstr.split('```')[0].strip()
             remaining_text = remaining_text.replace('<tool_use>'+weaktoolstr, "")
 
         if json_str:
@@ -184,7 +209,7 @@ class ToolClient:
                 if func_name: tool_calls = [MockToolCall(func_name, args)]
             except json.JSONDecodeError:
                 print("[Warn] Failed to parse tool_use JSON:", json_str)
-                thinking += f"[Warn] JSON 解析失败，模型输出了无效的 JSON."
+                remaining_text += f"[Warning] JSON 解析失败，模型输出了无效的 JSON."
             except Exception as e:
                 print("[Error] Exception during tool_use parsing:", str(e), data)
 
@@ -198,20 +223,32 @@ def tryparse(json_str):
         return json.loads(json_str[:-1])
 
 if __name__ == "__main__":
-    llmclient = ToolClient(LLMSession().ask)
-    response = llmclient.chat(
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+    try: from mykey import sider_cookie
+    except ImportError: sider_cookie = ""
+    try: from mykey import oai_apikey, oai_apibase, oai_model
+    except ImportError: oai_apikey = oai_apibase = oai_model = ""
+
+    llmclient = ToolClient(LLMSession(api_key=oai_apikey, api_base=oai_apibase, model=oai_model).ask)
+    print(llmclient.raw_api("Hello, world!", stream=False))
+    #llmclient = ToolClient(SiderLLMSession().ask)
+    def get_final(gen):
+        try:
+            while True: 
+                print('mid:', next(gen))
+        except StopIteration as e:
+            return e.value
+        
+    response = get_final(llmclient.chat(
         messages=[{"role": "user", "content": "我的IP是多少"}], 
         tools=[{"name": "get_ip", "parameters": {}}]
-    )
-    # 4. 获取结果
+    ))
     print(f"思考: {response.thinking}") 
-    # -> 我需要查一下 IP。
-
     if response.tool_calls:
         cmd = response.tool_calls[0]
         print(f"调用: {cmd.function.name} 参数: {cmd.function.arguments}")
 
-    response = llmclient.chat(
+    response = get_final(llmclient.chat(
         messages=[{"role": "user", "content": "<tool_result>10.176.45.12</tool_result>"}] 
-    )
+    ))
     print(response.content)
\ No newline at end of file
diff --git a/stapp.py b/stapp.py
index 9607f95..826c548 100644
--- a/stapp.py
+++ b/stapp.py
@@ -35,27 +35,32 @@ def render_llm_switcher():
     current_idx = agent.llm_no
     st.caption(f"LLM Core: {current_idx}")
     if st.button("切换备用链路"):
-        agent.llm_no = (current_idx + 1) % len(agent.llmclient.raw_apis)
+        agent.next_llm()
         st.rerun(scope="fragment")
     if st.button("强行停止任务"):
         agent.abort()
         st.toast("已发送停止信号")
+    if st.button("重新注入System Prompt"):
+        agent.llmclient.last_tools = ''
+        st.toast("下次将重新注入System Prompt")
 with st.sidebar: render_llm_switcher()
 
 @st.fragment(run_every="1s")
 def global_queue_listener():
-    if agent.current_source != 'auto': return
-    while not agent.display_queue.empty():
-        item = agent.display_queue.get()
-        if 'next' in item:
-            st.session_state.idle_buf = item['next']
-        if 'done' in item:
-            st.session_state.messages.append({"role": "assistant", "content": f"{item['done']}"})
-            st.session_state.idle_buf = ""
-            st.rerun()
-    if st.session_state.get("idle_buf"):
-        with st.chat_message("assistant"):
-            st.write(st.session_state.idle_buf + "▌")
+    if agent.current_source == 'auto':
+        while not agent.display_queue.empty():
+            item = agent.display_queue.get()
+            if item.get('source') == 'auto':
+                if 'next' in item: st.session_state.idle_buf = item['next']
+                if 'done' in item:
+                    st.session_state.messages.append({"role": "assistant", "content": f"🤖 {item['done']}"})
+                    st.session_state.idle_buf = ""; st.rerun()
+        if st.session_state.get("idle_buf"):
+            with st.chat_message("assistant"):
+                st.write(st.session_state.idle_buf + "▌")
+    else:
+        st.caption("🟢 Agent Listener Active", help=f"Last sync: {int(time.time())}")
+        st.session_state.idle_buf = "" 
 
 global_queue_listener()
 
@@ -65,10 +70,10 @@ def agent_backend_stream(prompt):
         while True:
             item = agent.display_queue.get()
             if 'next' in item: yield item['next'] 
-            if 'done' in item: break
+            if 'done' in item: 
+                yield item['done']; break
     finally:
         agent.abort()
-        print('User aborted the operation.')
 
 if prompt := st.chat_input("请输入指令"):
     st.session_state.messages.append({"role": "user", "content": prompt})