refactor: restructure memory hierarchy and fix ljqCtrl coordinate logic

This commit is contained in:
Jiaqing Liang
2026-02-11 12:54:09 +08:00
parent 8f593c4eaf
commit 1276021422
7 changed files with 33 additions and 22 deletions

View File

@@ -75,7 +75,7 @@ def agent_runner_loop(client, system_prompt, user_input, handler, tools_schema,
next_prompt += outcome.next_prompt next_prompt += outcome.next_prompt
if (turn+1) % 5 == 0: if (turn+1) % 5 == 0:
next_prompt += f"\n\n[DANGER] 已连续执行第 {turn+1} 轮。禁止无效重试。若无有效进展必须切换策略1. 探测物理边界 2. 请求用户协助。" next_prompt += f"\n\n[DANGER] 已连续执行第 {turn+1} 轮。禁止无效重试。若无有效进展必须切换策略1. 探测物理边界 2. 请求用户协助。"
if turn == 23: if (turn+1) % 25 == 0:
next_prompt += f"\n\n[DANGER] 已连续执行第 {turn+1} 轮。你必须总结情况进行ask_user不允许继续重试。" next_prompt += f"\n\n### [DANGER] 已连续执行第 {turn+1} 轮。你必须总结情况进行ask_user不允许继续重试。"
messages = [{"role": "user", "content": next_prompt}] messages = [{"role": "user", "content": next_prompt}]
return {'result': 'MAX_TURNS_EXCEEDED'} return {'result': 'MAX_TURNS_EXCEEDED'}

View File

@@ -81,7 +81,7 @@ class GeneraticAgent:
self.handler = handler self.handler = handler
self.llmclient.backend = self.llmclient.backends[self.llm_no] self.llmclient.backend = self.llmclient.backends[self.llm_no]
gen = agent_runner_loop(self.llmclient, sys_prompt, gen = agent_runner_loop(self.llmclient, sys_prompt,
raw_query, handler, TOOLS_SCHEMA, max_turns=25) raw_query, handler, TOOLS_SCHEMA, max_turns=40)
try: try:
full_response = ""; last_pos = 0 full_response = ""; last_pos = 0

View File

@@ -34,23 +34,23 @@
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
"name": "web_scan", "name": "web_scan",
"description": "获取当前网页的清洗后内容,并列出所有已打开的标签页。支持切换标签页。在长页面中,可以使用 focus_item 进行语义过滤以提取关键信息。", "description": "获取当前网页的清洗后内容,并列出所有已打开的标签页。支持切换标签页。切换页面后一般应先调用查看。",
"parameters": {"type": "object", "properties": { "parameters": {"type": "object", "properties": {
"focus_item": {"type": "string", "description": "语义过滤指令,用于在长列表中优先保留与该关键词相关的项。"}, "focus_item": {"type": "string", "description": "语义过滤指令,用于在长列表中优先保留与该关键词相关的项。"},
"switch_tab_id": {"type": "string", "description": "可选的标签页 ID。如果提供系统将在扫描前切换到该标签页。"}}} "switch_tab_id": {"type": "string", "description": "可选的标签页 ID。如果提供系统将在扫描前切换到该标签页。"}}}
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
"name": "web_execute_js", "name": "web_execute_js",
"description": "万能网页操控工具。通过执行 JavaScript 脚本实现对浏览器的完全控制(如点击、滚动、提取特定数据)。这是 Web 场景下的首选工具。执行结果可选择保存到本地文件进行后续分析。", "description": "万能网页操控工具。通过执行 JavaScript 脚本实现对浏览器的完全控制(如点击、滚动、提取特定数据)。鼓励在有把握情况下记忆中有selector/做法等精准使用以减少web_scan调用。执行结果可选择保存到本地文件进行后续分析。",
"parameters": {"type": "object", "properties": { "parameters": {"type": "object", "properties": {
"script": {"type": "string", "description": "要执行的 JavaScript 代码。"}, "script": {"type": "string", "description": "要执行的 JavaScript 代码。"},
"save_to_file": {"type": "string", "description": "可选。将 JS 执行结果js_return保存到的文件路径。注意:该功能不支持 await 等异步结果。"}}, "required": ["script"]} "save_to_file": {"type": "string", "description": "可选。将 JS 执行结果js_return保存到的文件路径。该功能不支持 await 等异步结果。"}}, "required": ["script"]}
}}, }},
{"type": "function", "function": { {"type": "function", "function": {
"name": "update_working_mem", "name": "update_working_mem",
"description": "更新当前任务的工作记忆。当任务目标细化、阶段切换或发现关键物理约束(坑)时调用,内容将作为物理锚点在后续每轮自动注入,强行锁定推理方向,防止长程任务逻辑漂移。", "description": "更新当前任务的工作记忆。当任务目标细化、阶段切换或发现关键物理约束(坑)时调用,内容不得超过200 tokens将作为物理锚点在后续每轮自动注入,强行锁定推理方向,防止长程任务逻辑漂移。",
"parameters": {"type": "object", "properties": { "parameters": {"type": "object", "properties": {
"key_info": {"type": "string", "description": "替换掉当前的极简关键信息200tokens以内),包含当前进度和目标、已获取的关键信息及下一步必须绕过的坑,已经解决或完成的步骤可以去掉"}, "key_info": {"type": "string", "description": "替换掉当前的极简关键信息(<200 tokens包含当前进度和目标、已获取的关键信息及下一步必须绕过的坑已经解决或完成的步骤可以去掉"},
"related_sop": {"type": "string", "description": "相关sop名称可以多个必要时需要再读"}}} "related_sop": {"type": "string", "description": "相关sop名称可以多个必要时需要再读"}}}
}}, }},
{"type": "function", "function": { {"type": "function", "function": {

2
ga.py
View File

@@ -458,7 +458,7 @@ class GenericAgentHandler(BaseHandler):
def _get_anchor_prompt(self): def _get_anchor_prompt(self):
h_str = "\n".join(self.history_info[-20:]) h_str = "\n".join(self.history_info[-20:])
prompt = f"\n### [WORKING MEMORY]\n<history>\n{h_str}\n</history>" prompt = f"\n### [WORKING MEMORY]\n<history>\n{h_str}\n</history>"
if self.key_info: prompt += f"\n<sop_essentials>{self.key_info}</sop_essentials>" if self.key_info: prompt += f"\n<key_info>{self.key_info}</key_info>"
if self.related_sop: prompt += f"\n有不清晰的地方请再次读取{self.related_sop}" if self.related_sop: prompt += f"\n有不清晰的地方请再次读取{self.related_sop}"
print(prompt) print(prompt)
return prompt return prompt

View File

@@ -1,11 +1,12 @@
# coding=utf-8 # coding=utf-8
""" """
CRITICAL: 严禁在此工具链中 import pyautogui (会污染 win32api 导致逻辑冲突)。
ljqCtrl Quick Reference: ljqCtrl Quick Reference:
- dpi_scale: float (Physical = Logical / dpi_scale) - dpi_scale: float (Logical = Physical * dpi_scale)
- Click(x, y=None): Click logical/physical coordinates - Click(x, y): Use Physical Coordinates (from screenshots)
- SetCursorPos(z): Move mouse to logical coordinate z=(x, y) - SetCursorPos(z): Use Physical Coordinates z=(x, y)
- Press(cmd, staytime=0): Keyboard shortcuts (e.g. 'ctrl+c') - Press(cmd, staytime=0): Keyboard shortcuts (e.g. 'ctrl+v')
- FindBlock(fn, wrect=None, threshold=0.8) -> (obj_center, is_found) - FindBlock(fn, wrect=None, threshold=0.8) -> (obj_center_phys, is_found)
- MouseDClick(staytime=0.05), MouseClick(staytime=0.05) - MouseDClick(staytime=0.05), MouseClick(staytime=0.05)
""" """

View File

@@ -12,7 +12,7 @@
--- ---
## 记忆层级架构 ## 记忆层级架构
``` ```
L1: global_mem_insight.txt (极简索引层 - 严格控制 ≤50 行) L1: global_mem_insight.txt (极简索引层 - 严格控制 ≤30 行)
↓ 导航指向 (Pointer) ↓ 导航指向 (Pointer)
L2: global_mem.txt (事实库层 - 现短但会膨胀) L2: global_mem.txt (事实库层 - 现短但会膨胀)
↓ 详细引用 (Reference) ↓ 详细引用 (Reference)

View File

@@ -56,7 +56,7 @@ class GeminiSession:
return iter([full_text]) if stream else full_text return iter([full_text]) if stream else full_text
class LLMSession: class LLMSession:
def __init__(self, api_key=oai_apikey, api_base=oai_apibase, model=oai_model, context_win=12000): def __init__(self, api_key=oai_apikey, api_base=oai_apibase, model=oai_model, context_win=16000):
self.api_key = api_key self.api_key = api_key
self.api_base = api_base self.api_base = api_base
self.raw_msgs = [] self.raw_msgs = []
@@ -202,18 +202,18 @@ class ToolClient:
if tools: if tools:
tools_json = json.dumps(tools, ensure_ascii=False, separators=(',', ':')) tools_json = json.dumps(tools, ensure_ascii=False, separators=(',', ':'))
tool_instruction = f""" tool_instruction = f"""
### 交互协议 (必须严格遵守) ### 交互协议 (必须严格遵守,持续有效)
请按照以下步骤思考并行动,标签之间需要回车换行: 请按照以下步骤思考并行动,标签之间需要回车换行:
1. **思考**: 在 `<thinking>` 标签中先进行思考,分析现状和策略。 1. **思考**: 在 `<thinking>` 标签中先进行思考,分析现状和策略。
2. **总结**: 在 `<summary>` 中输出*极为简短*的高度概括的单行(<30字物理快照包括上次工具调用结果获取的新信息+本次工具调用意图和预期。此内容将进入长期工作记忆,记录关键信息,严禁输出无实际信息增量的描述。 2. **总结**: 在 `<summary>` 中输出*极为简短*的高度概括的单行(<30字物理快照包括上次工具调用结果获取的新信息+本次工具调用意图和预期。此内容将进入长期工作记忆,记录关键信息,严禁输出无实际信息增量的描述。
3. **行动**: 如果需要调用工具,请在回复正文之后输出一个 **<tool_use>块**,然后结束,我会稍后给你返回<tool_result>块。 3. **行动**: 如果需要调用工具,请在回复正文之后输出一个 **<tool_use>块**,然后结束,我会稍后给你返回<tool_result>块。
格式: ```<tool_use>\n{{"name": "工具名", "arguments": {{参数}}}}\n</tool_use>\n``` 格式: ```<tool_use>\n{{"name": "工具名", "arguments": {{参数}}}}\n</tool_use>\n```
### 可用工具库 ### 可用工具库(已挂载,持续有效)
{tools_json} {tools_json}
""" """
if self.auto_save_tokens and self.last_tools == tools_json: if self.auto_save_tokens and self.last_tools == tools_json:
tool_instruction = "\n### 交互协议保持不变,沿用之前的协议和工具库\n" tool_instruction = "\n### 工具库状态持续有效code_run/file_read等**可正常调用**。调用协议沿用\n"
else: else:
self.total_cd_tokens = 0 self.total_cd_tokens = 0
self.last_tools = tools_json self.last_tools = tools_json
@@ -255,7 +255,11 @@ class ToolClient:
if json_str == '' and '```' in weaktoolstr and weaktoolstr.split('```')[0].strip().endswith('}'): if json_str == '' and '```' in weaktoolstr and weaktoolstr.split('```')[0].strip().endswith('}'):
json_str = weaktoolstr.split('```')[0].strip() json_str = weaktoolstr.split('```')[0].strip()
remaining_text = remaining_text.replace('<tool_use>'+weaktoolstr, "") remaining_text = remaining_text.replace('<tool_use>'+weaktoolstr, "")
elif '"name":' in remaining_text and '"arguments":' in remaining_text:
json_match = re.search(r"(\{.*\"name\":.*?\})", remaining_text, re.DOTALL | re.MULTILINE)
if json_match:
json_str = json_match.group(1).strip()
remaining_text = remaining_text.replace(json_str, "").strip()
if json_str: if json_str:
try: try:
data = tryparse(json_str) data = tryparse(json_str)
@@ -275,8 +279,14 @@ class ToolClient:
def tryparse(json_str): def tryparse(json_str):
try: return json.loads(json_str) try: return json.loads(json_str)
except: except: pass
return json.loads(json_str[:-1]) json_str = json_str.strip().strip('`').replace('json\n', '', 1).strip()
try: return json.loads(json_str)
except: pass
try: return json.loads(json_str[:-1])
except: pass
if '}' in json_str: json_str = json_str[:json_str.rfind('}') + 1]
return json.loads(json_str)
if __name__ == "__main__": if __name__ == "__main__":
import sys, os import sys, os