From 33f2df4270f0ac38af9039a8525a994ce39f591a Mon Sep 17 00:00:00 2001 From: Jiaqing Liang Date: Wed, 11 Mar 2026 14:48:19 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0web=5Fscan=E7=9A=84te?= =?UTF-8?q?xt=5Fonly=E5=8F=82=E6=95=B0=E6=94=AF=E6=8C=81Shadow=20DOM?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E6=8F=90=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- assets/tools_schema.json | 3 ++- ga.py | 13 ++++++------- simphtml.py | 20 ++++++++++++++------ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/assets/tools_schema.json b/assets/tools_schema.json index 6c73195..0d34f01 100644 --- a/assets/tools_schema.json +++ b/assets/tools_schema.json @@ -37,7 +37,8 @@ "description": "获取当前页面的简化HTML内容和标签页列表。注意:简化会过滤边栏、浮动元素等非主体内容,如需查看被过滤内容请用execute_js。切换页面后一般应先调用查看。", "parameters": {"type": "object", "properties": { "tabs_only": {"type": "boolean", "description": "仅返回标签页列表和当前标签信息,不获取HTML内容。", "default": false}, - "switch_tab_id": {"type": "string", "description": "可选的标签页 ID。如果提供,系统将在扫描前切换到该标签页。"}}} + "switch_tab_id": {"type": "string", "description": "可选的标签页 ID。如果提供,系统将在扫描前切换到该标签页。"}, + "text_only": {"type": "boolean", "description": "只要纯文本不要HTML信息。", "default": false}}} }}, {"type": "function", "function": { "name": "web_execute_js", diff --git a/ga.py b/ga.py index 20dfa79..0ffeb12 100644 --- a/ga.py +++ b/ga.py @@ -109,7 +109,7 @@ def first_init_driver(): #driver.newtab() time.sleep(3) -def web_scan(tabs_only=False, switch_tab_id=None): +def web_scan(tabs_only=False, switch_tab_id=None, text_only=False): """ 获取当前页面的简化HTML内容和标签页列表。注意:简化过程会过滤边栏、浮动元素等非主体内容。 tabs_only: 仅返回标签页列表,不获取HTML内容(节省token)。 @@ -135,7 +135,7 @@ def web_scan(tabs_only=False, switch_tab_id=None): "active_tab": driver.default_session_id } } - if not tabs_only: result["content"] = get_html(driver, cutlist=True, maxchars=28000) + if not tabs_only: result["content"] = get_html(driver, cutlist=True, maxchars=28000, text_only=text_only) return result except Exception as e: return {"status": "error", "msg": format_error(e)} @@ -243,8 +243,7 @@ def smart_format(data, max_depth=2, max_str_len=100, omit_str=' ... '): return json.dumps(truncate(data, 0), indent=2, ensure_ascii=False, default=str) class GenericAgentHandler(BaseHandler): - '''Generic Agent 工具库,包含多种工具的实现。工具函数自动加上了 do_ 前缀。实际工具名没有前缀。 - ''' + '''Generic Agent 工具库,包含多种工具的实现。工具函数自动加上了 do_ 前缀。实际工具名没有前缀。''' def __init__(self, parent, last_history=None, cwd='./'): self.parent = parent self.key_info = "" @@ -302,7 +301,8 @@ class GenericAgentHandler(BaseHandler): ''' tabs_only = args.get("tabs_only", False) switch_tab_id = args.get("switch_tab_id", None) - result = web_scan(tabs_only=tabs_only, switch_tab_id=switch_tab_id) + text_only = args.get("text_only", False) + result = web_scan(tabs_only=tabs_only, switch_tab_id=switch_tab_id, text_only=text_only) content = result.pop("content", None) yield f'[Info] {str(result)}\n' if content: next_prompt = f"\n```html\n{content}\n```\n" @@ -373,8 +373,7 @@ class GenericAgentHandler(BaseHandler): old = open(path, 'r', encoding="utf-8").read() if os.path.exists(path) else "" open(path, 'w', encoding="utf-8").write(new_content + old) else: - with open(path, 'a' if mode == "append" else 'w', encoding="utf-8") as f: - f.write(new_content) + with open(path, 'a' if mode == "append" else 'w', encoding="utf-8") as f: f.write(new_content) yield f"[Status] ✅ {mode.capitalize()} 成功 ({len(new_content)} bytes)\n" next_prompt = self._get_anchor_prompt() return StepOutcome({"status": "success", 'writed_bytes': len(new_content)}, diff --git a/simphtml.py b/simphtml.py index d68aafd..76e9534 100644 --- a/simphtml.py +++ b/simphtml.py @@ -1,7 +1,7 @@ try: from bs4 import BeautifulSoup except ImportError: print("[Error] BeautifulSoup4 未安装,请叫Agent安装BeautifulSoup4,再使用web相关工具。") -js_optHTML = r'''function optHTML() { +js_optHTML = r'''function optHTML(text_only=false) { function createEnhancedDOMCopy() { const nodeInfo = new WeakMap(); const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'META', 'LINK', 'COLGROUP', 'COL', 'TEMPLATE', 'PARAM', 'SOURCE']; @@ -43,6 +43,12 @@ function createEnhancedDOMCopy() { } } catch(e) {} } + if (sourceNode.shadowRoot) { + for (const shadowChild of sourceNode.shadowRoot.childNodes) { + const shadowClone = cloneNode(shadowChild, keep); + if (shadowClone) childNodes.push(shadowClone); + } + } const rect = sourceNode.getBoundingClientRect(); const style = window.getComputedStyle(sourceNode); @@ -90,7 +96,8 @@ function createEnhancedDOMCopy() { } }; } -const { domCopy, getNodeInfo, isVisible } = createEnhancedDOMCopy(); +const { domCopy, getNodeInfo, isVisible } = createEnhancedDOMCopy(); +if (text_only) return domCopy.innerText; const viewportArea = window.innerWidth * window.innerHeight; function analyzeNode(node, pPathType='main') { @@ -823,8 +830,8 @@ def get_temp_texts(driver): return [] import time -def get_main_block(driver, extra_js=""): - return driver.execute_js(extra_js+'\n'+js_optHTML).get('data', '') +def get_main_block(driver, extra_js="", text_only=False): + return driver.execute_js(f"{extra_js}\n{js_optHTML}\nreturn optHTML({str(text_only).lower()});").get('data', '') def find_changed_elements(before_html, after_html): before_soup = BeautifulSoup(before_html, 'html.parser') @@ -855,8 +862,9 @@ def find_changed_elements(before_html, after_html): result["top_change"] = h if len(h) <= 2000 else h[:2000] + '...[TRUNCATED]' return result -def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js=""): - page = get_main_block(driver, extra_js=extra_js) +def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False): + page = get_main_block(driver, extra_js=extra_js, text_only=text_only) + if text_only: return page soup = optimize_html_for_tokens(page) html = str(soup) if not cutlist or len(html) <= maxchars: return html