feat: 添加web_scan的text_only参数支持Shadow DOM文本提取
This commit is contained in:
@@ -37,7 +37,8 @@
|
|||||||
"description": "获取当前页面的简化HTML内容和标签页列表。注意:简化会过滤边栏、浮动元素等非主体内容,如需查看被过滤内容请用execute_js。切换页面后一般应先调用查看。",
|
"description": "获取当前页面的简化HTML内容和标签页列表。注意:简化会过滤边栏、浮动元素等非主体内容,如需查看被过滤内容请用execute_js。切换页面后一般应先调用查看。",
|
||||||
"parameters": {"type": "object", "properties": {
|
"parameters": {"type": "object", "properties": {
|
||||||
"tabs_only": {"type": "boolean", "description": "仅返回标签页列表和当前标签信息,不获取HTML内容。", "default": false},
|
"tabs_only": {"type": "boolean", "description": "仅返回标签页列表和当前标签信息,不获取HTML内容。", "default": false},
|
||||||
"switch_tab_id": {"type": "string", "description": "可选的标签页 ID。如果提供,系统将在扫描前切换到该标签页。"}}}
|
"switch_tab_id": {"type": "string", "description": "可选的标签页 ID。如果提供,系统将在扫描前切换到该标签页。"},
|
||||||
|
"text_only": {"type": "boolean", "description": "只要纯文本不要HTML信息。", "default": false}}}
|
||||||
}},
|
}},
|
||||||
{"type": "function", "function": {
|
{"type": "function", "function": {
|
||||||
"name": "web_execute_js",
|
"name": "web_execute_js",
|
||||||
|
|||||||
13
ga.py
13
ga.py
@@ -109,7 +109,7 @@ def first_init_driver():
|
|||||||
#driver.newtab()
|
#driver.newtab()
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
def web_scan(tabs_only=False, switch_tab_id=None):
|
def web_scan(tabs_only=False, switch_tab_id=None, text_only=False):
|
||||||
"""
|
"""
|
||||||
获取当前页面的简化HTML内容和标签页列表。注意:简化过程会过滤边栏、浮动元素等非主体内容。
|
获取当前页面的简化HTML内容和标签页列表。注意:简化过程会过滤边栏、浮动元素等非主体内容。
|
||||||
tabs_only: 仅返回标签页列表,不获取HTML内容(节省token)。
|
tabs_only: 仅返回标签页列表,不获取HTML内容(节省token)。
|
||||||
@@ -135,7 +135,7 @@ def web_scan(tabs_only=False, switch_tab_id=None):
|
|||||||
"active_tab": driver.default_session_id
|
"active_tab": driver.default_session_id
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if not tabs_only: result["content"] = get_html(driver, cutlist=True, maxchars=28000)
|
if not tabs_only: result["content"] = get_html(driver, cutlist=True, maxchars=28000, text_only=text_only)
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "msg": format_error(e)}
|
return {"status": "error", "msg": format_error(e)}
|
||||||
@@ -243,8 +243,7 @@ def smart_format(data, max_depth=2, max_str_len=100, omit_str=' ... '):
|
|||||||
return json.dumps(truncate(data, 0), indent=2, ensure_ascii=False, default=str)
|
return json.dumps(truncate(data, 0), indent=2, ensure_ascii=False, default=str)
|
||||||
|
|
||||||
class GenericAgentHandler(BaseHandler):
|
class GenericAgentHandler(BaseHandler):
|
||||||
'''Generic Agent 工具库,包含多种工具的实现。工具函数自动加上了 do_ 前缀。实际工具名没有前缀。
|
'''Generic Agent 工具库,包含多种工具的实现。工具函数自动加上了 do_ 前缀。实际工具名没有前缀。'''
|
||||||
'''
|
|
||||||
def __init__(self, parent, last_history=None, cwd='./'):
|
def __init__(self, parent, last_history=None, cwd='./'):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.key_info = ""
|
self.key_info = ""
|
||||||
@@ -302,7 +301,8 @@ class GenericAgentHandler(BaseHandler):
|
|||||||
'''
|
'''
|
||||||
tabs_only = args.get("tabs_only", False)
|
tabs_only = args.get("tabs_only", False)
|
||||||
switch_tab_id = args.get("switch_tab_id", None)
|
switch_tab_id = args.get("switch_tab_id", None)
|
||||||
result = web_scan(tabs_only=tabs_only, switch_tab_id=switch_tab_id)
|
text_only = args.get("text_only", False)
|
||||||
|
result = web_scan(tabs_only=tabs_only, switch_tab_id=switch_tab_id, text_only=text_only)
|
||||||
content = result.pop("content", None)
|
content = result.pop("content", None)
|
||||||
yield f'[Info] {str(result)}\n'
|
yield f'[Info] {str(result)}\n'
|
||||||
if content: next_prompt = f"<tool_result>\n```html\n{content}\n```\n</tool_result>"
|
if content: next_prompt = f"<tool_result>\n```html\n{content}\n```\n</tool_result>"
|
||||||
@@ -373,8 +373,7 @@ class GenericAgentHandler(BaseHandler):
|
|||||||
old = open(path, 'r', encoding="utf-8").read() if os.path.exists(path) else ""
|
old = open(path, 'r', encoding="utf-8").read() if os.path.exists(path) else ""
|
||||||
open(path, 'w', encoding="utf-8").write(new_content + old)
|
open(path, 'w', encoding="utf-8").write(new_content + old)
|
||||||
else:
|
else:
|
||||||
with open(path, 'a' if mode == "append" else 'w', encoding="utf-8") as f:
|
with open(path, 'a' if mode == "append" else 'w', encoding="utf-8") as f: f.write(new_content)
|
||||||
f.write(new_content)
|
|
||||||
yield f"[Status] ✅ {mode.capitalize()} 成功 ({len(new_content)} bytes)\n"
|
yield f"[Status] ✅ {mode.capitalize()} 成功 ({len(new_content)} bytes)\n"
|
||||||
next_prompt = self._get_anchor_prompt()
|
next_prompt = self._get_anchor_prompt()
|
||||||
return StepOutcome({"status": "success", 'writed_bytes': len(new_content)},
|
return StepOutcome({"status": "success", 'writed_bytes': len(new_content)},
|
||||||
|
|||||||
18
simphtml.py
18
simphtml.py
@@ -1,7 +1,7 @@
|
|||||||
try: from bs4 import BeautifulSoup
|
try: from bs4 import BeautifulSoup
|
||||||
except ImportError: print("[Error] BeautifulSoup4 未安装,请叫Agent安装BeautifulSoup4,再使用web相关工具。")
|
except ImportError: print("[Error] BeautifulSoup4 未安装,请叫Agent安装BeautifulSoup4,再使用web相关工具。")
|
||||||
|
|
||||||
js_optHTML = r'''function optHTML() {
|
js_optHTML = r'''function optHTML(text_only=false) {
|
||||||
function createEnhancedDOMCopy() {
|
function createEnhancedDOMCopy() {
|
||||||
const nodeInfo = new WeakMap();
|
const nodeInfo = new WeakMap();
|
||||||
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'META', 'LINK', 'COLGROUP', 'COL', 'TEMPLATE', 'PARAM', 'SOURCE'];
|
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'META', 'LINK', 'COLGROUP', 'COL', 'TEMPLATE', 'PARAM', 'SOURCE'];
|
||||||
@@ -43,6 +43,12 @@ function createEnhancedDOMCopy() {
|
|||||||
}
|
}
|
||||||
} catch(e) {}
|
} catch(e) {}
|
||||||
}
|
}
|
||||||
|
if (sourceNode.shadowRoot) {
|
||||||
|
for (const shadowChild of sourceNode.shadowRoot.childNodes) {
|
||||||
|
const shadowClone = cloneNode(shadowChild, keep);
|
||||||
|
if (shadowClone) childNodes.push(shadowClone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const rect = sourceNode.getBoundingClientRect();
|
const rect = sourceNode.getBoundingClientRect();
|
||||||
const style = window.getComputedStyle(sourceNode);
|
const style = window.getComputedStyle(sourceNode);
|
||||||
@@ -91,6 +97,7 @@ function createEnhancedDOMCopy() {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
const { domCopy, getNodeInfo, isVisible } = createEnhancedDOMCopy();
|
const { domCopy, getNodeInfo, isVisible } = createEnhancedDOMCopy();
|
||||||
|
if (text_only) return domCopy.innerText;
|
||||||
const viewportArea = window.innerWidth * window.innerHeight;
|
const viewportArea = window.innerWidth * window.innerHeight;
|
||||||
|
|
||||||
function analyzeNode(node, pPathType='main') {
|
function analyzeNode(node, pPathType='main') {
|
||||||
@@ -823,8 +830,8 @@ def get_temp_texts(driver):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
import time
|
import time
|
||||||
def get_main_block(driver, extra_js=""):
|
def get_main_block(driver, extra_js="", text_only=False):
|
||||||
return driver.execute_js(extra_js+'\n'+js_optHTML).get('data', '')
|
return driver.execute_js(f"{extra_js}\n{js_optHTML}\nreturn optHTML({str(text_only).lower()});").get('data', '')
|
||||||
|
|
||||||
def find_changed_elements(before_html, after_html):
|
def find_changed_elements(before_html, after_html):
|
||||||
before_soup = BeautifulSoup(before_html, 'html.parser')
|
before_soup = BeautifulSoup(before_html, 'html.parser')
|
||||||
@@ -855,8 +862,9 @@ def find_changed_elements(before_html, after_html):
|
|||||||
result["top_change"] = h if len(h) <= 2000 else h[:2000] + '...[TRUNCATED]'
|
result["top_change"] = h if len(h) <= 2000 else h[:2000] + '...[TRUNCATED]'
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js=""):
|
def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False):
|
||||||
page = get_main_block(driver, extra_js=extra_js)
|
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
||||||
|
if text_only: return page
|
||||||
soup = optimize_html_for_tokens(page)
|
soup = optimize_html_for_tokens(page)
|
||||||
html = str(soup)
|
html = str(soup)
|
||||||
if not cutlist or len(html) <= maxchars: return html
|
if not cutlist or len(html) <= maxchars: return html
|
||||||
|
|||||||
Reference in New Issue
Block a user