From eb8834b9dffe644269471a7f1461595a68077ad9 Mon Sep 17 00:00:00 2001 From: Jiaqing Liang Date: Mon, 23 Mar 2026 15:26:05 +0800 Subject: [PATCH] feat: enhance text_only mode - add form element annotations, block-level line breaks, and whitespace cleanup --- assets/insight_fixed_structure.txt | 10 +++++----- simphtml.py | 23 ++++++++++++++++++++--- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/assets/insight_fixed_structure.txt b/assets/insight_fixed_structure.txt index d90f480..d420aa3 100644 --- a/assets/insight_fixed_structure.txt +++ b/assets/insight_fixed_structure.txt @@ -2,8 +2,8 @@ Facts(L2): ../memory/global_mem.txt | Code: ../ | SOPs(L3): ../memory/*.md or *. Insight是极简索引,L2/L3变更时同步Insight,索引必须极简。写记忆前先读META-SOP(L0)。 [CONSTITUTION] -1. 改自身源码先请示;./内可自主实验,允许装包和portable工具。 -2. 决策前查记忆库;未查证不断言。 -3. 分步执行,控制粒度,限制失败半径;3次失败请求干预。 -4. 密钥文件仅引用,不读取/移动。 -5. 写任何记忆前读META-SOP核验,memory下文件只能patch修改(除非新建)。 +1. 改自身源码先请示;./内可自主实验,允许装包和portable工具 +2. 决策前查记忆库;未查证不断言 +3. 分步执行,控制粒度,限制失败半径;3次失败请求干预 +4. 密钥文件仅引用,不读取/移动 +5. 写任何记忆前读META-SOP核验,memory下文件只能patch修改(除非新建) diff --git a/simphtml.py b/simphtml.py index b5294da..ff18f11 100644 --- a/simphtml.py +++ b/simphtml.py @@ -99,7 +99,18 @@ function createEnhancedDOMCopy() { }; } const { domCopy, getNodeInfo, isVisible } = createEnhancedDOMCopy(); -if (text_only) return domCopy.innerText; +if (text_only) { + const blocks = new Set(['DIV','P','H1','H2','H3','H4','H5','H6','LI','TR','SECTION','ARTICLE','HEADER','FOOTER','NAV','BLOCKQUOTE','PRE','HR','BR','DT','DD','FIGCAPTION','DETAILS','SUMMARY']); + domCopy.querySelectorAll('*').forEach(el => { + if (blocks.has(el.tagName)) el.insertAdjacentText('beforebegin', '\n'); + }); + domCopy.querySelectorAll('input:not([type=hidden]),textarea,select').forEach(el=>{ + const p=[el.tagName,el.id&&'#'+el.id,el.getAttribute('name')&&'name='+el.getAttribute('name'),el.tagName==='INPUT'&&'type='+(el.getAttribute('type')||'text'),el.getAttribute('placeholder')&&'"'+el.getAttribute('placeholder')+'"',el.getAttribute('data-autofilled')&&'autofilled',el.disabled&&'disabled',el.tagName==='SELECT'&&el.getAttribute('data-selected')&&'="'+el.getAttribute('data-selected')+'"'].filter(Boolean).join(' '); + el.insertAdjacentText('beforebegin','\n['+p+']\n'); + }); + domCopy.querySelectorAll('button[disabled]').forEach(el=>el.insertAdjacentText('beforebegin','[DISABLED] ')); + return domCopy.textContent; +} const viewportArea = window.innerWidth * window.innerHeight; function analyzeNode(node, pPathType='main') { @@ -768,9 +779,15 @@ def get_temp_texts(driver): print(e) return [] -import time +import time, re def get_main_block(driver, extra_js="", text_only=False): - return driver.execute_js(f"{extra_js}\n{js_optHTML}\nreturn optHTML({str(text_only).lower()});").get('data', '') + page = driver.execute_js(f"{extra_js}\n{js_optHTML}\nreturn optHTML({str(text_only).lower()});").get('data', '') + if text_only: + page = re.sub(r' {2,}', ' ', page) # 连续空格→单空格 + page = re.sub(r'^ +', '', page, flags=re.M) # 去行首空格 + page = re.sub(r'(\n\s*){3,}', '\n\n', page) # 3+空行→1空行 + return page.strip() + return page def find_changed_elements(before_html, after_html): before_soup = BeautifulSoup(before_html, 'html.parser')