From 555eeabf56a6882997c7cf5e2921d074a2f30e45 Mon Sep 17 00:00:00 2001 From: Liang Jiaqing Date: Fri, 3 Apr 2026 22:34:41 +0800 Subject: [PATCH] simphtml: SVG className fix, CSS.escape, DOMRect compat, cutlist tuning; tmwebdriver_sop update --- memory/tmwebdriver_sop.md | 13 +++++++++++-- simphtml.py | 35 +++++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/memory/tmwebdriver_sop.md b/memory/tmwebdriver_sop.md index 320013c..19589ac 100644 --- a/memory/tmwebdriver_sop.md +++ b/memory/tmwebdriver_sop.md @@ -151,10 +151,19 @@ web_execute_js script='{"cmd": "batch", "commands": [...]}' - 验证码canvas/img:JS `canvas.toDataURL()` 直接拿base64最干净 - 备选:`window.open(location.href,'_blank')` 前台开新标签→win32截图→完后close -## 直接import(仅作调试使用) -- `sys.path.insert(0, GenericAgent根目录)`, `from TMWebDriver import TMWebDriver` +## simphtml与TMWebDriver调试 +- ⭐**simphtml唯一调试方法**:必须通过 `code_run` 注入JS到真实浏览器执行,因为Python端无法完全模拟DOM行为。 + ```python + import sys + sys.path.append('../') + from TMWebDriver import * + from simphtml import * + driver = TMWebDriver() + res = driver.execute_js(js_optHTML) # js_optHTML为simphtml中注入的JS代码 + ``` - `d=TMWebDriver()`, `d.set_session('url_pattern')`, `d.execute_js('code')` → 返回`{'data': value}`(非裸值) - 配合simphtml:`str(simphtml.optimize_html_for_tokens(html))` → 注意返回BS4 Tag需str() +- ⚠**DOMRect坑点(hasOverlap)**:`DOMRect` 对象在某些浏览器/上下文中可能缺少 `x` 和 `y` 属性(只有 `left`/`top`),直接访问 `rect.x` 会得到 `undefined`,导致数学计算(如重叠判定)变成 `NaN`,从而引发逻辑错误(如错误判定为重叠导致元素被误删)。必须兼容:`const x = rect.x !== undefined ? rect.x : rect.left;` ## 跨域iframe操控(postMessage中继) - 跨域iframe的contentDocument不可访问,web_execute_js只在顶层执行 diff --git a/simphtml.py b/simphtml.py index 1b9adff..9af8e22 100644 --- a/simphtml.py +++ b/simphtml.py @@ -102,6 +102,10 @@ function createEnhancedDOMCopy() { if (sourceNode.nodeType === 1 && sourceNode.tagName === 'DIV') { if (!hasValidChildren && !sourceNode.textContent.trim()) return null; } + // aria-hidden + not visible = truly hidden (e.g. mobile menus), remove even if has children + if (sourceNode.getAttribute && sourceNode.getAttribute('aria-hidden') === 'true' && !info.isVisible) { + return null; + } if (info.isVisible || hasValidChildren || keep) { childNodes.forEach(child => clone.appendChild(child)); return clone; @@ -197,9 +201,10 @@ function analyzeNode(node, pPathType='main') { if (child.node.textContent.trim().length > 200) isSecondary = true; // P3: 有实质文本内容则保留 if (child.style.visibility === 'hidden') isSecondary = false; if (isSecondary) child.node.dataset.mark = 'K:secondary'; - else child.node.dataset.mark = 'R:nonEssential'; + else child.node.dataset.mark = 'K:nonEssential'; } } else { + return; // relaxed: skip equalmany filtering, list truncation handles token budget const uniqueClassNames = new Set(childrenInfo.map(item => item.node.getAttribute('class') || '')).size; const highClassNameVariety = uniqueClassNames >= childrenInfo.length * 0.8; if (pathType !== 'main' && highClassNameVariety && childrenInfo.length > 5) { @@ -267,8 +272,12 @@ function analyzeNode(node, pPathType='main') { const r1 = a.rect, r2 = b.rect; if (!r1.width || !r2.width || !r1.height || !r2.height) {return false;} const epsilon = 1; - return !(r1.x + r1.width <= r2.x + epsilon || r1.x >= r2.x + r2.width - epsilon || - r1.y + r1.height <= r2.y + epsilon || r1.y >= r2.y + r2.height - epsilon + const x1 = r1.x !== undefined ? r1.x : r1.left; + const y1 = r1.y !== undefined ? r1.y : r1.top; + const x2 = r2.x !== undefined ? r2.x : r2.left; + const y2 = r2.y !== undefined ? r2.y : r2.top; + return !(x1 + r1.width <= x2 + epsilon || x1 >= x2 + r2.width - epsilon || + y1 + r1.height <= y2 + epsilon || y1 >= y2 + r2.height - epsilon ); }) ); @@ -309,6 +318,7 @@ js_findMainList = r'''function findMainList(startElement = null) { const containers = []; const allEls = root.querySelectorAll('*'); for (const node of allEls) { + if (node.closest('svg')) continue; if (node.children.length >= MIN_CHILDREN) { containers.push(node); } @@ -363,9 +373,9 @@ js_findMainList = r'''function findMainList(startElement = null) { }; let prefix = ''; if (cId) { - prefix = '#' + cId; + prefix = '#' + CSS.escape(cId); } else if (cClass) { - prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + c).join(''); + prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + CSS.escape(c)).join(''); } if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector; if (score !== undefined) result.score = score; @@ -382,7 +392,7 @@ js_findMainList = r'''function findMainList(startElement = null) { } function findTopGroups(container, limit) { - const children = Array.from(container.children); + const children = Array.from(container.children).filter(c => !c.closest('svg')); const totalChildren = children.length; if (totalChildren < 3) return []; @@ -437,7 +447,7 @@ js_findMainList = r'''function findMainList(startElement = null) { // 添加类组 Object.keys(classFreq).forEach(cls => { if (classFreq[cls] >= minGroupSize) { - const selector = '.' + cls; + const selector = '.' + CSS.escape(cls); groups.push({ selector, elements: classMap[cls], @@ -458,7 +468,7 @@ js_findMainList = r'''function findMainList(startElement = null) { ); if (elements.length >= minGroupSize) { - const selector = tag + '.' + cls; + const selector = tag + '.' + CSS.escape(cls); groups.push({selector, elements, score: scoreGroup(selector, elements)}); } }); @@ -471,7 +481,7 @@ js_findMainList = r'''function findMainList(startElement = null) { el.className && el.className.split(/\s+/).includes(topClasses[i]) && el.className.split(/\s+/).includes(topClasses[j])); if (elements.length >= minGroupSize) { - const selector = '.' + topClasses[i] + '.' + topClasses[j]; + const selector = '.' + CSS.escape(topClasses[i]) + '.' + CSS.escape(topClasses[j]); groups.push({selector, elements,score: scoreGroup(selector, elements)}); } } @@ -692,12 +702,13 @@ def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", for entry in lists: sel = entry.get('selector') if isinstance(entry, dict) else None if not sel: continue - items = soup.select(sel) + try: items = soup.select(sel) + except Exception: print(f'[cutlist] skip invalid selector: {sel}'); continue if len(items) < 5: continue total_len = sum(len(str(it)) for it in items) avg_len = total_len / len(items) - if avg_len < 500 or (avg_len < 1000 and total_len < len(html) * 0.3): continue - print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, keep 3, save ~{total_len - 3 * avg_len:.0f} chars") + print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars") + if avg_len < 400 or (avg_len < 800 and total_len < 4000): continue hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)] keep = hit[:6] if hit else items[:3] removed = [it for it in items if it not in keep]