diff --git a/simphtml.py b/simphtml.py index ca810ba..3f5af71 100644 --- a/simphtml.py +++ b/simphtml.py @@ -160,8 +160,9 @@ function analyzeNode(node, pPathType='main') { const childrenInfo = children.map(child => { const info = getNodeInfo(child) || { rect: {}, style: {} }; return { node: child, rect: info.rect, style: info.style, - area: info.area, zIndex: info.zIndex, isVisible: info.isVisible }; - }).sort((a, b) => b.area - a.area); + area: info.area, zIndex: (info.zIndex || 0), isVisible: info.isVisible }; + }); + childrenInfo.sort((a, b) => b.area - a.area); // 检测是划分还是覆盖 const isOverlay = hasOverlap(childrenInfo); @@ -223,6 +224,9 @@ function analyzeNode(node, pPathType='main') { } function handleOverlayContainer(childrenInfo, pathType) { + // elementFromPoint ground truth: 让浏览器告诉我们谁在视觉最上层 + const _efp = document.elementFromPoint(window.innerWidth/2, window.innerHeight/2); + if (_efp) { let _el = _efp; while (_el) { const _h = childrenInfo.find(c => c.node.id && c.node.id === _el.id); if (_h) { _h.zIndex = 9999; break; } _el = _el.parentElement; } } const sorted = [...childrenInfo].sort((a, b) => b.zIndex - a.zIndex); console.log('排序后的子元素:', sorted); if (sorted.length === 0) return; @@ -324,22 +328,20 @@ js_findMainList = r'''function findMainList(startElement = null) { const MIN_CHILDREN = 8; const MAX_CONTAINERS = 20; - // 全局扫描:收集所有子元素足够多的容器 - const containers = []; + // 全局扫描:收集候选容器,按 l1 + l2*0.1 排序(l2=孙子元素数,捕获表格等多层结构) + const candidates = []; const allEls = root.querySelectorAll('*'); for (const node of allEls) { if (node.closest('svg')) continue; - if (node.children.length >= MIN_CHILDREN) { - containers.push(node); - } + const l1 = node.children.length; + if (l1 < 5) continue; + let l2 = 0; + for (const child of node.children) l2 += child.children.length; + const score = l1 + l2 * 0.1; + if (score >= MIN_CHILDREN) candidates.push({node, score}); } - if (root.children.length >= MIN_CHILDREN && !containers.includes(root)) { - containers.unshift(root); - } - - // 按子元素数量降序,取前 MAX_CONTAINERS 个 - containers.sort((a, b) => b.children.length - a.children.length); - const toProcess = containers.slice(0, MAX_CONTAINERS); + candidates.sort((a, b) => b.score - a.score); + const toProcess = candidates.slice(0, MAX_CONTAINERS).map(c => c.node); // 对每个容器找候选组并评分 let allCandidates = []; @@ -374,6 +376,7 @@ js_findMainList = r'''function findMainList(startElement = null) { } function describeResult(container, items, selector, score) { + if(container&&!container.id)container.id='_ljq'+(window._lci=(window._lci||0)+1); const cTag = container ? container.tagName : null; const cId = container ? (container.id || '') : ''; const cClass = container ? (String(container.className || '').trim()) : ''; @@ -382,11 +385,7 @@ js_findMainList = r'''function findMainList(startElement = null) { itemCount: items.length, }; let prefix = ''; - if (cId) { - prefix = '#' + CSS.escape(cId); - } else if (cClass) { - prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + CSS.escape(c)).join(''); - } + if (cId) prefix = '#' + CSS.escape(cId); if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector; if (score !== undefined) result.score = score; if (items.length > 0) { @@ -701,6 +700,7 @@ def find_changed_elements(before_html, after_html): return result def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False): + if cutlist: rr = driver.execute_js(js_findMainList + "return findMainList(document.body);").get('data', []) page = get_main_block(driver, extra_js=extra_js, text_only=text_only) if text_only: return page soup = optimize_html_for_tokens(page) @@ -708,7 +708,6 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", div.name = 'iframe'; del div['data-tag'] html = str(soup) if not cutlist: return html - rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', []) lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else []) if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}") for entry in lists: @@ -720,7 +719,7 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", total_len = sum(len(str(it)) for it in items) avg_len = total_len / len(items) print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars") - if avg_len < 400 or (avg_len < 800 and total_len < 4000): continue + if avg_len < 200 or (avg_len < 700 and total_len < 2500): continue hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)] keep = hit[:6] if hit else items[:3] removed = [it for it in items if it not in keep] @@ -736,24 +735,83 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", for it in removed: it.decompose() ss = str(optimize_html_for_tokens(soup)) if lists else html if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)") - if len(ss) > maxchars: ss = str(truncate_biggest(soup, maxchars)) + if len(ss) > maxchars: ss = str(smart_truncate(soup, maxchars)) return ss -def truncate_biggest(soup, budget): - over = len(str(soup)) - budget +def smart_truncate(soup, budget, _depth=0): + """原地截断 soup 使其接近 budget 字符。 + 策略:穿透单子元素找分叉点;top3 能扛住 over 则按比例分担,否则从尾部删子元素。""" + CUT_THRESHOLD = 8000 # 小于此值直接去尾,大于则继续递归找分叉点 + indent = ' ' * _depth + def cut(ele, keep): + from bs4 import NavigableString + s = str(ele) + over = len(s) - keep + if over <= 0: return + # 保护 FAKE ELEMENT 提示标签 + protected = [c.extract() for c in ele.find_all(lambda tag: tag.string and '[FAKE ELEMENT]' in tag.string)] + s = str(ele) + over = len(s) - keep + if over <= 0: + for p in protected: ele.append(p) + return + marker = f' [TRUNCATED {over//1000}k chars]' + inner = ele.decode_contents() + tag_overhead = len(s) - len(inner) + inner_keep = max(keep - tag_overhead - len(marker), 0) + ele.clear() + if inner_keep > 0: + ele.append(BeautifulSoup(inner[:inner_keep], 'html.parser')) + ele.append(NavigableString(marker)) + for p in protected: ele.append(p) + total = len(str(soup)) + if total <= budget: return soup + kids = [(c, len(str(c))) for c in soup.children if c.name and not (c.string and '[FAKE ELEMENT]' in c.string)] + if not kids: return soup + selflen = total - sum(l for _, l in kids) + remaining_budget = max(budget - selflen, 0) + tag = getattr(soup, 'name', '?') + print(f'{indent}[smart_truncate] <{tag}> total={total} budget={budget} selflen={selflen} kids={len(kids)}') + # === 1 kid: 穿透 === + if len(kids) == 1: + print(f'{indent} -> single child, recurse into <{kids[0][0].name}>') + smart_truncate(kids[0][0], remaining_budget, _depth) + return soup + over = sum(l for _, l in kids) - remaining_budget if over <= 0: return soup - t = max((t for t in soup.find_all(True) if len(str(t)) > over and t.parent), key=lambda x: len(str(x))) - # 穿透空壳: 最胖子Tag占>70%就深入, 跳过html/body/单传div - while (kids := [(c, len(str(c))) for c in t.children if c.name]) and max(kids, key=lambda x: x[1])[1] > len(str(t)) * .7: - t = max(kids, key=lambda x: x[1])[0] - acc, keep = 0, len(str(t)) - over - for c in list(t.children): - if (acc := acc + len(str(c))) > keep: - tail = list(c.next_siblings) - c.replace_with(BeautifulSoup(str(c)[:max(0, keep-acc+len(str(c)))] + f' [TRUNCATED {over//1000}k chars]', 'html.parser')) - for s in tail: s.extract() - break - print(f"[truncate] finally {len(str(soup))} chars, cut <{t.name}> by {over}") + # 看 top 3 能否承担 over + ranked = sorted(range(len(kids)), key=lambda i: kids[i][1], reverse=True) + tops = list(ranked[:min(3, len(ranked))]) + top_total = sum(kids[i][1] for i in tops) + if top_total < over: + # === top 3 扛不住,从尾部删子元素 === + removed = 0 + removed_count = 0 + while kids and removed < over: + c, l = kids.pop(); c.decompose() + removed += l; removed_count += 1 + print(f'{indent} -> tail-cut: removed {removed_count} children ({removed//1000}k chars) from end') + return soup + # === top 2-3 按比例分担 === + # 过滤掉太小的 kid(不到最大的 10%),让大的全扛 + max_size = kids[ranked[0]][1] + filtered = [i for i in tops if kids[i][1] >= max_size * 0.1] + filtered_total = sum(kids[i][1] for i in filtered) + if filtered_total >= over: + tops, top_total = filtered, filtered_total + # 先打印所有分配计划 + actions = [] + for i in tops: + c, l = kids[i] + share = int(over * l / top_total) + new_keep = l - share + print(f'{indent} -> <{c.name}> {l} -> {new_keep} (share={share})') + actions.append((c, l, new_keep)) + # 再统一执行 + for c, l, new_keep in actions: + if new_keep <= 0: c.decompose() + elif new_keep > CUT_THRESHOLD: smart_truncate(c, new_keep, _depth + 1) + else: cut(c, new_keep) return soup def execute_js_rich(script, driver, no_monitor=False):