simphtml: elementFromPoint弹窗修复 + findMainList评分优化 + _ljq ID注入 + smart_truncate重写

This commit is contained in:
Liang Jiaqing
2026-04-07 10:10:27 +08:00
parent b4741a9a39
commit 054674baa1

View File

@@ -160,8 +160,9 @@ function analyzeNode(node, pPathType='main') {
const childrenInfo = children.map(child => { const childrenInfo = children.map(child => {
const info = getNodeInfo(child) || { rect: {}, style: {} }; const info = getNodeInfo(child) || { rect: {}, style: {} };
return { node: child, rect: info.rect, style: info.style, return { node: child, rect: info.rect, style: info.style,
area: info.area, zIndex: info.zIndex, isVisible: info.isVisible }; area: info.area, zIndex: (info.zIndex || 0), isVisible: info.isVisible };
}).sort((a, b) => b.area - a.area); });
childrenInfo.sort((a, b) => b.area - a.area);
// 检测是划分还是覆盖 // 检测是划分还是覆盖
const isOverlay = hasOverlap(childrenInfo); const isOverlay = hasOverlap(childrenInfo);
@@ -223,6 +224,9 @@ function analyzeNode(node, pPathType='main') {
} }
function handleOverlayContainer(childrenInfo, pathType) { function handleOverlayContainer(childrenInfo, pathType) {
// elementFromPoint ground truth: 让浏览器告诉我们谁在视觉最上层
const _efp = document.elementFromPoint(window.innerWidth/2, window.innerHeight/2);
if (_efp) { let _el = _efp; while (_el) { const _h = childrenInfo.find(c => c.node.id && c.node.id === _el.id); if (_h) { _h.zIndex = 9999; break; } _el = _el.parentElement; } }
const sorted = [...childrenInfo].sort((a, b) => b.zIndex - a.zIndex); const sorted = [...childrenInfo].sort((a, b) => b.zIndex - a.zIndex);
console.log('排序后的子元素:', sorted); console.log('排序后的子元素:', sorted);
if (sorted.length === 0) return; if (sorted.length === 0) return;
@@ -324,22 +328,20 @@ js_findMainList = r'''function findMainList(startElement = null) {
const MIN_CHILDREN = 8; const MIN_CHILDREN = 8;
const MAX_CONTAINERS = 20; const MAX_CONTAINERS = 20;
// 全局扫描:收集所有子元素足够多的容器 // 全局扫描:收集候选容器,按 l1 + l2*0.1 排序l2=孙子元素数,捕获表格等多层结构)
const containers = []; const candidates = [];
const allEls = root.querySelectorAll('*'); const allEls = root.querySelectorAll('*');
for (const node of allEls) { for (const node of allEls) {
if (node.closest('svg')) continue; if (node.closest('svg')) continue;
if (node.children.length >= MIN_CHILDREN) { const l1 = node.children.length;
containers.push(node); if (l1 < 5) continue;
} let l2 = 0;
for (const child of node.children) l2 += child.children.length;
const score = l1 + l2 * 0.1;
if (score >= MIN_CHILDREN) candidates.push({node, score});
} }
if (root.children.length >= MIN_CHILDREN && !containers.includes(root)) { candidates.sort((a, b) => b.score - a.score);
containers.unshift(root); const toProcess = candidates.slice(0, MAX_CONTAINERS).map(c => c.node);
}
// 按子元素数量降序,取前 MAX_CONTAINERS 个
containers.sort((a, b) => b.children.length - a.children.length);
const toProcess = containers.slice(0, MAX_CONTAINERS);
// 对每个容器找候选组并评分 // 对每个容器找候选组并评分
let allCandidates = []; let allCandidates = [];
@@ -374,6 +376,7 @@ js_findMainList = r'''function findMainList(startElement = null) {
} }
function describeResult(container, items, selector, score) { function describeResult(container, items, selector, score) {
if(container&&!container.id)container.id='_ljq'+(window._lci=(window._lci||0)+1);
const cTag = container ? container.tagName : null; const cTag = container ? container.tagName : null;
const cId = container ? (container.id || '') : ''; const cId = container ? (container.id || '') : '';
const cClass = container ? (String(container.className || '').trim()) : ''; const cClass = container ? (String(container.className || '').trim()) : '';
@@ -382,11 +385,7 @@ js_findMainList = r'''function findMainList(startElement = null) {
itemCount: items.length, itemCount: items.length,
}; };
let prefix = ''; let prefix = '';
if (cId) { if (cId) prefix = '#' + CSS.escape(cId);
prefix = '#' + CSS.escape(cId);
} else if (cClass) {
prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + CSS.escape(c)).join('');
}
if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector; if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector;
if (score !== undefined) result.score = score; if (score !== undefined) result.score = score;
if (items.length > 0) { if (items.length > 0) {
@@ -701,6 +700,7 @@ def find_changed_elements(before_html, after_html):
return result return result
def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False): def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False):
if cutlist: rr = driver.execute_js(js_findMainList + "return findMainList(document.body);").get('data', [])
page = get_main_block(driver, extra_js=extra_js, text_only=text_only) page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
if text_only: return page if text_only: return page
soup = optimize_html_for_tokens(page) soup = optimize_html_for_tokens(page)
@@ -708,7 +708,6 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="",
div.name = 'iframe'; del div['data-tag'] div.name = 'iframe'; del div['data-tag']
html = str(soup) html = str(soup)
if not cutlist: return html if not cutlist: return html
rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', [])
lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else []) lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else [])
if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}") if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}")
for entry in lists: for entry in lists:
@@ -720,7 +719,7 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="",
total_len = sum(len(str(it)) for it in items) total_len = sum(len(str(it)) for it in items)
avg_len = total_len / len(items) avg_len = total_len / len(items)
print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars") print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars")
if avg_len < 400 or (avg_len < 800 and total_len < 4000): continue if avg_len < 200 or (avg_len < 700 and total_len < 2500): continue
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)] hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
keep = hit[:6] if hit else items[:3] keep = hit[:6] if hit else items[:3]
removed = [it for it in items if it not in keep] removed = [it for it in items if it not in keep]
@@ -736,24 +735,83 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="",
for it in removed: it.decompose() for it in removed: it.decompose()
ss = str(optimize_html_for_tokens(soup)) if lists else html ss = str(optimize_html_for_tokens(soup)) if lists else html
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)") if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
if len(ss) > maxchars: ss = str(truncate_biggest(soup, maxchars)) if len(ss) > maxchars: ss = str(smart_truncate(soup, maxchars))
return ss return ss
def truncate_biggest(soup, budget): def smart_truncate(soup, budget, _depth=0):
over = len(str(soup)) - budget """原地截断 soup 使其接近 budget 字符。
策略穿透单子元素找分叉点top3 能扛住 over 则按比例分担,否则从尾部删子元素。"""
CUT_THRESHOLD = 8000 # 小于此值直接去尾,大于则继续递归找分叉点
indent = ' ' * _depth
def cut(ele, keep):
from bs4 import NavigableString
s = str(ele)
over = len(s) - keep
if over <= 0: return
# 保护 FAKE ELEMENT 提示标签
protected = [c.extract() for c in ele.find_all(lambda tag: tag.string and '[FAKE ELEMENT]' in tag.string)]
s = str(ele)
over = len(s) - keep
if over <= 0:
for p in protected: ele.append(p)
return
marker = f' [TRUNCATED {over//1000}k chars]'
inner = ele.decode_contents()
tag_overhead = len(s) - len(inner)
inner_keep = max(keep - tag_overhead - len(marker), 0)
ele.clear()
if inner_keep > 0:
ele.append(BeautifulSoup(inner[:inner_keep], 'html.parser'))
ele.append(NavigableString(marker))
for p in protected: ele.append(p)
total = len(str(soup))
if total <= budget: return soup
kids = [(c, len(str(c))) for c in soup.children if c.name and not (c.string and '[FAKE ELEMENT]' in c.string)]
if not kids: return soup
selflen = total - sum(l for _, l in kids)
remaining_budget = max(budget - selflen, 0)
tag = getattr(soup, 'name', '?')
print(f'{indent}[smart_truncate] <{tag}> total={total} budget={budget} selflen={selflen} kids={len(kids)}')
# === 1 kid: 穿透 ===
if len(kids) == 1:
print(f'{indent} -> single child, recurse into <{kids[0][0].name}>')
smart_truncate(kids[0][0], remaining_budget, _depth)
return soup
over = sum(l for _, l in kids) - remaining_budget
if over <= 0: return soup if over <= 0: return soup
t = max((t for t in soup.find_all(True) if len(str(t)) > over and t.parent), key=lambda x: len(str(x))) # 看 top 3 能否承担 over
# 穿透空壳: 最胖子Tag占>70%就深入, 跳过html/body/单传div ranked = sorted(range(len(kids)), key=lambda i: kids[i][1], reverse=True)
while (kids := [(c, len(str(c))) for c in t.children if c.name]) and max(kids, key=lambda x: x[1])[1] > len(str(t)) * .7: tops = list(ranked[:min(3, len(ranked))])
t = max(kids, key=lambda x: x[1])[0] top_total = sum(kids[i][1] for i in tops)
acc, keep = 0, len(str(t)) - over if top_total < over:
for c in list(t.children): # === top 3 扛不住,从尾部删子元素 ===
if (acc := acc + len(str(c))) > keep: removed = 0
tail = list(c.next_siblings) removed_count = 0
c.replace_with(BeautifulSoup(str(c)[:max(0, keep-acc+len(str(c)))] + f' [TRUNCATED {over//1000}k chars]', 'html.parser')) while kids and removed < over:
for s in tail: s.extract() c, l = kids.pop(); c.decompose()
break removed += l; removed_count += 1
print(f"[truncate] finally {len(str(soup))} chars, cut <{t.name}> by {over}") print(f'{indent} -> tail-cut: removed {removed_count} children ({removed//1000}k chars) from end')
return soup
# === top 2-3 按比例分担 ===
# 过滤掉太小的 kid不到最大的 10%),让大的全扛
max_size = kids[ranked[0]][1]
filtered = [i for i in tops if kids[i][1] >= max_size * 0.1]
filtered_total = sum(kids[i][1] for i in filtered)
if filtered_total >= over:
tops, top_total = filtered, filtered_total
# 先打印所有分配计划
actions = []
for i in tops:
c, l = kids[i]
share = int(over * l / top_total)
new_keep = l - share
print(f'{indent} -> <{c.name}> {l} -> {new_keep} (share={share})')
actions.append((c, l, new_keep))
# 再统一执行
for c, l, new_keep in actions:
if new_keep <= 0: c.decompose()
elif new_keep > CUT_THRESHOLD: smart_truncate(c, new_keep, _depth + 1)
else: cut(c, new_keep)
return soup return soup
def execute_js_rich(script, driver, no_monitor=False): def execute_js_rich(script, driver, no_monitor=False):