simphtml: elementFromPoint弹窗修复 + findMainList评分优化 + _ljq ID注入 + smart_truncate重写
This commit is contained in:
130
simphtml.py
130
simphtml.py
@@ -160,8 +160,9 @@ function analyzeNode(node, pPathType='main') {
|
|||||||
const childrenInfo = children.map(child => {
|
const childrenInfo = children.map(child => {
|
||||||
const info = getNodeInfo(child) || { rect: {}, style: {} };
|
const info = getNodeInfo(child) || { rect: {}, style: {} };
|
||||||
return { node: child, rect: info.rect, style: info.style,
|
return { node: child, rect: info.rect, style: info.style,
|
||||||
area: info.area, zIndex: info.zIndex, isVisible: info.isVisible };
|
area: info.area, zIndex: (info.zIndex || 0), isVisible: info.isVisible };
|
||||||
}).sort((a, b) => b.area - a.area);
|
});
|
||||||
|
childrenInfo.sort((a, b) => b.area - a.area);
|
||||||
|
|
||||||
// 检测是划分还是覆盖
|
// 检测是划分还是覆盖
|
||||||
const isOverlay = hasOverlap(childrenInfo);
|
const isOverlay = hasOverlap(childrenInfo);
|
||||||
@@ -223,6 +224,9 @@ function analyzeNode(node, pPathType='main') {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function handleOverlayContainer(childrenInfo, pathType) {
|
function handleOverlayContainer(childrenInfo, pathType) {
|
||||||
|
// elementFromPoint ground truth: 让浏览器告诉我们谁在视觉最上层
|
||||||
|
const _efp = document.elementFromPoint(window.innerWidth/2, window.innerHeight/2);
|
||||||
|
if (_efp) { let _el = _efp; while (_el) { const _h = childrenInfo.find(c => c.node.id && c.node.id === _el.id); if (_h) { _h.zIndex = 9999; break; } _el = _el.parentElement; } }
|
||||||
const sorted = [...childrenInfo].sort((a, b) => b.zIndex - a.zIndex);
|
const sorted = [...childrenInfo].sort((a, b) => b.zIndex - a.zIndex);
|
||||||
console.log('排序后的子元素:', sorted);
|
console.log('排序后的子元素:', sorted);
|
||||||
if (sorted.length === 0) return;
|
if (sorted.length === 0) return;
|
||||||
@@ -324,22 +328,20 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
const MIN_CHILDREN = 8;
|
const MIN_CHILDREN = 8;
|
||||||
const MAX_CONTAINERS = 20;
|
const MAX_CONTAINERS = 20;
|
||||||
|
|
||||||
// 全局扫描:收集所有子元素足够多的容器
|
// 全局扫描:收集候选容器,按 l1 + l2*0.1 排序(l2=孙子元素数,捕获表格等多层结构)
|
||||||
const containers = [];
|
const candidates = [];
|
||||||
const allEls = root.querySelectorAll('*');
|
const allEls = root.querySelectorAll('*');
|
||||||
for (const node of allEls) {
|
for (const node of allEls) {
|
||||||
if (node.closest('svg')) continue;
|
if (node.closest('svg')) continue;
|
||||||
if (node.children.length >= MIN_CHILDREN) {
|
const l1 = node.children.length;
|
||||||
containers.push(node);
|
if (l1 < 5) continue;
|
||||||
}
|
let l2 = 0;
|
||||||
|
for (const child of node.children) l2 += child.children.length;
|
||||||
|
const score = l1 + l2 * 0.1;
|
||||||
|
if (score >= MIN_CHILDREN) candidates.push({node, score});
|
||||||
}
|
}
|
||||||
if (root.children.length >= MIN_CHILDREN && !containers.includes(root)) {
|
candidates.sort((a, b) => b.score - a.score);
|
||||||
containers.unshift(root);
|
const toProcess = candidates.slice(0, MAX_CONTAINERS).map(c => c.node);
|
||||||
}
|
|
||||||
|
|
||||||
// 按子元素数量降序,取前 MAX_CONTAINERS 个
|
|
||||||
containers.sort((a, b) => b.children.length - a.children.length);
|
|
||||||
const toProcess = containers.slice(0, MAX_CONTAINERS);
|
|
||||||
|
|
||||||
// 对每个容器找候选组并评分
|
// 对每个容器找候选组并评分
|
||||||
let allCandidates = [];
|
let allCandidates = [];
|
||||||
@@ -374,6 +376,7 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function describeResult(container, items, selector, score) {
|
function describeResult(container, items, selector, score) {
|
||||||
|
if(container&&!container.id)container.id='_ljq'+(window._lci=(window._lci||0)+1);
|
||||||
const cTag = container ? container.tagName : null;
|
const cTag = container ? container.tagName : null;
|
||||||
const cId = container ? (container.id || '') : '';
|
const cId = container ? (container.id || '') : '';
|
||||||
const cClass = container ? (String(container.className || '').trim()) : '';
|
const cClass = container ? (String(container.className || '').trim()) : '';
|
||||||
@@ -382,11 +385,7 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
itemCount: items.length,
|
itemCount: items.length,
|
||||||
};
|
};
|
||||||
let prefix = '';
|
let prefix = '';
|
||||||
if (cId) {
|
if (cId) prefix = '#' + CSS.escape(cId);
|
||||||
prefix = '#' + CSS.escape(cId);
|
|
||||||
} else if (cClass) {
|
|
||||||
prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + CSS.escape(c)).join('');
|
|
||||||
}
|
|
||||||
if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector;
|
if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector;
|
||||||
if (score !== undefined) result.score = score;
|
if (score !== undefined) result.score = score;
|
||||||
if (items.length > 0) {
|
if (items.length > 0) {
|
||||||
@@ -701,6 +700,7 @@ def find_changed_elements(before_html, after_html):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False):
|
def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False):
|
||||||
|
if cutlist: rr = driver.execute_js(js_findMainList + "return findMainList(document.body);").get('data', [])
|
||||||
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
||||||
if text_only: return page
|
if text_only: return page
|
||||||
soup = optimize_html_for_tokens(page)
|
soup = optimize_html_for_tokens(page)
|
||||||
@@ -708,7 +708,6 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="",
|
|||||||
div.name = 'iframe'; del div['data-tag']
|
div.name = 'iframe'; del div['data-tag']
|
||||||
html = str(soup)
|
html = str(soup)
|
||||||
if not cutlist: return html
|
if not cutlist: return html
|
||||||
rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', [])
|
|
||||||
lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else [])
|
lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else [])
|
||||||
if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}")
|
if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}")
|
||||||
for entry in lists:
|
for entry in lists:
|
||||||
@@ -720,7 +719,7 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="",
|
|||||||
total_len = sum(len(str(it)) for it in items)
|
total_len = sum(len(str(it)) for it in items)
|
||||||
avg_len = total_len / len(items)
|
avg_len = total_len / len(items)
|
||||||
print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars")
|
print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars")
|
||||||
if avg_len < 400 or (avg_len < 800 and total_len < 4000): continue
|
if avg_len < 200 or (avg_len < 700 and total_len < 2500): continue
|
||||||
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
|
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
|
||||||
keep = hit[:6] if hit else items[:3]
|
keep = hit[:6] if hit else items[:3]
|
||||||
removed = [it for it in items if it not in keep]
|
removed = [it for it in items if it not in keep]
|
||||||
@@ -736,24 +735,83 @@ def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="",
|
|||||||
for it in removed: it.decompose()
|
for it in removed: it.decompose()
|
||||||
ss = str(optimize_html_for_tokens(soup)) if lists else html
|
ss = str(optimize_html_for_tokens(soup)) if lists else html
|
||||||
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
|
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
|
||||||
if len(ss) > maxchars: ss = str(truncate_biggest(soup, maxchars))
|
if len(ss) > maxchars: ss = str(smart_truncate(soup, maxchars))
|
||||||
return ss
|
return ss
|
||||||
|
|
||||||
def truncate_biggest(soup, budget):
|
def smart_truncate(soup, budget, _depth=0):
|
||||||
over = len(str(soup)) - budget
|
"""原地截断 soup 使其接近 budget 字符。
|
||||||
|
策略:穿透单子元素找分叉点;top3 能扛住 over 则按比例分担,否则从尾部删子元素。"""
|
||||||
|
CUT_THRESHOLD = 8000 # 小于此值直接去尾,大于则继续递归找分叉点
|
||||||
|
indent = ' ' * _depth
|
||||||
|
def cut(ele, keep):
|
||||||
|
from bs4 import NavigableString
|
||||||
|
s = str(ele)
|
||||||
|
over = len(s) - keep
|
||||||
|
if over <= 0: return
|
||||||
|
# 保护 FAKE ELEMENT 提示标签
|
||||||
|
protected = [c.extract() for c in ele.find_all(lambda tag: tag.string and '[FAKE ELEMENT]' in tag.string)]
|
||||||
|
s = str(ele)
|
||||||
|
over = len(s) - keep
|
||||||
|
if over <= 0:
|
||||||
|
for p in protected: ele.append(p)
|
||||||
|
return
|
||||||
|
marker = f' [TRUNCATED {over//1000}k chars]'
|
||||||
|
inner = ele.decode_contents()
|
||||||
|
tag_overhead = len(s) - len(inner)
|
||||||
|
inner_keep = max(keep - tag_overhead - len(marker), 0)
|
||||||
|
ele.clear()
|
||||||
|
if inner_keep > 0:
|
||||||
|
ele.append(BeautifulSoup(inner[:inner_keep], 'html.parser'))
|
||||||
|
ele.append(NavigableString(marker))
|
||||||
|
for p in protected: ele.append(p)
|
||||||
|
total = len(str(soup))
|
||||||
|
if total <= budget: return soup
|
||||||
|
kids = [(c, len(str(c))) for c in soup.children if c.name and not (c.string and '[FAKE ELEMENT]' in c.string)]
|
||||||
|
if not kids: return soup
|
||||||
|
selflen = total - sum(l for _, l in kids)
|
||||||
|
remaining_budget = max(budget - selflen, 0)
|
||||||
|
tag = getattr(soup, 'name', '?')
|
||||||
|
print(f'{indent}[smart_truncate] <{tag}> total={total} budget={budget} selflen={selflen} kids={len(kids)}')
|
||||||
|
# === 1 kid: 穿透 ===
|
||||||
|
if len(kids) == 1:
|
||||||
|
print(f'{indent} -> single child, recurse into <{kids[0][0].name}>')
|
||||||
|
smart_truncate(kids[0][0], remaining_budget, _depth)
|
||||||
|
return soup
|
||||||
|
over = sum(l for _, l in kids) - remaining_budget
|
||||||
if over <= 0: return soup
|
if over <= 0: return soup
|
||||||
t = max((t for t in soup.find_all(True) if len(str(t)) > over and t.parent), key=lambda x: len(str(x)))
|
# 看 top 3 能否承担 over
|
||||||
# 穿透空壳: 最胖子Tag占>70%就深入, 跳过html/body/单传div
|
ranked = sorted(range(len(kids)), key=lambda i: kids[i][1], reverse=True)
|
||||||
while (kids := [(c, len(str(c))) for c in t.children if c.name]) and max(kids, key=lambda x: x[1])[1] > len(str(t)) * .7:
|
tops = list(ranked[:min(3, len(ranked))])
|
||||||
t = max(kids, key=lambda x: x[1])[0]
|
top_total = sum(kids[i][1] for i in tops)
|
||||||
acc, keep = 0, len(str(t)) - over
|
if top_total < over:
|
||||||
for c in list(t.children):
|
# === top 3 扛不住,从尾部删子元素 ===
|
||||||
if (acc := acc + len(str(c))) > keep:
|
removed = 0
|
||||||
tail = list(c.next_siblings)
|
removed_count = 0
|
||||||
c.replace_with(BeautifulSoup(str(c)[:max(0, keep-acc+len(str(c)))] + f' [TRUNCATED {over//1000}k chars]', 'html.parser'))
|
while kids and removed < over:
|
||||||
for s in tail: s.extract()
|
c, l = kids.pop(); c.decompose()
|
||||||
break
|
removed += l; removed_count += 1
|
||||||
print(f"[truncate] finally {len(str(soup))} chars, cut <{t.name}> by {over}")
|
print(f'{indent} -> tail-cut: removed {removed_count} children ({removed//1000}k chars) from end')
|
||||||
|
return soup
|
||||||
|
# === top 2-3 按比例分担 ===
|
||||||
|
# 过滤掉太小的 kid(不到最大的 10%),让大的全扛
|
||||||
|
max_size = kids[ranked[0]][1]
|
||||||
|
filtered = [i for i in tops if kids[i][1] >= max_size * 0.1]
|
||||||
|
filtered_total = sum(kids[i][1] for i in filtered)
|
||||||
|
if filtered_total >= over:
|
||||||
|
tops, top_total = filtered, filtered_total
|
||||||
|
# 先打印所有分配计划
|
||||||
|
actions = []
|
||||||
|
for i in tops:
|
||||||
|
c, l = kids[i]
|
||||||
|
share = int(over * l / top_total)
|
||||||
|
new_keep = l - share
|
||||||
|
print(f'{indent} -> <{c.name}> {l} -> {new_keep} (share={share})')
|
||||||
|
actions.append((c, l, new_keep))
|
||||||
|
# 再统一执行
|
||||||
|
for c, l, new_keep in actions:
|
||||||
|
if new_keep <= 0: c.decompose()
|
||||||
|
elif new_keep > CUT_THRESHOLD: smart_truncate(c, new_keep, _depth + 1)
|
||||||
|
else: cut(c, new_keep)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def execute_js_rich(script, driver, no_monitor=False):
|
def execute_js_rich(script, driver, no_monitor=False):
|
||||||
|
|||||||
Reference in New Issue
Block a user