refactor(simphtml): rewrite list detection & cutlist for multi-list support

- simphtml: replace center-point ancestor-chain approach with global container scan;
  support multiple lists per page; add container-scoped selector prefixes;
  inline FAKE ELEMENT hints with hidden item previews; remove findMainContent
- ga: hot-reload simphtml on each web_scan; fix file_read total_lines for keyword search;
  add errors='replace' for global_mem encoding safety
- llmcore: stabilize NativeClaude session/device IDs across requests;
  rename no_system_prompt to fake_cc_system_prompt; deep-copy message content
- launch: adjust window width 700->600
This commit is contained in:
Liang Jiaqing
2026-04-03 19:38:51 +08:00
parent 4a5043a658
commit 97abc43a40
4 changed files with 120 additions and 282 deletions

18
ga.py
View File

@@ -1,4 +1,4 @@
import sys, os, re, json, time, threading import sys, os, re, json, time, threading, importlib
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import tempfile, traceback, subprocess, itertools, collections import tempfile, traceback, subprocess, itertools, collections
@@ -94,10 +94,8 @@ def ask_user(question: str, candidates: list = None):
return {"status": "INTERRUPT", "intent": "HUMAN_INTERVENTION", return {"status": "INTERRUPT", "intent": "HUMAN_INTERVENTION",
"data": {"question": question, "candidates": candidates or []}} "data": {"question": question, "candidates": candidates or []}}
from simphtml import execute_js_rich, get_html import simphtml
driver = None driver = None
def first_init_driver(): def first_init_driver():
global driver global driver
from TMWebDriver import TMWebDriver from TMWebDriver import TMWebDriver
@@ -137,7 +135,8 @@ def web_scan(tabs_only=False, switch_tab_id=None, text_only=False):
"active_tab": driver.default_session_id "active_tab": driver.default_session_id
} }
} }
if not tabs_only: result["content"] = get_html(driver, cutlist=True, maxchars=38000, text_only=text_only) if not tabs_only:
importlib.reload(simphtml); result["content"] = simphtml.get_html(driver, cutlist=True, maxchars=38000, text_only=text_only)
return result return result
except Exception as e: except Exception as e:
return {"status": "error", "msg": format_error(e)} return {"status": "error", "msg": format_error(e)}
@@ -183,7 +182,7 @@ def web_execute_js(script, switch_tab_id=None, no_monitor=False):
if driver is None: first_init_driver() if driver is None: first_init_driver()
if len(driver.get_all_sessions()) == 0: return {"status": "error", "msg": "没有可用的浏览器标签页查L3记忆分析原因。"} if len(driver.get_all_sessions()) == 0: return {"status": "error", "msg": "没有可用的浏览器标签页查L3记忆分析原因。"}
if switch_tab_id: driver.default_session_id = switch_tab_id if switch_tab_id: driver.default_session_id = switch_tab_id
result = execute_js_rich(script, driver, no_monitor=no_monitor) result = simphtml.execute_js_rich(script, driver, no_monitor=no_monitor)
return result return result
except Exception as e: except Exception as e:
return {"status": "error", "msg": format_error(e)} return {"status": "error", "msg": format_error(e)}
@@ -236,14 +235,13 @@ def file_read(path, start=1, keyword=None, count=200, show_linenos=True):
else: res = list(itertools.islice(stream, count)) else: res = list(itertools.islice(stream, count))
realcnt = len(res); L_MAX = max(100, 512000//realcnt); TAG = " ... [TRUNCATED]" realcnt = len(res); L_MAX = max(100, 512000//realcnt); TAG = " ... [TRUNCATED]"
remaining = sum(1 for _ in itertools.islice(stream, 5000)) remaining = sum(1 for _ in itertools.islice(stream, 5000))
total_lines = (start - 1) + realcnt + remaining total_lines = (res[0][0] - 1 if res else start - 1) + realcnt + remaining
total_tag = "[FILE] Total " + (f"{total_lines}+" if remaining >= 5000 else str(total_lines)) + ' lines\n' total_tag = "[FILE] Total " + (f"{total_lines}+" if remaining >= 5000 else str(total_lines)) + ' lines\n'
res = [(i, l if len(l) <= L_MAX else l[:L_MAX] + TAG) for i, l in res] res = [(i, l if len(l) <= L_MAX else l[:L_MAX] + TAG) for i, l in res]
result = "\n".join(f"{i}|{l}" if show_linenos else l for i, l in res) result = "\n".join(f"{i}|{l}" if show_linenos else l for i, l in res)
if show_linenos: result = total_tag + result if show_linenos: result = total_tag + result
return result return result
except Exception as e: except Exception as e: return f"Error: {str(e)}"
return f"Error: {str(e)}"
def smart_format(data, max_depth=2, max_str_len=100, omit_str=' ... '): def smart_format(data, max_depth=2, max_str_len=100, omit_str=' ... '):
def truncate(obj, depth): def truncate(obj, depth):
@@ -516,7 +514,7 @@ def get_global_memory():
prompt = "\n" prompt = "\n"
try: try:
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(script_dir, 'memory/global_mem_insight.txt'), 'r', encoding='utf-8') as f: insight = f.read() with open(os.path.join(script_dir, 'memory/global_mem_insight.txt'), 'r', encoding='utf-8', errors='replace') as f: insight = f.read()
with open(os.path.join(script_dir, 'assets/insight_fixed_structure.txt'), 'r', encoding='utf-8') as f: structure = f.read() with open(os.path.join(script_dir, 'assets/insight_fixed_structure.txt'), 'r', encoding='utf-8') as f: structure = f.read()
prompt += f'cwd = {os.path.abspath("./temp")} (用./引用)\n' prompt += f'cwd = {os.path.abspath("./temp")} (用./引用)\n'
prompt += f"\n[Memory] (../memory)\n" prompt += f"\n[Memory] (../memory)\n"

View File

@@ -1,6 +1,6 @@
import webview, threading, subprocess, sys, time, os, ctypes, atexit, socket, random import webview, threading, subprocess, sys, time, os, ctypes, atexit, socket, random
WINDOW_WIDTH, WINDOW_HEIGHT, RIGHT_PADDING, TOP_PADDING = 700, 900, 0, 100 WINDOW_WIDTH, WINDOW_HEIGHT, RIGHT_PADDING, TOP_PADDING = 600, 900, 0, 100
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
frontends_dir = os.path.join(script_dir, "frontends") frontends_dir = os.path.join(script_dir, "frontends")

View File

@@ -481,7 +481,10 @@ class NativeClaudeSession(BaseSession):
def __init__(self, cfg): def __init__(self, cfg):
super().__init__(cfg) super().__init__(cfg)
self.context_win = cfg.get("context_win", 28000) self.context_win = cfg.get("context_win", 28000)
self.no_system_prompt = cfg.get("no_system_prompt", False) self.fake_cc_system_prompt = cfg.get("fake_cc_system_prompt", False)
self._session_id = str(uuid.uuid4())
self._account_uuid = str(uuid.uuid4())
self._device_id = uuid.uuid4().hex + uuid.uuid4().hex[:32]
def raw_ask(self, messages, tools=None, system=None, model=None, temperature=0.5, max_tokens=6144): def raw_ask(self, messages, tools=None, system=None, model=None, temperature=0.5, max_tokens=6144):
model = model or self.default_model model = model or self.default_model
@@ -490,14 +493,14 @@ class NativeClaudeSession(BaseSession):
if self.api_key.startswith("cr_"): headers["authorization"] = f"Bearer {self.api_key}" if self.api_key.startswith("cr_"): headers["authorization"] = f"Bearer {self.api_key}"
else: headers["x-api-key"] = self.api_key else: headers["x-api-key"] = self.api_key
payload = {"model": model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "stream": True} payload = {"model": model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "stream": True}
payload["metadata"] = {"user_id": json.dumps({"device_id":uuid.uuid4().hex+uuid.uuid4().hex[:32],"account_uuid":"","session_id":str(uuid.uuid4())},separators=(',',':'))} payload["metadata"] = {"user_id": json.dumps({"device_id": self._device_id, "account_uuid": self._account_uuid, "session_id": self._session_id}, separators=(',', ':'))}
if tools: if tools:
tools = [dict(t) for t in tools]; tools[-1]["cache_control"] = {"type": "ephemeral"} tools = [dict(t) for t in tools]; tools[-1]["cache_control"] = {"type": "ephemeral"}
payload["tools"] = tools payload["tools"] = tools
payload['system'] = [] payload['system'] = [{"type": "text", "text": "You are Claude Code, Anthropic's official CLI for Claude.", "cache_control": {"type": "ephemeral"}}]
if system: if system:
if self.no_system_prompt: messages[0]["content"].insert(0, {"type": "text", "text": f"{system}\n"}) if self.fake_cc_system_prompt: messages[0]["content"].insert(0, {"type": "text", "text": system})
else: payload["system"] = [{"type": "text", "text": system, "cache_control": {"type": "ephemeral"}}] else: payload["system"] = [{"type": "text", "text": system}]
messages[-1] = {**messages[-1], "content": list(messages[-1]["content"])} messages[-1] = {**messages[-1], "content": list(messages[-1]["content"])}
messages[-1]["content"][-1] = dict(messages[-1]["content"][-1], cache_control={"type": "ephemeral"}) messages[-1]["content"][-1] = dict(messages[-1]["content"][-1], cache_control={"type": "ephemeral"})
try: try:
@@ -518,7 +521,7 @@ class NativeClaudeSession(BaseSession):
with self.lock: with self.lock:
self.history.append(msg) self.history.append(msg)
trim_messages_history(self.history, self.context_win) trim_messages_history(self.history, self.context_win)
messages = list(self.history) messages = [{"role": m["role"], "content": list(m["content"])} for m in self.history]
content_blocks = None content_blocks = None
gen = self.raw_ask(messages, tools, self.system, model) gen = self.raw_ask(messages, tools, self.system, model)

View File

@@ -301,73 +301,84 @@ return root.outerHTML;
optHTML()''' optHTML()'''
js_findMainList = r'''function findMainList(startElement = null) { js_findMainList = r'''function findMainList(startElement = null) {
const containerElement = startElement || document.body; const root = startElement || document.body;
const rect = containerElement.getBoundingClientRect(); const MIN_CHILDREN = 8;
const centerX = startElement ? (rect.left + rect.width/2) : (window.innerWidth/2); const MAX_CONTAINERS = 20;
const centerY = startElement ? (rect.top + rect.height/2) : (window.innerHeight/2);
// 获取中心元素 // 全局扫描:收集所有子元素足够多的容器
const centerElement = document.elementFromPoint(centerX, centerY) || containerElement; const containers = [];
if (!centerElement) return { container: null, items: [] }; const allEls = root.querySelectorAll('*');
for (const node of allEls) {
// 收集祖先链 if (node.children.length >= MIN_CHILDREN) {
const ancestors = []; containers.push(node);
for (let current = centerElement; current && ancestors.length < 10; current = current.parentElement) { }
ancestors.push(current); }
if (current === containerElement) break; if (root.children.length >= MIN_CHILDREN && !containers.includes(root)) {
if (containerElement !== document.body && !containerElement.contains(current)) break; containers.unshift(root);
} }
if (!ancestors.includes(containerElement)) ancestors.push(containerElement);
let groupCandidates = []; // 按子元素数量降序,取前 MAX_CONTAINERS 个
ancestors.forEach(ancestor => { containers.sort((a, b) => b.children.length - a.children.length);
const topGroups = findTopGroups(ancestor, 3); const toProcess = containers.slice(0, MAX_CONTAINERS);
groupCandidates = groupCandidates.concat(topGroups);
});
console.log(groupCandidates); // 对每个容器找候选组并评分
let allCandidates = [];
let candidates = []; for (const container of toProcess) {
ancestors.forEach(container => { const topGroups = findTopGroups(container, 3);
groupCandidates.forEach(groupInfo => { for (const groupInfo of topGroups) {
// 尝试将组应用到当前容器
const items = findMatchingElements(container, groupInfo.selector); const items = findMatchingElements(container, groupInfo.selector);
// 只考虑足够大的组 if (items.length >= 5) {
if (items.length >= 3) { const score = scoreContainer(container, items) + groupInfo.score;
candidates.push({ if (score >= 30) {
container: container, allCandidates.push({ container, selector: groupInfo.selector, items, score });
selector: groupInfo.selector, }
items: items, }
gscore: groupInfo.score
});
} }
});
});
candidates = candidates.map(candidate => {
const score = scoreContainer(candidate.container, candidate.items) + candidate.gscore;
return {...candidate, score};
});
if (candidates.length === 0) {
return { container: centerElement, items: [] };
} }
// 3. 选择得分最高的容器 // 按分数降序排列
const bestCandidate = candidates.sort((a, b) => b.score - a.score)[0]; allCandidates.sort((a, b) => b.score - a.score);
console.log(candidates);
// 如果最高分仍然很低,退回到中心元素 // 去重移除与更高分候选重叠超50%的结果
if (bestCandidate.score < 30) { const kept = [];
return { container: centerElement, items: [] }; for (const cand of allCandidates) {
let dominated = false;
for (const k of kept) {
if (k.container.contains(cand.container) || cand.container.contains(k.container)) {
const kSet = new Set(k.items);
const overlap = cand.items.filter(it => kSet.has(it)).length;
if (overlap > cand.items.length * 0.5) { dominated = true; break; }
}
}
if (!dominated) kept.push(cand);
} }
return { function describeResult(container, items, selector, score) {
container: bestCandidate.container, const cTag = container ? container.tagName : null;
items: bestCandidate.items, const cId = container ? (container.id || '') : '';
selector: bestCandidate.selector, const cClass = container ? (String(container.className || '').trim()) : '';
score: bestCandidate.score const result = {
containerTag: cTag, containerId: cId, containerClass: cClass,
itemCount: items.length,
}; };
let prefix = '';
if (cId) {
prefix = '#' + cId;
} else if (cClass) {
prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + c).join('');
}
if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector;
if (score !== undefined) result.score = score;
if (items.length > 0) {
result.firstItemPreview = items[0].outerHTML.substring(0, 200);
result.itemTags = items.slice(0, 10).map(el => el.tagName + (el.className ? '.' + String(el.className).trim().split(/\s+/)[0] : ''));
}
return result;
}
if (kept.length === 0) return [];
return kept.map(c => describeResult(c.container, c.items, c.selector, c.score));
} }
function findTopGroups(container, limit) { function findTopGroups(container, limit) {
@@ -435,14 +446,8 @@ js_findMainList = r'''function findMainList(startElement = null) {
} }
}); });
// 添加标签+类组合 // 添加标签+类组合
const topTags = Object.keys(tagFreq) const topTags = Object.keys(tagFreq).filter(t => tagFreq[t] >= minGroupSize).slice(0, 3);
.filter(t => tagFreq[t] >= minGroupSize) const topClasses = Object.keys(classFreq).filter(c => classFreq[c] >= minGroupSize).sort((a, b) => classFreq[b] - classFreq[a]).slice(0, 3);
.slice(0, 3);
const topClasses = Object.keys(classFreq)
.filter(c => classFreq[c] >= minGroupSize)
.sort((a, b) => classFreq[b] - classFreq[a])
.slice(0, 3);
// 标签+类 // 标签+类
topTags.forEach(tag => { topTags.forEach(tag => {
@@ -454,11 +459,7 @@ js_findMainList = r'''function findMainList(startElement = null) {
if (elements.length >= minGroupSize) { if (elements.length >= minGroupSize) {
const selector = tag + '.' + cls; const selector = tag + '.' + cls;
groups.push({ groups.push({selector, elements, score: scoreGroup(selector, elements)});
selector,
elements,
score: scoreGroup(selector, elements)
});
} }
}); });
}); });
@@ -467,25 +468,16 @@ js_findMainList = r'''function findMainList(startElement = null) {
for (let i = 0; i < topClasses.length; i++) { for (let i = 0; i < topClasses.length; i++) {
for (let j = i + 1; j < topClasses.length; j++) { for (let j = i + 1; j < topClasses.length; j++) {
const elements = children.filter(el => const elements = children.filter(el =>
el.className && el.className && el.className.split(/\s+/).includes(topClasses[i]) && el.className.split(/\s+/).includes(topClasses[j]));
el.className.split(/\s+/).includes(topClasses[i]) &&
el.className.split(/\s+/).includes(topClasses[j])
);
if (elements.length >= minGroupSize) { if (elements.length >= minGroupSize) {
const selector = '.' + topClasses[i] + '.' + topClasses[j]; const selector = '.' + topClasses[i] + '.' + topClasses[j];
groups.push({ groups.push({selector, elements,score: scoreGroup(selector, elements)});
selector,
elements,
score: scoreGroup(selector, elements)
});
} }
} }
} }
// 返回得分最高的N个组 // 返回得分最高的N个组
return groups return groups.sort((a, b) => b.score - a.score).slice(0, limit);
.sort((a, b) => b.score - a.score)
.slice(0, limit);
} }
function findMatchingElements(container, selector) { function findMatchingElements(container, selector) {
@@ -500,7 +492,6 @@ js_findMainList = r'''function findMainList(startElement = null) {
function scoreContainer(container, items) { function scoreContainer(container, items) {
if (!container || items.length < 3) return 0; if (!container || items.length < 3) return 0;
// 1. 计算基础面积数据 // 1. 计算基础面积数据
const containerRect = container.getBoundingClientRect(); const containerRect = container.getBoundingClientRect();
const containerArea = containerRect.width * containerRect.height; const containerArea = containerRect.width * containerRect.height;
@@ -520,14 +511,11 @@ js_findMainList = r'''function findMainList(startElement = null) {
visibleItems++; visibleItems++;
} }
}); });
// 如果可见项太少,返回低分 // 如果可见项太少,返回低分
if (visibleItems < 3) return 0; if (visibleItems < 3) return 0;
// 防止异常值:确保面积不超过容器 // 防止异常值:确保面积不超过容器
totalItemArea = Math.min(totalItemArea, containerArea * 0.98); totalItemArea = Math.min(totalItemArea, containerArea * 0.98);
const areaRatio = totalItemArea / containerArea; const areaRatio = totalItemArea / containerArea;
// 3. 计算各项评分 - 使用线性插值而非阶梯 // 3. 计算各项评分 - 使用线性插值而非阶梯
// 3.2 面积比评分 - 最多40分连续曲线 // 3.2 面积比评分 - 最多40分连续曲线
// 使用sigmoid函数让评分更平滑 // 使用sigmoid函数让评分更平滑
@@ -539,7 +527,6 @@ js_findMainList = r'''function findMainList(startElement = null) {
const mean = itemAreas.reduce((sum, area) => sum + area, 0) / itemAreas.length; const mean = itemAreas.reduce((sum, area) => sum + area, 0) / itemAreas.length;
const variance = itemAreas.reduce((sum, area) => sum + Math.pow(area - mean, 2), 0) / itemAreas.length; const variance = itemAreas.reduce((sum, area) => sum + Math.pow(area - mean, 2), 0) / itemAreas.length;
const cv = mean > 0 ? Math.sqrt(variance) / mean : 1; const cv = mean > 0 ? Math.sqrt(variance) / mean : 1;
// 指数衰减函数cv越小分数越高 // 指数衰减函数cv越小分数越高
uniformityScore = 20 * Math.exp(-2.5 * cv); uniformityScore = 20 * Math.exp(-2.5 * cv);
} }
@@ -558,10 +545,8 @@ js_findMainList = r'''function findMainList(startElement = null) {
// 坐标分组并计算行列数 // 坐标分组并计算行列数
const uniqueRows = new Set(items.map(item => Math.round(item.getBoundingClientRect().top / 5) * 5)).size; const uniqueRows = new Set(items.map(item => Math.round(item.getBoundingClientRect().top / 5) * 5)).size;
const uniqueCols = new Set(items.map(item => Math.round(item.getBoundingClientRect().left / 5) * 5)).size; const uniqueCols = new Set(items.map(item => Math.round(item.getBoundingClientRect().left / 5) * 5)).size;
// 如果是单行或单列,直接给满分;否则评估网格质量 // 如果是单行或单列,直接给满分;否则评估网格质量
if (uniqueRows === 1 || uniqueCols === 1) { if (uniqueRows === 1 || uniqueCols === 1) { layoutScore = 20;
layoutScore = 20;
} else { } else {
const coverage = Math.min(1, items.length / (uniqueRows * uniqueCols)); const coverage = Math.min(1, items.length / (uniqueRows * uniqueCols));
const efficiency = Math.max(0, 1 - (uniqueRows + uniqueCols) / (2 * items.length)); const efficiency = Math.max(0, 1 - (uniqueRows + uniqueCols) / (2 * items.length));
@@ -586,168 +571,6 @@ js_findMainList = r'''function findMainList(startElement = null) {
return totalScore; return totalScore;
}''' }'''
js_findMainContent = '''
function isLikelyOperationMenu(element) {
// 基础尺寸和位置检查
const rect = element.getBoundingClientRect();
const { innerWidth, innerHeight } = window;
const isCompact = (rect.width * rect.height) < (innerWidth * innerHeight * 0.15);
if (!isCompact) return false;
// 边缘检测
const edgeProximity = {
top: rect.top < 100,
left: rect.left < 50,
right: innerWidth - rect.right < 50,
bottom: innerHeight - rect.bottom < 100
};
const isAtEdge = Object.values(edgeProximity).some(Boolean);
// 交互元素分析
const links = [...element.querySelectorAll('a')];
const buttons = [...element.querySelectorAll('button, [role="button"]')];
const allInteractive = [...links, ...buttons];
// 快速排除: 边缘较大元素通常是导航
if (isAtEdge && rect.width > 150 && rect.height > 50 && links.length > 3) {
return false;
}
// 链接类型分析
const linkTypes = links.reduce((types, link) => {
const href = link.getAttribute('href') || '';
if (href.startsWith('#')) types.hash++;
else if (href.startsWith('javascript:')) types.js++;
else if (href.includes('://') && !href.includes(location.hostname)) types.external++;
else types.internal++;
return types;
}, { hash: 0, js: 0, external: 0, internal: 0 });
// 特征评分
const operationFeatures = [
linkTypes.hash > 0 || linkTypes.js > 0, // 页内操作链接
buttons.length > 0, // 有按钮
buttons.length > 1,
rect.width > rect.height * 1.5 && allInteractive.length <= 6, // 水平排列且元素适量
element.querySelectorAll('svg, img, i, [class*="icon"]').length > 0, // 有图标
getComputedStyle(element).position !== 'static' && !isAtEdge // 定位但不在边缘
];
const navigationFeatures = [
isAtEdge, // 在页面边缘
linkTypes.internal > 3, // 多个内部页面链接
links.length === allInteractive.length && links.length > 3 // 全是链接且数量多
];
const opScore = operationFeatures.filter(Boolean).length;
const navScore = navigationFeatures.filter(Boolean).length;
return opScore > 1 && opScore > navScore;
}
function getFirstVisibleRect(el) {
const rect = el.getBoundingClientRect();
if (rect.width > 0 && rect.height > 0) {
return {
left: rect.left, top: rect.top, right: rect.right, bottom: rect.bottom,
width: rect.width, height: rect.height, x: rect.x, y: rect.y,
zIndex: parseInt(getComputedStyle(el).zIndex) || 0
};
}
if (!el.querySelector('button, a, input') || !el.innerText.trim()) return rect;
const visibleChild = Array.from(el.children)
.find(child => {
const hasContent = child.querySelector('button, a, input') && child.innerText.trim();
return hasContent && (
child.getBoundingClientRect().width > 0 ||
getFirstVisibleRect(child).width > 0
);
});
if (!visibleChild) return rect;
const childRect = visibleChild.getBoundingClientRect();
return childRect.width > 0 ?
{
left: childRect.left, top: childRect.top, right: childRect.right, bottom: childRect.bottom,
width: childRect.width, height: childRect.height, x: childRect.x, y: childRect.y,
zIndex: parseInt(getComputedStyle(visibleChild).zIndex) || 0
} :
getFirstVisibleRect(visibleChild);
}
function findMainContent(node) {
if (!node?.children?.length) return node;
const rectn = node.getBoundingClientRect();
const viewportArea = window.innerWidth * window.innerHeight;
if (rectn.width * rectn.height < viewportArea * 0.4) return node;
// 过滤可见元素
const children = [...node.children].filter(child => {
const style = window.getComputedStyle(child);
const hasTextContent = child.textContent.trim().length > 5;
return style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0' && hasTextContent;
});
if (!children.length) return node;
if (children.length === 1) return findMainContent(children[0]);
if (children.length > 10) return node;
if (children.length == 2 && (isLikelyOperationMenu(children[0]) || isLikelyOperationMenu(children[1]))) return node;
// 计算元素信息
const elemInfo = children.map(child => {
const rect = getFirstVisibleRect(child);
const style = window.getComputedStyle(child);
return {
element: child, area: rect.width * rect.height, rect, style,
zIndex: rect.zIndex || 0, position: style.position
};
}).sort((a, b) => b.area - a.area);
// 检测重叠
function isOverlapping(r1, r2) {
return !(r1.right <= r2.left || r1.left >= r2.right || r1.bottom <= r2.top || r1.top >= r2.bottom);
}
// 检查是否有任何重叠的元素对
const hasOverlap = elemInfo.some((e1, i) =>
elemInfo.slice(i + 1).some(e2 => isOverlapping(e1.rect, e2.rect))
);
console.log(hasOverlap, elemInfo);
// 无重叠情况: 面积比例判断
if (!hasOverlap) {
const totalArea = elemInfo.reduce((sum, item) => sum + item.area, 0);
const [main, second] = elemInfo;
return (main.area / totalArea > 0.6 && (!second || main.area > second.area * 2))
? findMainContent(main.element) : node;
}
// 1. 按z-index和定位方式排序
const sorted = [...elemInfo].sort((a, b) => {
// 非静态定位优先
if (a.position !== 'static' && b.position === 'static') return -1;
if (a.position === 'static' && b.position !== 'static') return 1;
// 其次按z-index排序
return b.zIndex - a.zIndex;
});
// 2. 在排序后的列表中找到第一个符合条件的元素
const suitable = sorted.find(x => {
const el = x.element, rect = x.rect, style = x.style;
return Math.abs((rect.left + rect.width/2) - window.innerWidth/2) < window.innerWidth*0.3 &&
parseFloat(style.opacity) > 0.1 &&
(parseInt(rect.zIndex) > 30 || style.boxShadow !== 'none') &&
el.querySelector('button, a, input') !== null;
});
// 3. 找到合适元素则使用它,否则返回面积最大的元素
if (suitable) {
return findMainContent(suitable.element);
} else {
const byArea = [...elemInfo].sort((a, b) => b.area - a.area);
return findMainContent(byArea[0].element);
}
} '''
def optimize_html_for_tokens(html): def optimize_html_for_tokens(html):
if type(html) is str: soup = BeautifulSoup(html, 'html.parser') if type(html) is str: soup = BeautifulSoup(html, 'html.parser')
else: soup = html else: soup = html
@@ -860,24 +683,38 @@ def find_changed_elements(before_html, after_html):
def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False): def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False):
page = get_main_block(driver, extra_js=extra_js, text_only=text_only) page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
if text_only: return page if text_only: return page
warning = ''
soup = optimize_html_for_tokens(page) soup = optimize_html_for_tokens(page)
html = str(soup) html = str(soup)
if not cutlist or len(html) <= maxchars: return html if not cutlist: return html
rr = driver.execute_js(js_findMainList + js_findMainContent + """ rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', [])
return findMainList(findMainContent(document.body));""").get('data', {}) lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else [])
sel = rr.get("selector", None) if isinstance(rr, dict) else None if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}")
if sel: for entry in lists:
s = BeautifulSoup(str(soup), "html.parser"); items = s.select(sel) sel = entry.get('selector') if isinstance(entry, dict) else None
if not sel: continue
items = soup.select(sel)
if len(items) < 5: continue
total_len = sum(len(str(it)) for it in items)
avg_len = total_len / len(items)
if avg_len < 500 or (avg_len < 1000 and total_len < len(html) * 0.3): continue
print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, keep 3, save ~{total_len - 3 * avg_len:.0f} chars")
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)] hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
keep = hit[:6] if hit else items[:3] keep = hit[:6] if hit else items[:3]
for it in items: removed = [it for it in items if it not in keep]
if it not in keep: it.decompose() sample_texts = []
ss = str(optimize_html_for_tokens(s)) for rm in removed[:5]:
warning = '[SYSTEM] Found item list, only show some items ...\n' txt = rm.get_text(" ", strip=True)[:40]
else: ss = html if txt: sample_texts.append(txt)
hint_parts = [f'[FAKE ELEMENT] {len(removed)} more items hidden, selector: "{sel}"']
if sample_texts: hint_parts.append('Hidden items: ' + ','.join(f'"{t}"' for t in sample_texts))
hint_tag = soup.new_tag("div")
hint_tag.string = ' '.join(hint_parts)
if keep: keep[-1].insert_after(hint_tag)
for it in removed: it.decompose()
ss = str(optimize_html_for_tokens(soup)) if lists else html
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
if len(ss) > maxchars: ss = ss[:maxchars] + ' ... [TRUNCATED]' if len(ss) > maxchars: ss = ss[:maxchars] + ' ... [TRUNCATED]'
return warning + str(ss) return ss
def execute_js_rich(script, driver, no_monitor=False): def execute_js_rich(script, driver, no_monitor=False):
last_html = None last_html = None