refactor(simphtml): rewrite list detection & cutlist for multi-list support
- simphtml: replace center-point ancestor-chain approach with global container scan; support multiple lists per page; add container-scoped selector prefixes; inline FAKE ELEMENT hints with hidden item previews; remove findMainContent - ga: hot-reload simphtml on each web_scan; fix file_read total_lines for keyword search; add errors='replace' for global_mem encoding safety - llmcore: stabilize NativeClaude session/device IDs across requests; rename no_system_prompt to fake_cc_system_prompt; deep-copy message content - launch: adjust window width 700->600
This commit is contained in:
18
ga.py
18
ga.py
@@ -1,4 +1,4 @@
|
|||||||
import sys, os, re, json, time, threading
|
import sys, os, re, json, time, threading, importlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tempfile, traceback, subprocess, itertools, collections
|
import tempfile, traceback, subprocess, itertools, collections
|
||||||
@@ -94,10 +94,8 @@ def ask_user(question: str, candidates: list = None):
|
|||||||
return {"status": "INTERRUPT", "intent": "HUMAN_INTERVENTION",
|
return {"status": "INTERRUPT", "intent": "HUMAN_INTERVENTION",
|
||||||
"data": {"question": question, "candidates": candidates or []}}
|
"data": {"question": question, "candidates": candidates or []}}
|
||||||
|
|
||||||
from simphtml import execute_js_rich, get_html
|
import simphtml
|
||||||
|
|
||||||
driver = None
|
driver = None
|
||||||
|
|
||||||
def first_init_driver():
|
def first_init_driver():
|
||||||
global driver
|
global driver
|
||||||
from TMWebDriver import TMWebDriver
|
from TMWebDriver import TMWebDriver
|
||||||
@@ -137,7 +135,8 @@ def web_scan(tabs_only=False, switch_tab_id=None, text_only=False):
|
|||||||
"active_tab": driver.default_session_id
|
"active_tab": driver.default_session_id
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if not tabs_only: result["content"] = get_html(driver, cutlist=True, maxchars=38000, text_only=text_only)
|
if not tabs_only:
|
||||||
|
importlib.reload(simphtml); result["content"] = simphtml.get_html(driver, cutlist=True, maxchars=38000, text_only=text_only)
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "msg": format_error(e)}
|
return {"status": "error", "msg": format_error(e)}
|
||||||
@@ -183,7 +182,7 @@ def web_execute_js(script, switch_tab_id=None, no_monitor=False):
|
|||||||
if driver is None: first_init_driver()
|
if driver is None: first_init_driver()
|
||||||
if len(driver.get_all_sessions()) == 0: return {"status": "error", "msg": "没有可用的浏览器标签页,查L3记忆分析原因。"}
|
if len(driver.get_all_sessions()) == 0: return {"status": "error", "msg": "没有可用的浏览器标签页,查L3记忆分析原因。"}
|
||||||
if switch_tab_id: driver.default_session_id = switch_tab_id
|
if switch_tab_id: driver.default_session_id = switch_tab_id
|
||||||
result = execute_js_rich(script, driver, no_monitor=no_monitor)
|
result = simphtml.execute_js_rich(script, driver, no_monitor=no_monitor)
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "msg": format_error(e)}
|
return {"status": "error", "msg": format_error(e)}
|
||||||
@@ -236,14 +235,13 @@ def file_read(path, start=1, keyword=None, count=200, show_linenos=True):
|
|||||||
else: res = list(itertools.islice(stream, count))
|
else: res = list(itertools.islice(stream, count))
|
||||||
realcnt = len(res); L_MAX = max(100, 512000//realcnt); TAG = " ... [TRUNCATED]"
|
realcnt = len(res); L_MAX = max(100, 512000//realcnt); TAG = " ... [TRUNCATED]"
|
||||||
remaining = sum(1 for _ in itertools.islice(stream, 5000))
|
remaining = sum(1 for _ in itertools.islice(stream, 5000))
|
||||||
total_lines = (start - 1) + realcnt + remaining
|
total_lines = (res[0][0] - 1 if res else start - 1) + realcnt + remaining
|
||||||
total_tag = "[FILE] Total " + (f"{total_lines}+" if remaining >= 5000 else str(total_lines)) + ' lines\n'
|
total_tag = "[FILE] Total " + (f"{total_lines}+" if remaining >= 5000 else str(total_lines)) + ' lines\n'
|
||||||
res = [(i, l if len(l) <= L_MAX else l[:L_MAX] + TAG) for i, l in res]
|
res = [(i, l if len(l) <= L_MAX else l[:L_MAX] + TAG) for i, l in res]
|
||||||
result = "\n".join(f"{i}|{l}" if show_linenos else l for i, l in res)
|
result = "\n".join(f"{i}|{l}" if show_linenos else l for i, l in res)
|
||||||
if show_linenos: result = total_tag + result
|
if show_linenos: result = total_tag + result
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e: return f"Error: {str(e)}"
|
||||||
return f"Error: {str(e)}"
|
|
||||||
|
|
||||||
def smart_format(data, max_depth=2, max_str_len=100, omit_str=' ... '):
|
def smart_format(data, max_depth=2, max_str_len=100, omit_str=' ... '):
|
||||||
def truncate(obj, depth):
|
def truncate(obj, depth):
|
||||||
@@ -516,7 +514,7 @@ def get_global_memory():
|
|||||||
prompt = "\n"
|
prompt = "\n"
|
||||||
try:
|
try:
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
with open(os.path.join(script_dir, 'memory/global_mem_insight.txt'), 'r', encoding='utf-8') as f: insight = f.read()
|
with open(os.path.join(script_dir, 'memory/global_mem_insight.txt'), 'r', encoding='utf-8', errors='replace') as f: insight = f.read()
|
||||||
with open(os.path.join(script_dir, 'assets/insight_fixed_structure.txt'), 'r', encoding='utf-8') as f: structure = f.read()
|
with open(os.path.join(script_dir, 'assets/insight_fixed_structure.txt'), 'r', encoding='utf-8') as f: structure = f.read()
|
||||||
prompt += f'cwd = {os.path.abspath("./temp")} (用./引用)\n'
|
prompt += f'cwd = {os.path.abspath("./temp")} (用./引用)\n'
|
||||||
prompt += f"\n[Memory] (../memory)\n"
|
prompt += f"\n[Memory] (../memory)\n"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import webview, threading, subprocess, sys, time, os, ctypes, atexit, socket, random
|
import webview, threading, subprocess, sys, time, os, ctypes, atexit, socket, random
|
||||||
|
|
||||||
WINDOW_WIDTH, WINDOW_HEIGHT, RIGHT_PADDING, TOP_PADDING = 700, 900, 0, 100
|
WINDOW_WIDTH, WINDOW_HEIGHT, RIGHT_PADDING, TOP_PADDING = 600, 900, 0, 100
|
||||||
|
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
frontends_dir = os.path.join(script_dir, "frontends")
|
frontends_dir = os.path.join(script_dir, "frontends")
|
||||||
|
|||||||
15
llmcore.py
15
llmcore.py
@@ -481,7 +481,10 @@ class NativeClaudeSession(BaseSession):
|
|||||||
def __init__(self, cfg):
|
def __init__(self, cfg):
|
||||||
super().__init__(cfg)
|
super().__init__(cfg)
|
||||||
self.context_win = cfg.get("context_win", 28000)
|
self.context_win = cfg.get("context_win", 28000)
|
||||||
self.no_system_prompt = cfg.get("no_system_prompt", False)
|
self.fake_cc_system_prompt = cfg.get("fake_cc_system_prompt", False)
|
||||||
|
self._session_id = str(uuid.uuid4())
|
||||||
|
self._account_uuid = str(uuid.uuid4())
|
||||||
|
self._device_id = uuid.uuid4().hex + uuid.uuid4().hex[:32]
|
||||||
|
|
||||||
def raw_ask(self, messages, tools=None, system=None, model=None, temperature=0.5, max_tokens=6144):
|
def raw_ask(self, messages, tools=None, system=None, model=None, temperature=0.5, max_tokens=6144):
|
||||||
model = model or self.default_model
|
model = model or self.default_model
|
||||||
@@ -490,14 +493,14 @@ class NativeClaudeSession(BaseSession):
|
|||||||
if self.api_key.startswith("cr_"): headers["authorization"] = f"Bearer {self.api_key}"
|
if self.api_key.startswith("cr_"): headers["authorization"] = f"Bearer {self.api_key}"
|
||||||
else: headers["x-api-key"] = self.api_key
|
else: headers["x-api-key"] = self.api_key
|
||||||
payload = {"model": model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "stream": True}
|
payload = {"model": model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "stream": True}
|
||||||
payload["metadata"] = {"user_id": json.dumps({"device_id":uuid.uuid4().hex+uuid.uuid4().hex[:32],"account_uuid":"","session_id":str(uuid.uuid4())},separators=(',',':'))}
|
payload["metadata"] = {"user_id": json.dumps({"device_id": self._device_id, "account_uuid": self._account_uuid, "session_id": self._session_id}, separators=(',', ':'))}
|
||||||
if tools:
|
if tools:
|
||||||
tools = [dict(t) for t in tools]; tools[-1]["cache_control"] = {"type": "ephemeral"}
|
tools = [dict(t) for t in tools]; tools[-1]["cache_control"] = {"type": "ephemeral"}
|
||||||
payload["tools"] = tools
|
payload["tools"] = tools
|
||||||
payload['system'] = []
|
payload['system'] = [{"type": "text", "text": "You are Claude Code, Anthropic's official CLI for Claude.", "cache_control": {"type": "ephemeral"}}]
|
||||||
if system:
|
if system:
|
||||||
if self.no_system_prompt: messages[0]["content"].insert(0, {"type": "text", "text": f"{system}\n"})
|
if self.fake_cc_system_prompt: messages[0]["content"].insert(0, {"type": "text", "text": system})
|
||||||
else: payload["system"] = [{"type": "text", "text": system, "cache_control": {"type": "ephemeral"}}]
|
else: payload["system"] = [{"type": "text", "text": system}]
|
||||||
messages[-1] = {**messages[-1], "content": list(messages[-1]["content"])}
|
messages[-1] = {**messages[-1], "content": list(messages[-1]["content"])}
|
||||||
messages[-1]["content"][-1] = dict(messages[-1]["content"][-1], cache_control={"type": "ephemeral"})
|
messages[-1]["content"][-1] = dict(messages[-1]["content"][-1], cache_control={"type": "ephemeral"})
|
||||||
try:
|
try:
|
||||||
@@ -518,7 +521,7 @@ class NativeClaudeSession(BaseSession):
|
|||||||
with self.lock:
|
with self.lock:
|
||||||
self.history.append(msg)
|
self.history.append(msg)
|
||||||
trim_messages_history(self.history, self.context_win)
|
trim_messages_history(self.history, self.context_win)
|
||||||
messages = list(self.history)
|
messages = [{"role": m["role"], "content": list(m["content"])} for m in self.history]
|
||||||
|
|
||||||
content_blocks = None
|
content_blocks = None
|
||||||
gen = self.raw_ask(messages, tools, self.system, model)
|
gen = self.raw_ask(messages, tools, self.system, model)
|
||||||
|
|||||||
363
simphtml.py
363
simphtml.py
@@ -301,73 +301,84 @@ return root.outerHTML;
|
|||||||
optHTML()'''
|
optHTML()'''
|
||||||
|
|
||||||
js_findMainList = r'''function findMainList(startElement = null) {
|
js_findMainList = r'''function findMainList(startElement = null) {
|
||||||
const containerElement = startElement || document.body;
|
const root = startElement || document.body;
|
||||||
const rect = containerElement.getBoundingClientRect();
|
const MIN_CHILDREN = 8;
|
||||||
const centerX = startElement ? (rect.left + rect.width/2) : (window.innerWidth/2);
|
const MAX_CONTAINERS = 20;
|
||||||
const centerY = startElement ? (rect.top + rect.height/2) : (window.innerHeight/2);
|
|
||||||
|
|
||||||
// 获取中心元素
|
// 全局扫描:收集所有子元素足够多的容器
|
||||||
const centerElement = document.elementFromPoint(centerX, centerY) || containerElement;
|
const containers = [];
|
||||||
if (!centerElement) return { container: null, items: [] };
|
const allEls = root.querySelectorAll('*');
|
||||||
|
for (const node of allEls) {
|
||||||
// 收集祖先链
|
if (node.children.length >= MIN_CHILDREN) {
|
||||||
const ancestors = [];
|
containers.push(node);
|
||||||
for (let current = centerElement; current && ancestors.length < 10; current = current.parentElement) {
|
}
|
||||||
ancestors.push(current);
|
}
|
||||||
if (current === containerElement) break;
|
if (root.children.length >= MIN_CHILDREN && !containers.includes(root)) {
|
||||||
if (containerElement !== document.body && !containerElement.contains(current)) break;
|
containers.unshift(root);
|
||||||
}
|
}
|
||||||
if (!ancestors.includes(containerElement)) ancestors.push(containerElement);
|
|
||||||
|
|
||||||
let groupCandidates = [];
|
// 按子元素数量降序,取前 MAX_CONTAINERS 个
|
||||||
ancestors.forEach(ancestor => {
|
containers.sort((a, b) => b.children.length - a.children.length);
|
||||||
const topGroups = findTopGroups(ancestor, 3);
|
const toProcess = containers.slice(0, MAX_CONTAINERS);
|
||||||
groupCandidates = groupCandidates.concat(topGroups);
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log(groupCandidates);
|
// 对每个容器找候选组并评分
|
||||||
|
let allCandidates = [];
|
||||||
let candidates = [];
|
for (const container of toProcess) {
|
||||||
ancestors.forEach(container => {
|
const topGroups = findTopGroups(container, 3);
|
||||||
groupCandidates.forEach(groupInfo => {
|
for (const groupInfo of topGroups) {
|
||||||
// 尝试将组应用到当前容器
|
|
||||||
const items = findMatchingElements(container, groupInfo.selector);
|
const items = findMatchingElements(container, groupInfo.selector);
|
||||||
// 只考虑足够大的组
|
if (items.length >= 5) {
|
||||||
if (items.length >= 3) {
|
const score = scoreContainer(container, items) + groupInfo.score;
|
||||||
candidates.push({
|
if (score >= 30) {
|
||||||
container: container,
|
allCandidates.push({ container, selector: groupInfo.selector, items, score });
|
||||||
selector: groupInfo.selector,
|
}
|
||||||
items: items,
|
}
|
||||||
gscore: groupInfo.score
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
candidates = candidates.map(candidate => {
|
|
||||||
const score = scoreContainer(candidate.container, candidate.items) + candidate.gscore;
|
|
||||||
return {...candidate, score};
|
|
||||||
});
|
|
||||||
|
|
||||||
if (candidates.length === 0) {
|
|
||||||
return { container: centerElement, items: [] };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. 选择得分最高的容器
|
// 按分数降序排列
|
||||||
const bestCandidate = candidates.sort((a, b) => b.score - a.score)[0];
|
allCandidates.sort((a, b) => b.score - a.score);
|
||||||
console.log(candidates);
|
|
||||||
|
|
||||||
// 如果最高分仍然很低,退回到中心元素
|
// 去重:移除与更高分候选重叠超50%的结果
|
||||||
if (bestCandidate.score < 30) {
|
const kept = [];
|
||||||
return { container: centerElement, items: [] };
|
for (const cand of allCandidates) {
|
||||||
|
let dominated = false;
|
||||||
|
for (const k of kept) {
|
||||||
|
if (k.container.contains(cand.container) || cand.container.contains(k.container)) {
|
||||||
|
const kSet = new Set(k.items);
|
||||||
|
const overlap = cand.items.filter(it => kSet.has(it)).length;
|
||||||
|
if (overlap > cand.items.length * 0.5) { dominated = true; break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!dominated) kept.push(cand);
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
function describeResult(container, items, selector, score) {
|
||||||
container: bestCandidate.container,
|
const cTag = container ? container.tagName : null;
|
||||||
items: bestCandidate.items,
|
const cId = container ? (container.id || '') : '';
|
||||||
selector: bestCandidate.selector,
|
const cClass = container ? (String(container.className || '').trim()) : '';
|
||||||
score: bestCandidate.score
|
const result = {
|
||||||
|
containerTag: cTag, containerId: cId, containerClass: cClass,
|
||||||
|
itemCount: items.length,
|
||||||
};
|
};
|
||||||
|
let prefix = '';
|
||||||
|
if (cId) {
|
||||||
|
prefix = '#' + cId;
|
||||||
|
} else if (cClass) {
|
||||||
|
prefix = (cTag || '').toLowerCase() + cClass.split(/\s+/).slice(0, 3).map(c => '.' + c).join('');
|
||||||
|
}
|
||||||
|
if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector;
|
||||||
|
if (score !== undefined) result.score = score;
|
||||||
|
if (items.length > 0) {
|
||||||
|
result.firstItemPreview = items[0].outerHTML.substring(0, 200);
|
||||||
|
result.itemTags = items.slice(0, 10).map(el => el.tagName + (el.className ? '.' + String(el.className).trim().split(/\s+/)[0] : ''));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kept.length === 0) return [];
|
||||||
|
|
||||||
|
return kept.map(c => describeResult(c.container, c.items, c.selector, c.score));
|
||||||
}
|
}
|
||||||
|
|
||||||
function findTopGroups(container, limit) {
|
function findTopGroups(container, limit) {
|
||||||
@@ -435,14 +446,8 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
// 添加标签+类组合
|
// 添加标签+类组合
|
||||||
const topTags = Object.keys(tagFreq)
|
const topTags = Object.keys(tagFreq).filter(t => tagFreq[t] >= minGroupSize).slice(0, 3);
|
||||||
.filter(t => tagFreq[t] >= minGroupSize)
|
const topClasses = Object.keys(classFreq).filter(c => classFreq[c] >= minGroupSize).sort((a, b) => classFreq[b] - classFreq[a]).slice(0, 3);
|
||||||
.slice(0, 3);
|
|
||||||
|
|
||||||
const topClasses = Object.keys(classFreq)
|
|
||||||
.filter(c => classFreq[c] >= minGroupSize)
|
|
||||||
.sort((a, b) => classFreq[b] - classFreq[a])
|
|
||||||
.slice(0, 3);
|
|
||||||
|
|
||||||
// 标签+类
|
// 标签+类
|
||||||
topTags.forEach(tag => {
|
topTags.forEach(tag => {
|
||||||
@@ -454,11 +459,7 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
|
|
||||||
if (elements.length >= minGroupSize) {
|
if (elements.length >= minGroupSize) {
|
||||||
const selector = tag + '.' + cls;
|
const selector = tag + '.' + cls;
|
||||||
groups.push({
|
groups.push({selector, elements, score: scoreGroup(selector, elements)});
|
||||||
selector,
|
|
||||||
elements,
|
|
||||||
score: scoreGroup(selector, elements)
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -467,25 +468,16 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
for (let i = 0; i < topClasses.length; i++) {
|
for (let i = 0; i < topClasses.length; i++) {
|
||||||
for (let j = i + 1; j < topClasses.length; j++) {
|
for (let j = i + 1; j < topClasses.length; j++) {
|
||||||
const elements = children.filter(el =>
|
const elements = children.filter(el =>
|
||||||
el.className &&
|
el.className && el.className.split(/\s+/).includes(topClasses[i]) && el.className.split(/\s+/).includes(topClasses[j]));
|
||||||
el.className.split(/\s+/).includes(topClasses[i]) &&
|
|
||||||
el.className.split(/\s+/).includes(topClasses[j])
|
|
||||||
);
|
|
||||||
|
|
||||||
if (elements.length >= minGroupSize) {
|
if (elements.length >= minGroupSize) {
|
||||||
const selector = '.' + topClasses[i] + '.' + topClasses[j];
|
const selector = '.' + topClasses[i] + '.' + topClasses[j];
|
||||||
groups.push({
|
groups.push({selector, elements,score: scoreGroup(selector, elements)});
|
||||||
selector,
|
|
||||||
elements,
|
|
||||||
score: scoreGroup(selector, elements)
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 返回得分最高的N个组
|
// 返回得分最高的N个组
|
||||||
return groups
|
return groups.sort((a, b) => b.score - a.score).slice(0, limit);
|
||||||
.sort((a, b) => b.score - a.score)
|
|
||||||
.slice(0, limit);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function findMatchingElements(container, selector) {
|
function findMatchingElements(container, selector) {
|
||||||
@@ -500,7 +492,6 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
|
|
||||||
function scoreContainer(container, items) {
|
function scoreContainer(container, items) {
|
||||||
if (!container || items.length < 3) return 0;
|
if (!container || items.length < 3) return 0;
|
||||||
|
|
||||||
// 1. 计算基础面积数据
|
// 1. 计算基础面积数据
|
||||||
const containerRect = container.getBoundingClientRect();
|
const containerRect = container.getBoundingClientRect();
|
||||||
const containerArea = containerRect.width * containerRect.height;
|
const containerArea = containerRect.width * containerRect.height;
|
||||||
@@ -520,14 +511,11 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
visibleItems++;
|
visibleItems++;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// 如果可见项太少,返回低分
|
// 如果可见项太少,返回低分
|
||||||
if (visibleItems < 3) return 0;
|
if (visibleItems < 3) return 0;
|
||||||
|
|
||||||
// 防止异常值:确保面积不超过容器
|
// 防止异常值:确保面积不超过容器
|
||||||
totalItemArea = Math.min(totalItemArea, containerArea * 0.98);
|
totalItemArea = Math.min(totalItemArea, containerArea * 0.98);
|
||||||
const areaRatio = totalItemArea / containerArea;
|
const areaRatio = totalItemArea / containerArea;
|
||||||
|
|
||||||
// 3. 计算各项评分 - 使用线性插值而非阶梯
|
// 3. 计算各项评分 - 使用线性插值而非阶梯
|
||||||
// 3.2 面积比评分 - 最多40分,连续曲线
|
// 3.2 面积比评分 - 最多40分,连续曲线
|
||||||
// 使用sigmoid函数让评分更平滑
|
// 使用sigmoid函数让评分更平滑
|
||||||
@@ -539,7 +527,6 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
const mean = itemAreas.reduce((sum, area) => sum + area, 0) / itemAreas.length;
|
const mean = itemAreas.reduce((sum, area) => sum + area, 0) / itemAreas.length;
|
||||||
const variance = itemAreas.reduce((sum, area) => sum + Math.pow(area - mean, 2), 0) / itemAreas.length;
|
const variance = itemAreas.reduce((sum, area) => sum + Math.pow(area - mean, 2), 0) / itemAreas.length;
|
||||||
const cv = mean > 0 ? Math.sqrt(variance) / mean : 1;
|
const cv = mean > 0 ? Math.sqrt(variance) / mean : 1;
|
||||||
|
|
||||||
// 指数衰减函数,cv越小分数越高
|
// 指数衰减函数,cv越小分数越高
|
||||||
uniformityScore = 20 * Math.exp(-2.5 * cv);
|
uniformityScore = 20 * Math.exp(-2.5 * cv);
|
||||||
}
|
}
|
||||||
@@ -558,10 +545,8 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
// 坐标分组并计算行列数
|
// 坐标分组并计算行列数
|
||||||
const uniqueRows = new Set(items.map(item => Math.round(item.getBoundingClientRect().top / 5) * 5)).size;
|
const uniqueRows = new Set(items.map(item => Math.round(item.getBoundingClientRect().top / 5) * 5)).size;
|
||||||
const uniqueCols = new Set(items.map(item => Math.round(item.getBoundingClientRect().left / 5) * 5)).size;
|
const uniqueCols = new Set(items.map(item => Math.round(item.getBoundingClientRect().left / 5) * 5)).size;
|
||||||
|
|
||||||
// 如果是单行或单列,直接给满分;否则评估网格质量
|
// 如果是单行或单列,直接给满分;否则评估网格质量
|
||||||
if (uniqueRows === 1 || uniqueCols === 1) {
|
if (uniqueRows === 1 || uniqueCols === 1) { layoutScore = 20;
|
||||||
layoutScore = 20;
|
|
||||||
} else {
|
} else {
|
||||||
const coverage = Math.min(1, items.length / (uniqueRows * uniqueCols));
|
const coverage = Math.min(1, items.length / (uniqueRows * uniqueCols));
|
||||||
const efficiency = Math.max(0, 1 - (uniqueRows + uniqueCols) / (2 * items.length));
|
const efficiency = Math.max(0, 1 - (uniqueRows + uniqueCols) / (2 * items.length));
|
||||||
@@ -586,168 +571,6 @@ js_findMainList = r'''function findMainList(startElement = null) {
|
|||||||
return totalScore;
|
return totalScore;
|
||||||
}'''
|
}'''
|
||||||
|
|
||||||
js_findMainContent = '''
|
|
||||||
function isLikelyOperationMenu(element) {
|
|
||||||
// 基础尺寸和位置检查
|
|
||||||
const rect = element.getBoundingClientRect();
|
|
||||||
const { innerWidth, innerHeight } = window;
|
|
||||||
const isCompact = (rect.width * rect.height) < (innerWidth * innerHeight * 0.15);
|
|
||||||
if (!isCompact) return false;
|
|
||||||
|
|
||||||
// 边缘检测
|
|
||||||
const edgeProximity = {
|
|
||||||
top: rect.top < 100,
|
|
||||||
left: rect.left < 50,
|
|
||||||
right: innerWidth - rect.right < 50,
|
|
||||||
bottom: innerHeight - rect.bottom < 100
|
|
||||||
};
|
|
||||||
const isAtEdge = Object.values(edgeProximity).some(Boolean);
|
|
||||||
|
|
||||||
// 交互元素分析
|
|
||||||
const links = [...element.querySelectorAll('a')];
|
|
||||||
const buttons = [...element.querySelectorAll('button, [role="button"]')];
|
|
||||||
const allInteractive = [...links, ...buttons];
|
|
||||||
|
|
||||||
// 快速排除: 边缘较大元素通常是导航
|
|
||||||
if (isAtEdge && rect.width > 150 && rect.height > 50 && links.length > 3) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 链接类型分析
|
|
||||||
const linkTypes = links.reduce((types, link) => {
|
|
||||||
const href = link.getAttribute('href') || '';
|
|
||||||
if (href.startsWith('#')) types.hash++;
|
|
||||||
else if (href.startsWith('javascript:')) types.js++;
|
|
||||||
else if (href.includes('://') && !href.includes(location.hostname)) types.external++;
|
|
||||||
else types.internal++;
|
|
||||||
return types;
|
|
||||||
}, { hash: 0, js: 0, external: 0, internal: 0 });
|
|
||||||
|
|
||||||
// 特征评分
|
|
||||||
const operationFeatures = [
|
|
||||||
linkTypes.hash > 0 || linkTypes.js > 0, // 页内操作链接
|
|
||||||
buttons.length > 0, // 有按钮
|
|
||||||
buttons.length > 1,
|
|
||||||
rect.width > rect.height * 1.5 && allInteractive.length <= 6, // 水平排列且元素适量
|
|
||||||
element.querySelectorAll('svg, img, i, [class*="icon"]').length > 0, // 有图标
|
|
||||||
getComputedStyle(element).position !== 'static' && !isAtEdge // 定位但不在边缘
|
|
||||||
];
|
|
||||||
const navigationFeatures = [
|
|
||||||
isAtEdge, // 在页面边缘
|
|
||||||
linkTypes.internal > 3, // 多个内部页面链接
|
|
||||||
links.length === allInteractive.length && links.length > 3 // 全是链接且数量多
|
|
||||||
];
|
|
||||||
const opScore = operationFeatures.filter(Boolean).length;
|
|
||||||
const navScore = navigationFeatures.filter(Boolean).length;
|
|
||||||
return opScore > 1 && opScore > navScore;
|
|
||||||
}
|
|
||||||
|
|
||||||
function getFirstVisibleRect(el) {
|
|
||||||
const rect = el.getBoundingClientRect();
|
|
||||||
|
|
||||||
if (rect.width > 0 && rect.height > 0) {
|
|
||||||
return {
|
|
||||||
left: rect.left, top: rect.top, right: rect.right, bottom: rect.bottom,
|
|
||||||
width: rect.width, height: rect.height, x: rect.x, y: rect.y,
|
|
||||||
zIndex: parseInt(getComputedStyle(el).zIndex) || 0
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!el.querySelector('button, a, input') || !el.innerText.trim()) return rect;
|
|
||||||
|
|
||||||
const visibleChild = Array.from(el.children)
|
|
||||||
.find(child => {
|
|
||||||
const hasContent = child.querySelector('button, a, input') && child.innerText.trim();
|
|
||||||
return hasContent && (
|
|
||||||
child.getBoundingClientRect().width > 0 ||
|
|
||||||
getFirstVisibleRect(child).width > 0
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!visibleChild) return rect;
|
|
||||||
|
|
||||||
const childRect = visibleChild.getBoundingClientRect();
|
|
||||||
return childRect.width > 0 ?
|
|
||||||
{
|
|
||||||
left: childRect.left, top: childRect.top, right: childRect.right, bottom: childRect.bottom,
|
|
||||||
width: childRect.width, height: childRect.height, x: childRect.x, y: childRect.y,
|
|
||||||
zIndex: parseInt(getComputedStyle(visibleChild).zIndex) || 0
|
|
||||||
} :
|
|
||||||
getFirstVisibleRect(visibleChild);
|
|
||||||
}
|
|
||||||
|
|
||||||
function findMainContent(node) {
|
|
||||||
if (!node?.children?.length) return node;
|
|
||||||
const rectn = node.getBoundingClientRect();
|
|
||||||
const viewportArea = window.innerWidth * window.innerHeight;
|
|
||||||
if (rectn.width * rectn.height < viewportArea * 0.4) return node;
|
|
||||||
|
|
||||||
// 过滤可见元素
|
|
||||||
const children = [...node.children].filter(child => {
|
|
||||||
const style = window.getComputedStyle(child);
|
|
||||||
const hasTextContent = child.textContent.trim().length > 5;
|
|
||||||
return style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0' && hasTextContent;
|
|
||||||
});
|
|
||||||
if (!children.length) return node;
|
|
||||||
if (children.length === 1) return findMainContent(children[0]);
|
|
||||||
if (children.length > 10) return node;
|
|
||||||
if (children.length == 2 && (isLikelyOperationMenu(children[0]) || isLikelyOperationMenu(children[1]))) return node;
|
|
||||||
|
|
||||||
// 计算元素信息
|
|
||||||
const elemInfo = children.map(child => {
|
|
||||||
const rect = getFirstVisibleRect(child);
|
|
||||||
const style = window.getComputedStyle(child);
|
|
||||||
return {
|
|
||||||
element: child, area: rect.width * rect.height, rect, style,
|
|
||||||
zIndex: rect.zIndex || 0, position: style.position
|
|
||||||
};
|
|
||||||
}).sort((a, b) => b.area - a.area);
|
|
||||||
// 检测重叠
|
|
||||||
function isOverlapping(r1, r2) {
|
|
||||||
return !(r1.right <= r2.left || r1.left >= r2.right || r1.bottom <= r2.top || r1.top >= r2.bottom);
|
|
||||||
}
|
|
||||||
// 检查是否有任何重叠的元素对
|
|
||||||
const hasOverlap = elemInfo.some((e1, i) =>
|
|
||||||
elemInfo.slice(i + 1).some(e2 => isOverlapping(e1.rect, e2.rect))
|
|
||||||
);
|
|
||||||
|
|
||||||
console.log(hasOverlap, elemInfo);
|
|
||||||
|
|
||||||
// 无重叠情况: 面积比例判断
|
|
||||||
if (!hasOverlap) {
|
|
||||||
const totalArea = elemInfo.reduce((sum, item) => sum + item.area, 0);
|
|
||||||
const [main, second] = elemInfo;
|
|
||||||
return (main.area / totalArea > 0.6 && (!second || main.area > second.area * 2))
|
|
||||||
? findMainContent(main.element) : node;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1. 按z-index和定位方式排序
|
|
||||||
const sorted = [...elemInfo].sort((a, b) => {
|
|
||||||
// 非静态定位优先
|
|
||||||
if (a.position !== 'static' && b.position === 'static') return -1;
|
|
||||||
if (a.position === 'static' && b.position !== 'static') return 1;
|
|
||||||
// 其次按z-index排序
|
|
||||||
return b.zIndex - a.zIndex;
|
|
||||||
});
|
|
||||||
|
|
||||||
// 2. 在排序后的列表中找到第一个符合条件的元素
|
|
||||||
const suitable = sorted.find(x => {
|
|
||||||
const el = x.element, rect = x.rect, style = x.style;
|
|
||||||
return Math.abs((rect.left + rect.width/2) - window.innerWidth/2) < window.innerWidth*0.3 &&
|
|
||||||
parseFloat(style.opacity) > 0.1 &&
|
|
||||||
(parseInt(rect.zIndex) > 30 || style.boxShadow !== 'none') &&
|
|
||||||
el.querySelector('button, a, input') !== null;
|
|
||||||
});
|
|
||||||
|
|
||||||
// 3. 找到合适元素则使用它,否则返回面积最大的元素
|
|
||||||
if (suitable) {
|
|
||||||
return findMainContent(suitable.element);
|
|
||||||
} else {
|
|
||||||
const byArea = [...elemInfo].sort((a, b) => b.area - a.area);
|
|
||||||
return findMainContent(byArea[0].element);
|
|
||||||
}
|
|
||||||
} '''
|
|
||||||
|
|
||||||
def optimize_html_for_tokens(html):
|
def optimize_html_for_tokens(html):
|
||||||
if type(html) is str: soup = BeautifulSoup(html, 'html.parser')
|
if type(html) is str: soup = BeautifulSoup(html, 'html.parser')
|
||||||
else: soup = html
|
else: soup = html
|
||||||
@@ -860,24 +683,38 @@ def find_changed_elements(before_html, after_html):
|
|||||||
def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False):
|
def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False):
|
||||||
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
||||||
if text_only: return page
|
if text_only: return page
|
||||||
warning = ''
|
|
||||||
soup = optimize_html_for_tokens(page)
|
soup = optimize_html_for_tokens(page)
|
||||||
html = str(soup)
|
html = str(soup)
|
||||||
if not cutlist or len(html) <= maxchars: return html
|
if not cutlist: return html
|
||||||
rr = driver.execute_js(js_findMainList + js_findMainContent + """
|
rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', [])
|
||||||
return findMainList(findMainContent(document.body));""").get('data', {})
|
lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else [])
|
||||||
sel = rr.get("selector", None) if isinstance(rr, dict) else None
|
if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}")
|
||||||
if sel:
|
for entry in lists:
|
||||||
s = BeautifulSoup(str(soup), "html.parser"); items = s.select(sel)
|
sel = entry.get('selector') if isinstance(entry, dict) else None
|
||||||
|
if not sel: continue
|
||||||
|
items = soup.select(sel)
|
||||||
|
if len(items) < 5: continue
|
||||||
|
total_len = sum(len(str(it)) for it in items)
|
||||||
|
avg_len = total_len / len(items)
|
||||||
|
if avg_len < 500 or (avg_len < 1000 and total_len < len(html) * 0.3): continue
|
||||||
|
print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, keep 3, save ~{total_len - 3 * avg_len:.0f} chars")
|
||||||
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
|
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
|
||||||
keep = hit[:6] if hit else items[:3]
|
keep = hit[:6] if hit else items[:3]
|
||||||
for it in items:
|
removed = [it for it in items if it not in keep]
|
||||||
if it not in keep: it.decompose()
|
sample_texts = []
|
||||||
ss = str(optimize_html_for_tokens(s))
|
for rm in removed[:5]:
|
||||||
warning = '[SYSTEM] Found item list, only show some items ...\n'
|
txt = rm.get_text(" ", strip=True)[:40]
|
||||||
else: ss = html
|
if txt: sample_texts.append(txt)
|
||||||
|
hint_parts = [f'[FAKE ELEMENT] {len(removed)} more items hidden, selector: "{sel}"']
|
||||||
|
if sample_texts: hint_parts.append('Hidden items: ' + ','.join(f'"{t}"' for t in sample_texts))
|
||||||
|
hint_tag = soup.new_tag("div")
|
||||||
|
hint_tag.string = ' '.join(hint_parts)
|
||||||
|
if keep: keep[-1].insert_after(hint_tag)
|
||||||
|
for it in removed: it.decompose()
|
||||||
|
ss = str(optimize_html_for_tokens(soup)) if lists else html
|
||||||
|
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
|
||||||
if len(ss) > maxchars: ss = ss[:maxchars] + ' ... [TRUNCATED]'
|
if len(ss) > maxchars: ss = ss[:maxchars] + ' ... [TRUNCATED]'
|
||||||
return warning + str(ss)
|
return ss
|
||||||
|
|
||||||
def execute_js_rich(script, driver, no_monitor=False):
|
def execute_js_rich(script, driver, no_monitor=False):
|
||||||
last_html = None
|
last_html = None
|
||||||
|
|||||||
Reference in New Issue
Block a user