- background.js: Add management API, auto-return for JS, refactor new tab capture - simphtml.py: Fix falsy return value bug, flatten return structure - agent_loop.py: Simplify UI icons - manifest.json: Add management permission - tools_schema: Optimize prompts for web tools - tmwebdriver_sop.md: Update documentation
871 lines
41 KiB
Python
871 lines
41 KiB
Python
try: from bs4 import BeautifulSoup
|
||
except ImportError: print("[Error] BeautifulSoup4 未安装,请叫Agent安装BeautifulSoup4,再使用web相关工具。")
|
||
|
||
js_optHTML = r'''function optHTML(text_only=false) {
|
||
function createEnhancedDOMCopy() {
|
||
const nodeInfo = new WeakMap();
|
||
const ignoreTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'META', 'LINK', 'COLGROUP', 'COL', 'TEMPLATE', 'PARAM', 'SOURCE'];
|
||
const ignoreIds = ['ljq-ind'];
|
||
function cloneNode(sourceNode, keep=false) {
|
||
if (sourceNode.nodeType === 8 ||
|
||
(sourceNode.nodeType === 1 && (
|
||
ignoreTags.includes(sourceNode.tagName) ||
|
||
(sourceNode.id && ignoreIds.includes(sourceNode.id))
|
||
))) {
|
||
return null;
|
||
}
|
||
if (sourceNode.nodeType === 3) return sourceNode.cloneNode(false);
|
||
const clone = sourceNode.cloneNode(false);
|
||
if ((sourceNode.tagName === 'INPUT' || sourceNode.tagName === 'TEXTAREA') && sourceNode.value) clone.setAttribute('value', sourceNode.value);
|
||
if (sourceNode.tagName === 'INPUT' && (sourceNode.type === 'radio' || sourceNode.type === 'checkbox') && sourceNode.checked) clone.setAttribute('checked', '');
|
||
else if (sourceNode.tagName === 'SELECT' && sourceNode.value) clone.setAttribute('data-selected', sourceNode.value);
|
||
try { if (sourceNode.matches && sourceNode.matches(':-webkit-autofill')) { clone.setAttribute('data-autofilled', 'true'); if (!sourceNode.value) clone.setAttribute('value', '⚠️受保护-读tmwebdriver_sop的autofill章节提取'); } } catch(e) {}
|
||
|
||
const isDropdown = sourceNode.classList?.contains('dropdown-menu') ||
|
||
/dropdown|menu/i.test(sourceNode.className) || sourceNode.getAttribute('role') === 'menu';
|
||
const _ddItems = isDropdown ? sourceNode.querySelectorAll('a, button, [role="menuitem"], li').length : 0;
|
||
const isSmallDropdown = _ddItems > 0 && _ddItems <= 7 && sourceNode.textContent.length < 500;
|
||
|
||
const childNodes = [];
|
||
for (const child of sourceNode.childNodes) {
|
||
const childClone = cloneNode(child, keep || isSmallDropdown);
|
||
if (childClone) childNodes.push(childClone);
|
||
}
|
||
if (sourceNode.tagName === 'IFRAME') {
|
||
try {
|
||
const iDoc = sourceNode.contentDocument || sourceNode.contentWindow?.document;
|
||
if (iDoc && iDoc.body && iDoc.body.children.length > 0) {
|
||
const wrapper = document.createElement('div');
|
||
wrapper.setAttribute('data-iframe-content', sourceNode.src || '');
|
||
for (const ch of iDoc.body.childNodes) {
|
||
const c = cloneNode(ch, keep);
|
||
if (c) wrapper.appendChild(c);
|
||
}
|
||
if (wrapper.childNodes.length) childNodes.push(wrapper);
|
||
}
|
||
} catch(e) {}
|
||
}
|
||
if (sourceNode.shadowRoot) {
|
||
for (const shadowChild of sourceNode.shadowRoot.childNodes) {
|
||
const shadowClone = cloneNode(shadowChild, keep);
|
||
if (shadowClone) childNodes.push(shadowClone);
|
||
}
|
||
}
|
||
|
||
const rect = sourceNode.getBoundingClientRect();
|
||
const style = window.getComputedStyle(sourceNode);
|
||
const area = (style.display === 'none' || style.visibility === 'hidden' || parseFloat(style.opacity) <= 0)?0:rect.width * rect.height;
|
||
const isVisible = (rect.width > 1 && rect.height > 1 &&
|
||
style.display !== 'none' && style.visibility !== 'hidden' &&
|
||
parseFloat(style.opacity) > 0 &&
|
||
Math.abs(rect.left) < 5000 && Math.abs(rect.top) < 5000)
|
||
|| isSmallDropdown;
|
||
const zIndex = style.position !== 'static' ? (parseInt(style.zIndex) || 0) : 0;
|
||
|
||
let info = {
|
||
rect, area, isVisible, isSmallDropdown, zIndex,
|
||
style: {
|
||
display: style.display, visibility: style.visibility,
|
||
opacity: style.opacity, position: style.position
|
||
}};
|
||
|
||
const nonTextChildren = childNodes.filter(child => child.nodeType !== 3);
|
||
const hasValidChildren = nonTextChildren.length > 0;
|
||
|
||
if (hasValidChildren) {
|
||
const childrenInfos = nonTextChildren.map(c => nodeInfo.get(c)).filter(i => i && i.rect && i.rect.width > 0 && i.rect.height > 0);
|
||
const bgAlpha = (() => {
|
||
const c = style.backgroundColor;
|
||
if (!c || c === 'transparent') return 0;
|
||
const m = c.match(/rgba?\([^)]+,\s*([\d.]+)\)/);
|
||
return m ? parseFloat(m[1]) : 1;
|
||
})();
|
||
const hasVisualBg = bgAlpha > 0.1 || style.backgroundImage !== 'none' || (style.backdropFilter && style.backdropFilter !== 'none') || style.boxShadow !== 'none';
|
||
|
||
if (!hasVisualBg && childrenInfos.length > 0) {
|
||
let minL = Infinity, minT = Infinity, maxR = -Infinity, maxB = -Infinity;
|
||
for (const cInfo of childrenInfos) {
|
||
minL = Math.min(minL, cInfo.rect.left);
|
||
minT = Math.min(minT, cInfo.rect.top);
|
||
maxR = Math.max(maxR, cInfo.rect.right);
|
||
maxB = Math.max(maxB, cInfo.rect.bottom);
|
||
}
|
||
info.rect = { left: minL, top: minT, right: maxR, bottom: maxB, width: maxR - minL, height: maxB - minT };
|
||
info.area = info.rect.width * info.rect.height;
|
||
} else {
|
||
const maxC = childrenInfos.filter(i => i.isVisible).sort((a, b) => b.area - a.area)[0];
|
||
if (maxC && maxC.area > 10000 && (!isVisible || maxC.area > info.area * 5)) info = maxC;
|
||
}
|
||
}
|
||
nodeInfo.set(clone, info);
|
||
|
||
if (sourceNode.nodeType === 1 && sourceNode.tagName === 'DIV') {
|
||
if (!hasValidChildren && !sourceNode.textContent.trim()) return null;
|
||
}
|
||
// aria-hidden + not visible = truly hidden (e.g. mobile menus), remove even if has children
|
||
if (sourceNode.getAttribute && sourceNode.getAttribute('aria-hidden') === 'true' && !info.isVisible) {
|
||
return null;
|
||
}
|
||
if (info.isVisible || hasValidChildren || keep) {
|
||
childNodes.forEach(child => clone.appendChild(child));
|
||
return clone;
|
||
}
|
||
return null;
|
||
}
|
||
|
||
return {
|
||
domCopy: cloneNode(document.body),
|
||
getNodeInfo: node => nodeInfo.get(node),
|
||
isVisible: node => {
|
||
const info = nodeInfo.get(node);
|
||
return info && info.isVisible;
|
||
}
|
||
};
|
||
}
|
||
const { domCopy, getNodeInfo, isVisible } = createEnhancedDOMCopy();
|
||
if (text_only) {
|
||
const blocks = new Set(['DIV','P','H1','H2','H3','H4','H5','H6','LI','TR','SECTION','ARTICLE','HEADER','FOOTER','NAV','BLOCKQUOTE','PRE','HR','BR','DT','DD','FIGCAPTION','DETAILS','SUMMARY']);
|
||
domCopy.querySelectorAll('*').forEach(el => {
|
||
if (blocks.has(el.tagName)) el.insertAdjacentText('beforebegin', '\n');
|
||
});
|
||
domCopy.querySelectorAll('input:not([type=hidden]),textarea,select').forEach(el=>{
|
||
const p=[el.tagName,el.id&&'#'+el.id,el.getAttribute('name')&&'name='+el.getAttribute('name'),el.tagName==='INPUT'&&'type='+(el.getAttribute('type')||'text'),el.getAttribute('placeholder')&&'"'+el.getAttribute('placeholder')+'"',el.getAttribute('data-autofilled')&&'autofilled',el.disabled&&'disabled',el.tagName==='SELECT'&&el.getAttribute('data-selected')&&'="'+el.getAttribute('data-selected')+'"'].filter(Boolean).join(' ');
|
||
el.insertAdjacentText('beforebegin','\n['+p+']\n');
|
||
});
|
||
domCopy.querySelectorAll('button[disabled]').forEach(el=>el.insertAdjacentText('beforebegin','[DISABLED] '));
|
||
return domCopy.textContent;
|
||
}
|
||
const viewportArea = window.innerWidth * window.innerHeight;
|
||
|
||
function analyzeNode(node, pPathType='main') {
|
||
// 处理非元素节点和叶节点
|
||
if (node.nodeType !== 1 || !node.children.length) {
|
||
node.nodeType === 1 && (node.dataset.mark = 'K:leaf');
|
||
return;
|
||
}
|
||
const pathType = (node.dataset.mark === 'K:secondary') ? 'second' : pPathType;
|
||
const nodeInfoData = getNodeInfo(node);
|
||
if (!nodeInfoData || !nodeInfoData.rect) return;
|
||
const rectn = nodeInfoData.rect;
|
||
if (rectn.width < window.innerWidth * 0.8 && rectn.height < window.innerHeight * 0.8) return node;
|
||
if (node.tagName === 'TABLE') return;
|
||
const children = Array.from(node.children);
|
||
if (children.length === 1) {
|
||
node.dataset.mark = 'K:container';
|
||
return analyzeNode(children[0], pathType);
|
||
}
|
||
if (children.length > 10) return;
|
||
|
||
// 获取子元素信息并排序
|
||
const childrenInfo = children.map(child => {
|
||
const info = getNodeInfo(child) || { rect: {}, style: {} };
|
||
return { node: child, rect: info.rect, style: info.style,
|
||
area: info.area, zIndex: (info.zIndex || 0), isVisible: info.isVisible };
|
||
});
|
||
childrenInfo.sort((a, b) => b.area - a.area);
|
||
|
||
// 检测是划分还是覆盖
|
||
const isOverlay = hasOverlap(childrenInfo);
|
||
node.dataset.mark = isOverlay ? 'K:overlayParent' : 'K:partitionParent';
|
||
|
||
if (isOverlay) handleOverlayContainer(childrenInfo, pathType);
|
||
else handlePartitionContainer(childrenInfo, pathType);
|
||
|
||
console.log(`${isOverlay ? '覆盖' : '划分'}容器:`, node, `子元素数量: ${children.length}`);
|
||
console.log('子元素及标记:', children.map(child => ({
|
||
element: child,
|
||
mark: child.dataset.mark || '无',
|
||
info: getNodeInfo ? getNodeInfo(child) : undefined
|
||
})));
|
||
for (const child of children)
|
||
if (!child.dataset.mark || child.dataset.mark[0] !== 'R') analyzeNode(child, pathType);
|
||
}
|
||
|
||
// 处理划分容器
|
||
function handlePartitionContainer(childrenInfo, pathType) {
|
||
childrenInfo.sort((a, b) => b.area - a.area);
|
||
const totalArea = childrenInfo.reduce((sum, item) => sum + item.area, 0);
|
||
console.log(childrenInfo[0].area / totalArea);
|
||
const hasMainElement = childrenInfo.length >= 1 &&
|
||
(childrenInfo[0].area / totalArea > 0.5) &&
|
||
(childrenInfo.length === 1 || childrenInfo[0].area > childrenInfo[1].area * 2);
|
||
if (hasMainElement) {
|
||
childrenInfo[0].node.dataset.mark = 'K:main';
|
||
for (let i = 1; i < childrenInfo.length; i++) {
|
||
const child = childrenInfo[i];
|
||
let className = (child.node.getAttribute('class') || '').toLowerCase();
|
||
let isSecondary = containsButton(child.node);
|
||
if (className.includes('nav')) isSecondary = true;
|
||
if (className.includes('breadcrumbs')) isSecondary = true;
|
||
if (className.includes('header') && className.includes('table')) isSecondary = true;
|
||
if (child.node.innerHTML.trim().replace(/\s+/g, '').length < 500) isSecondary = true;
|
||
if (child.node.textContent.trim().length > 200) isSecondary = true; // P3: 有实质文本内容则保留
|
||
if (child.style.visibility === 'hidden') isSecondary = false;
|
||
if (isSecondary) child.node.dataset.mark = 'K:secondary';
|
||
else child.node.dataset.mark = 'K:nonEssential';
|
||
}
|
||
} else {
|
||
return; // relaxed: skip equalmany filtering, list truncation handles token budget
|
||
const uniqueClassNames = new Set(childrenInfo.map(item => item.node.getAttribute('class') || '')).size;
|
||
const highClassNameVariety = uniqueClassNames >= childrenInfo.length * 0.8;
|
||
if (pathType !== 'main' && highClassNameVariety && childrenInfo.length > 5) {
|
||
childrenInfo.forEach(child => child.node.dataset.mark = 'R:equalmany');
|
||
} else {
|
||
childrenInfo.forEach(child => child.node.dataset.mark = 'K:equal');
|
||
}
|
||
}
|
||
}
|
||
|
||
function containsButton(container) {
|
||
const hasStandardButton = container.querySelector('button, input[type="button"], input[type="submit"], [role="button"]') !== null;
|
||
if (hasStandardButton) return true;
|
||
const hasClassButton = container.querySelector('[class*="-btn"], [class*="-button"], .button, .btn, [class*="btn-"]') !== null;
|
||
return hasClassButton;
|
||
}
|
||
|
||
function handleOverlayContainer(childrenInfo, pathType) {
|
||
// elementFromPoint ground truth: 让浏览器告诉我们谁在视觉最上层
|
||
const _efp = document.elementFromPoint(window.innerWidth/2, window.innerHeight/2);
|
||
if (_efp) { let _el = _efp; while (_el) { const _h = childrenInfo.find(c => c.node.id && c.node.id === _el.id); if (_h) { _h.zIndex = 9999; break; } _el = _el.parentElement; } }
|
||
const sorted = [...childrenInfo].sort((a, b) => b.zIndex - a.zIndex);
|
||
console.log('排序后的子元素:', sorted);
|
||
if (sorted.length === 0) return;
|
||
|
||
const top = sorted[0];
|
||
const rect = top.rect;
|
||
const topNode = top.node;
|
||
const isComplex = top.node.querySelectorAll('input, select, textarea, button, a, [role="button"]').length >= 1;
|
||
|
||
const textContent = topNode.textContent?.trim() || '';
|
||
const textLength = textContent.length;
|
||
const hasLinks = topNode.querySelectorAll('a').length > 0;
|
||
const isMostlyText = textLength > 7 && !hasLinks;
|
||
|
||
const centerDiff = Math.abs((rect.left + rect.width/2) - window.innerWidth/2) / window.innerWidth;
|
||
const minDimensionRatio = Math.min(rect.width / window.innerWidth, rect.height / window.innerHeight);
|
||
const maxDimensionRatio = Math.max(rect.width / window.innerWidth, rect.height / window.innerHeight);
|
||
const isNearTop = rect.top < 50;
|
||
const isDialog = (top.node.querySelector('iframe') || top.node.querySelector('button') || top.node.querySelector('input')) && centerDiff < 0.3;
|
||
|
||
if (isComplex && centerDiff < 0.2 &&
|
||
((minDimensionRatio > 0.2 && rect.width/window.innerWidth < 0.98) || minDimensionRatio > 0.95)) {
|
||
top.node.dataset.mark = 'K:mainInteractive';
|
||
sorted.slice(1).forEach(e => {
|
||
if ((parseInt(e.zIndex)||0) <= (parseInt(sorted[0].zIndex)||0)) {
|
||
e.node.dataset.mark = 'R:covered';
|
||
} else {
|
||
e.node.dataset.mark = 'K:noncovered';
|
||
}
|
||
});
|
||
} else {
|
||
if (isComplex && isNearTop && maxDimensionRatio > 0.4 && top.isVisible) {
|
||
top.node.dataset.mark = 'K:topBar';
|
||
} else if (isMostlyText || isComplex || isDialog) {
|
||
topNode.dataset.mark = 'K:messageContent';
|
||
} else {
|
||
topNode.dataset.mark = 'R:floatingAd';
|
||
}
|
||
const rest = sorted.slice(1);
|
||
rest.length && (!hasOverlap(rest) ? handlePartitionContainer(rest, pathType) : handleOverlayContainer(rest, pathType));
|
||
}
|
||
}
|
||
|
||
function hasOverlap(items) {
|
||
return items.some((a, i) =>
|
||
items.slice(i+1).some(b => {
|
||
const r1 = a.rect, r2 = b.rect;
|
||
if (!r1.width || !r2.width || !r1.height || !r2.height) {return false;}
|
||
const epsilon = 1;
|
||
const x1 = r1.x !== undefined ? r1.x : r1.left;
|
||
const y1 = r1.y !== undefined ? r1.y : r1.top;
|
||
const x2 = r2.x !== undefined ? r2.x : r2.left;
|
||
const y2 = r2.y !== undefined ? r2.y : r2.top;
|
||
return !(x1 + r1.width <= x2 + epsilon || x1 >= x2 + r2.width - epsilon ||
|
||
y1 + r1.height <= y2 + epsilon || y1 >= y2 + r2.height - epsilon
|
||
);
|
||
})
|
||
);
|
||
}
|
||
|
||
// Hoist top 1-2 deep fixed dialogs to body level for overlay detection
|
||
const _fc = [...domCopy.querySelectorAll('*')].filter(el => {
|
||
if (el.parentNode === domCopy) return false;
|
||
const info = getNodeInfo(el);
|
||
if (!info?.rect || info.style.position !== 'fixed') return false;
|
||
const r = info.rect, cover = (r.width * r.height) / viewportArea;
|
||
const cd = Math.abs((r.left + r.width/2) - window.innerWidth/2) / window.innerWidth;
|
||
return cover > 0.15 && cover < 1.0 && cd < 0.3 && el.querySelector('button, input, a, [role="button"], iframe');
|
||
}).filter((el, _, arr) => !arr.some(o => o !== el && o.contains(el)))
|
||
.sort((a, b) => (getNodeInfo(b).rect.width * getNodeInfo(b).rect.height) - (getNodeInfo(a).rect.width * getNodeInfo(a).rect.height))
|
||
.slice(0, 2);
|
||
_fc.forEach(el => { const r = getNodeInfo(el).rect; console.log('[simphtml] Hoisted fixed dialog:', el.tagName + (el.id ? '#'+el.id : '') + (el.className ? '.'+String(el.className).split(' ')[0] : ''), Math.round(r.width)+'x'+Math.round(r.height), Math.round(100*r.width*r.height/viewportArea)+'%'); el.parentNode.removeChild(el); domCopy.appendChild(el); });
|
||
const result = analyzeNode(domCopy);
|
||
domCopy.querySelectorAll('[data-mark^="R:"]').forEach(el=>el.parentNode?.removeChild(el));
|
||
let root = domCopy;
|
||
while (root.children.length === 1) {
|
||
root = root.children[0];
|
||
}
|
||
for (let ii = 0; ii < 3; ii++) {
|
||
root.querySelectorAll('div').forEach(div => (!div.textContent.trim() && div.children.length === 0) && div.remove());
|
||
}
|
||
root.querySelectorAll('[data-mark]').forEach(e => e.removeAttribute('data-mark'));
|
||
root.removeAttribute('data-mark');
|
||
root.querySelectorAll('iframe').forEach(f => {
|
||
if (f.children.length) {
|
||
const d = document.createElement('div');
|
||
for (const a of f.attributes) d.setAttribute(a.name, a.value);
|
||
d.setAttribute('data-tag', 'iframe');
|
||
while (f.firstChild) d.appendChild(f.firstChild);
|
||
f.parentNode.replaceChild(d, f);
|
||
}
|
||
});
|
||
return root.outerHTML;
|
||
}
|
||
optHTML()'''
|
||
|
||
js_findMainList = r'''function findMainList(startElement = null) {
|
||
const root = startElement || document.body;
|
||
const MIN_CHILDREN = 8;
|
||
const MAX_CONTAINERS = 20;
|
||
|
||
// 全局扫描:收集候选容器,按 l1 + l2*0.1 排序(l2=孙子元素数,捕获表格等多层结构)
|
||
const candidates = [];
|
||
const allEls = root.querySelectorAll('*');
|
||
for (const node of allEls) {
|
||
if (node.closest('svg')) continue;
|
||
const l1 = node.children.length;
|
||
if (l1 < 5) continue;
|
||
let l2 = 0;
|
||
for (const child of node.children) l2 += child.children.length;
|
||
const score = l1 + l2 * 0.1;
|
||
if (score >= MIN_CHILDREN) candidates.push({node, score});
|
||
}
|
||
candidates.sort((a, b) => b.score - a.score);
|
||
const toProcess = candidates.slice(0, MAX_CONTAINERS).map(c => c.node);
|
||
|
||
// 对每个容器找候选组并评分
|
||
let allCandidates = [];
|
||
for (const container of toProcess) {
|
||
const topGroups = findTopGroups(container, 3);
|
||
for (const groupInfo of topGroups) {
|
||
const items = findMatchingElements(container, groupInfo.selector);
|
||
if (items.length >= 5) {
|
||
const score = scoreContainer(container, items) + groupInfo.score;
|
||
if (score >= 30) {
|
||
allCandidates.push({ container, selector: groupInfo.selector, items, score });
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 按分数降序排列
|
||
allCandidates.sort((a, b) => b.score - a.score);
|
||
|
||
// 去重:移除与更高分候选重叠超50%的结果
|
||
const kept = [];
|
||
for (const cand of allCandidates) {
|
||
let dominated = false;
|
||
for (const k of kept) {
|
||
if (k.container.contains(cand.container) || cand.container.contains(k.container)) {
|
||
const kSet = new Set(k.items);
|
||
const overlap = cand.items.filter(it => kSet.has(it)).length;
|
||
if (overlap > cand.items.length * 0.5) { dominated = true; break; }
|
||
}
|
||
}
|
||
if (!dominated) kept.push(cand);
|
||
}
|
||
|
||
function describeResult(container, items, selector, score) {
|
||
if(container&&!container.id)container.id='_ljq'+(window._lci=(window._lci||0)+1);
|
||
const cTag = container ? container.tagName : null;
|
||
const cId = container ? (container.id || '') : '';
|
||
const cClass = container ? (String(container.className || '').trim()) : '';
|
||
const result = {
|
||
containerTag: cTag, containerId: cId, containerClass: cClass,
|
||
itemCount: items.length,
|
||
};
|
||
let prefix = '';
|
||
if (cId) prefix = '#' + CSS.escape(cId);
|
||
if (selector) result.selector = prefix ? (prefix + ' > ' + selector) : selector;
|
||
if (score !== undefined) result.score = score;
|
||
if (items.length > 0) {
|
||
result.firstItemPreview = items[0].outerHTML.substring(0, 200);
|
||
result.itemTags = items.slice(0, 10).map(el => el.tagName + (el.className ? '.' + String(el.className).trim().split(/\s+/)[0] : ''));
|
||
}
|
||
return result;
|
||
}
|
||
|
||
if (kept.length === 0) return [];
|
||
|
||
return kept.map(c => describeResult(c.container, c.items, c.selector, c.score));
|
||
}
|
||
|
||
function findTopGroups(container, limit) {
|
||
const children = Array.from(container.children).filter(c => !c.closest('svg'));
|
||
const totalChildren = children.length;
|
||
if (totalChildren < 3) return [];
|
||
|
||
const minGroupSize = Math.max(3, Math.floor(totalChildren * 0.2));
|
||
const groups = [];
|
||
|
||
// 统计标签和类名
|
||
const tagFreq = {}, classFreq = {}, tagMap = {}, classMap = {};
|
||
|
||
children.forEach(child => {
|
||
// 统计标签
|
||
const tag = child.tagName.toLowerCase();
|
||
if (tag === "td") return;
|
||
tagFreq[tag] = (tagFreq[tag] || 0) + 1;
|
||
if (!tagMap[tag]) tagMap[tag] = [];
|
||
tagMap[tag].push(child);
|
||
|
||
// 统计类名
|
||
if (child.className) {
|
||
child.className.trim().split(/\s+/).forEach(cls => {
|
||
if (cls) {
|
||
classFreq[cls] = (classFreq[cls] || 0) + 1;
|
||
if (!classMap[cls]) classMap[cls] = [];
|
||
classMap[cls].push(child);
|
||
}
|
||
});
|
||
}
|
||
});
|
||
|
||
// 评分函数
|
||
const scoreGroup = (selector, elements) => {
|
||
const coverage = elements.length / totalChildren;
|
||
let specificity = selector.startsWith('.')
|
||
? (0.6 + (selector.match(/\./g).length - 1) * 0.1) // 类选择器
|
||
: (selector.includes('.')
|
||
? (0.7 + (selector.match(/\./g).length) * 0.1) // 标签+类
|
||
: 0.3); // 纯标签
|
||
return (coverage * 0.5) + (specificity * 0.5);
|
||
};
|
||
|
||
// 添加标签组
|
||
Object.keys(tagFreq).forEach(tag => {
|
||
if (tag !== "div" && tagFreq[tag] >= minGroupSize) {
|
||
groups.push({
|
||
selector: tag,
|
||
elements: tagMap[tag],
|
||
score: scoreGroup(tag, tagMap[tag]) - 0.5
|
||
});
|
||
}
|
||
});
|
||
|
||
// 添加类组
|
||
Object.keys(classFreq).forEach(cls => {
|
||
if (classFreq[cls] >= minGroupSize) {
|
||
const selector = '.' + CSS.escape(cls);
|
||
groups.push({
|
||
selector,
|
||
elements: classMap[cls],
|
||
score: scoreGroup(selector, classMap[cls])
|
||
});
|
||
}
|
||
});
|
||
// 添加标签+类组合
|
||
const topTags = Object.keys(tagFreq).filter(t => tagFreq[t] >= minGroupSize).slice(0, 3);
|
||
const topClasses = Object.keys(classFreq).filter(c => classFreq[c] >= minGroupSize).sort((a, b) => classFreq[b] - classFreq[a]).slice(0, 3);
|
||
|
||
// 标签+类
|
||
topTags.forEach(tag => {
|
||
topClasses.forEach(cls => {
|
||
const elements = children.filter(el =>
|
||
el.tagName.toLowerCase() === tag &&
|
||
el.className && el.className.split(/\s+/).includes(cls)
|
||
);
|
||
|
||
if (elements.length >= minGroupSize) {
|
||
const selector = tag + '.' + CSS.escape(cls);
|
||
groups.push({selector, elements, score: scoreGroup(selector, elements)});
|
||
}
|
||
});
|
||
});
|
||
|
||
// 多类组合
|
||
for (let i = 0; i < topClasses.length; i++) {
|
||
for (let j = i + 1; j < topClasses.length; j++) {
|
||
const elements = children.filter(el =>
|
||
el.className && el.className.split(/\s+/).includes(topClasses[i]) && el.className.split(/\s+/).includes(topClasses[j]));
|
||
|
||
if (elements.length >= minGroupSize) {
|
||
const selector = '.' + CSS.escape(topClasses[i]) + '.' + CSS.escape(topClasses[j]);
|
||
groups.push({selector, elements,score: scoreGroup(selector, elements)});
|
||
}
|
||
}
|
||
}
|
||
// 返回得分最高的N个组
|
||
return groups.sort((a, b) => b.score - a.score).slice(0, limit);
|
||
}
|
||
|
||
function findMatchingElements(container, selector) {
|
||
try {
|
||
return Array.from(container.querySelectorAll(selector));
|
||
} catch (e) {
|
||
// 处理无效选择器
|
||
console.error('Invalid selector:', selector, e);
|
||
return [];
|
||
}
|
||
}
|
||
|
||
function scoreContainer(container, items) {
|
||
if (!container || items.length < 3) return 0;
|
||
// 1. 计算基础面积数据
|
||
const containerRect = container.getBoundingClientRect();
|
||
const containerArea = containerRect.width * containerRect.height;
|
||
if (containerArea < 10000) return 0; // 容器太小
|
||
|
||
// 收集列表项面积数据
|
||
const itemAreas = [];
|
||
let totalItemArea = 0;
|
||
let visibleItems = 0;
|
||
|
||
items.forEach(item => {
|
||
const rect = item.getBoundingClientRect();
|
||
const area = rect.width * rect.height;
|
||
if (area > 0) {
|
||
totalItemArea += area;
|
||
itemAreas.push(area);
|
||
visibleItems++;
|
||
}
|
||
});
|
||
// 如果可见项太少,返回低分
|
||
if (visibleItems < 3) return 0;
|
||
// 防止异常值:确保面积不超过容器
|
||
totalItemArea = Math.min(totalItemArea, containerArea * 0.98);
|
||
const areaRatio = totalItemArea / containerArea;
|
||
// 3. 计算各项评分 - 使用线性插值而非阶梯
|
||
// 3.2 面积比评分 - 最多40分,连续曲线
|
||
// 使用sigmoid函数让评分更平滑
|
||
const areaScore = 40 / (1 + Math.exp(-12 * (areaRatio - 0.4)));
|
||
|
||
// 3.3 均匀性评分 - 最多20分,连续曲线
|
||
let uniformityScore = 0;
|
||
if (itemAreas.length >= 3) {
|
||
const mean = itemAreas.reduce((sum, area) => sum + area, 0) / itemAreas.length;
|
||
const variance = itemAreas.reduce((sum, area) => sum + Math.pow(area - mean, 2), 0) / itemAreas.length;
|
||
const cv = mean > 0 ? Math.sqrt(variance) / mean : 1;
|
||
// 指数衰减函数,cv越小分数越高
|
||
uniformityScore = 20 * Math.exp(-2.5 * cv);
|
||
}
|
||
|
||
const baseScore = Math.log2(visibleItems) * 5 + Math.floor(visibleItems / 5) * 0.25;
|
||
const rawCountScore = Math.min(40, baseScore);
|
||
const countScore = rawCountScore * Math.max(0.1, uniformityScore / 20);
|
||
|
||
// 3.4 容器尺寸评分 - 最多15分,连续曲线
|
||
const viewportArea = window.innerWidth * window.innerHeight;
|
||
const containerViewportRatio = containerArea / viewportArea;
|
||
const sizeScore = 2 * (1 - 1/(1 + Math.exp(-10 * (containerViewportRatio - 0.25))));
|
||
|
||
let layoutScore = 0;
|
||
if (items.length >= 3) {
|
||
// 坐标分组并计算行列数
|
||
const uniqueRows = new Set(items.map(item => Math.round(item.getBoundingClientRect().top / 5) * 5)).size;
|
||
const uniqueCols = new Set(items.map(item => Math.round(item.getBoundingClientRect().left / 5) * 5)).size;
|
||
// 如果是单行或单列,直接给满分;否则评估网格质量
|
||
if (uniqueRows === 1 || uniqueCols === 1) { layoutScore = 20;
|
||
} else {
|
||
const coverage = Math.min(1, items.length / (uniqueRows * uniqueCols));
|
||
const efficiency = Math.max(0, 1 - (uniqueRows + uniqueCols) / (2 * items.length));
|
||
layoutScore = 20 * (0.7 * coverage + 0.3 * efficiency);
|
||
}
|
||
}
|
||
|
||
// 总分 - 仍然保持100分左右的总分
|
||
const totalScore = countScore + areaScore + uniformityScore + layoutScore + sizeScore;
|
||
|
||
if (totalScore > 100)
|
||
console.log(container, {
|
||
total: totalScore.toFixed(2),
|
||
count: countScore.toFixed(2),
|
||
areaRatio: areaRatio.toFixed(2),
|
||
area: areaScore.toFixed(2),
|
||
uniformity: uniformityScore.toFixed(2),
|
||
size: sizeScore.toFixed(2),
|
||
layout: layoutScore.toFixed(2)
|
||
});
|
||
|
||
return totalScore;
|
||
}'''
|
||
|
||
def optimize_html_for_tokens(html):
|
||
if type(html) is str: soup = BeautifulSoup(html, 'html.parser')
|
||
else: soup = html
|
||
for svg in soup.find_all('svg'):
|
||
svg.clear(); svg.attrs = {}
|
||
[tag.attrs.pop('style', None) for tag in soup.find_all(True)]
|
||
for tag in soup.find_all(True):
|
||
if tag.has_attr('src'):
|
||
if tag['src'].startswith('data:'): tag['src'] = '__img__'
|
||
elif len(tag['src']) > 30: tag['src'] = '__url__'
|
||
if tag.has_attr('href') and len(tag['href']) > 30: tag['href'] = '__link__'
|
||
if tag.has_attr('action') and len(tag['action']) > 30: tag['action'] = '__url__'
|
||
for a in ('value', 'title', 'alt'):
|
||
if tag.has_attr(a) and isinstance(tag[a], str) and len(tag[a]) > 100: tag[a] = tag[a][:50] + ' ...'
|
||
for attr in list(tag.attrs.keys()):
|
||
if attr not in ['id', 'class', 'name', 'src', 'href', 'alt', 'value', 'type', 'placeholder',
|
||
'disabled', 'checked', 'selected', 'readonly', 'required', 'multiple',
|
||
'role', 'aria-label', 'aria-expanded', 'aria-hidden', 'contenteditable',
|
||
'title', 'for', 'action', 'method', 'target', 'colspan', 'rowspan']:
|
||
if attr.startswith('data-v'): tag.attrs.pop(attr, None)
|
||
elif attr.startswith('data-') and isinstance(tag[attr], str) and len(tag[attr]) > 20:
|
||
tag[attr] = '__data__'
|
||
elif not attr.startswith('data-'): tag.attrs.pop(attr, None)
|
||
return soup
|
||
|
||
|
||
temp_monitor_js = """function startStrMonitor(interval) {
|
||
if (window._tm && window._tm.id) clearInterval(window._tm.id);
|
||
window._tm = {extract: () => {
|
||
const texts = new Set(), walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
||
let node, t, s; while (node = walker.nextNode())
|
||
((t = node.textContent.trim()) && t.length > 10 && !(s = t.substring(0, 20)).includes('_')) && texts.add(s);
|
||
return texts;
|
||
}};
|
||
window._tm.init = window._tm.extract();
|
||
window._tm.all = new Set();
|
||
window._tm.id = setInterval(() => window._tm.extract().forEach(t => window._tm.all.add(t)), interval);
|
||
}
|
||
startStrMonitor(450);
|
||
"""
|
||
def start_temp_monitor(driver):
|
||
try: driver.execute_js(temp_monitor_js)
|
||
except: pass
|
||
|
||
def get_temp_texts(driver):
|
||
js = """function stopStrMonitor() {
|
||
if (!window._tm) return [];
|
||
clearInterval(window._tm.id);
|
||
const final = window._tm.extract();
|
||
const newlySeen = [...window._tm.all].filter(t => !window._tm.init.has(t));
|
||
let result;
|
||
if (newlySeen.length < 8) {
|
||
result = newlySeen;
|
||
} else {
|
||
result = newlySeen.filter(t => !final.has(t));
|
||
}
|
||
delete window._tm;
|
||
return result;
|
||
}
|
||
stopStrMonitor();
|
||
"""
|
||
try: return list(set(driver.execute_js(js).get('data', [])))
|
||
except Exception as e:
|
||
print(e)
|
||
return []
|
||
|
||
import time, re, os
|
||
def get_main_block(driver, extra_js="", text_only=False):
|
||
page = driver.execute_js(f"{extra_js}\n{js_optHTML}\nreturn optHTML({str(text_only).lower()});").get('data', '')
|
||
if text_only:
|
||
page = re.sub(r' {2,}', ' ', page) # 连续空格→单空格
|
||
page = re.sub(r'^ +', '', page, flags=re.M) # 去行首空格
|
||
page = re.sub(r'(\n\s*){3,}', '\n\n', page) # 3+空行→1空行
|
||
return page.strip()
|
||
return page
|
||
|
||
def find_changed_elements(before_html, after_html):
|
||
before_soup = BeautifulSoup(before_html, 'html.parser')
|
||
after_soup = BeautifulSoup(after_html, 'html.parser')
|
||
def direct_text(el):
|
||
return ''.join(t.strip() for t in el.find_all(string=True, recursive=False)).strip()
|
||
def get_sig(el):
|
||
attrs = {k:v for k,v in el.attrs.items() if k != 'data-track-id'}
|
||
return f"{el.name}:{attrs}:{direct_text(el)}"
|
||
def build_sigs(soup):
|
||
result = {}
|
||
for el in soup.find_all(True):
|
||
sig = get_sig(el)
|
||
result.setdefault(sig, []).append(el)
|
||
return result
|
||
before_sigs, after_sigs = build_sigs(before_soup), build_sigs(after_soup)
|
||
changed = []
|
||
for sig, els in after_sigs.items():
|
||
if sig not in before_sigs: changed.extend(els)
|
||
elif len(els) > len(before_sigs[sig]): changed.extend(els[:len(els) - len(before_sigs[sig])])
|
||
if len(changed) == 0 and str(before_soup) != str(after_soup):
|
||
before_els, after_els = before_soup.find_all(True), after_soup.find_all(True)
|
||
for i in range(min(len(before_els), len(after_els))):
|
||
if get_sig(before_els[i]) != get_sig(after_els[i]): changed.append(after_els[i])
|
||
# 变化边界: parent不在changed中的元素
|
||
cids = set(id(el) for el in changed)
|
||
boundaries = [el for el in changed if el.parent is None or id(el.parent) not in cids]
|
||
top = max(boundaries, key=lambda el: len(str(el))) if boundaries else None
|
||
result = {"changed": len(changed)}
|
||
if top:
|
||
h = str(top)
|
||
result["top_change"] = h if len(h) <= 2000 else h[:2000] + '...[TRUNCATED]'
|
||
return result
|
||
|
||
def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False):
|
||
if cutlist: rr = driver.execute_js(js_findMainList + "return findMainList(document.body);").get('data', [])
|
||
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
||
if text_only: return page
|
||
soup = optimize_html_for_tokens(page)
|
||
for div in soup.select('div[data-tag="iframe"]'):
|
||
div.name = 'iframe'; del div['data-tag']
|
||
html = str(soup)
|
||
if not cutlist: return html
|
||
lists = rr if isinstance(rr, list) else ([rr] if isinstance(rr, dict) and rr.get('selector') else [])
|
||
if lists: print(f"[cutlist] Found {len(lists)} list(s): {[e.get('selector','?') if isinstance(e,dict) else '?' for e in lists]}")
|
||
for entry in lists:
|
||
sel = entry.get('selector') if isinstance(entry, dict) else None
|
||
if not sel: continue
|
||
try: items = soup.select(sel)
|
||
except Exception: print(f'[cutlist] skip invalid selector: {sel}'); continue
|
||
if len(items) < 5: continue
|
||
total_len = sum(len(str(it)) for it in items)
|
||
avg_len = total_len / len(items)
|
||
print(f"[cutlist] '{sel}': {len(items)} items, avg {avg_len:.0f} chars, total {total_len}, if keep 3, save ~{total_len - 3 * avg_len:.0f} chars")
|
||
if avg_len < 200 or (avg_len < 700 and total_len < 2500): continue
|
||
hit = [it for it in items if instruction and instruction.strip() and instruction in it.get_text(" ",strip=True)]
|
||
keep = hit[:6] if hit else items[:3]
|
||
removed = [it for it in items if it not in keep]
|
||
sample_texts = []
|
||
for rm in removed[:5]:
|
||
txt = rm.get_text(" ", strip=True)[:40]
|
||
if txt: sample_texts.append(txt)
|
||
hint_parts = [f'[FAKE ELEMENT] {len(removed)} more items hidden, selector: "{sel}"']
|
||
if sample_texts: hint_parts.append('Hidden items: ' + ','.join(f'"{t}"' for t in sample_texts))
|
||
hint_tag = soup.new_tag("div")
|
||
hint_tag.string = ' '.join(hint_parts)
|
||
if keep: keep[-1].insert_after(hint_tag)
|
||
for it in removed: it.decompose()
|
||
ss = str(optimize_html_for_tokens(soup)) if lists else html
|
||
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
|
||
if len(ss) > maxchars: ss = str(smart_truncate(soup, maxchars))
|
||
return ss
|
||
|
||
def smart_truncate(soup, budget, _depth=0):
|
||
"""原地截断 soup 使其接近 budget 字符。
|
||
策略:穿透单子元素找分叉点;top3 能扛住 over 则按比例分担,否则从尾部删子元素。"""
|
||
CUT_THRESHOLD = 8000 # 小于此值直接去尾,大于则继续递归找分叉点
|
||
indent = ' ' * _depth
|
||
def cut(ele, keep):
|
||
from bs4 import NavigableString
|
||
s = str(ele)
|
||
over = len(s) - keep
|
||
if over <= 0: return
|
||
# 保护 FAKE ELEMENT 提示标签
|
||
protected = [c.extract() for c in ele.find_all(lambda tag: tag.string and '[FAKE ELEMENT]' in tag.string)]
|
||
s = str(ele)
|
||
over = len(s) - keep
|
||
if over <= 0:
|
||
for p in protected: ele.append(p)
|
||
return
|
||
marker = f' [TRUNCATED {over//1000}k chars]'
|
||
inner = ele.decode_contents()
|
||
tag_overhead = len(s) - len(inner)
|
||
inner_keep = max(keep - tag_overhead - len(marker), 0)
|
||
ele.clear()
|
||
if inner_keep > 0:
|
||
ele.append(BeautifulSoup(inner[:inner_keep], 'html.parser'))
|
||
ele.append(NavigableString(marker))
|
||
for p in protected: ele.append(p)
|
||
total = len(str(soup))
|
||
if total <= budget: return soup
|
||
kids = [(c, len(str(c))) for c in soup.children if c.name and not (c.string and '[FAKE ELEMENT]' in c.string)]
|
||
if not kids: return soup
|
||
selflen = total - sum(l for _, l in kids)
|
||
remaining_budget = max(budget - selflen, 0)
|
||
tag = getattr(soup, 'name', '?')
|
||
print(f'{indent}[smart_truncate] <{tag}> total={total} budget={budget} selflen={selflen} kids={len(kids)}')
|
||
# === 1 kid: 穿透 ===
|
||
if len(kids) == 1:
|
||
print(f'{indent} -> single child, recurse into <{kids[0][0].name}>')
|
||
smart_truncate(kids[0][0], remaining_budget, _depth)
|
||
return soup
|
||
over = sum(l for _, l in kids) - remaining_budget
|
||
if over <= 0: return soup
|
||
# 看 top 3 能否承担 over
|
||
ranked = sorted(range(len(kids)), key=lambda i: kids[i][1], reverse=True)
|
||
tops = list(ranked[:min(3, len(ranked))])
|
||
top_total = sum(kids[i][1] for i in tops)
|
||
if top_total < over:
|
||
# === top 3 扛不住,从尾部删子元素 ===
|
||
removed = 0
|
||
removed_count = 0
|
||
while kids and removed < over:
|
||
c, l = kids.pop(); c.decompose()
|
||
removed += l; removed_count += 1
|
||
print(f'{indent} -> tail-cut: removed {removed_count} children ({removed//1000}k chars) from end')
|
||
return soup
|
||
# === top 2-3 按比例分担 ===
|
||
# 过滤掉太小的 kid(不到最大的 10%),让大的全扛
|
||
max_size = kids[ranked[0]][1]
|
||
filtered = [i for i in tops if kids[i][1] >= max_size * 0.1]
|
||
filtered_total = sum(kids[i][1] for i in filtered)
|
||
if filtered_total >= over:
|
||
tops, top_total = filtered, filtered_total
|
||
# 先打印所有分配计划
|
||
actions = []
|
||
for i in tops:
|
||
c, l = kids[i]
|
||
share = int(over * l / top_total)
|
||
new_keep = l - share
|
||
print(f'{indent} -> <{c.name}> {l} -> {new_keep} (share={share})')
|
||
actions.append((c, l, new_keep))
|
||
# 再统一执行
|
||
for c, l, new_keep in actions:
|
||
if new_keep <= 0: c.decompose()
|
||
elif new_keep > CUT_THRESHOLD: smart_truncate(c, new_keep, _depth + 1)
|
||
else: cut(c, new_keep)
|
||
return soup
|
||
|
||
def execute_js_rich(script, driver, no_monitor=False):
|
||
last_html = None
|
||
if not no_monitor:
|
||
try: last_html = get_html(driver, cutlist=False, extra_js=temp_monitor_js, maxchars=9999999)
|
||
except: pass
|
||
result = None; error_msg = None; reloaded = False; newTabs = []
|
||
before_sids = set(driver.get_session_dict().keys()); response = {}
|
||
try:
|
||
print(f"Executing: {script[:250]} ...")
|
||
response = driver.execute_js(script)
|
||
result = response['data'] if 'data' in response else response.get('result')
|
||
if response.get('closed', 0) == 1: reloaded = True
|
||
time.sleep(1)
|
||
except Exception as e:
|
||
error = e.args[0] if e.args else str(e)
|
||
if isinstance(error, dict): error.pop('stack', None)
|
||
error_msg = str(error)
|
||
print(f"Error: {error_msg}")
|
||
rr = {
|
||
"status": "failed" if error_msg else "success",
|
||
"js_return": result,
|
||
"tab_id": driver.default_session_id
|
||
}
|
||
if reloaded: rr['reloaded'] = reloaded
|
||
if response.get('newTabs'): rr['newTabs'] = response['newTabs']
|
||
else:
|
||
after = driver.get_session_dict()
|
||
new_sids = {k: v for k, v in after.items() if k not in before_sids}
|
||
if new_sids:
|
||
newTabs = [{'id': k, 'url': v} for k, v in new_sids.items()]
|
||
rr['environment']['newTabs'] = newTabs
|
||
rr['suggestion'] = "页面已刷新,以上新标签页在执行期间连接。"
|
||
if error_msg: rr['error'] = error_msg
|
||
if no_monitor: return rr
|
||
if not reloaded:
|
||
try: rr['transients'] = get_temp_texts(driver)
|
||
except: rr['transients'] = []
|
||
if not reloaded and len(newTabs) == 0:
|
||
try:
|
||
current_html = get_html(driver, cutlist=False, maxchars=9999999)
|
||
if last_html is None: raise Exception("no baseline")
|
||
diff_data = find_changed_elements(last_html, current_html)
|
||
change_count = diff_data.get('changed', 0)
|
||
top_change = diff_data.get('top_change', '')
|
||
diff_summary = f"DOM变化量: {change_count}"
|
||
if top_change: diff_summary += f"\n最显著变化:\n{top_change}"
|
||
transients = rr.get('transients', [])
|
||
if change_count == 0 and not transients and len(newTabs) == 0:
|
||
diff_summary += " (页面无变化)"
|
||
rr['suggestion'] = "页面无明显变化"
|
||
except:
|
||
diff_summary = "页面变化监控不可用"
|
||
rr['diff'] = diff_summary
|
||
return rr
|