smart truncation, iframe preservation, streamlit filter
- simphtml: truncate_biggest replaces naive string cut, targets fattest content block via 70% drill-down - simphtml: iframe children preserved through div proxy (bypass BS4 limitation) - background/content.js: filter streamlit tabs and skip content script injection - ga.py: file_content tag regex allows attributes; maxchars 38k->35k
This commit is contained in:
@@ -341,7 +341,7 @@ chrome.runtime.onInstalled.addListener(() => connectWS());
|
|||||||
// Sync tab list on changes
|
// Sync tab list on changes
|
||||||
async function sendTabsUpdate() {
|
async function sendTabsUpdate() {
|
||||||
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||||
const tabs = (await chrome.tabs.query({})).filter(t => isScriptable(t.url));
|
const tabs = (await chrome.tabs.query({})).filter(t => isScriptable(t.url) && !/streamlit/i.test(t.title));
|
||||||
ws.send(JSON.stringify({
|
ws.send(JSON.stringify({
|
||||||
type: 'tabs_update',
|
type: 'tabs_update',
|
||||||
tabs: tabs.map(t => ({ id: t.id, url: t.url, title: t.title }))
|
tabs: tabs.map(t => ({ id: t.id, url: t.url, title: t.title }))
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
;(function(){ if (/streamlit/i.test(document.title)) return;
|
||||||
|
|
||||||
// Remove meta CSP tags
|
// Remove meta CSP tags
|
||||||
document.querySelectorAll('meta[http-equiv="Content-Security-Policy"]').forEach(e => e.remove());
|
document.querySelectorAll('meta[http-equiv="Content-Security-Policy"]').forEach(e => e.remove());
|
||||||
@@ -42,4 +43,5 @@ async function handle(el) {
|
|||||||
} catch (e) {
|
} catch (e) {
|
||||||
el.textContent = JSON.stringify({ ok: false, error: e.message });
|
el.textContent = JSON.stringify({ ok: false, error: e.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
})();
|
||||||
4
ga.py
4
ga.py
@@ -136,7 +136,7 @@ def web_scan(tabs_only=False, switch_tab_id=None, text_only=False):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if not tabs_only:
|
if not tabs_only:
|
||||||
importlib.reload(simphtml); result["content"] = simphtml.get_html(driver, cutlist=True, maxchars=38000, text_only=text_only)
|
importlib.reload(simphtml); result["content"] = simphtml.get_html(driver, cutlist=True, maxchars=35000, text_only=text_only)
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "msg": format_error(e)}
|
return {"status": "error", "msg": format_error(e)}
|
||||||
@@ -378,7 +378,7 @@ class GenericAgentHandler(BaseHandler):
|
|||||||
yield f"[Action] {action_str} file: {os.path.basename(path)}\n"
|
yield f"[Action] {action_str} file: {os.path.basename(path)}\n"
|
||||||
|
|
||||||
def extract_robust_content(text):
|
def extract_robust_content(text):
|
||||||
tag = re.search(r"<file_content>(.*)</file_content>", text, re.DOTALL)
|
tag = re.search(r"<file_content[^>]*>(.*)</file_content>", text, re.DOTALL)
|
||||||
if tag: return tag.group(1).strip()
|
if tag: return tag.group(1).strip()
|
||||||
s, e = text.find("```"), text.rfind("```")
|
s, e = text.find("```"), text.rfind("```")
|
||||||
if -1 < s < e: return text[text.find("\n", s)+1 : e].strip()
|
if -1 < s < e: return text[text.find("\n", s)+1 : e].strip()
|
||||||
|
|||||||
37
simphtml.py
37
simphtml.py
@@ -301,10 +301,20 @@ let root = domCopy;
|
|||||||
while (root.children.length === 1) {
|
while (root.children.length === 1) {
|
||||||
root = root.children[0];
|
root = root.children[0];
|
||||||
}
|
}
|
||||||
for (let ii = 0; ii < 3; ii++)
|
for (let ii = 0; ii < 3; ii++) {
|
||||||
root.querySelectorAll('div').forEach(div => (!div.textContent.trim() && div.children.length === 0) && div.remove());
|
root.querySelectorAll('div').forEach(div => (!div.textContent.trim() && div.children.length === 0) && div.remove());
|
||||||
|
}
|
||||||
root.querySelectorAll('[data-mark]').forEach(e => e.removeAttribute('data-mark'));
|
root.querySelectorAll('[data-mark]').forEach(e => e.removeAttribute('data-mark'));
|
||||||
root.removeAttribute('data-mark');
|
root.removeAttribute('data-mark');
|
||||||
|
root.querySelectorAll('iframe').forEach(f => {
|
||||||
|
if (f.children.length) {
|
||||||
|
const d = document.createElement('div');
|
||||||
|
for (const a of f.attributes) d.setAttribute(a.name, a.value);
|
||||||
|
d.setAttribute('data-tag', 'iframe');
|
||||||
|
while (f.firstChild) d.appendChild(f.firstChild);
|
||||||
|
f.parentNode.replaceChild(d, f);
|
||||||
|
}
|
||||||
|
});
|
||||||
return root.outerHTML;
|
return root.outerHTML;
|
||||||
}
|
}
|
||||||
optHTML()'''
|
optHTML()'''
|
||||||
@@ -690,10 +700,12 @@ def find_changed_elements(before_html, after_html):
|
|||||||
result["top_change"] = h if len(h) <= 2000 else h[:2000] + '...[TRUNCATED]'
|
result["top_change"] = h if len(h) <= 2000 else h[:2000] + '...[TRUNCATED]'
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="", text_only=False):
|
def get_html(driver, cutlist=False, maxchars=35000, instruction="", extra_js="", text_only=False):
|
||||||
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
page = get_main_block(driver, extra_js=extra_js, text_only=text_only)
|
||||||
if text_only: return page
|
if text_only: return page
|
||||||
soup = optimize_html_for_tokens(page)
|
soup = optimize_html_for_tokens(page)
|
||||||
|
for div in soup.select('div[data-tag="iframe"]'):
|
||||||
|
div.name = 'iframe'; del div['data-tag']
|
||||||
html = str(soup)
|
html = str(soup)
|
||||||
if not cutlist: return html
|
if not cutlist: return html
|
||||||
rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', [])
|
rr = driver.execute_js(js_findMainList + """return findMainList(document.body);""").get('data', [])
|
||||||
@@ -724,9 +736,26 @@ def get_html(driver, cutlist=False, maxchars=38000, instruction="", extra_js="",
|
|||||||
for it in removed: it.decompose()
|
for it in removed: it.decompose()
|
||||||
ss = str(optimize_html_for_tokens(soup)) if lists else html
|
ss = str(optimize_html_for_tokens(soup)) if lists else html
|
||||||
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
|
if lists: print(f"[cutlist] Result: {len(html)} -> {len(ss)} chars ({100-len(ss)*100//len(html)}% saved)")
|
||||||
if len(ss) > maxchars: ss = ss[:maxchars] + ' ... [TRUNCATED]'
|
if len(ss) > maxchars: ss = str(truncate_biggest(soup, maxchars))
|
||||||
return ss
|
return ss
|
||||||
|
|
||||||
|
def truncate_biggest(soup, budget):
|
||||||
|
over = len(str(soup)) - budget
|
||||||
|
if over <= 0: return soup
|
||||||
|
t = max((t for t in soup.find_all(True) if len(str(t)) > over and t.parent), key=lambda x: len(str(x)))
|
||||||
|
# 穿透空壳: 最胖子Tag占>70%就深入, 跳过html/body/单传div
|
||||||
|
while (kids := [(c, len(str(c))) for c in t.children if c.name]) and max(kids, key=lambda x: x[1])[1] > len(str(t)) * .7:
|
||||||
|
t = max(kids, key=lambda x: x[1])[0]
|
||||||
|
acc, keep = 0, len(str(t)) - over
|
||||||
|
for c in list(t.children):
|
||||||
|
if (acc := acc + len(str(c))) > keep:
|
||||||
|
tail = list(c.next_siblings)
|
||||||
|
c.replace_with(BeautifulSoup(str(c)[:max(0, keep-acc+len(str(c)))] + f' [TRUNCATED {over//1000}k chars]', 'html.parser'))
|
||||||
|
for s in tail: s.extract()
|
||||||
|
break
|
||||||
|
print(f"[truncate] finally {len(str(soup))} chars, cut <{t.name}> by {over}")
|
||||||
|
return soup
|
||||||
|
|
||||||
def execute_js_rich(script, driver, no_monitor=False):
|
def execute_js_rich(script, driver, no_monitor=False):
|
||||||
last_html = None
|
last_html = None
|
||||||
if not no_monitor:
|
if not no_monitor:
|
||||||
|
|||||||
Reference in New Issue
Block a user