smart truncation, iframe preservation, streamlit filter

- simphtml: truncate_biggest replaces naive string cut, targets fattest content block via 70% drill-down
- simphtml: iframe children preserved through div proxy (bypass BS4 limitation)
- background/content.js: filter streamlit tabs and skip content script injection
- ga.py: file_content tag regex allows attributes; maxchars 38k->35k
This commit is contained in:
Liang Jiaqing
2026-04-06 12:19:20 +08:00
parent aae6d810cd
commit 8e829730a9
4 changed files with 39 additions and 8 deletions

4
ga.py
View File

@@ -136,7 +136,7 @@ def web_scan(tabs_only=False, switch_tab_id=None, text_only=False):
}
}
if not tabs_only:
importlib.reload(simphtml); result["content"] = simphtml.get_html(driver, cutlist=True, maxchars=38000, text_only=text_only)
importlib.reload(simphtml); result["content"] = simphtml.get_html(driver, cutlist=True, maxchars=35000, text_only=text_only)
return result
except Exception as e:
return {"status": "error", "msg": format_error(e)}
@@ -378,7 +378,7 @@ class GenericAgentHandler(BaseHandler):
yield f"[Action] {action_str} file: {os.path.basename(path)}\n"
def extract_robust_content(text):
tag = re.search(r"<file_content>(.*)</file_content>", text, re.DOTALL)
tag = re.search(r"<file_content[^>]*>(.*)</file_content>", text, re.DOTALL)
if tag: return tag.group(1).strip()
s, e = text.find("```"), text.rfind("```")
if -1 < s < e: return text[text.find("\n", s)+1 : e].strip()