From f14ad0c69394f4187c8eff713b836545fcf9f4d6 Mon Sep 17 00:00:00 2001 From: song <130889452+mrsxs@users.noreply.github.com> Date: Tue, 21 Apr 2026 12:07:14 +0800 Subject: [PATCH] fix(llmcore): preserve thinking block signature in streaming SSE parser (#123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anthropic's extended thinking streaming protocol emits two delta types for a single thinking block: `thinking_delta` (the textual reasoning) and `signature_delta` (a base64 HMAC tag appended at the end of the block). Both must be accumulated into the same `content_block`. Current code only handles `thinking_delta`, so `signature_delta` events are silently dropped. When the assistant's reply (with thinking) is echoed back on the next turn, Anthropic's server validates the signature and rejects the request with 400: "Invalid `signature` in `thinking` block" Downstream effects observed in production (via sub2api relay logs): - Every request with history triggers a 400 signature error - The relay strips thinking blocks and retries, which changes the cache prefix and invalidates prompt caching, forcing a full rebuild of cache_creation_tokens (~20k-30k per affected request) - Measured in a 5h window: 5/25 requests suffered cache invalidation, accounting for 53.5% of total spend that was otherwise avoidable Fix: 1. Initialize `current_block` with an empty `signature` field when a thinking block starts, so the dict shape matches Anthropic's spec (`{type, thinking, signature}`). 2. Handle `signature_delta` events by appending `delta.signature` to `current_block["signature"]`. Using `+=` (rather than assignment) mirrors how `thinking_delta` is accumulated and is robust against future chunked signatures. No behavior change for clients that disable extended thinking, or for upstreams that don't emit `signature_delta`. For `tool_use` threads that require valid thinking signatures to preserve reasoning context, this fix is required — the previous behavior silently corrupted them. Verification: - Before fix: upstream returns 400 + retry; cache_creation_tokens spike to ~25k on every 4th-5th request in a conversation - After fix: upstream accepts the first attempt; cache_read_tokens dominate, cache_creation_tokens only appear on the first request of a fresh 5m prompt-cache window --- llmcore.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llmcore.py b/llmcore.py index c8d9970..604f97c 100644 --- a/llmcore.py +++ b/llmcore.py @@ -115,7 +115,7 @@ def _parse_claude_sse(resp_lines): elif evt_type == "content_block_start": block = evt.get("content_block", {}) if block.get("type") == "text": current_block = {"type": "text", "text": ""} - elif block.get("type") == "thinking": current_block = {"type": "thinking", "thinking": ""} + elif block.get("type") == "thinking": current_block = {"type": "thinking", "thinking": "", "signature": ""} elif block.get("type") == "tool_use": current_block = {"type": "tool_use", "id": block.get("id", ""), "name": block.get("name", ""), "input": {}} tool_json_buf = "" @@ -127,6 +127,9 @@ def _parse_claude_sse(resp_lines): if text: yield text elif delta.get("type") == "thinking_delta": if current_block and current_block.get("type") == "thinking": current_block["thinking"] += delta.get("thinking", "") + elif delta.get("type") == "signature_delta": + if current_block and current_block.get("type") == "thinking": + current_block["signature"] = current_block.get("signature", "") + delta.get("signature", "") elif delta.get("type") == "input_json_delta": tool_json_buf += delta.get("partial_json", "") elif evt_type == "content_block_stop": if current_block: