From cc657998e811adaf8e4eb0406873b57bc22b0e5e Mon Sep 17 00:00:00 2001 From: Piotr Oleszczyk Date: Sun, 1 Mar 2026 20:15:49 +0100 Subject: [PATCH] fix(llm): switch from thinking_budget to thinking_level=LOW for Gemini 3 gemini-flash-latest resolves to gemini-3-flash-preview which uses thinking_level instead of the legacy thinking_budget (mixing both returns HTTP 400). Use LOW to reduce thinking overhead while keeping basic reasoning, replacing the now-incompatible thinking_budget=0. Co-Authored-By: Claude Sonnet 4.6 --- backend/innercontext/llm.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/backend/innercontext/llm.py b/backend/innercontext/llm.py index f2fdde4..bf7fc90 100644 --- a/backend/innercontext/llm.py +++ b/backend/innercontext/llm.py @@ -46,11 +46,16 @@ def call_gemini( with suppress(Exception): user_input = str(contents) - # Disable thinking by default — Gemini 2.5 Flash thinking tokens count toward - # max_output_tokens, leaving too little room for actual JSON output. + # Limit thinking by default — Gemini 3 Flash defaults to "high" thinking which + # consumes most of the token budget before generating actual output. + # Use "low" to reduce latency while keeping basic reasoning intact. if config.thinking_config is None: config = config.model_copy( - update={"thinking_config": genai_types.ThinkingConfig(thinking_budget=0)} + update={ + "thinking_config": genai_types.ThinkingConfig( + thinking_level=genai_types.ThinkingLevel.LOW + ) + } ) start = time.monotonic()