From ada5f2a93ba7490d0adabb9985798cba08068d6a Mon Sep 17 00:00:00 2001 From: Piotr Oleszczyk Date: Sun, 1 Mar 2026 20:12:31 +0100 Subject: [PATCH] fix(llm): disable Gemini thinking to prevent MAX_TOKENS on structured output Gemini 2.5 Flash (gemini-flash-latest) enables thinking by default. Thinking tokens count toward max_output_tokens, leaving ~150 tokens for actual JSON output and causing MAX_TOKENS truncation. Disable thinking centrally in call_gemini via ThinkingConfig(thinking_budget=0). Co-Authored-By: Claude Sonnet 4.6 --- backend/innercontext/llm.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/innercontext/llm.py b/backend/innercontext/llm.py index 5bc3b9d..f2fdde4 100644 --- a/backend/innercontext/llm.py +++ b/backend/innercontext/llm.py @@ -46,6 +46,13 @@ def call_gemini( with suppress(Exception): user_input = str(contents) + # Disable thinking by default — Gemini 2.5 Flash thinking tokens count toward + # max_output_tokens, leaving too little room for actual JSON output. + if config.thinking_config is None: + config = config.model_copy( + update={"thinking_config": genai_types.ThinkingConfig(thinking_budget=0)} + ) + start = time.monotonic() success, error_detail, response, finish_reason = True, None, None, None try: