From ada5f2a93ba7490d0adabb9985798cba08068d6a Mon Sep 17 00:00:00 2001
From: Piotr Oleszczyk <piotr@oleszczyk.eu>
Date: Sun, 1 Mar 2026 20:12:31 +0100
Subject: [PATCH] fix(llm): disable Gemini thinking to prevent MAX_TOKENS on
 structured output

Gemini 2.5 Flash (gemini-flash-latest) enables thinking by default.
Thinking tokens count toward max_output_tokens, leaving ~150 tokens for
actual JSON output and causing MAX_TOKENS truncation. Disable thinking
centrally in call_gemini via ThinkingConfig(thinking_budget=0).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/innercontext/llm.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/backend/innercontext/llm.py b/backend/innercontext/llm.py
index 5bc3b9d..f2fdde4 100644
--- a/backend/innercontext/llm.py
+++ b/backend/innercontext/llm.py
@@ -46,6 +46,13 @@ def call_gemini(
         with suppress(Exception):
             user_input = str(contents)
 
+    # Disable thinking by default — Gemini 2.5 Flash thinking tokens count toward
+    # max_output_tokens, leaving too little room for actual JSON output.
+    if config.thinking_config is None:
+        config = config.model_copy(
+            update={"thinking_config": genai_types.ThinkingConfig(thinking_budget=0)}
+        )
+
     start = time.monotonic()
     success, error_detail, response, finish_reason = True, None, None, None
     try: