From cc657998e811adaf8e4eb0406873b57bc22b0e5e Mon Sep 17 00:00:00 2001
From: Piotr Oleszczyk <piotr@oleszczyk.eu>
Date: Sun, 1 Mar 2026 20:15:49 +0100
Subject: [PATCH] fix(llm): switch from thinking_budget to thinking_level=LOW
 for Gemini 3

gemini-flash-latest resolves to gemini-3-flash-preview which uses
thinking_level instead of the legacy thinking_budget (mixing both
returns HTTP 400). Use LOW to reduce thinking overhead while keeping
basic reasoning, replacing the now-incompatible thinking_budget=0.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/innercontext/llm.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/backend/innercontext/llm.py b/backend/innercontext/llm.py
index f2fdde4..bf7fc90 100644
--- a/backend/innercontext/llm.py
+++ b/backend/innercontext/llm.py
@@ -46,11 +46,16 @@ def call_gemini(
         with suppress(Exception):
             user_input = str(contents)
 
-    # Disable thinking by default — Gemini 2.5 Flash thinking tokens count toward
-    # max_output_tokens, leaving too little room for actual JSON output.
+    # Limit thinking by default — Gemini 3 Flash defaults to "high" thinking which
+    # consumes most of the token budget before generating actual output.
+    # Use "low" to reduce latency while keeping basic reasoning intact.
     if config.thinking_config is None:
         config = config.model_copy(
-            update={"thinking_config": genai_types.ThinkingConfig(thinking_budget=0)}
+            update={
+                "thinking_config": genai_types.ThinkingConfig(
+                    thinking_level=genai_types.ThinkingLevel.LOW
+                )
+            }
         )
 
     start = time.monotonic()