refactor(llm): optimize Gemini config profiles for extraction and creativity

Introduces `get_extraction_config` and `get_creative_config` to standardize Gemini API calls. * Defines explicit config profiles with appropriate `temperature` and `thinking_level` for Gemini 3 Flash. * Extraction tasks use minimal thinking and temp=0.0 to reduce latency and token usage. * Creative tasks use low thinking, temp=0.4, and top_p=0.8 to balance naturalness and safety. * Applies these helpers across products, routines, and skincare endpoints. * Also updates default model to `gemini-3-flash-preview`.
2026-03-03 21:24:23 +01:00 · 2026-03-03 21:24:23 +01:00 · ba1f10d99f
commit ba1f10d99f
parent 78df7322a9
5 changed files with 72 additions and 33 deletions
--- a/backend/innercontext/api/products.py
+++ b/backend/innercontext/api/products.py
@ -4,14 +4,13 @@ from typing import Optional
 from uuid import UUID, uuid4

 from fastapi import APIRouter, Depends, HTTPException, Query
-from google.genai import types as genai_types
 from pydantic import BaseModel as PydanticBase
 from pydantic import ValidationError
 from sqlmodel import Session, SQLModel, col, select

 from db import get_session
 from innercontext.api.utils import get_or_404
-from innercontext.llm import call_gemini
+from innercontext.llm import call_gemini, get_creative_config, get_extraction_config
 from innercontext.models import (
    Product,
    ProductBase,
@ -422,12 +421,10 @@ def parse_product_text(data: ProductParseRequest) -> ProductParseResponse:
    response = call_gemini(
        endpoint="products/parse-text",
        contents=f"Extract product data from this text:\n\n{data.text}",
-        config=genai_types.GenerateContentConfig(
+        config=get_extraction_config(
            system_instruction=_product_parse_system_prompt(),
-            response_mime_type="application/json",
            response_schema=ProductParseLLMResponse,
            max_output_tokens=16384,
-            temperature=0.0,
        ),
        user_input=data.text,
    )
@ -637,12 +634,10 @@ def suggest_shopping(session: Session = Depends(get_session)):
    response = call_gemini(
        endpoint="products/suggest",
        contents=prompt,
-        config=genai_types.GenerateContentConfig(
+        config=get_creative_config(
            system_instruction=_SHOPPING_SYSTEM_PROMPT,
-            response_mime_type="application/json",
            response_schema=_ShoppingSuggestionsOut,
            max_output_tokens=4096,
-            temperature=0.4,
        ),
        user_input=prompt,
    )
--- a/backend/innercontext/api/routines.py
+++ b/backend/innercontext/api/routines.py
@ -4,13 +4,12 @@ from typing import Optional
 from uuid import UUID, uuid4

 from fastapi import APIRouter, Depends, HTTPException
-from google.genai import types as genai_types
 from pydantic import BaseModel as PydanticBase
 from sqlmodel import Session, SQLModel, col, select

 from db import get_session
 from innercontext.api.utils import get_or_404
-from innercontext.llm import call_gemini
+from innercontext.llm import call_gemini, get_creative_config
 from innercontext.models import (
    GroomingSchedule,
    Product,
@ -522,12 +521,10 @@ def suggest_routine(
    response = call_gemini(
        endpoint="routines/suggest",
        contents=prompt,
-        config=genai_types.GenerateContentConfig(
+        config=get_creative_config(
            system_instruction=_ROUTINES_SYSTEM_PROMPT,
-            response_mime_type="application/json",
            response_schema=_SuggestionOut,
            max_output_tokens=4096,
-            temperature=0.4,
        ),
        user_input=prompt,
    )
@ -600,12 +597,10 @@ def suggest_batch(
    response = call_gemini(
        endpoint="routines/suggest-batch",
        contents=prompt,
-        config=genai_types.GenerateContentConfig(
+        config=get_creative_config(
            system_instruction=_ROUTINES_SYSTEM_PROMPT,
-            response_mime_type="application/json",
            response_schema=_BatchOut,
            max_output_tokens=8192,
-            temperature=0.4,
        ),
        user_input=prompt,
    )
--- a/backend/innercontext/api/skincare.py
+++ b/backend/innercontext/api/skincare.py
@ -11,7 +11,7 @@ from sqlmodel import Session, SQLModel, select

 from db import get_session
 from innercontext.api.utils import get_or_404
-from innercontext.llm import call_gemini
+from innercontext.llm import call_gemini, get_extraction_config
 from innercontext.models import (
    SkinConditionSnapshot,
    SkinConditionSnapshotBase,
@ -171,12 +171,10 @@ async def analyze_skin_photos(
    response = call_gemini(
        endpoint="skincare/analyze-photos",
        contents=parts,
-        config=genai_types.GenerateContentConfig(
+        config=get_extraction_config(
            system_instruction=_skin_photo_system_prompt(),
-            response_mime_type="application/json",
            response_schema=_SkinAnalysisOut,
            max_output_tokens=2048,
-            temperature=0.0,
        ),
        user_input=image_summary,
    )
--- a/backend/innercontext/llm.py
+++ b/backend/innercontext/llm.py
@ -3,12 +3,50 @@
 import os
 import time
 from contextlib import suppress
+from typing import Any

 from fastapi import HTTPException
 from google import genai
 from google.genai import types as genai_types

-_DEFAULT_MODEL = "gemini-flash-latest"
+_DEFAULT_MODEL = "gemini-3-flash-preview"
+
+
+def get_extraction_config(
+    system_instruction: str,
+    response_schema: Any,
+    max_output_tokens: int = 8192,
+) -> genai_types.GenerateContentConfig:
+    """Config for strict data extraction (deterministic, minimal thinking)."""
+    return genai_types.GenerateContentConfig(
+        system_instruction=system_instruction,
+        response_mime_type="application/json",
+        response_schema=response_schema,
+        max_output_tokens=max_output_tokens,
+        temperature=0.0,
+        thinking_config=genai_types.ThinkingConfig(
+            thinking_level=genai_types.ThinkingLevel.MINIMAL
+        ),
+    )
+
+
+def get_creative_config(
+    system_instruction: str,
+    response_schema: Any,
+    max_output_tokens: int = 4096,
+) -> genai_types.GenerateContentConfig:
+    """Config for creative tasks like recommendations (balanced creativity)."""
+    return genai_types.GenerateContentConfig(
+        system_instruction=system_instruction,
+        response_mime_type="application/json",
+        response_schema=response_schema,
+        max_output_tokens=max_output_tokens,
+        temperature=0.4,
+        top_p=0.8,
+        thinking_config=genai_types.ThinkingConfig(
+            thinking_level=genai_types.ThinkingLevel.LOW
+        ),
+    )


 def get_gemini_client() -> tuple[genai.Client, str]:
@ -46,18 +84,6 @@ def call_gemini(
        with suppress(Exception):
            user_input = str(contents)

-    # Limit thinking by default — Gemini 3 Flash defaults to "high" thinking which
-    # consumes most of the token budget before generating actual output.
-    # Use "low" to reduce latency while keeping basic reasoning intact.
-    if config.thinking_config is None:
-        config = config.model_copy(
-            update={
-                "thinking_config": genai_types.ThinkingConfig(
-                    thinking_level=genai_types.ThinkingLevel.LOW
-                )
-            }
-        )
-
    start = time.monotonic()
    success, error_detail, response, finish_reason = True, None, None, None
    try:
--- a/backend/test_query.py
+++ b/backend/test_query.py
@ -0,0 +1,25 @@
+from datetime import date, timedelta
+
+from sqlmodel import select
+
+from db import get_session
+from innercontext.models import Routine, RoutineStep
+
+
+def run():
+    session = next(get_session())
+    ref_date = date.today()
+    cutoff = ref_date - timedelta(days=7)
+
+    recent_usage = session.exec(
+        select(RoutineStep.product_id)
+        .join(Routine, Routine.id == RoutineStep.routine_id)
+        .where(Routine.routine_date >= cutoff)
+        .where(Routine.routine_date <= ref_date)
+    ).all()
+
+    print("Found:", len(recent_usage))
+
+
+if __name__ == "__main__":
+    run()