refactor(llm): optimize Gemini config profiles for extraction and creativity

Introduces `get_extraction_config` and `get_creative_config` to standardize Gemini API calls. * Defines explicit config profiles with appropriate `temperature` and `thinking_level` for Gemini 3 Flash. * Extraction tasks use minimal thinking and temp=0.0 to reduce latency and token usage. * Creative tasks use low thinking, temp=0.4, and top_p=0.8 to balance naturalness and safety. * Applies these helpers across products, routines, and skincare endpoints. * Also updates default model to `gemini-3-flash-preview`.
2026-03-03 21:24:23 +01:00 · 2026-03-03 21:24:23 +01:00 · ba1f10d99f
commit ba1f10d99f
parent 78df7322a9
5 changed files with 72 additions and 33 deletions
--- a/backend/innercontext/api/products.py
+++ b/backend/innercontext/api/products.py
@ -4,14 +4,13 @@ from typing import Optional
 from uuid import UUID, uuid4
 from fastapi import APIRouter, Depends, HTTPException, Query
 from google.genai import types as genai_types
 from pydantic import BaseModel as PydanticBase
 from pydantic import ValidationError
 from sqlmodel import Session, SQLModel, col, select
 from db import get_session
 from innercontext.api.utils import get_or_404
-from innercontext.llm import call_gemini
+from innercontext.llm import call_gemini, get_creative_config, get_extraction_config
 from innercontext.models import (
    Product,
    ProductBase,
@ -422,12 +421,10 @@ def parse_product_text(data: ProductParseRequest) -> ProductParseResponse:
    response = call_gemini(
        endpoint="products/parse-text",
        contents=f"Extract product data from this text:\n\n{data.text}",
-        config=genai_types.GenerateContentConfig(
+        config=get_extraction_config(
            system_instruction=_product_parse_system_prompt(),
            response_mime_type="application/json",
            response_schema=ProductParseLLMResponse,
            max_output_tokens=16384,
            temperature=0.0,
        ),
        user_input=data.text,
    )
@ -637,12 +634,10 @@ def suggest_shopping(session: Session = Depends(get_session)):
    response = call_gemini(
        endpoint="products/suggest",
        contents=prompt,
-        config=genai_types.GenerateContentConfig(
+        config=get_creative_config(
            system_instruction=_SHOPPING_SYSTEM_PROMPT,
            response_mime_type="application/json",
            response_schema=_ShoppingSuggestionsOut,
            max_output_tokens=4096,
            temperature=0.4,
        ),
        user_input=prompt,
    )
--- a/backend/innercontext/api/routines.py
+++ b/backend/innercontext/api/routines.py
@ -4,13 +4,12 @@ from typing import Optional
 from uuid import UUID, uuid4
 from fastapi import APIRouter, Depends, HTTPException
 from google.genai import types as genai_types
 from pydantic import BaseModel as PydanticBase
 from sqlmodel import Session, SQLModel, col, select
 from db import get_session
 from innercontext.api.utils import get_or_404
-from innercontext.llm import call_gemini
+from innercontext.llm import call_gemini, get_creative_config
 from innercontext.models import (
    GroomingSchedule,
    Product,
@ -522,12 +521,10 @@ def suggest_routine(
    response = call_gemini(
        endpoint="routines/suggest",
        contents=prompt,
-        config=genai_types.GenerateContentConfig(
+        config=get_creative_config(
            system_instruction=_ROUTINES_SYSTEM_PROMPT,
            response_mime_type="application/json",
            response_schema=_SuggestionOut,
            max_output_tokens=4096,
            temperature=0.4,
        ),
        user_input=prompt,
    )
@ -600,12 +597,10 @@ def suggest_batch(
    response = call_gemini(
        endpoint="routines/suggest-batch",
        contents=prompt,
-        config=genai_types.GenerateContentConfig(
+        config=get_creative_config(
            system_instruction=_ROUTINES_SYSTEM_PROMPT,
            response_mime_type="application/json",
            response_schema=_BatchOut,
            max_output_tokens=8192,
            temperature=0.4,
        ),
        user_input=prompt,
    )
--- a/backend/innercontext/api/skincare.py
+++ b/backend/innercontext/api/skincare.py
@ -11,7 +11,7 @@ from sqlmodel import Session, SQLModel, select
 from db import get_session
 from innercontext.api.utils import get_or_404
-from innercontext.llm import call_gemini
+from innercontext.llm import call_gemini, get_extraction_config
 from innercontext.models import (
    SkinConditionSnapshot,
    SkinConditionSnapshotBase,
@ -171,12 +171,10 @@ async def analyze_skin_photos(
    response = call_gemini(
        endpoint="skincare/analyze-photos",
        contents=parts,
-        config=genai_types.GenerateContentConfig(
+        config=get_extraction_config(
            system_instruction=_skin_photo_system_prompt(),
            response_mime_type="application/json",
            response_schema=_SkinAnalysisOut,
            max_output_tokens=2048,
            temperature=0.0,
        ),
        user_input=image_summary,
    )
--- a/backend/innercontext/llm.py
+++ b/backend/innercontext/llm.py
@ -3,12 +3,50 @@
 import os
 import time
 from contextlib import suppress
 from typing import Any
 from fastapi import HTTPException
 from google import genai
 from google.genai import types as genai_types
-_DEFAULT_MODEL = "gemini-flash-latest"
+_DEFAULT_MODEL = "gemini-3-flash-preview"
 def get_extraction_config(
    system_instruction: str,
    response_schema: Any,
    max_output_tokens: int = 8192,
 ) -> genai_types.GenerateContentConfig:
    """Config for strict data extraction (deterministic, minimal thinking)."""
    return genai_types.GenerateContentConfig(
        system_instruction=system_instruction,
        response_mime_type="application/json",
        response_schema=response_schema,
        max_output_tokens=max_output_tokens,
        temperature=0.0,
        thinking_config=genai_types.ThinkingConfig(
            thinking_level=genai_types.ThinkingLevel.MINIMAL
        ),
    )
 def get_creative_config(
    system_instruction: str,
    response_schema: Any,
    max_output_tokens: int = 4096,
 ) -> genai_types.GenerateContentConfig:
    """Config for creative tasks like recommendations (balanced creativity)."""
    return genai_types.GenerateContentConfig(
        system_instruction=system_instruction,
        response_mime_type="application/json",
        response_schema=response_schema,
        max_output_tokens=max_output_tokens,
        temperature=0.4,
        top_p=0.8,
        thinking_config=genai_types.ThinkingConfig(
            thinking_level=genai_types.ThinkingLevel.LOW
        ),
    )
 def get_gemini_client() -> tuple[genai.Client, str]:
@ -46,18 +84,6 @@ def call_gemini(
        with suppress(Exception):
            user_input = str(contents)
    # Limit thinking by default — Gemini 3 Flash defaults to "high" thinking which
    # consumes most of the token budget before generating actual output.
    # Use "low" to reduce latency while keeping basic reasoning intact.
    if config.thinking_config is None:
        config = config.model_copy(
            update={
                "thinking_config": genai_types.ThinkingConfig(
                    thinking_level=genai_types.ThinkingLevel.LOW
                )
            }
        )
    start = time.monotonic()
    success, error_detail, response, finish_reason = True, None, None, None
    try:
--- a/backend/test_query.py
+++ b/backend/test_query.py
@ -0,0 +1,25 @@
 from datetime import date, timedelta
 from sqlmodel import select
 from db import get_session
 from innercontext.models import Routine, RoutineStep
 def run():
    session = next(get_session())
    ref_date = date.today()
    cutoff = ref_date - timedelta(days=7)
    recent_usage = session.exec(
        select(RoutineStep.product_id)
        .join(Routine, Routine.id == RoutineStep.routine_id)
        .where(Routine.routine_date >= cutoff)
        .where(Routine.routine_date <= ref_date)
    ).all()
    print("Found:", len(recent_usage))
 if __name__ == "__main__":
    run()