refactor(llm): optimize Gemini config profiles for extraction and creativity

Introduces `get_extraction_config` and `get_creative_config` to standardize Gemini API calls.

* Defines explicit config profiles with appropriate `temperature` and `thinking_level` for Gemini 3 Flash.
* Extraction tasks use minimal thinking and temp=0.0 to reduce latency and token usage.
* Creative tasks use low thinking, temp=0.4, and top_p=0.8 to balance naturalness and safety.
* Applies these helpers across products, routines, and skincare endpoints.
* Also updates default model to `gemini-3-flash-preview`.
This commit is contained in:
Piotr Oleszczyk 2026-03-03 21:24:23 +01:00
parent 78df7322a9
commit ba1f10d99f
5 changed files with 72 additions and 33 deletions

View file

@ -4,14 +4,13 @@ from typing import Optional
from uuid import UUID, uuid4 from uuid import UUID, uuid4
from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import APIRouter, Depends, HTTPException, Query
from google.genai import types as genai_types
from pydantic import BaseModel as PydanticBase from pydantic import BaseModel as PydanticBase
from pydantic import ValidationError from pydantic import ValidationError
from sqlmodel import Session, SQLModel, col, select from sqlmodel import Session, SQLModel, col, select
from db import get_session from db import get_session
from innercontext.api.utils import get_or_404 from innercontext.api.utils import get_or_404
from innercontext.llm import call_gemini from innercontext.llm import call_gemini, get_creative_config, get_extraction_config
from innercontext.models import ( from innercontext.models import (
Product, Product,
ProductBase, ProductBase,
@ -422,12 +421,10 @@ def parse_product_text(data: ProductParseRequest) -> ProductParseResponse:
response = call_gemini( response = call_gemini(
endpoint="products/parse-text", endpoint="products/parse-text",
contents=f"Extract product data from this text:\n\n{data.text}", contents=f"Extract product data from this text:\n\n{data.text}",
config=genai_types.GenerateContentConfig( config=get_extraction_config(
system_instruction=_product_parse_system_prompt(), system_instruction=_product_parse_system_prompt(),
response_mime_type="application/json",
response_schema=ProductParseLLMResponse, response_schema=ProductParseLLMResponse,
max_output_tokens=16384, max_output_tokens=16384,
temperature=0.0,
), ),
user_input=data.text, user_input=data.text,
) )
@ -637,12 +634,10 @@ def suggest_shopping(session: Session = Depends(get_session)):
response = call_gemini( response = call_gemini(
endpoint="products/suggest", endpoint="products/suggest",
contents=prompt, contents=prompt,
config=genai_types.GenerateContentConfig( config=get_creative_config(
system_instruction=_SHOPPING_SYSTEM_PROMPT, system_instruction=_SHOPPING_SYSTEM_PROMPT,
response_mime_type="application/json",
response_schema=_ShoppingSuggestionsOut, response_schema=_ShoppingSuggestionsOut,
max_output_tokens=4096, max_output_tokens=4096,
temperature=0.4,
), ),
user_input=prompt, user_input=prompt,
) )

View file

@ -4,13 +4,12 @@ from typing import Optional
from uuid import UUID, uuid4 from uuid import UUID, uuid4
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from google.genai import types as genai_types
from pydantic import BaseModel as PydanticBase from pydantic import BaseModel as PydanticBase
from sqlmodel import Session, SQLModel, col, select from sqlmodel import Session, SQLModel, col, select
from db import get_session from db import get_session
from innercontext.api.utils import get_or_404 from innercontext.api.utils import get_or_404
from innercontext.llm import call_gemini from innercontext.llm import call_gemini, get_creative_config
from innercontext.models import ( from innercontext.models import (
GroomingSchedule, GroomingSchedule,
Product, Product,
@ -522,12 +521,10 @@ def suggest_routine(
response = call_gemini( response = call_gemini(
endpoint="routines/suggest", endpoint="routines/suggest",
contents=prompt, contents=prompt,
config=genai_types.GenerateContentConfig( config=get_creative_config(
system_instruction=_ROUTINES_SYSTEM_PROMPT, system_instruction=_ROUTINES_SYSTEM_PROMPT,
response_mime_type="application/json",
response_schema=_SuggestionOut, response_schema=_SuggestionOut,
max_output_tokens=4096, max_output_tokens=4096,
temperature=0.4,
), ),
user_input=prompt, user_input=prompt,
) )
@ -600,12 +597,10 @@ def suggest_batch(
response = call_gemini( response = call_gemini(
endpoint="routines/suggest-batch", endpoint="routines/suggest-batch",
contents=prompt, contents=prompt,
config=genai_types.GenerateContentConfig( config=get_creative_config(
system_instruction=_ROUTINES_SYSTEM_PROMPT, system_instruction=_ROUTINES_SYSTEM_PROMPT,
response_mime_type="application/json",
response_schema=_BatchOut, response_schema=_BatchOut,
max_output_tokens=8192, max_output_tokens=8192,
temperature=0.4,
), ),
user_input=prompt, user_input=prompt,
) )

View file

@ -11,7 +11,7 @@ from sqlmodel import Session, SQLModel, select
from db import get_session from db import get_session
from innercontext.api.utils import get_or_404 from innercontext.api.utils import get_or_404
from innercontext.llm import call_gemini from innercontext.llm import call_gemini, get_extraction_config
from innercontext.models import ( from innercontext.models import (
SkinConditionSnapshot, SkinConditionSnapshot,
SkinConditionSnapshotBase, SkinConditionSnapshotBase,
@ -171,12 +171,10 @@ async def analyze_skin_photos(
response = call_gemini( response = call_gemini(
endpoint="skincare/analyze-photos", endpoint="skincare/analyze-photos",
contents=parts, contents=parts,
config=genai_types.GenerateContentConfig( config=get_extraction_config(
system_instruction=_skin_photo_system_prompt(), system_instruction=_skin_photo_system_prompt(),
response_mime_type="application/json",
response_schema=_SkinAnalysisOut, response_schema=_SkinAnalysisOut,
max_output_tokens=2048, max_output_tokens=2048,
temperature=0.0,
), ),
user_input=image_summary, user_input=image_summary,
) )

View file

@ -3,12 +3,50 @@
import os import os
import time import time
from contextlib import suppress from contextlib import suppress
from typing import Any
from fastapi import HTTPException from fastapi import HTTPException
from google import genai from google import genai
from google.genai import types as genai_types from google.genai import types as genai_types
_DEFAULT_MODEL = "gemini-flash-latest" _DEFAULT_MODEL = "gemini-3-flash-preview"
def get_extraction_config(
system_instruction: str,
response_schema: Any,
max_output_tokens: int = 8192,
) -> genai_types.GenerateContentConfig:
"""Config for strict data extraction (deterministic, minimal thinking)."""
return genai_types.GenerateContentConfig(
system_instruction=system_instruction,
response_mime_type="application/json",
response_schema=response_schema,
max_output_tokens=max_output_tokens,
temperature=0.0,
thinking_config=genai_types.ThinkingConfig(
thinking_level=genai_types.ThinkingLevel.MINIMAL
),
)
def get_creative_config(
system_instruction: str,
response_schema: Any,
max_output_tokens: int = 4096,
) -> genai_types.GenerateContentConfig:
"""Config for creative tasks like recommendations (balanced creativity)."""
return genai_types.GenerateContentConfig(
system_instruction=system_instruction,
response_mime_type="application/json",
response_schema=response_schema,
max_output_tokens=max_output_tokens,
temperature=0.4,
top_p=0.8,
thinking_config=genai_types.ThinkingConfig(
thinking_level=genai_types.ThinkingLevel.LOW
),
)
def get_gemini_client() -> tuple[genai.Client, str]: def get_gemini_client() -> tuple[genai.Client, str]:
@ -46,18 +84,6 @@ def call_gemini(
with suppress(Exception): with suppress(Exception):
user_input = str(contents) user_input = str(contents)
# Limit thinking by default — Gemini 3 Flash defaults to "high" thinking which
# consumes most of the token budget before generating actual output.
# Use "low" to reduce latency while keeping basic reasoning intact.
if config.thinking_config is None:
config = config.model_copy(
update={
"thinking_config": genai_types.ThinkingConfig(
thinking_level=genai_types.ThinkingLevel.LOW
)
}
)
start = time.monotonic() start = time.monotonic()
success, error_detail, response, finish_reason = True, None, None, None success, error_detail, response, finish_reason = True, None, None, None
try: try:

25
backend/test_query.py Normal file
View file

@ -0,0 +1,25 @@
from datetime import date, timedelta
from sqlmodel import select
from db import get_session
from innercontext.models import Routine, RoutineStep
def run():
session = next(get_session())
ref_date = date.today()
cutoff = ref_date - timedelta(days=7)
recent_usage = session.exec(
select(RoutineStep.product_id)
.join(Routine, Routine.id == RoutineStep.routine_id)
.where(Routine.routine_date >= cutoff)
.where(Routine.routine_date <= ref_date)
).all()
print("Found:", len(recent_usage))
if __name__ == "__main__":
run()