From 3c3248c2eac9038775c1e54224a5599742d86af8 Mon Sep 17 00:00:00 2001
From: Piotr Oleszczyk <piotr@oleszczyk.eu>
Date: Fri, 6 Mar 2026 15:50:28 +0100
Subject: [PATCH] feat(api): add Phase 3 observability - expose validation
 warnings and metadata to frontend

Backend changes:
- Create ResponseMetadata and TokenMetrics models for API responses
- Modify call_gemini() and call_gemini_with_function_tools() to return (response, log_id) tuple
- Add _build_response_metadata() helper to extract metadata from AICallLog
- Update routines API (/suggest, /suggest-batch) to populate validation_warnings, auto_fixes_applied, and metadata
- Update products API (/suggest) to populate observability fields
- Update skincare API to handle new return signature

Frontend changes:
- Add TypeScript types: TokenMetrics, ResponseMetadata
- Update RoutineSuggestion, BatchSuggestion, ShoppingSuggestionResponse with observability fields

Next: Create UI components to display warnings, reasoning chains, and token metrics
---
 backend/innercontext/api/products.py        | 66 +++++++++++---
 backend/innercontext/api/routines.py        | 96 ++++++++++++++++-----
 backend/innercontext/api/skincare.py        |  2 +-
 backend/innercontext/llm.py                 | 33 +++++--
 backend/innercontext/models/api_metadata.py | 29 +++++++
 frontend/src/lib/types.ts                   | 27 ++++++
 6 files changed, 213 insertions(+), 40 deletions(-)
 create mode 100644 backend/innercontext/models/api_metadata.py

diff --git a/backend/innercontext/api/products.py b/backend/innercontext/api/products.py
index d6bc82e..eb4840b 100644
--- a/backend/innercontext/api/products.py
+++ b/backend/innercontext/api/products.py
@@ -1,7 +1,7 @@
 import json
 import logging
 from datetime import date
-from typing import Literal, Optional
+from typing import Any, Literal, Optional
 from uuid import UUID, uuid4
 
 from fastapi import APIRouter, Depends, HTTPException, Query
@@ -42,6 +42,8 @@ from innercontext.models import (
     SkinConcern,
     SkinConditionSnapshot,
 )
+from innercontext.models.ai_log import AICallLog
+from innercontext.models.api_metadata import ResponseMetadata, TokenMetrics
 from innercontext.models.enums import (
     AbsorptionSpeed,
     DayTime,
@@ -63,6 +65,37 @@ logger = logging.getLogger(__name__)
 
 router = APIRouter()
 
+
+def _build_response_metadata(session: Session, log_id: Any) -> ResponseMetadata | None:
+    """Build ResponseMetadata from AICallLog for Phase 3 observability."""
+    if not log_id:
+        return None
+
+    log = session.get(AICallLog, log_id)
+    if not log:
+        return None
+
+    token_metrics = None
+    if (
+        log.prompt_tokens is not None
+        and log.completion_tokens is not None
+        and log.total_tokens is not None
+    ):
+        token_metrics = TokenMetrics(
+            prompt_tokens=log.prompt_tokens,
+            completion_tokens=log.completion_tokens,
+            thoughts_tokens=log.thoughts_tokens,
+            total_tokens=log.total_tokens,
+        )
+
+    return ResponseMetadata(
+        model_used=log.model,
+        duration_ms=log.duration_ms or 0,
+        reasoning_chain=log.reasoning_chain,
+        token_metrics=token_metrics,
+    )
+
+
 PricingSource = Literal["category", "fallback", "insufficient_data"]
 PricingOutput = tuple[PriceTier | None, float | None, PricingSource | None]
 PricingOutputs = dict[UUID, PricingOutput]
@@ -234,6 +267,10 @@ class ProductSuggestion(PydanticBase):
 class ShoppingSuggestionResponse(PydanticBase):
     suggestions: list[ProductSuggestion]
     reasoning: str
+    # Phase 3: Observability fields
+    validation_warnings: list[str] | None = None
+    auto_fixes_applied: list[str] | None = None
+    metadata: "ResponseMetadata | None" = None
 
 
 class _ProductSuggestionOut(PydanticBase):
@@ -609,7 +646,7 @@ def parse_product_text(data: ProductParseRequest) -> ProductParseResponse:
     # Phase 1: Sanitize input text
     sanitized_text = sanitize_user_input(data.text, max_length=10000)
 
-    response = call_gemini(
+    response, log_id = call_gemini(
         endpoint="products/parse-text",
         contents=f"Extract product data from this text:\n\n{sanitized_text}",
         config=get_extraction_config(
@@ -997,7 +1034,7 @@ def suggest_shopping(session: Session = Depends(get_session)):
     }
 
     try:
-        response = call_gemini_with_function_tools(
+        response, log_id = call_gemini_with_function_tools(
             endpoint="products/suggest",
             contents=prompt,
             config=config,
@@ -1020,7 +1057,7 @@ def suggest_shopping(session: Session = Depends(get_session)):
             "- Zasugeruj tylko najbardziej bezpieczne i realistyczne typy produktow do uzupelnienia brakow,"
             " unikaj agresywnych aktywnych przy niepelnych danych.\n"
         )
-        response = call_gemini(
+        response, log_id = call_gemini(
             endpoint="products/suggest",
             contents=conservative_prompt,
             config=get_creative_config(
@@ -1044,12 +1081,6 @@ def suggest_shopping(session: Session = Depends(get_session)):
     except json.JSONDecodeError as e:
         raise HTTPException(status_code=502, detail=f"LLM returned invalid JSON: {e}")
 
-    shopping_response = ShoppingSuggestionResponse(
-        suggestions=[ProductSuggestion(**s) for s in parsed.get("suggestions", [])],
-        reasoning=parsed.get("reasoning", ""),
-    )
-
-    # Phase 1: Validate the shopping suggestions
     # Get products with inventory (those user already owns)
     products_with_inventory = session.exec(
         select(Product).join(ProductInventory).distinct()
@@ -1061,7 +1092,15 @@ def suggest_shopping(session: Session = Depends(get_session)):
         valid_targets=set(SkinConcern),
     )
 
+    # Phase 1: Validate the shopping suggestions
     validator = ShoppingValidator()
+
+    # Build initial shopping response without metadata
+    shopping_response = ShoppingSuggestionResponse(
+        suggestions=[ProductSuggestion(**s) for s in parsed.get("suggestions", [])],
+        reasoning=parsed.get("reasoning", ""),
+    )
+
     validation_result = validator.validate(shopping_response, shopping_context)
 
     if not validation_result.is_valid:
@@ -1073,7 +1112,14 @@ def suggest_shopping(session: Session = Depends(get_session)):
             detail=f"Generated shopping suggestions failed validation: {'; '.join(validation_result.errors)}",
         )
 
+    # Phase 3: Add warnings, auto-fixes, and metadata to response
     if validation_result.warnings:
         logger.warning(f"Shopping suggestion warnings: {validation_result.warnings}")
+        shopping_response.validation_warnings = validation_result.warnings
+
+    if validation_result.auto_fixes:
+        shopping_response.auto_fixes_applied = validation_result.auto_fixes
+
+    shopping_response.metadata = _build_response_metadata(session, log_id)
 
     return shopping_response
diff --git a/backend/innercontext/api/routines.py b/backend/innercontext/api/routines.py
index 7134beb..2de0ae4 100644
--- a/backend/innercontext/api/routines.py
+++ b/backend/innercontext/api/routines.py
@@ -2,7 +2,7 @@ import json
 import logging
 import math
 from datetime import date, timedelta
-from typing import Optional
+from typing import Any, Optional
 from uuid import UUID, uuid4
 
 from fastapi import APIRouter, Depends, HTTPException
@@ -40,6 +40,8 @@ from innercontext.models import (
     RoutineStep,
     SkinConditionSnapshot,
 )
+from innercontext.models.ai_log import AICallLog
+from innercontext.models.api_metadata import ResponseMetadata, TokenMetrics
 from innercontext.models.enums import GroomingAction, PartOfDay
 from innercontext.validators import BatchValidator, RoutineSuggestionValidator
 from innercontext.validators.batch_validator import BatchValidationContext
@@ -47,6 +49,37 @@ from innercontext.validators.routine_validator import RoutineValidationContext
 
 logger = logging.getLogger(__name__)
 
+
+def _build_response_metadata(session: Session, log_id: Any) -> ResponseMetadata | None:
+    """Build ResponseMetadata from AICallLog for Phase 3 observability."""
+    if not log_id:
+        return None
+
+    log = session.get(AICallLog, log_id)
+    if not log:
+        return None
+
+    token_metrics = None
+    if (
+        log.prompt_tokens is not None
+        and log.completion_tokens is not None
+        and log.total_tokens is not None
+    ):
+        token_metrics = TokenMetrics(
+            prompt_tokens=log.prompt_tokens,
+            completion_tokens=log.completion_tokens,
+            thoughts_tokens=log.thoughts_tokens,
+            total_tokens=log.total_tokens,
+        )
+
+    return ResponseMetadata(
+        model_used=log.model,
+        duration_ms=log.duration_ms or 0,
+        reasoning_chain=log.reasoning_chain,
+        token_metrics=token_metrics,
+    )
+
+
 router = APIRouter()
 
 
@@ -124,6 +157,10 @@ class RoutineSuggestion(SQLModel):
     steps: list[SuggestedStep]
     reasoning: str
     summary: Optional[RoutineSuggestionSummary] = None
+    # Phase 3: Observability fields
+    validation_warnings: Optional[list[str]] = None
+    auto_fixes_applied: Optional[list[str]] = None
+    metadata: Optional[ResponseMetadata] = None
 
 
 class SuggestBatchRequest(SQLModel):
@@ -144,6 +181,10 @@ class DayPlan(SQLModel):
 class BatchSuggestion(SQLModel):
     days: list[DayPlan]
     overall_reasoning: str
+    # Phase 3: Observability fields
+    validation_warnings: Optional[list[str]] = None
+    auto_fixes_applied: Optional[list[str]] = None
+    metadata: Optional[ResponseMetadata] = None
 
 
 # ---------------------------------------------------------------------------
@@ -674,7 +715,7 @@ def suggest_routine(
     }
 
     try:
-        response = call_gemini_with_function_tools(
+        response, log_id = call_gemini_with_function_tools(
             endpoint="routines/suggest",
             contents=prompt,
             config=config,
@@ -698,7 +739,7 @@ def suggest_routine(
             " preferujac lagodne produkty wspierajace bariere i fotoprotekcje.\n"
             "- Gdy masz watpliwosci, pomijaj ryzykowne aktywne kroki.\n"
         )
-        response = call_gemini(
+        response, log_id = call_gemini(
             endpoint="routines/suggest",
             contents=conservative_prompt,
             config=get_creative_config(
@@ -760,13 +801,6 @@ def suggest_routine(
         confidence=confidence,
     )
 
-    # Phase 1: Validate the response
-    suggestion = RoutineSuggestion(
-        steps=steps,
-        reasoning=parsed.get("reasoning", ""),
-        summary=summary,
-    )
-
     # Get skin snapshot for barrier state
     stmt = select(SkinConditionSnapshot).order_by(
         col(SkinConditionSnapshot.snapshot_date).desc()
@@ -790,8 +824,16 @@ def suggest_routine(
         just_shaved=False,  # Could be enhanced with grooming context
     )
 
-    # Validate
+    # Phase 1: Validate the response
     validator = RoutineSuggestionValidator()
+
+    # Build initial suggestion without metadata
+    suggestion = RoutineSuggestion(
+        steps=steps,
+        reasoning=parsed.get("reasoning", ""),
+        summary=summary,
+    )
+
     validation_result = validator.validate(suggestion, validation_context)
 
     if not validation_result.is_valid:
@@ -805,10 +847,15 @@ def suggest_routine(
             detail=f"Generated routine failed safety validation: {'; '.join(validation_result.errors)}",
         )
 
-    # Add warnings to response if any
+    # Phase 3: Add warnings, auto-fixes, and metadata to response
     if validation_result.warnings:
         logger.warning(f"Routine suggestion warnings: {validation_result.warnings}")
-        # Note: We'll add warnings field to RoutineSuggestion model in a moment
+        suggestion.validation_warnings = validation_result.warnings
+
+    if validation_result.auto_fixes:
+        suggestion.auto_fixes_applied = validation_result.auto_fixes
+
+    suggestion.metadata = _build_response_metadata(session, log_id)
 
     return suggestion
 
@@ -878,7 +925,7 @@ def suggest_batch(
         "\nZwróć JSON zgodny ze schematem."
     )
 
-    response = call_gemini(
+    response, log_id = call_gemini(
         endpoint="routines/suggest-batch",
         contents=prompt,
         config=get_creative_config(
@@ -936,11 +983,6 @@ def suggest_batch(
             )
         )
 
-    batch_suggestion = BatchSuggestion(
-        days=days, overall_reasoning=parsed.get("overall_reasoning", "")
-    )
-
-    # Phase 1: Validate the batch response
     # Get skin snapshot for barrier state
     stmt = select(SkinConditionSnapshot).order_by(
         col(SkinConditionSnapshot.snapshot_date).desc()
@@ -964,8 +1006,14 @@ def suggest_batch(
         last_used_dates=last_used_dates_by_uuid,
     )
 
-    # Validate
+    # Phase 1: Validate the batch response
     batch_validator = BatchValidator()
+
+    # Build initial batch suggestion without metadata
+    batch_suggestion = BatchSuggestion(
+        days=days, overall_reasoning=parsed.get("overall_reasoning", "")
+    )
+
     validation_result = batch_validator.validate(batch_suggestion, batch_context)
 
     if not validation_result.is_valid:
@@ -977,9 +1025,15 @@ def suggest_batch(
             detail=f"Generated batch plan failed safety validation: {'; '.join(validation_result.errors)}",
         )
 
-    # Log warnings if any
+    # Phase 3: Add warnings, auto-fixes, and metadata to response
     if validation_result.warnings:
         logger.warning(f"Batch routine warnings: {validation_result.warnings}")
+        batch_suggestion.validation_warnings = validation_result.warnings
+
+    if validation_result.auto_fixes:
+        batch_suggestion.auto_fixes_applied = validation_result.auto_fixes
+
+    batch_suggestion.metadata = _build_response_metadata(session, log_id)
 
     return batch_suggestion
 
diff --git a/backend/innercontext/api/skincare.py b/backend/innercontext/api/skincare.py
index bfa0b6b..730db1e 100644
--- a/backend/innercontext/api/skincare.py
+++ b/backend/innercontext/api/skincare.py
@@ -179,7 +179,7 @@ async def analyze_skin_photos(
     )
 
     image_summary = f"{len(photos)} image(s): {', '.join((p.content_type or 'unknown') for p in photos)}"
-    response = call_gemini(
+    response, log_id = call_gemini(
         endpoint="skincare/analyze-photos",
         contents=parts,
         config=get_extraction_config(
diff --git a/backend/innercontext/llm.py b/backend/innercontext/llm.py
index 89c19b0..2c08fbb 100644
--- a/backend/innercontext/llm.py
+++ b/backend/innercontext/llm.py
@@ -109,8 +109,12 @@ def call_gemini(
     config: genai_types.GenerateContentConfig,
     user_input: str | None = None,
     tool_trace: dict[str, Any] | None = None,
-):
-    """Call Gemini, log full request + response to DB, return response unchanged."""
+) -> tuple[Any, Any]:
+    """Call Gemini, log full request + response to DB.
+
+    Returns:
+        Tuple of (response, log_id) where log_id is the AICallLog.id (UUID) or None if logging failed.
+    """
     from sqlmodel import Session
 
     from db import engine  # deferred to avoid circular import at module load
@@ -127,7 +131,13 @@ def call_gemini(
             user_input = str(contents)
 
     start = time.monotonic()
-    success, error_detail, response, finish_reason = True, None, None, None
+    success, error_detail, response, finish_reason, log_id = (
+        True,
+        None,
+        None,
+        None,
+        None,
+    )
     try:
         response = client.models.generate_content(
             model=model, contents=contents, config=config
@@ -199,7 +209,9 @@ def call_gemini(
             with Session(engine) as s:
                 s.add(log)
                 s.commit()
-    return response
+                s.refresh(log)
+                log_id = log.id
+    return response, log_id
 
 
 def call_gemini_with_function_tools(
@@ -210,17 +222,22 @@ def call_gemini_with_function_tools(
     function_handlers: dict[str, Callable[[dict[str, Any]], dict[str, Any]]],
     user_input: str | None = None,
     max_tool_roundtrips: int = 2,
-):
-    """Call Gemini with function-calling loop until final response text is produced."""
+) -> tuple[Any, Any]:
+    """Call Gemini with function-calling loop until final response text is produced.
+
+    Returns:
+        Tuple of (response, log_id) where log_id is the AICallLog.id (UUID) of the final call.
+    """
     if max_tool_roundtrips < 0:
         raise ValueError("max_tool_roundtrips must be >= 0")
 
     history = list(contents) if isinstance(contents, list) else [contents]
     rounds = 0
     trace_events: list[dict[str, Any]] = []
+    log_id = None
 
     while True:
-        response = call_gemini(
+        response, log_id = call_gemini(
             endpoint=endpoint,
             contents=history,
             config=config,
@@ -233,7 +250,7 @@ def call_gemini_with_function_tools(
         )
         function_calls = list(getattr(response, "function_calls", None) or [])
         if not function_calls:
-            return response
+            return response, log_id
 
         if rounds >= max_tool_roundtrips:
             raise HTTPException(
diff --git a/backend/innercontext/models/api_metadata.py b/backend/innercontext/models/api_metadata.py
new file mode 100644
index 0000000..639d6cb
--- /dev/null
+++ b/backend/innercontext/models/api_metadata.py
@@ -0,0 +1,29 @@
+"""Models for API response metadata (Phase 3: UI/UX Observability)."""
+
+from pydantic import BaseModel
+
+
+class TokenMetrics(BaseModel):
+    """Token usage metrics from LLM call."""
+
+    prompt_tokens: int
+    completion_tokens: int
+    thoughts_tokens: int | None = None
+    total_tokens: int
+
+
+class ResponseMetadata(BaseModel):
+    """Metadata about the LLM response for observability."""
+
+    model_used: str
+    duration_ms: int
+    reasoning_chain: str | None = None
+    token_metrics: TokenMetrics | None = None
+
+
+class EnrichedResponse(BaseModel):
+    """Base class for API responses with validation and metadata."""
+
+    validation_warnings: list[str] | None = None
+    auto_fixes_applied: list[str] | None = None
+    metadata: ResponseMetadata | None = None
diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts
index 77758d5..a96230f 100644
--- a/frontend/src/lib/types.ts
+++ b/frontend/src/lib/types.ts
@@ -241,10 +241,29 @@ export interface RoutineSuggestionSummary {
   confidence: number;
 }
 
+// Phase 3: Observability metadata types
+export interface TokenMetrics {
+  prompt_tokens: number;
+  completion_tokens: number;
+  thoughts_tokens?: number;
+  total_tokens: number;
+}
+
+export interface ResponseMetadata {
+  model_used: string;
+  duration_ms: number;
+  reasoning_chain?: string;
+  token_metrics?: TokenMetrics;
+}
+
 export interface RoutineSuggestion {
   steps: SuggestedStep[];
   reasoning: string;
   summary?: RoutineSuggestionSummary;
+  // Phase 3: Observability fields
+  validation_warnings?: string[];
+  auto_fixes_applied?: string[];
+  metadata?: ResponseMetadata;
 }
 
 export interface DayPlan {
@@ -257,6 +276,10 @@ export interface DayPlan {
 export interface BatchSuggestion {
   days: DayPlan[];
   overall_reasoning: string;
+  // Phase 3: Observability fields
+  validation_warnings?: string[];
+  auto_fixes_applied?: string[];
+  metadata?: ResponseMetadata;
 }
 
 // ─── Shopping suggestion types ───────────────────────────────────────────────
@@ -274,6 +297,10 @@ export interface ProductSuggestion {
 export interface ShoppingSuggestionResponse {
   suggestions: ProductSuggestion[];
   reasoning: string;
+  // Phase 3: Observability fields
+  validation_warnings?: string[];
+  auto_fixes_applied?: string[];
+  metadata?: ResponseMetadata;
 }
 
 // ─── Health types ────────────────────────────────────────────────────────────