diff --git a/backend/alembic/versions/7e6f73d1cc95_add_enhanced_token_metrics_to_ai_call_.py b/backend/alembic/versions/7e6f73d1cc95_add_enhanced_token_metrics_to_ai_call_.py new file mode 100644 index 0000000..204d2fe --- /dev/null +++ b/backend/alembic/versions/7e6f73d1cc95_add_enhanced_token_metrics_to_ai_call_.py @@ -0,0 +1,44 @@ +"""add enhanced token metrics to ai_call_logs + +Revision ID: 7e6f73d1cc95 +Revises: 27b2c306b0c6 +Create Date: 2026-03-06 12:15:42.003323 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "7e6f73d1cc95" +down_revision: Union[str, Sequence[str], None] = "27b2c306b0c6" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema. + + Add enhanced token metrics to ai_call_logs for detailed Gemini API analysis. + Captures thoughts_tokens, tool_use_prompt_tokens, and cached_content_tokens + to understand token usage breakdown and verify max_output_tokens behavior. + """ + op.add_column( + "ai_call_logs", sa.Column("thoughts_tokens", sa.Integer(), nullable=True) + ) + op.add_column( + "ai_call_logs", sa.Column("tool_use_prompt_tokens", sa.Integer(), nullable=True) + ) + op.add_column( + "ai_call_logs", sa.Column("cached_content_tokens", sa.Integer(), nullable=True) + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_column("ai_call_logs", "cached_content_tokens") + op.drop_column("ai_call_logs", "tool_use_prompt_tokens") + op.drop_column("ai_call_logs", "thoughts_tokens") diff --git a/backend/innercontext/llm.py b/backend/innercontext/llm.py index 40635cf..89c19b0 100644 --- a/backend/innercontext/llm.py +++ b/backend/innercontext/llm.py @@ -158,6 +158,13 @@ def call_gemini( # Phase 2: Extract reasoning chain for observability reasoning_chain = _extract_thinking_content(response) + # Extract enhanced token metadata from Gemini API + usage = ( + response.usage_metadata + if response and response.usage_metadata + else None + ) + log = AICallLog( endpoint=endpoint, model=model, @@ -165,19 +172,22 @@ def call_gemini( user_input=user_input, response_text=response.text if response else None, tool_trace=tool_trace, - prompt_tokens=( - response.usage_metadata.prompt_token_count - if response and response.usage_metadata + # Core token counts + prompt_tokens=usage.prompt_token_count if usage else None, + completion_tokens=usage.candidates_token_count if usage else None, + total_tokens=usage.total_token_count if usage else None, + # Enhanced token breakdown (Phase 2) + thoughts_tokens=( + getattr(usage, "thoughts_token_count", None) if usage else None + ), + tool_use_prompt_tokens=( + getattr(usage, "tool_use_prompt_token_count", None) + if usage else None ), - completion_tokens=( - response.usage_metadata.candidates_token_count - if response and response.usage_metadata - else None - ), - total_tokens=( - response.usage_metadata.total_token_count - if response and response.usage_metadata + cached_content_tokens=( + getattr(usage, "cached_content_token_count", None) + if usage else None ), duration_ms=duration_ms, diff --git a/backend/innercontext/models/ai_log.py b/backend/innercontext/models/ai_log.py index 0b1b41b..7dfa457 100644 --- a/backend/innercontext/models/ai_log.py +++ b/backend/innercontext/models/ai_log.py @@ -48,3 +48,17 @@ class AICallLog(SQLModel, table=True): default=None, description="LLM reasoning/thinking process (MEDIUM thinking level)", ) + + # Enhanced token metrics (Phase 2 - Gemini API detailed breakdown) + thoughts_tokens: int | None = Field( + default=None, + description="Thinking tokens (thoughtsTokenCount) - separate from output budget", + ) + tool_use_prompt_tokens: int | None = Field( + default=None, + description="Tool use prompt tokens (toolUsePromptTokenCount)", + ) + cached_content_tokens: int | None = Field( + default=None, + description="Cached content tokens (cachedContentTokenCount)", + )