feat(api): add enhanced token metrics logging for Gemini API
Add comprehensive token breakdown logging to understand MAX_TOKENS behavior and verify documentation claims about thinking tokens. New Fields Added to ai_call_logs: - thoughts_tokens: Thinking tokens (thoughtsTokenCount) - documented as separate from output budget - tool_use_prompt_tokens: Tool use overhead (toolUsePromptTokenCount) - cached_content_tokens: Cached content tokens (cachedContentTokenCount) Purpose: Investigate token counting mystery from production logs where: prompt_tokens: 4400 completion_tokens: 589 total_tokens: 8489 ← Should be 4400 + 589 = 4989, missing 3500! According to Gemini API docs (Polish translation): totalTokenCount = promptTokenCount + candidatesTokenCount (thoughts NOT included in total) But production logs show 3500 token gap. New logging will reveal: 1. Are thinking tokens actually separate from max_output_tokens limit? 2. Where did the 3500 missing tokens go? 3. Does MEDIUM thinking level consume output budget despite docs? 4. Are tool use tokens included in total but not shown separately? Changes: - Added 3 new integer columns to ai_call_logs (nullable) - Enhanced llm.py to capture all usage_metadata fields - Used getattr() for safe access (fields may not exist in all responses) - Database migration: 7e6f73d1cc95 This will provide complete data for future LLM calls to diagnose: - MAX_TOKENS failures - Token budget behavior - Thinking token costs - Tool use overhead
This commit is contained in:
parent
5bb2ea5f08
commit
3bf19d8acb
3 changed files with 79 additions and 11 deletions
|
|
@ -0,0 +1,44 @@
|
||||||
|
"""add enhanced token metrics to ai_call_logs
|
||||||
|
|
||||||
|
Revision ID: 7e6f73d1cc95
|
||||||
|
Revises: 27b2c306b0c6
|
||||||
|
Create Date: 2026-03-06 12:15:42.003323
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "7e6f73d1cc95"
|
||||||
|
down_revision: Union[str, Sequence[str], None] = "27b2c306b0c6"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
"""Upgrade schema.
|
||||||
|
|
||||||
|
Add enhanced token metrics to ai_call_logs for detailed Gemini API analysis.
|
||||||
|
Captures thoughts_tokens, tool_use_prompt_tokens, and cached_content_tokens
|
||||||
|
to understand token usage breakdown and verify max_output_tokens behavior.
|
||||||
|
"""
|
||||||
|
op.add_column(
|
||||||
|
"ai_call_logs", sa.Column("thoughts_tokens", sa.Integer(), nullable=True)
|
||||||
|
)
|
||||||
|
op.add_column(
|
||||||
|
"ai_call_logs", sa.Column("tool_use_prompt_tokens", sa.Integer(), nullable=True)
|
||||||
|
)
|
||||||
|
op.add_column(
|
||||||
|
"ai_call_logs", sa.Column("cached_content_tokens", sa.Integer(), nullable=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
"""Downgrade schema."""
|
||||||
|
op.drop_column("ai_call_logs", "cached_content_tokens")
|
||||||
|
op.drop_column("ai_call_logs", "tool_use_prompt_tokens")
|
||||||
|
op.drop_column("ai_call_logs", "thoughts_tokens")
|
||||||
|
|
@ -158,6 +158,13 @@ def call_gemini(
|
||||||
# Phase 2: Extract reasoning chain for observability
|
# Phase 2: Extract reasoning chain for observability
|
||||||
reasoning_chain = _extract_thinking_content(response)
|
reasoning_chain = _extract_thinking_content(response)
|
||||||
|
|
||||||
|
# Extract enhanced token metadata from Gemini API
|
||||||
|
usage = (
|
||||||
|
response.usage_metadata
|
||||||
|
if response and response.usage_metadata
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
log = AICallLog(
|
log = AICallLog(
|
||||||
endpoint=endpoint,
|
endpoint=endpoint,
|
||||||
model=model,
|
model=model,
|
||||||
|
|
@ -165,19 +172,22 @@ def call_gemini(
|
||||||
user_input=user_input,
|
user_input=user_input,
|
||||||
response_text=response.text if response else None,
|
response_text=response.text if response else None,
|
||||||
tool_trace=tool_trace,
|
tool_trace=tool_trace,
|
||||||
prompt_tokens=(
|
# Core token counts
|
||||||
response.usage_metadata.prompt_token_count
|
prompt_tokens=usage.prompt_token_count if usage else None,
|
||||||
if response and response.usage_metadata
|
completion_tokens=usage.candidates_token_count if usage else None,
|
||||||
|
total_tokens=usage.total_token_count if usage else None,
|
||||||
|
# Enhanced token breakdown (Phase 2)
|
||||||
|
thoughts_tokens=(
|
||||||
|
getattr(usage, "thoughts_token_count", None) if usage else None
|
||||||
|
),
|
||||||
|
tool_use_prompt_tokens=(
|
||||||
|
getattr(usage, "tool_use_prompt_token_count", None)
|
||||||
|
if usage
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
completion_tokens=(
|
cached_content_tokens=(
|
||||||
response.usage_metadata.candidates_token_count
|
getattr(usage, "cached_content_token_count", None)
|
||||||
if response and response.usage_metadata
|
if usage
|
||||||
else None
|
|
||||||
),
|
|
||||||
total_tokens=(
|
|
||||||
response.usage_metadata.total_token_count
|
|
||||||
if response and response.usage_metadata
|
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
duration_ms=duration_ms,
|
duration_ms=duration_ms,
|
||||||
|
|
|
||||||
|
|
@ -48,3 +48,17 @@ class AICallLog(SQLModel, table=True):
|
||||||
default=None,
|
default=None,
|
||||||
description="LLM reasoning/thinking process (MEDIUM thinking level)",
|
description="LLM reasoning/thinking process (MEDIUM thinking level)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Enhanced token metrics (Phase 2 - Gemini API detailed breakdown)
|
||||||
|
thoughts_tokens: int | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Thinking tokens (thoughtsTokenCount) - separate from output budget",
|
||||||
|
)
|
||||||
|
tool_use_prompt_tokens: int | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Tool use prompt tokens (toolUsePromptTokenCount)",
|
||||||
|
)
|
||||||
|
cached_content_tokens: int | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Cached content tokens (cachedContentTokenCount)",
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue