What to Monitor in LLM Systems

Car dashboards show speed, fuel, and engine temperature. Not because drivers love data, but because these numbers predict whether you'll reach your destination. Miss a warning light and you're stranded.

LLM monitoring serves the same purpose. The right metrics predict incidents before users feel them. The wrong metrics give false confidence while problems brew.

The Core Four

def core_metrics():
    return {
        "latency": {
            "what": "How long requests take",
            "measure": "P50, P95, P99 by endpoint",
            "why": "User experience, SLA compliance",
            "alert_if": "P99 > 5s (or your SLA)",
        },
        "error_rate": {
            "what": "Percentage of failed requests",
            "measure": "Errors / Total requests",
            "why": "System health, user impact",
            "alert_if": "> 1%",
        },
        "throughput": {
            "what": "Requests handled per second",
            "measure": "RPS, tokens per second",
            "why": "Capacity utilization, trend analysis",
            "alert_if": "Sudden drop > 50%",
        },
        "cost": {
            "what": "Spend rate",
            "measure": "$/hour, $/request, $/user",
            "why": "Budget management, anomaly detection",
            "alert_if": "> 2x normal rate",
        },
    }

LLM-Specific Metrics

def llm_specific_metrics():
    return {
        "time_to_first_token": {
            "what": "Delay before generation starts",
            "measure": "Milliseconds from request to first token",
            "why": "User perceives this as 'thinking time'",
            "target": "< 500ms for interactive",
        },
        "inter_token_latency": {
            "what": "Time between tokens during streaming",
            "measure": "Average ms between tokens",
            "why": "Smooth streaming experience",
            "target": "< 50ms for natural reading speed",
        },
        "tokens_per_second": {
            "what": "Generation speed",
            "measure": "Output tokens / generation time",
            "why": "Throughput and efficiency",
            "target": "Model and hardware dependent",
        },
        "queue_depth": {
            "what": "Requests waiting to be processed",
            "measure": "Number of pending requests",
            "why": "Leading indicator of capacity issues",
            "alert_if": "Growing continuously",
        },
        "kv_cache_utilization": {
            "what": "Memory used for active requests",
            "measure": "Current / Max KV cache capacity",
            "why": "Predict OOM before it happens",
            "alert_if": "> 90%",
        },
        "batch_size_average": {
            "what": "Requests batched together",
            "measure": "Average concurrent requests per batch",
            "why": "GPU efficiency indicator",
            "target": "> 8 for good utilization",
        },
    }

Quality Metrics

def quality_metrics():
    return {
        "user_regeneration_rate": {
            "what": "How often users click 'regenerate'",
            "measure": "Regenerations / Total requests",
            "why": "Proxy for response quality",
            "alert_if": "> 10% and increasing",
        },
        "response_length_distribution": {
            "what": "How long responses are",
            "measure": "Histogram of output token counts",
            "why": "Detect truncation or verbosity issues",
            "alert_if": "Significant distribution shift",
        },
        "safety_classifier_triggers": {
            "what": "Content flagged by safety systems",
            "measure": "Flags / Total requests",
            "why": "Model behavior monitoring",
            "alert_if": "> baseline + 2 std dev",
        },
        "empty_response_rate": {
            "what": "Responses with no content",
            "measure": "Empty / Total responses",
            "why": "Indicates model issues",
            "alert_if": "> 0.1%",
        },
    }

The Monitoring Stack

def monitoring_stack():
    return {
        "metrics_collection": {
            "tool": "Prometheus, Datadog, CloudWatch",
            "what": "Time-series numerical data",
            "retention": "15 days high-res, 1 year aggregated",
        },
        "logging": {
            "tool": "ELK, Splunk, CloudWatch Logs",
            "what": "Request/response details, errors",
            "retention": "30 days",
            "caution": "Don't log full prompts in production (privacy)",
        },
        "tracing": {
            "tool": "Jaeger, Zipkin, Datadog APM",
            "what": "Request flow through system",
            "use_for": "Debugging latency issues",
        },
        "dashboards": {
            "tool": "Grafana, Datadog, custom",
            "what": "Visual representation of metrics",
            "types": ["Operational (current health)", "Analytical (trends)"],
        },
        "alerting": {
            "tool": "PagerDuty, Opsgenie, native",
            "what": "Notify when thresholds breached",
            "principle": "Alert on symptoms, not causes",
        },
    }

Dashboard Layout

def operational_dashboard():
    """
    What to show for at-a-glance health
    """
    return {
        "top_row": {
            "widgets": [
                "Current RPS",
                "Error rate (last 5 min)",
                "P99 latency (last 5 min)",
                "Active alerts",
            ],
            "purpose": "Instant health assessment",
        },
        "middle_row": {
            "widgets": [
                "Latency over time (P50, P95, P99)",
                "Error rate over time",
                "Throughput over time",
            ],
            "purpose": "Trend identification",
        },
        "bottom_row": {
            "widgets": [
                "GPU utilization",
                "Queue depth",
                "Cost burn rate",
                "Model version distribution",
            ],
            "purpose": "Infrastructure health",
        },
    }

Alerting Strategy

def alerting_strategy():
    return {
        "severity_levels": {
            "critical": {
                "criteria": "Service down or severely degraded",
                "examples": ["Error rate > 20%", "P99 > 30s", "OOM events"],
                "action": "Page on-call immediately",
            },
            "warning": {
                "criteria": "Degradation likely without intervention",
                "examples": ["Error rate > 5%", "Queue growing", "GPU > 90%"],
                "action": "Slack alert, investigate soon",
            },
            "info": {
                "criteria": "Notable but not urgent",
                "examples": ["Unusual traffic pattern", "Cost spike"],
                "action": "Log for review",
            },
        },
        "alert_fatigue_prevention": [
            "Alert on symptoms, not every metric",
            "Group related alerts",
            "Auto-resolve when condition clears",
            "Review and tune thresholds monthly",
        ],
    }

Implementation Example

def metrics_implementation():
    return """
    from prometheus_client import Counter, Histogram, Gauge

    # Request metrics
    requests_total = Counter(
        'llm_requests_total',
        'Total requests',
        ['model', 'endpoint', 'status']
    )

    latency_histogram = Histogram(
        'llm_request_latency_seconds',
        'Request latency',
        ['model', 'endpoint'],
        buckets=[0.1, 0.5, 1, 2, 5, 10, 30]
    )

    ttft_histogram = Histogram(
        'llm_time_to_first_token_seconds',
        'Time to first token',
        ['model'],
        buckets=[0.05, 0.1, 0.2, 0.5, 1, 2, 5]
    )

    # Resource metrics
    gpu_utilization = Gauge(
        'gpu_utilization_percent',
        'GPU utilization',
        ['gpu_id']
    )

    queue_depth = Gauge(
        'llm_queue_depth',
        'Requests waiting in queue'
    )

    # Usage in request handler
    async def handle_request(request):
        with latency_histogram.labels(model=model_name, endpoint=endpoint).time():
            start = time.time()
            async for token in generate(request):
                if first_token:
                    ttft_histogram.labels(model=model_name).observe(time.time() - start)
                    first_token = False
                yield token
        requests_total.labels(model=model_name, endpoint=endpoint, status='success').inc()
    """

The metrics you choose shape what you notice. Monitor what predicts user experience and system health. Everything else is noise that makes real signals harder to find.