Finding the KV Cache Problem Before Your Bill Does

Doctors don't guess—they run tests. A patient complaining of fatigue might have a hundred causes. Blood tests, imaging, and vitals narrow it down systematically.

Memory problems in LLM serving need the same diagnostic discipline. "It's running slow" could be model weights, KV cache, fragmentation, or something else entirely. Here's how to find the actual problem.

The Memory Breakdown

def memory_audit(gpu_memory_gb: float = 80):
    """
    Where does memory actually go?
    Example: 70B model on H100 with tensor parallelism 4
    """
    breakdown = {
        "model_weights": {
            "total": 140,  # 70B × 2 bytes FP16
            "per_gpu": 35,  # 140 / 4 GPUs
            "status": "Fixed, predictable",
        },
        "kv_cache": {
            "per_token_per_request": 0.0025,  # 2.5 MB
            "at_1k_context_1_request": 2.5,
            "at_8k_context_10_requests": 200,  # Often the killer
            "status": "Variable, grows with usage",
        },
        "activations": {
            "per_gpu_during_forward": 4,  # Varies with batch
            "status": "Temporary, freed after forward pass",
        },
        "cuda_context": {
            "overhead": 1.5,
            "status": "Fixed per GPU",
        },
        "allocator_overhead": {
            "typical": 2,
            "status": "Fragmentation and buffers",
        },
    }

    available = gpu_memory_gb - breakdown["model_weights"]["per_gpu"] - \
                breakdown["cuda_context"]["overhead"] - \
                breakdown["allocator_overhead"]["typical"]

    breakdown["available_for_kv_cache"] = available
    # 80 - 35 - 1.5 - 2 = 41.5 GB per GPU for KV cache

    return breakdown

The Diagnostic Steps

class KVCacheDiagnosis:
    """
    Step-by-step diagnosis
    """

    step_1_baseline = """
    1. Measure baseline memory (no requests)

    nvidia-smi --query-gpu=memory.used --format=csv -l 1

    Expected: model_weights + cuda_overhead + allocator
    If higher: Something else consuming memory
    """

    step_2_single_request = """
    2. Send single request, measure growth

    # Before request: X GB
    # After request starts: X + KV_cache GB
    # After request completes: Should return to ~X

    If it doesn't return: Memory leak
    """

    step_3_concurrent_load = """
    3. Test concurrent requests

    Send 10 concurrent requests with 2K context each.
    Expected growth: 10 × 2K × cache_per_token

    If growth >> expected: Something wrong
    """

    step_4_long_context = """
    4. Test long context specifically

    Send 1 request with 16K context.
    Monitor memory continuously.

    Growth should be linear with tokens.
    If it's faster: Possible duplication or leak
    """

Monitoring Commands

def monitoring_commands():
    """
    Practical commands for diagnosis
    """
    return {
        "continuous_memory": {
            "command": "nvidia-smi dmon -s m -d 1",
            "output": "Memory utilization every second",
        },
        "detailed_breakdown": {
            "command": "python -c 'import torch; print(torch.cuda.memory_stats())'",
            "output": "Allocated, reserved, active memory",
        },
        "vllm_specific": {
            "info": "Check vLLM logs for KV cache stats",
            "look_for": [
                "num_batched_tokens",
                "num_running_reqs",
                "gpu_cache_usage",
            ],
        },
        "custom_tracking": """
        # Add to your serving code
        import torch

        def log_memory():
            allocated = torch.cuda.memory_allocated() / 1e9
            reserved = torch.cuda.memory_reserved() / 1e9
            print(f"Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
        """,
    }

Common Problems and Solutions

def common_issues():
    return {
        "memory_grows_continuously": {
            "symptom": "Memory increases over time, never releases",
            "likely_cause": "Memory leak or unbounded cache",
            "diagnosis": "Track memory during idle period",
            "solution": "Update framework, check for cache clearing",
        },
        "oom_with_few_requests": {
            "symptom": "OOM crash with 5 concurrent requests",
            "likely_cause": "Long context not accounted for",
            "diagnosis": "Check max_tokens in requests",
            "solution": "Lower max_num_seqs or max_model_len",
        },
        "fragmentation": {
            "symptom": "nvidia-smi shows memory available, still OOM",
            "likely_cause": "Memory fragmentation",
            "diagnosis": "torch.cuda.memory_summary()",
            "solution": "Restart worker, use paged allocation",
        },
        "baseline_too_high": {
            "symptom": "Fresh start uses more memory than expected",
            "likely_cause": "Multiple model copies or cached data",
            "diagnosis": "Check for duplicate model loads",
            "solution": "Verify tensor parallelism config",
        },
    }

The Calculation Worksheet

def kv_cache_calculator(
    model_name: str,
    context_length: int,
    concurrent_requests: int,
    gpu_memory_gb: float,
    tensor_parallel: int,
) -> dict:
    """
    Calculate if your config will fit
    """
    # Model-specific KV cache sizes (approximate)
    kv_per_token = {
        "llama_7b": 0.0005,   # GB
        "llama_13b": 0.001,
        "llama_70b": 0.0025,
        "mixtral_8x7b": 0.002,
    }

    cache_per_token = kv_per_token.get(model_name, 0.001)

    # Total KV cache needed
    total_kv_cache = (
        cache_per_token *
        context_length *
        concurrent_requests
    )

    # Memory per GPU (distributed)
    kv_per_gpu = total_kv_cache / tensor_parallel

    # Available memory (rough estimate)
    model_sizes = {
        "llama_7b": 14,
        "llama_13b": 26,
        "llama_70b": 140,
        "mixtral_8x7b": 90,
    }
    model_per_gpu = model_sizes.get(model_name, 50) / tensor_parallel
    overhead = 4  # CUDA + allocator

    available = gpu_memory_gb - model_per_gpu - overhead

    return {
        "kv_cache_needed_per_gpu": kv_per_gpu,
        "available_per_gpu": available,
        "will_fit": kv_per_gpu < available,
        "headroom_gb": available - kv_per_gpu,
        "max_concurrent": int(available / (cache_per_token * context_length)),
    }

The Alert Thresholds

def recommended_alerts():
    return {
        "memory_utilization": {
            "warning": 80,  # %
            "critical": 90,
            "why": "Leave headroom for bursts",
        },
        "kv_cache_fragmentation": {
            "metric": "reserved - allocated",
            "warning": "2GB",
            "action": "Consider worker restart",
        },
        "concurrent_requests": {
            "track": "requests_in_flight vs max_concurrent",
            "warning": "80% of max",
            "why": "Approaching memory limit",
        },
        "context_length_spikes": {
            "track": "max context_length over 5 minutes",
            "warning": "2x average",
            "why": "Long requests consume disproportionate memory",
        },
    }

The KV cache is often the limiting factor in production. Knowing how to diagnose it means knowing when to add memory, when to reduce concurrency, and when the problem is actually something else entirely.