Compressing the Cache Not Just the Model

Photographers compress raw images for storage but forget about thumbnails. Those thumbnails, multiplied across thousands of photos, can consume more space than a few raw files.

LLM memory management has a similar blind spot. Teams optimize model weights but ignore KV cache. For long-context serving, the cache often dominates memory usage.

The Memory Split

def memory_comparison(model_params_b: float, context_length: int, batch_size: int):
    """
    Where does memory actually go?
    """
    # Model weights (FP16)
    model_gb = model_params_b * 2

    # KV cache (FP16)
    # Approximate: 2 bytes × 2 (K,V) × layers × heads × dim × context × batch
    kv_per_token = 0.0025  # GB for 70B model
    kv_cache_gb = context_length * batch_size * kv_per_token

    return {
        "model_weights": model_gb,
        "kv_cache": kv_cache_gb,
        "which_is_larger": "kv_cache" if kv_cache_gb > model_gb else "model",
    }

# Example: 70B model, 8K context, 10 concurrent requests
result = memory_comparison(70, 8000, 10)
# Model: 140 GB
# KV cache: 200 GB
# KV cache is larger!

KV Cache Quantization

class KVCacheQuantization:
    """
    Reduce KV cache precision from FP16 to INT8
    """

    implementation = """
    # vLLM
    vllm serve model_name --kv-cache-dtype int8

    # That's it. One flag for 2x cache capacity.
    """

    before_and_after = {
        "fp16": {
            "bytes_per_element": 2,
            "8K_context_10_requests": "200 GB",
        },
        "int8": {
            "bytes_per_element": 1,
            "8K_context_10_requests": "100 GB",
            "savings": "100 GB",
        },
    }

    quality_impact = """
    KV cache quantization is less sensitive than weight quantization.

    Why:
    - KV values are computed fresh each request
    - They're intermediate values, not learned parameters
    - Small errors in K/V have muted effect after softmax
    - Each token's KV is independent

    Typical quality degradation: < 1%
    Often undetectable in practical benchmarks.
    """

Why It's Often Overlooked

def why_overlooked():
    return {
        "reason_1": {
            "misconception": "Model weights are the memory consumer",
            "reality": "For long context, KV cache dominates",
        },
        "reason_2": {
            "misconception": "Quantizing cache would hurt quality",
            "reality": "Quality impact is minimal and well-tested",
        },
        "reason_3": {
            "misconception": "It's complex to implement",
            "reality": "One config flag in modern frameworks",
        },
        "reason_4": {
            "misconception": "Only matters at extreme scale",
            "reality": "Matters whenever context length × batch is significant",
        },
    }

Combined Optimization

def combined_memory_optimization():
    """
    Stack multiple techniques for maximum effect
    """
    baseline = {
        "model": "70B FP16 = 140 GB",
        "kv_cache": "8K × 10 requests = 200 GB",
        "total": "340 GB",
        "gpus_needed": "5× H100 80GB",
    }

    with_weight_quantization = {
        "model": "70B INT8 = 70 GB",
        "kv_cache": "8K × 10 requests = 200 GB",
        "total": "270 GB",
        "gpus_needed": "4× H100 80GB",
    }

    with_both = {
        "model": "70B INT8 = 70 GB",
        "kv_cache": "8K × 10 requests INT8 = 100 GB",
        "total": "170 GB",
        "gpus_needed": "3× H100 80GB",
        "savings_vs_baseline": "2 GPUs saved",
    }

    return baseline, with_weight_quantization, with_both

Quality Testing for KV Quantization

def kv_quantization_test_protocol():
    return {
        "test_1": {
            "name": "Perplexity comparison",
            "method": "Compare PPL on validation set",
            "acceptable_increase": "< 0.1",
        },
        "test_2": {
            "name": "Long context coherence",
            "method": "Verify references to early context work",
            "why": "KV cache errors could compound over length",
        },
        "test_3": {
            "name": "Output diff at various lengths",
            "method": "Compare outputs at 1K, 4K, 8K, 16K context",
            "acceptable": "> 95% token match",
        },
        "test_4": {
            "name": "Task-specific benchmarks",
            "method": "Run your actual use case evaluation",
            "threshold": "No statistically significant degradation",
        },
    }

When KV Cache Quantization Helps Most

def high_impact_scenarios():
    return {
        "long_context_serving": {
            "context_length": "> 4K tokens",
            "impact": "High - cache is large",
            "recommendation": "Definitely enable",
        },
        "high_concurrency": {
            "concurrent_requests": "> 10",
            "impact": "High - many caches add up",
            "recommendation": "Definitely enable",
        },
        "memory_constrained_gpu": {
            "setup": "Model barely fits",
            "impact": "Critical - enables serving",
            "recommendation": "Required for feasibility",
        },
        "short_context_low_concurrency": {
            "context_length": "< 1K",
            "concurrent_requests": "< 5",
            "impact": "Low - cache is small anyway",
            "recommendation": "Optional, but free performance",
        },
    }

Implementation Checklist

def kv_quantization_checklist():
    return [
        {
            "step": 1,
            "action": "Measure current KV cache memory usage",
            "command": "Monitor GPU memory during varying context lengths",
        },
        {
            "step": 2,
            "action": "Enable INT8 KV cache",
            "command": "--kv-cache-dtype int8 (vLLM)",
        },
        {
            "step": 3,
            "action": "Run quality benchmarks",
            "command": "Compare outputs on test set",
        },
        {
            "step": 4,
            "action": "Measure memory improvement",
            "command": "Same test, observe memory reduction",
        },
        {
            "step": 5,
            "action": "Deploy with monitoring",
            "command": "Track quality metrics in production",
        },
    ]

KV cache quantization is one of the highest-ROI optimizations available. Minimal quality impact, significant memory savings, and trivial to enable. If you're not using it, you're leaving performance on the table.