Back to Blog

The GPU Memory Techniques That Actually Scale

Compression algorithms compete on benchmarks but matter in production. A 2% improvement in compression ratio is noise. A 2x improvement changes what's possible. The same applies to GPU memory techniques—some move the needle, others are incremental.

Here's what actually scales.

The Techniques Ranked

def memory_techniques_ranked():
    """
    Ranked by practical impact on concurrent requests
    """
    return {
        "tier_1_transformative": [
            {
                "technique": "Paged KV Cache (PagedAttention)",
                "impact": "2-4x more concurrent requests",
                "complexity": "Use vLLM (automatic)",
                "why": "Eliminates fragmentation waste",
            },
            {
                "technique": "Prefix Caching",
                "impact": "2-10x for shared prefixes",
                "complexity": "Enable in vLLM/SGLang",
                "why": "N requests share 1 prefix cache",
            },
            {
                "technique": "Tensor Parallelism",
                "impact": "Linear memory scaling",
                "complexity": "Framework support",
                "why": "Split model across GPUs",
            },
        ],
        "tier_2_significant": [
            {
                "technique": "KV Cache Quantization (INT8)",
                "impact": "2x cache capacity",
                "complexity": "One config flag",
                "why": "Half the bytes per token",
            },
            {
                "technique": "Model Quantization (INT8/INT4)",
                "impact": "2-4x model memory reduction",
                "complexity": "Quantize and validate",
                "why": "Smaller weights, more room for cache",
            },
        ],
        "tier_3_incremental": [
            {
                "technique": "Dynamic Batching",
                "impact": "10-30% better utilization",
                "complexity": "Framework support",
                "why": "Pack requests efficiently",
            },
            {
                "technique": "Speculative Decoding",
                "impact": "Latency improvement, not memory",
                "complexity": "Need draft model",
                "why": "Trades compute for latency",
            },
        ],
    }

Paged KV Cache Deep Dive

class PagedKVCacheExplained:
    """
    Why this is transformative
    """

    problem = """
    Traditional allocation: Reserve max_tokens for each request.
    Request says max_tokens=2048? Allocate 2048 tokens of cache.
    Actual output: 150 tokens.
    Waste: 1898 tokens × cache_per_token = massive waste.

    With 100 concurrent requests, you're wasting 95% of allocated cache.
    """

    solution = """
    Paged allocation: Allocate in pages (e.g., 16 tokens each).
    Request generates 150 tokens? Allocate 10 pages.
    No waste from over-allocation.

    Additional benefit: Pages can be non-contiguous.
    No fragmentation from requests of different sizes.
    """

    implementation = """
    # vLLM handles this automatically
    # Key parameters:
    # --block-size 16 (tokens per page, default works well)
    # --gpu-memory-utilization 0.9 (leave 10% headroom)

    # The magic is in the scheduler:
    # - Tracks which pages are used by which requests
    # - Allocates pages as tokens are generated
    # - Frees pages immediately when requests complete
    # - Supports copy-on-write for prefix sharing
    """

    results = {
        "before_paging": "8 concurrent requests (32K context, 70B model)",
        "after_paging": "30+ concurrent requests (same config)",
        "improvement": "3-4x",
    }

Prefix Caching Deep Dive

class PrefixCachingExplained:
    """
    When you have shared prefixes (system prompts)
    """

    scenario = """
    All requests start with same 2000-token system prompt.
    Without prefix caching:
    - Each request stores 2000 tokens of KV cache
    - 100 requests = 100 × 2000 = 200,000 tokens

    With prefix caching:
    - System prompt KV cache stored once
    - Each request stores only unique tokens
    - 100 requests = 1 × 2000 + 100 × unique = much less
    """

    implementation = """
    # vLLM prefix caching
    # Enable: --enable-prefix-caching

    # SGLang RadixAttention
    # Enabled by default, more sophisticated caching

    # Key insight:
    # Cache is keyed by token sequence
    # Exact match required for reuse
    # Works for system prompt, few-shot examples, etc.
    """

    when_it_helps = {
        "high_impact": [
            "Chatbots with fixed system prompts",
            "Few-shot prompting with same examples",
            "Document Q&A with same document",
        ],
        "low_impact": [
            "Every request is unique",
            "System prompts vary per user",
            "No common prefix patterns",
        ],
    }

KV Cache Quantization

class KVCacheQuantization:
    """
    Simple technique, big impact
    """

    explanation = """
    Default KV cache: FP16 (2 bytes per element)
    Quantized: INT8 (1 byte per element)
    Result: 2x more tokens fit in same memory
    """

    implementation = """
    # vLLM
    # --kv-cache-dtype int8

    # That's it. One flag.
    """

    quality_impact = """
    Surprisingly minimal for most use cases.
    KV cache quantization is less sensitive than weight quantization.

    Why: KV values are attention keys/values, not learned weights.
    They have different statistical properties.

    Test on your workload, but expect <1% quality degradation.
    """

    combined_effect = """
    Paged allocation + KV quantization:
    - 3x from paging (no fragmentation)
    - 2x from quantization (half the bytes)
    - Combined: ~5-6x improvement in practice

    Going from 8 to 40+ concurrent requests.
    """

What Doesn't Scale

def overrated_techniques():
    return {
        "flash_attention": {
            "claim": "Huge memory savings",
            "reality": "Saves activation memory, not KV cache",
            "useful_for": "Training more than inference",
            "memory_impact": "Minimal for serving",
        },
        "gradient_checkpointing": {
            "claim": "Trade compute for memory",
            "reality": "Training technique, not inference",
            "memory_impact": "None for serving",
        },
        "cpu_offloading_for_active_cache": {
            "claim": "Infinite effective memory",
            "reality": "PCIe bandwidth kills latency",
            "useful_for": "Batch processing only",
            "memory_impact": "Technically yes, practically no",
        },
    }

Implementation Checklist

def scaling_checklist():
    return [
        {
            "step": 1,
            "action": "Use vLLM or SGLang",
            "reason": "Get paged attention automatically",
            "impact": "2-4x",
        },
        {
            "step": 2,
            "action": "Enable prefix caching if applicable",
            "reason": "Shared prefixes = shared cache",
            "impact": "2-10x (workload dependent)",
        },
        {
            "step": 3,
            "action": "Enable KV cache INT8",
            "reason": "2x cache capacity, minimal quality loss",
            "impact": "2x",
        },
        {
            "step": 4,
            "action": "Quantize model if needed",
            "reason": "Smaller model = more room for cache",
            "impact": "2-4x on model memory",
        },
        {
            "step": 5,
            "action": "Add tensor parallelism for large models",
            "reason": "Scale across GPUs",
            "impact": "Linear with GPU count",
        },
    ]

The techniques that matter most are the ones that address the actual bottleneck—usually KV cache for inference. Start with paging and prefix caching, add quantization if needed, scale with parallelism for large models.