What Senior Engineers Know About GPU Memory

Closet space is misleading. A 10-cubic-foot closet doesn't hold 10 cubic feet of stuff. Hangers, shelves, and awkward shapes mean you get maybe 6. GPU memory works the same way—theoretical capacity and practical capacity are different numbers.

Senior engineers learn this through expensive production incidents. Here's what they know that junior engineers don't.

Where GPU Memory Actually Goes

def gpu_memory_breakdown(model_params_b: float, context_length: int, batch_size: int):
    """
    For a typical LLM serving setup
    """
    bytes_per_param = 2  # FP16

    # Model weights (fixed, always present)
    model_weights_gb = model_params_b * bytes_per_param

    # KV cache (grows with context × batch)
    # Estimate: 2 bytes per token per layer per head × 2 (K and V)
    num_layers = int(model_params_b * 0.8)  # Rough estimate
    num_heads = 64  # Typical for 70B
    head_dim = 128
    kv_per_token = 2 * num_layers * num_heads * head_dim * 2 / 1e9  # GB

    kv_cache_gb = context_length * batch_size * kv_per_token

    # Activation memory (during forward pass)
    # Roughly proportional to batch size and model size
    activations_gb = model_params_b * 0.02 * batch_size

    # Framework overhead (CUDA context, allocator buffers)
    framework_overhead_gb = 2.0

    return {
        "model_weights": model_weights_gb,  # ~140 GB for 70B FP16
        "kv_cache": kv_cache_gb,
        "activations": activations_gb,
        "framework": framework_overhead_gb,
        "total": model_weights_gb + kv_cache_gb + activations_gb + framework_overhead_gb,
    }

# Example: 70B model, 32K context, batch size 8
# Model: 140 GB
# KV cache: ~40 GB (varies by architecture)
# Activations: ~11 GB
# Framework: 2 GB
# Total: 193 GB (needs 3x H100 80GB just for this!)

The KV Cache Problem

class KVCacheReality:
    """
    KV cache is the silent memory killer
    """

    explanation = """
    For each token in context:
    - Store Key vector (hidden_dim bytes)
    - Store Value vector (hidden_dim bytes)
    - For each layer
    - For each request in batch

    32K context × 80 layers × 8192 hidden × 2 (K+V) × 2 bytes
    = 41.9 GB per request!
    """

    why_its_worse_than_expected = [
        "Memory is per-request, not shared",
        "Must allocate for max_tokens, not actual",
        "Fragmentation wastes additional memory",
        "Can't easily swap to CPU (too slow)",
    ]

    def estimate_max_concurrent(self, gpu_memory_gb: float, model_gb: float) -> int:
        """How many concurrent requests fit?"""
        available = gpu_memory_gb - model_gb - 2  # Framework overhead
        kv_per_request = 42  # GB for 32K context on 70B
        return int(available / kv_per_request)

    # On 80GB H100 with 70B model in FP16:
    # Available: 80 - 35 (TP4 shard) - 2 = 43 GB
    # Max concurrent 32K requests: 1!
    # That's why long context is expensive

Why "Just Add VRAM" Doesn't Work

def vram_scaling_reality():
    return {
        "myth": "More VRAM = more concurrent long-context requests",
        "reality": {
            "problem_1": {
                "issue": "Memory bandwidth doesn't scale",
                "impact": "More memory to read = slower decode",
            },
            "problem_2": {
                "issue": "Attention is O(n²)",
                "impact": "2x context = 4x compute for prefill",
            },
            "problem_3": {
                "issue": "PCIe bottleneck",
                "impact": "Moving data to/from GPU is slow",
            },
        },
        "what_actually_helps": [
            "Tensor parallelism (split across GPUs)",
            "KV cache quantization (INT8 instead of FP16)",
            "Sliding window attention (trade quality for memory)",
            "Smart context management (summarize old context)",
        ],
    }

Memory Management Techniques

class MemoryManagementStrategies:
    """
    What production systems actually do
    """

    paged_attention = {
        "technique": "Allocate KV cache in pages, not contiguously",
        "benefit": "Eliminates fragmentation waste",
        "implementation": "vLLM does this automatically",
        "savings": "2-4x more concurrent requests",
    }

    kv_cache_quantization = {
        "technique": "Store KV cache in INT8 instead of FP16",
        "benefit": "2x memory reduction",
        "quality_impact": "Minimal for most tasks",
        "implementation": "vLLM --kv-cache-dtype int8",
    }

    prefix_caching = {
        "technique": "Share KV cache for common prefixes",
        "benefit": "N requests with same system prompt share 1 KV cache",
        "savings": "Huge for chatbots with fixed system prompts",
        "implementation": "SGLang RadixAttention, vLLM prefix caching",
    }

    dynamic_context = {
        "technique": "Truncate or summarize old context",
        "benefit": "Control memory growth in long conversations",
        "tradeoff": "May lose important context",
        "implementation": "Application-level logic",
    }

Diagnosing Memory Issues

def memory_debugging():
    return {
        "symptoms_and_causes": {
            "OOM during prefill": "Prompt too long for available memory",
            "OOM during decode": "KV cache grew beyond allocation",
            "OOM after many requests": "Memory fragmentation or leak",
            "Slow performance, high memory": "Thrashing, eviction happening",
        },

        "diagnostic_commands": {
            "current_usage": "nvidia-smi --query-gpu=memory.used --format=csv",
            "detailed_breakdown": "Use torch.cuda.memory_stats()",
            "allocation_history": "torch.cuda.memory_snapshot()",
        },

        "vllm_specific": {
            "check_config": "Log gpu_memory_utilization setting",
            "monitor_kv": "Check num_batched_tokens in logs",
            "adjust": "Lower max_num_seqs if OOMing",
        },
    }

The Practical Limits

def practical_limits_table():
    """
    Realistic concurrent requests by context length
    Single H100 80GB, 70B model FP16, 4-way TP
    """
    return {
        "2K_context": {"concurrent": 50, "note": "Comfortable"},
        "8K_context": {"concurrent": 20, "note": "Still good"},
        "16K_context": {"concurrent": 8, "note": "Getting tight"},
        "32K_context": {"concurrent": 3, "note": "Memory-constrained"},
        "64K_context": {"concurrent": 1, "note": "Barely fits"},
        "128K_context": {"concurrent": 0, "note": "Need more GPUs"},
    }

# Key insight: Long context is a different regime
# Short context: throughput-optimize
# Long context: memory-optimize (or don't serve it)

Senior engineers know GPU memory isn't about total VRAM—it's about how the memory is used. Long context changes everything. Plan for it specifically, or watch your system OOM under production load.