Back to Blog

Memory Planning for Multi-GPU Deployments

Adding lanes to a highway doesn't eliminate traffic. Merges create bottlenecks, interchanges slow everyone down, and coordination between lanes takes time. Four lanes isn't four times the capacity of one lane.

Multi-GPU inference follows the same pattern. Four GPUs don't give you 4x memory or 4x throughput. Communication overhead, synchronization costs, and activation memory eat into the gains. Understanding where memory actually goes is the difference between capacity planning and capacity guessing.

Memory Breakdown in Multi-GPU

def multi_gpu_memory_breakdown():
    """
    Where memory goes with tensor parallelism
    """
    # Example: 70B model on 4x A100-80GB

    return {
        "model_weights": {
            "total": "140 GB (FP16)",
            "per_gpu": "35 GB (sharded across 4 GPUs)",
            "note": "Weights distribute evenly",
        },

        "kv_cache": {
            "single_gpu_equivalent": "Up to 20 GB",
            "with_4_gpus": "~5 GB per GPU (sharded)",
            "catch": "But need all-reduce communication buffers",
        },

        "activation_memory": {
            "single_gpu": "~10 GB peak",
            "with_tensor_parallel": "~10 GB per GPU",
            "note": "Activations don't shard as well as weights",
        },

        "communication_buffers": {
            "all_reduce_buffers": "2-4 GB per GPU",
            "p2p_buffers": "1-2 GB per GPU",
            "total_overhead": "~5 GB per GPU",
        },

        "summary": {
            "naive_expectation": "4x 80GB = 320GB usable",
            "reality": "Effective ~240GB after overhead",
            "efficiency": "75% of theoretical",
        },
    }

Communication Costs

def communication_overhead():
    return {
        "all_reduce_operations": {
            "when": "After every attention and FFN block",
            "what": "Sum partial results across GPUs",
            "cost_per_layer": {
                "latency": "50-200 microseconds (NVLink)",
                "latency_pcie": "500-2000 microseconds",
                "bandwidth": "Hidden dimension * batch size * 2 bytes",
            },
        },

        "total_per_forward_pass": {
            "70b_model": "~160 all-reduce operations",
            "latency_contribution": "8-30ms depending on interconnect",
            "as_fraction_of_decode": "20-50% of decode time",
        },

        "memory_for_communication": """
        # Communication requires staging buffers
        buffer_size = hidden_dim * batch_size * dtype_size

        # Double buffering for overlap
        total_comm_memory = buffer_size * 2 * num_layers

        # For 70B with batch=32:
        # 8192 * 32 * 2 * 2 * 80 = ~84 MB per GPU
        # Plus NCCL internal buffers: ~2-4 GB
        """,
    }

Planning KV Cache with Tensor Parallelism

class TPKVCachePlanner:
    """
    Plan KV cache capacity with tensor parallelism
    """

    def __init__(self, model_config: dict, tp_degree: int):
        self.num_layers = model_config["num_layers"]
        self.num_heads = model_config["num_attention_heads"]
        self.head_dim = model_config["hidden_size"] // self.num_heads
        self.tp_degree = tp_degree

        # Heads are split across GPUs
        self.heads_per_gpu = self.num_heads // tp_degree

    def kv_cache_per_token_per_gpu(self, dtype_bytes: int = 2) -> int:
        """Memory per token per GPU"""
        # K and V for each layer, for heads on this GPU
        return (
            2 *  # K and V
            self.num_layers *
            self.heads_per_gpu *
            self.head_dim *
            dtype_bytes
        )

    def max_tokens_per_gpu(self, available_memory_gb: float) -> int:
        """Maximum tokens given available memory"""
        available_bytes = available_memory_gb * 1024**3
        bytes_per_token = self.kv_cache_per_token_per_gpu()
        return int(available_bytes / bytes_per_token)

    def plan_deployment(self, target_batch: int, target_seq_len: int) -> dict:
        """Plan memory for target workload"""
        tokens_needed = target_batch * target_seq_len
        memory_needed_per_gpu = (
            tokens_needed * self.kv_cache_per_token_per_gpu()
        ) / (1024**3)

        return {
            "kv_cache_per_gpu_gb": memory_needed_per_gpu,
            "total_kv_cache_gb": memory_needed_per_gpu * self.tp_degree,
            "tokens_supported": tokens_needed,
            "recommendation": self.recommend(memory_needed_per_gpu),
        }

Memory Allocation Strategy

def memory_allocation_strategy():
    """
    How to allocate memory across GPUs
    """
    return {
        "static_allocation": {
            "model_weights": {
                "allocation": "At load time",
                "pattern": "Each GPU gets 1/TP_DEGREE of each layer",
                "memory": "Fixed, predictable",
            },
            "communication_buffers": {
                "allocation": "At initialization",
                "size": "Based on max batch size",
                "memory": "Fixed, small",
            },
        },

        "dynamic_allocation": {
            "kv_cache": {
                "allocation": "As requests arrive",
                "pattern": "Paged or block allocation",
                "memory": "Variable, main consumer",
            },
            "activations": {
                "allocation": "Each forward pass",
                "pattern": "Reused across layers",
                "memory": "Peak during compute",
            },
        },

        "reservation_strategy": """
        Per GPU budget (80GB A100):

        Model weights:     35 GB (fixed)
        Comm buffers:       4 GB (fixed)
        Activation peak:   10 GB (reserved)
        System overhead:    3 GB (reserved)
        -------------------------
        For KV cache:      28 GB (usable)

        With 70B model, TP=4, FP16 KV:
        ~400K tokens total capacity
        = ~12,500 tokens per concurrent request at batch 32
        """,
    }

Scaling Expectations

def scaling_expectations():
    """
    What to actually expect from adding GPUs
    """
    return {
        "memory_scaling": {
            "model_capacity": "Near-linear (can fit larger models)",
            "kv_cache_total": "~0.8x linear (communication overhead)",
            "effective_batch": "~0.7x linear (synchronization costs)",
        },

        "throughput_scaling": {
            "tokens_per_second": {
                "2_gpus": "1.7x (not 2x)",
                "4_gpus": "3.0x (not 4x)",
                "8_gpus": "5.5x (not 8x)",
            },
            "reason": "Communication time doesn't parallelize",
        },

        "latency_impact": {
            "single_token": "Slightly worse (communication overhead)",
            "full_sequence": "Better (parallel compute > overhead)",
            "sweet_spot": "Longer sequences benefit more",
        },

        "cost_efficiency": """
        Adding GPUs has diminishing returns:

        1 GPU:  1.0x cost, 1.0x throughput = 1.0x efficiency
        2 GPUs: 2.0x cost, 1.7x throughput = 0.85x efficiency
        4 GPUs: 4.0x cost, 3.0x throughput = 0.75x efficiency
        8 GPUs: 8.0x cost, 5.5x throughput = 0.69x efficiency

        Use more GPUs when you need to, not when you can.
        """,
    }

Interconnect Impact

def interconnect_comparison():
    """
    How GPU interconnect affects multi-GPU performance
    """
    return {
        "nvlink": {
            "bandwidth": "600-900 GB/s (bidirectional)",
            "latency": "~5 microseconds",
            "all_reduce_time": "50-100 microseconds for typical tensor",
            "use_case": "Production inference, required for 8+ GPUs",
        },

        "pcie_gen4": {
            "bandwidth": "32 GB/s per GPU",
            "latency": "~10 microseconds",
            "all_reduce_time": "500-2000 microseconds",
            "use_case": "Budget setups, TP <= 2",
        },

        "practical_impact": """
        70B model, batch size 8, 1000 token generation:

        NVLink:
        - Communication overhead: ~5% of total time
        - Decode rate: 40 tokens/sec

        PCIe Gen4:
        - Communication overhead: ~30% of total time
        - Decode rate: 25 tokens/sec

        NVLink is 60% faster for multi-GPU inference.
        """,
    }

Troubleshooting Memory Issues

def troubleshooting_memory():
    return {
        "oom_on_one_gpu": {
            "symptom": "GPU 0 OOMs while others have free memory",
            "causes": [
                "Activation memory concentrated on first GPU",
                "Uneven KV cache allocation",
                "Communication buffers on master GPU",
            ],
            "fixes": [
                "Enable activation checkpointing",
                "Use paged attention for even KV distribution",
                "Check NCCL buffer placement",
            ],
        },

        "lower_than_expected_batch": {
            "symptom": "Can't reach expected batch size",
            "causes": [
                "Forgot communication buffer overhead",
                "Activation memory higher than estimated",
                "Memory fragmentation",
            ],
            "fixes": [
                "Reserve 10-15% headroom",
                "Profile actual memory usage",
                "Enable memory defragmentation",
            ],
        },

        "performance_degradation": {
            "symptom": "Multi-GPU slower than expected",
            "causes": [
                "PCIe bottleneck (not NVLink)",
                "Batch size too small for overhead",
                "Synchronization bugs",
            ],
            "fixes": [
                "Verify interconnect bandwidth",
                "Increase batch size",
                "Profile communication patterns",
            ],
        },
    }

Multi-GPU inference is about trading coordination overhead for total capacity. The overhead is real and unavoidable. Plan for it explicitly, measure it carefully, and scale only when the benefits outweigh the costs.