Back to Blog

Adding GPUs Without Linear Speedup

Adding more cooks in a kitchen doesn't proportionally increase meals served. At some point, they're bumping into each other, waiting for equipment, and coordinating who does what. The kitchen becomes the bottleneck, not the cooking.

GPU scaling follows the same curve. Two GPUs might give you 1.8x. Four might give 3.2x. Eight might give 5.5x. The gap between theoretical and actual widens with each GPU added. Understanding this curve determines whether scaling is worth the cost.

The Scaling Reality

def scaling_reality():
    """
    Actual throughput scaling vs linear expectation
    """
    measurements = {
        "70b_model": {
            "1_gpu": {"throughput": 1.0, "efficiency": 1.00},
            "2_gpu": {"throughput": 1.85, "efficiency": 0.92},
            "4_gpu": {"throughput": 3.4, "efficiency": 0.85},
            "8_gpu": {"throughput": 6.0, "efficiency": 0.75},
        },
        "breakdown": {
            "lost_to_communication": "15-20%",
            "lost_to_synchronization": "5-10%",
            "lost_to_load_imbalance": "2-5%",
        },
    }

    return {
        "measurements": measurements,
        "takeaway": """
        Scaling efficiency drops with GPU count:
        - 2 GPUs: 90%+ efficiency (good)
        - 4 GPUs: 85% efficiency (acceptable)
        - 8 GPUs: 75% efficiency (expensive)
        - 16 GPUs: 60% efficiency (only if you must)
        """,
    }

Where Efficiency Goes

def efficiency_breakdown():
    return {
        "communication_overhead": {
            "what": "Time spent sending data between GPUs",
            "when": "After every attention and FFN layer",
            "magnitude": "10-25% of total time (with NVLink)",
            "scales_with": "Number of GPUs, hidden dimension",
        },

        "synchronization_barriers": {
            "what": "Waiting for all GPUs to reach same point",
            "when": "Before and after collective operations",
            "magnitude": "5-10% of total time",
            "scales_with": "Load variance, GPU count",
        },

        "load_imbalance": {
            "what": "Some GPUs finish earlier, wait for others",
            "when": "Variable-length sequences in same batch",
            "magnitude": "2-10% depending on workload",
            "scales_with": "Sequence length variance",
        },

        "memory_bandwidth_contention": {
            "what": "GPUs competing for memory access",
            "when": "Large batch sizes, memory-bound operations",
            "magnitude": "5-15% in decode phase",
            "scales_with": "Batch size per GPU",
        },
    }

Amdahl's Law in Practice

def amdahls_law_for_inference():
    """
    Sequential portions limit parallel speedup
    """
    return {
        "formula": "Speedup = 1 / (S + P/N)",
        "where": {
            "S": "Sequential fraction (cannot parallelize)",
            "P": "Parallel fraction (can parallelize)",
            "N": "Number of GPUs",
        },

        "inference_breakdown": {
            "parallelizable": {
                "matrix_multiplies": "95% of compute",
                "attention_scores": "Mostly parallel",
            },
            "sequential": {
                "all_reduce": "Must synchronize results",
                "tokenization": "Single-threaded prep",
                "sampling": "Sequential token selection",
            },
        },

        "example": """
        If 10% of work is sequential (S=0.1):

        1 GPU:  Speedup = 1.0x
        2 GPUs: Speedup = 1 / (0.1 + 0.9/2) = 1.82x
        4 GPUs: Speedup = 1 / (0.1 + 0.9/4) = 3.08x
        8 GPUs: Speedup = 1 / (0.1 + 0.9/8) = 4.71x
        ∞ GPUs: Speedup = 1 / 0.1 = 10x (maximum possible)

        Can never exceed 10x no matter how many GPUs.
        """,
    }

Batch Size and Scaling

def batch_size_scaling_interaction():
    return {
        "small_batch_penalty": {
            "problem": "Communication overhead dominates small batches",
            "example": """
                Batch=1: Communication takes 5ms, compute takes 5ms
                50% overhead!

                Batch=32: Communication takes 5ms, compute takes 100ms
                5% overhead
            """,
            "conclusion": "Multi-GPU needs larger batches to amortize overhead",
        },

        "minimum_efficient_batch": {
            "2_gpus": "Batch >= 4",
            "4_gpus": "Batch >= 8",
            "8_gpus": "Batch >= 16",
            "rule": "~2x batch per doubling of GPUs",
        },

        "practical_implication": """
        If your workload is low-latency single requests:
        - Multi-GPU adds overhead without benefit
        - Single powerful GPU may be better

        If your workload is high-throughput batch:
        - Multi-GPU can amortize overhead
        - Scaling makes sense
        """,
    }

Scaling Decision Framework

def when_to_scale():
    return {
        "scale_vertically_first": {
            "action": "Use bigger/faster single GPU",
            "when": [
                "Model fits on one GPU",
                "Latency-sensitive workload",
                "Batch size < 8",
            ],
            "why": "No communication overhead",
        },

        "scale_to_2_gpus": {
            "action": "Tensor parallel across 2 GPUs",
            "when": [
                "Model needs > 80GB",
                "Need 1.8x throughput",
                "Have NVLink",
            ],
            "efficiency": "90%+ achievable",
        },

        "scale_to_4_gpus": {
            "action": "TP=4 on single node",
            "when": [
                "Model needs > 160GB",
                "Need ~3x throughput",
                "Batch size >= 8",
            ],
            "efficiency": "85% achievable",
        },

        "scale_to_8_gpus": {
            "action": "TP=8 or TP=4 x PP=2",
            "when": [
                "Largest models (400B+)",
                "Need maximum throughput",
                "Cost secondary to capability",
            ],
            "efficiency": "70-75% typical",
        },

        "beyond_8_gpus": {
            "action": "Multi-node with pipeline parallelism",
            "when": [
                "Models > 600GB",
                "Extreme throughput requirements",
            ],
            "efficiency": "50-60% typical",
            "warning": "Seriously diminishing returns",
        },
    }

Measuring Scaling Efficiency

class ScalingBenchmark:
    """
    Measure actual scaling efficiency
    """

    def benchmark_configuration(self, num_gpus: int) -> dict:
        """Benchmark throughput at given GPU count"""
        results = []

        for batch_size in [1, 4, 8, 16, 32]:
            throughput = self.measure_throughput(num_gpus, batch_size)
            results.append({
                "batch_size": batch_size,
                "throughput_tokens_per_sec": throughput,
            })

        return results

    def compute_efficiency(self, baseline: float, scaled: float, num_gpus: int) -> float:
        """Efficiency = actual speedup / theoretical speedup"""
        actual_speedup = scaled / baseline
        theoretical_speedup = num_gpus
        return actual_speedup / theoretical_speedup

    def scaling_report(self) -> dict:
        """Generate full scaling report"""
        baseline = self.benchmark_configuration(1)

        report = {"baseline_1_gpu": baseline}

        for num_gpus in [2, 4, 8]:
            results = self.benchmark_configuration(num_gpus)
            efficiency = self.compute_efficiency(
                baseline[3]["throughput_tokens_per_sec"],  # batch=16
                results[3]["throughput_tokens_per_sec"],
                num_gpus
            )

            report[f"{num_gpus}_gpus"] = {
                "results": results,
                "efficiency": f"{efficiency*100:.0f}%",
            }

        return report

Cost-Effectiveness Analysis

def cost_effectiveness():
    """
    Is scaling worth the money?
    """
    # Example: H100 at $3/hour
    gpu_cost_per_hour = 3.00

    scenarios = {
        "1_gpu": {
            "cost": 3.00,
            "throughput": 50,  # tokens/sec
            "cost_per_million_tokens": 3.00 / (50 * 3600 / 1_000_000),
        },
        "2_gpus": {
            "cost": 6.00,
            "throughput": 92,  # 1.85x
            "cost_per_million_tokens": 6.00 / (92 * 3600 / 1_000_000),
        },
        "4_gpus": {
            "cost": 12.00,
            "throughput": 170,  # 3.4x
            "cost_per_million_tokens": 12.00 / (170 * 3600 / 1_000_000),
        },
        "8_gpus": {
            "cost": 24.00,
            "throughput": 300,  # 6x
            "cost_per_million_tokens": 24.00 / (300 * 3600 / 1_000_000),
        },
    }

    return {
        "scenarios": scenarios,
        "analysis": """
        Cost per million tokens:
        1 GPU:  $0.017
        2 GPUs: $0.018 (6% more expensive per token)
        4 GPUs: $0.020 (18% more expensive per token)
        8 GPUs: $0.022 (29% more expensive per token)

        Scale for throughput, not cost efficiency.
        Single GPU is most cost-effective per token.
        Multi-GPU is for when you need the throughput.
        """,
    }

Multi-GPU scaling trades cost efficiency for total throughput. Each additional GPU buys less incremental performance than the last. Scale when throughput requirements demand it, not because more GPUs feel better. The single-GPU baseline is your most cost-efficient option.