Knowing If You're Memory or Compute Limited

Trying to make a highway faster by building faster cars doesn't help if the bottleneck is lane capacity. More horsepower doesn't reduce congestion. You need to identify the actual constraint before choosing a solution.

LLM inference oscillates between two fundamentally different bottlenecks: memory bandwidth and compute. The prefill phase is usually compute-bound (processing input tokens). The decode phase is usually memory-bound (reading KV cache). Different bottlenecks require different optimizations.

The Two Bottlenecks

def bottleneck_characteristics():
    return {
        "compute_bound": {
            "definition": "Waiting for math to finish",
            "symptoms": [
                "GPU compute utilization near 100%",
                "Memory bandwidth utilization < 50%",
                "Adding more FLOPs makes it slower",
            ],
            "typical_during": "Prefill phase (processing input)",
            "what_helps": [
                "Faster GPU (more TFLOPS)",
                "Quantization (fewer FLOPs per operation)",
                "Kernel fusion (more operations per memory access)",
            ],
        },

        "memory_bound": {
            "definition": "Waiting for data to move",
            "symptoms": [
                "GPU compute utilization < 50%",
                "Memory bandwidth near saturation",
                "GPU sitting idle between operations",
            ],
            "typical_during": "Decode phase (generating tokens)",
            "what_helps": [
                "Higher memory bandwidth (H100 > A100)",
                "Quantization (smaller values to move)",
                "Batching (amortize memory access)",
            ],
        },
    }

The Prefill vs Decode Split

def prefill_vs_decode():
    return {
        "prefill_phase": {
            "what": "Processing all input tokens at once",
            "characteristics": {
                "batch_size": "Large (all input tokens)",
                "operation": "Attention over full input sequence",
                "bottleneck": "Usually compute-bound",
            },
            "arithmetic_intensity": "High (many FLOPs per byte)",
            "why_compute_bound": """
                Processing N tokens together allows parallel computation.
                Matrix multiplies are large and efficient.
                GPU cores stay busy.
            """,
        },

        "decode_phase": {
            "what": "Generating one token at a time",
            "characteristics": {
                "batch_size": "Small (1 token per request)",
                "operation": "Attention over full KV cache",
                "bottleneck": "Usually memory-bound",
            },
            "arithmetic_intensity": "Low (few FLOPs per byte)",
            "why_memory_bound": """
                Each token requires reading entire KV cache.
                Computation per byte read is minimal.
                Memory bandwidth limits throughput.
            """,
        },
    }

Profiling to Identify Bottleneck

def profiling_approach():
    return {
        "nvidia_nsight": {
            "command": "nsys profile python serve.py",
            "what_to_look_for": {
                "compute_bound": "High sm_efficiency, low mem_utilization",
                "memory_bound": "Low sm_efficiency, high mem_utilization",
            },
        },

        "simple_profiling": """
            import torch
            from torch.profiler import profile, ProfilerActivity

            with profile(
                activities=[ProfilerActivity.CUDA],
                record_shapes=True,
            ) as prof:
                model.generate(input_ids, max_new_tokens=100)

            # Analyze results
            events = prof.key_averages()
            for event in events:
                # High cuda_time vs cpu_time = compute bound
                # Low cuda_time vs total_time = memory bound
                print(f"{event.key}: {event.cuda_time_total/1000:.1f}ms")
        """,

        "roofline_analysis": {
            "what": "Plot performance vs arithmetic intensity",
            "horizontal_part": "Memory-bound (bandwidth limited)",
            "diagonal_part": "Compute-bound (FLOPS limited)",
            "your_position": "Where your workload lands",
        },
    }

The Roofline Model

def roofline_model():
    """
    Framework for understanding performance limits
    """
    # H100 SXM specifications
    peak_flops = 1979e12  # FP16 TFLOPS
    peak_bandwidth = 3.35e12  # bytes/sec

    def compute_ceiling(flops_needed: float) -> float:
        """Maximum performance if compute-bound"""
        return peak_flops

    def memory_ceiling(bytes_needed: float) -> float:
        """Maximum performance if memory-bound"""
        return peak_bandwidth

    def arithmetic_intensity(flops: float, bytes: float) -> float:
        """FLOPs per byte moved"""
        return flops / bytes

    def achieved_performance(intensity: float) -> float:
        """Actual performance given arithmetic intensity"""
        if intensity < peak_flops / peak_bandwidth:
            # Memory-bound: limited by bandwidth
            return intensity * peak_bandwidth
        else:
            # Compute-bound: limited by FLOPs
            return peak_flops

    return {
        "ridge_point": peak_flops / peak_bandwidth,  # ~590 FLOPs/byte
        "interpretation": """
            Arithmetic intensity < 590: Memory-bound
            Arithmetic intensity > 590: Compute-bound

            Prefill (large batch attention): ~1000 FLOPs/byte -> Compute-bound
            Decode (single token): ~10 FLOPs/byte -> Memory-bound
        """,
    }

Optimizations by Bottleneck

def optimization_by_bottleneck():
    return {
        "compute_bound_optimizations": {
            "effective": [
                ("Kernel fusion", "Reduce kernel launch overhead"),
                ("Flash Attention", "Fuse attention operations"),
                ("Quantization", "INT8/INT4 reduces compute"),
                ("Tensor cores", "Use dedicated matrix units"),
            ],
            "ineffective": [
                ("Larger batch size", "Already GPU-saturated"),
                ("Higher bandwidth GPU", "Not the bottleneck"),
            ],
        },

        "memory_bound_optimizations": {
            "effective": [
                ("Batching", "Amortize KV cache reads"),
                ("Quantization", "Smaller values = less bandwidth"),
                ("KV cache compression", "Less to read"),
                ("Higher bandwidth GPU", "H100 > A100"),
                ("PagedAttention", "Better memory access patterns"),
            ],
            "ineffective": [
                ("More TFLOPS", "Compute not the bottleneck"),
                ("Kernel fusion", "Already memory-limited"),
            ],
        },
    }

Batch Size and Bottleneck

def batch_size_impact():
    """
    How batch size shifts the bottleneck
    """
    return {
        "small_batch": {
            "typical": "Batch size 1-4",
            "prefill": "Memory-bound (not enough work)",
            "decode": "Severely memory-bound",
            "recommendation": "Increase batch if throughput matters",
        },

        "medium_batch": {
            "typical": "Batch size 8-32",
            "prefill": "Compute-bound (sweet spot)",
            "decode": "Memory-bound but better amortized",
            "recommendation": "Good balance for most workloads",
        },

        "large_batch": {
            "typical": "Batch size 64+",
            "prefill": "Compute-bound",
            "decode": "May become compute-bound",
            "recommendation": "Watch for latency increase",
        },

        "practical_guidance": """
            For latency-sensitive: Keep batch small, accept memory-bound decode
            For throughput: Increase batch until decode becomes compute-bound

            The inflection point varies by model and hardware.
            Profile to find your sweet spot.
        """,
    }

Measuring in Production

class BottleneckMonitor:
    """
    Continuous bottleneck identification
    """

    def sample_metrics(self) -> dict:
        """Collect GPU utilization metrics"""
        return {
            "sm_utilization": self.get_sm_utilization(),
            "memory_utilization": self.get_memory_bandwidth_utilization(),
            "memory_throughput_gb": self.get_memory_throughput(),
        }

    def classify_bottleneck(self, metrics: dict) -> str:
        """Determine current bottleneck"""
        sm = metrics["sm_utilization"]
        mem = metrics["memory_utilization"]

        if sm > 80 and mem < 50:
            return "compute_bound"
        elif sm < 50 and mem > 80:
            return "memory_bound"
        elif sm > 70 and mem > 70:
            return "balanced"
        else:
            return "underutilized"  # Neither saturated

    def recommend_optimization(self, bottleneck: str) -> list:
        """Suggest optimizations based on bottleneck"""
        recommendations = {
            "compute_bound": [
                "Enable INT8 quantization",
                "Use Flash Attention if not already",
                "Consider smaller model",
            ],
            "memory_bound": [
                "Increase batch size",
                "Enable KV cache quantization",
                "Consider H100 (3.35 TB/s) vs A100 (2 TB/s)",
            ],
            "balanced": [
                "Good configuration, monitor for changes",
            ],
            "underutilized": [
                "Increase load or reduce resources",
                "Check for CPU bottleneck",
            ],
        }
        return recommendations.get(bottleneck, [])

Common Misdiagnoses

def common_mistakes():
    return {
        "assuming_always_compute_bound": {
            "mistake": "Buying more TFLOPS for memory-bound workload",
            "reality": "Decode phase dominates, extra compute sits idle",
            "fix": "Profile first, buy bandwidth if memory-bound",
        },

        "ignoring_phase_differences": {
            "mistake": "Optimizing for average utilization",
            "reality": "Prefill and decode have different bottlenecks",
            "fix": "Profile each phase separately",
        },

        "batch_size_mismatch": {
            "mistake": "Benchmarking with batch=1, serving with batch=32",
            "reality": "Bottleneck shifts with batch size",
            "fix": "Profile at production batch sizes",
        },

        "confusing_utilization_metrics": {
            "mistake": "nvidia-smi shows 100%, must be compute-bound",
            "reality": "nvidia-smi GPU-Util includes memory activity",
            "fix": "Use nsight or torch profiler for accurate breakdown",
        },
    }

Profile before you optimize. Memory-bound workloads need bandwidth improvements. Compute-bound workloads need FLOPS improvements. Getting this wrong means wasted time and money on optimizations that don't address the actual constraint.