Back to Blog
Knowing If You're Memory or Compute Limited
Trying to make a highway faster by building faster cars doesn't help if the bottleneck is lane capacity. More horsepower doesn't reduce congestion. You need to identify the actual constraint before choosing a solution.
LLM inference oscillates between two fundamentally different bottlenecks: memory bandwidth and compute. The prefill phase is usually compute-bound (processing input tokens). The decode phase is usually memory-bound (reading KV cache). Different bottlenecks require different optimizations.
The Two Bottlenecks
def bottleneck_characteristics():
return {
"compute_bound": {
"definition": "Waiting for math to finish",
"symptoms": [
"GPU compute utilization near 100%",
"Memory bandwidth utilization < 50%",
"Adding more FLOPs makes it slower",
],
"typical_during": "Prefill phase (processing input)",
"what_helps": [
"Faster GPU (more TFLOPS)",
"Quantization (fewer FLOPs per operation)",
"Kernel fusion (more operations per memory access)",
],
},
"memory_bound": {
"definition": "Waiting for data to move",
"symptoms": [
"GPU compute utilization < 50%",
"Memory bandwidth near saturation",
"GPU sitting idle between operations",
],
"typical_during": "Decode phase (generating tokens)",
"what_helps": [
"Higher memory bandwidth (H100 > A100)",
"Quantization (smaller values to move)",
"Batching (amortize memory access)",
],
},
}
The Prefill vs Decode Split
def prefill_vs_decode():
return {
"prefill_phase": {
"what": "Processing all input tokens at once",
"characteristics": {
"batch_size": "Large (all input tokens)",
"operation": "Attention over full input sequence",
"bottleneck": "Usually compute-bound",
},
"arithmetic_intensity": "High (many FLOPs per byte)",
"why_compute_bound": """
Processing N tokens together allows parallel computation.
Matrix multiplies are large and efficient.
GPU cores stay busy.
""",
},
"decode_phase": {
"what": "Generating one token at a time",
"characteristics": {
"batch_size": "Small (1 token per request)",
"operation": "Attention over full KV cache",
"bottleneck": "Usually memory-bound",
},
"arithmetic_intensity": "Low (few FLOPs per byte)",
"why_memory_bound": """
Each token requires reading entire KV cache.
Computation per byte read is minimal.
Memory bandwidth limits throughput.
""",
},
}
Profiling to Identify Bottleneck
def profiling_approach():
return {
"nvidia_nsight": {
"command": "nsys profile python serve.py",
"what_to_look_for": {
"compute_bound": "High sm_efficiency, low mem_utilization",
"memory_bound": "Low sm_efficiency, high mem_utilization",
},
},
"simple_profiling": """
import torch
from torch.profiler import profile, ProfilerActivity
with profile(
activities=[ProfilerActivity.CUDA],
record_shapes=True,
) as prof:
model.generate(input_ids, max_new_tokens=100)
# Analyze results
events = prof.key_averages()
for event in events:
# High cuda_time vs cpu_time = compute bound
# Low cuda_time vs total_time = memory bound
print(f"{event.key}: {event.cuda_time_total/1000:.1f}ms")
""",
"roofline_analysis": {
"what": "Plot performance vs arithmetic intensity",
"horizontal_part": "Memory-bound (bandwidth limited)",
"diagonal_part": "Compute-bound (FLOPS limited)",
"your_position": "Where your workload lands",
},
}
The Roofline Model
def roofline_model():
"""
Framework for understanding performance limits
"""
# H100 SXM specifications
peak_flops = 1979e12 # FP16 TFLOPS
peak_bandwidth = 3.35e12 # bytes/sec
def compute_ceiling(flops_needed: float) -> float:
"""Maximum performance if compute-bound"""
return peak_flops
def memory_ceiling(bytes_needed: float) -> float:
"""Maximum performance if memory-bound"""
return peak_bandwidth
def arithmetic_intensity(flops: float, bytes: float) -> float:
"""FLOPs per byte moved"""
return flops / bytes
def achieved_performance(intensity: float) -> float:
"""Actual performance given arithmetic intensity"""
if intensity < peak_flops / peak_bandwidth:
# Memory-bound: limited by bandwidth
return intensity * peak_bandwidth
else:
# Compute-bound: limited by FLOPs
return peak_flops
return {
"ridge_point": peak_flops / peak_bandwidth, # ~590 FLOPs/byte
"interpretation": """
Arithmetic intensity < 590: Memory-bound
Arithmetic intensity > 590: Compute-bound
Prefill (large batch attention): ~1000 FLOPs/byte -> Compute-bound
Decode (single token): ~10 FLOPs/byte -> Memory-bound
""",
}
Optimizations by Bottleneck
def optimization_by_bottleneck():
return {
"compute_bound_optimizations": {
"effective": [
("Kernel fusion", "Reduce kernel launch overhead"),
("Flash Attention", "Fuse attention operations"),
("Quantization", "INT8/INT4 reduces compute"),
("Tensor cores", "Use dedicated matrix units"),
],
"ineffective": [
("Larger batch size", "Already GPU-saturated"),
("Higher bandwidth GPU", "Not the bottleneck"),
],
},
"memory_bound_optimizations": {
"effective": [
("Batching", "Amortize KV cache reads"),
("Quantization", "Smaller values = less bandwidth"),
("KV cache compression", "Less to read"),
("Higher bandwidth GPU", "H100 > A100"),
("PagedAttention", "Better memory access patterns"),
],
"ineffective": [
("More TFLOPS", "Compute not the bottleneck"),
("Kernel fusion", "Already memory-limited"),
],
},
}
Batch Size and Bottleneck
def batch_size_impact():
"""
How batch size shifts the bottleneck
"""
return {
"small_batch": {
"typical": "Batch size 1-4",
"prefill": "Memory-bound (not enough work)",
"decode": "Severely memory-bound",
"recommendation": "Increase batch if throughput matters",
},
"medium_batch": {
"typical": "Batch size 8-32",
"prefill": "Compute-bound (sweet spot)",
"decode": "Memory-bound but better amortized",
"recommendation": "Good balance for most workloads",
},
"large_batch": {
"typical": "Batch size 64+",
"prefill": "Compute-bound",
"decode": "May become compute-bound",
"recommendation": "Watch for latency increase",
},
"practical_guidance": """
For latency-sensitive: Keep batch small, accept memory-bound decode
For throughput: Increase batch until decode becomes compute-bound
The inflection point varies by model and hardware.
Profile to find your sweet spot.
""",
}
Measuring in Production
class BottleneckMonitor:
"""
Continuous bottleneck identification
"""
def sample_metrics(self) -> dict:
"""Collect GPU utilization metrics"""
return {
"sm_utilization": self.get_sm_utilization(),
"memory_utilization": self.get_memory_bandwidth_utilization(),
"memory_throughput_gb": self.get_memory_throughput(),
}
def classify_bottleneck(self, metrics: dict) -> str:
"""Determine current bottleneck"""
sm = metrics["sm_utilization"]
mem = metrics["memory_utilization"]
if sm > 80 and mem < 50:
return "compute_bound"
elif sm < 50 and mem > 80:
return "memory_bound"
elif sm > 70 and mem > 70:
return "balanced"
else:
return "underutilized" # Neither saturated
def recommend_optimization(self, bottleneck: str) -> list:
"""Suggest optimizations based on bottleneck"""
recommendations = {
"compute_bound": [
"Enable INT8 quantization",
"Use Flash Attention if not already",
"Consider smaller model",
],
"memory_bound": [
"Increase batch size",
"Enable KV cache quantization",
"Consider H100 (3.35 TB/s) vs A100 (2 TB/s)",
],
"balanced": [
"Good configuration, monitor for changes",
],
"underutilized": [
"Increase load or reduce resources",
"Check for CPU bottleneck",
],
}
return recommendations.get(bottleneck, [])
Common Misdiagnoses
def common_mistakes():
return {
"assuming_always_compute_bound": {
"mistake": "Buying more TFLOPS for memory-bound workload",
"reality": "Decode phase dominates, extra compute sits idle",
"fix": "Profile first, buy bandwidth if memory-bound",
},
"ignoring_phase_differences": {
"mistake": "Optimizing for average utilization",
"reality": "Prefill and decode have different bottlenecks",
"fix": "Profile each phase separately",
},
"batch_size_mismatch": {
"mistake": "Benchmarking with batch=1, serving with batch=32",
"reality": "Bottleneck shifts with batch size",
"fix": "Profile at production batch sizes",
},
"confusing_utilization_metrics": {
"mistake": "nvidia-smi shows 100%, must be compute-bound",
"reality": "nvidia-smi GPU-Util includes memory activity",
"fix": "Use nsight or torch profiler for accurate breakdown",
},
}
Profile before you optimize. Memory-bound workloads need bandwidth improvements. Compute-bound workloads need FLOPS improvements. Getting this wrong means wasted time and money on optimizations that don't address the actual constraint.