Back to Blog
Why Your GPU Utilization Numbers Lie
A factory running all machines doesn't mean all machines are doing useful work. Machines could be waiting for materials, producing defects, or running at half speed. Utilization measures activity, not productivity.
GPU "utilization" from nvidia-smi has the same problem. 90% utilization can mean 90% of time with active kernels—not 90% of compute capacity used. The gap between these numbers is where money burns.
What nvidia-smi Actually Measures
class NvidiaSmiReality:
"""
nvidia-smi 'GPU Utilization' is misleading
"""
what_it_measures = """
Percentage of time over the last sample period
that one or more GPU kernels were executing.
NOT:
- Percentage of compute units active
- Percentage of memory bandwidth used
- Percentage of theoretical FLOPS achieved
"""
example = """
A kernel that uses 1% of GPU compute
but runs continuously shows as 100% utilization.
A perfectly optimized kernel that uses 95% of compute
but has 10% idle time shows as 90% utilization.
The metrics can be inverses of efficiency.
"""
The Real Metrics That Matter
def meaningful_metrics():
return {
"MFU": {
"name": "Model FLOPS Utilization",
"definition": "Actual FLOPS / Theoretical Peak FLOPS",
"good_value": ">50% for inference",
"how_to_measure": "Profile with NSight Compute",
},
"Memory_Bandwidth_Utilization": {
"name": "HBM Bandwidth Usage",
"definition": "Actual GB/s / Peak GB/s",
"good_value": ">60% for decode-bound workloads",
"relevance": "Decode phase is memory-bound",
},
"SM_Efficiency": {
"name": "Streaming Multiprocessor Active",
"definition": "Percentage of SMs actively computing",
"good_value": ">80%",
"issue_if_low": "Kernel launch overhead or small batches",
},
"Compute_Throughput": {
"name": "Actual vs Peak Tensor Core Usage",
"definition": "TFLOPS achieved / TFLOPS theoretical",
"good_value": ">60% for compute-bound phases",
},
}
Why The Gap Exists
def utilization_gap_causes():
return {
"kernel_launch_overhead": {
"description": "Time between kernels",
"nvidia_smi_view": "Shows as 100% if kernels run continuously",
"actual_impact": "Can be 10-30% of time for small ops",
},
"memory_bound_operations": {
"description": "Waiting for memory, not computing",
"nvidia_smi_view": "Shows as utilized (kernel is running)",
"actual_impact": "Compute units idle waiting for data",
},
"small_batch_sizes": {
"description": "Not enough parallelism to fill GPU",
"nvidia_smi_view": "Shows high utilization",
"actual_impact": "Many SMs sitting idle",
},
"synchronization_points": {
"description": "Waiting for all threads to finish",
"nvidia_smi_view": "Hidden in utilization",
"actual_impact": "Serialization bottlenecks",
},
}
How to Measure Real Efficiency
def profiling_approaches():
return {
"quick_check": {
"tool": "nvidia-smi dmon",
"command": "nvidia-smi dmon -s pucvmet",
"metrics": ["SM%", "Mem%", "Enc%", "Dec%"],
"limitation": "Still high-level",
},
"application_level": {
"tool": "torch.profiler or nsys",
"approach": "Profile your actual serving code",
"what_to_look_for": [
"Time in forward pass vs overhead",
"Memory transfer time",
"Kernel duration distribution",
],
},
"deep_dive": {
"tool": "NVIDIA NSight Compute",
"approach": "Profile individual kernels",
"what_to_look_for": [
"Memory throughput as % of peak",
"Compute throughput as % of peak",
"Warp occupancy",
"Instruction mix",
],
},
"inference_specific": {
"tool": "Framework profilers (vLLM, etc.)",
"approach": "Built-in metrics",
"what_to_look_for": [
"Tokens/second",
"Time per iteration step",
"KV cache utilization",
"Batch efficiency",
],
},
}
Tokens Per Second: The Real Metric
class InferenceEfficiency:
"""
For inference, tokens/second is the truth
"""
def calculate_efficiency(
self,
tokens_per_second: float,
model_params_billions: float,
gpu_tflops: float,
) -> float:
# Rough estimate of FLOPS per token
# ~2 FLOPS per parameter per token (forward pass)
flops_per_token = model_params_billions * 2e9
# Actual TFLOPS achieved
actual_tflops = tokens_per_second * flops_per_token / 1e12
# Efficiency
efficiency = actual_tflops / gpu_tflops * 100
return efficiency
def example(self):
# Llama-70B on H100
efficiency = self.calculate_efficiency(
tokens_per_second=50,
model_params_billions=70,
gpu_tflops=990, # FP16
)
# = (50 * 140e9 / 1e12) / 990 * 100
# = 7 / 990 * 100
# = 0.7%
# This seems terrible, but it's expected for decode:
# Decode is memory-bound, not compute-bound
# Memory bandwidth efficiency is the real metric
Memory Bandwidth Is The Real Bottleneck
def memory_efficiency_for_decode():
"""
Decode phase is memory-bandwidth limited
"""
# H100 specs
h100_bandwidth_tbps = 3.35 # TB/s
# Llama-70B memory access per token
model_bytes = 70e9 * 2 # 70B params × 2 bytes (FP16)
# Must read all weights for each token
# Theoretical max tokens/second (memory limited)
theoretical_tps = h100_bandwidth_tbps * 1e12 / model_bytes
# = 3.35e12 / 140e9 = 24 tokens/second
# If you're getting 50 tokens/second with batching:
# You're actually exceeding single-request theoretical!
# Batching amortizes weight loading across requests
return {
"theoretical_single_request": 24,
"with_batching": "Can be higher due to weight reuse",
"target": "Measure memory bandwidth utilization, not compute",
}
What To Actually Monitor
def production_monitoring():
return {
"primary_metrics": [
"Tokens per second (throughput)",
"Latency percentiles (P50, P99)",
"Requests per GPU per hour",
"Cost per million tokens",
],
"efficiency_indicators": [
"Batch size average",
"Queue depth (should be >0 but not huge)",
"Memory utilization % (should be high)",
],
"warning_signs": [
"nvidia-smi: 90%, tokens/sec: low → inefficient serving",
"High memory util, low throughput → need optimization",
"Low batch sizes → not enough traffic or bad batching",
],
}
Don't trust nvidia-smi utilization. Trust tokens per second, cost per token, and latency percentiles. Those metrics tell you if the GPU is doing useful work—not just that it's busy.