Why Your GPU Utilization Numbers Lie

A factory running all machines doesn't mean all machines are doing useful work. Machines could be waiting for materials, producing defects, or running at half speed. Utilization measures activity, not productivity.

GPU "utilization" from nvidia-smi has the same problem. 90% utilization can mean 90% of time with active kernels—not 90% of compute capacity used. The gap between these numbers is where money burns.

What nvidia-smi Actually Measures

class NvidiaSmiReality:
    """
    nvidia-smi 'GPU Utilization' is misleading
    """
    what_it_measures = """
    Percentage of time over the last sample period
    that one or more GPU kernels were executing.

    NOT:
    - Percentage of compute units active
    - Percentage of memory bandwidth used
    - Percentage of theoretical FLOPS achieved
    """

    example = """
    A kernel that uses 1% of GPU compute
    but runs continuously shows as 100% utilization.

    A perfectly optimized kernel that uses 95% of compute
    but has 10% idle time shows as 90% utilization.

    The metrics can be inverses of efficiency.
    """

The Real Metrics That Matter

def meaningful_metrics():
    return {
        "MFU": {
            "name": "Model FLOPS Utilization",
            "definition": "Actual FLOPS / Theoretical Peak FLOPS",
            "good_value": ">50% for inference",
            "how_to_measure": "Profile with NSight Compute",
        },
        "Memory_Bandwidth_Utilization": {
            "name": "HBM Bandwidth Usage",
            "definition": "Actual GB/s / Peak GB/s",
            "good_value": ">60% for decode-bound workloads",
            "relevance": "Decode phase is memory-bound",
        },
        "SM_Efficiency": {
            "name": "Streaming Multiprocessor Active",
            "definition": "Percentage of SMs actively computing",
            "good_value": ">80%",
            "issue_if_low": "Kernel launch overhead or small batches",
        },
        "Compute_Throughput": {
            "name": "Actual vs Peak Tensor Core Usage",
            "definition": "TFLOPS achieved / TFLOPS theoretical",
            "good_value": ">60% for compute-bound phases",
        },
    }

Why The Gap Exists

def utilization_gap_causes():
    return {
        "kernel_launch_overhead": {
            "description": "Time between kernels",
            "nvidia_smi_view": "Shows as 100% if kernels run continuously",
            "actual_impact": "Can be 10-30% of time for small ops",
        },
        "memory_bound_operations": {
            "description": "Waiting for memory, not computing",
            "nvidia_smi_view": "Shows as utilized (kernel is running)",
            "actual_impact": "Compute units idle waiting for data",
        },
        "small_batch_sizes": {
            "description": "Not enough parallelism to fill GPU",
            "nvidia_smi_view": "Shows high utilization",
            "actual_impact": "Many SMs sitting idle",
        },
        "synchronization_points": {
            "description": "Waiting for all threads to finish",
            "nvidia_smi_view": "Hidden in utilization",
            "actual_impact": "Serialization bottlenecks",
        },
    }

How to Measure Real Efficiency

def profiling_approaches():
    return {
        "quick_check": {
            "tool": "nvidia-smi dmon",
            "command": "nvidia-smi dmon -s pucvmet",
            "metrics": ["SM%", "Mem%", "Enc%", "Dec%"],
            "limitation": "Still high-level",
        },
        "application_level": {
            "tool": "torch.profiler or nsys",
            "approach": "Profile your actual serving code",
            "what_to_look_for": [
                "Time in forward pass vs overhead",
                "Memory transfer time",
                "Kernel duration distribution",
            ],
        },
        "deep_dive": {
            "tool": "NVIDIA NSight Compute",
            "approach": "Profile individual kernels",
            "what_to_look_for": [
                "Memory throughput as % of peak",
                "Compute throughput as % of peak",
                "Warp occupancy",
                "Instruction mix",
            ],
        },
        "inference_specific": {
            "tool": "Framework profilers (vLLM, etc.)",
            "approach": "Built-in metrics",
            "what_to_look_for": [
                "Tokens/second",
                "Time per iteration step",
                "KV cache utilization",
                "Batch efficiency",
            ],
        },
    }

Tokens Per Second: The Real Metric

class InferenceEfficiency:
    """
    For inference, tokens/second is the truth
    """
    def calculate_efficiency(
        self,
        tokens_per_second: float,
        model_params_billions: float,
        gpu_tflops: float,
    ) -> float:
        # Rough estimate of FLOPS per token
        # ~2 FLOPS per parameter per token (forward pass)
        flops_per_token = model_params_billions * 2e9

        # Actual TFLOPS achieved
        actual_tflops = tokens_per_second * flops_per_token / 1e12

        # Efficiency
        efficiency = actual_tflops / gpu_tflops * 100

        return efficiency

    def example(self):
        # Llama-70B on H100
        efficiency = self.calculate_efficiency(
            tokens_per_second=50,
            model_params_billions=70,
            gpu_tflops=990,  # FP16
        )
        # = (50 * 140e9 / 1e12) / 990 * 100
        # = 7 / 990 * 100
        # = 0.7%

        # This seems terrible, but it's expected for decode:
        # Decode is memory-bound, not compute-bound
        # Memory bandwidth efficiency is the real metric

Memory Bandwidth Is The Real Bottleneck

def memory_efficiency_for_decode():
    """
    Decode phase is memory-bandwidth limited
    """
    # H100 specs
    h100_bandwidth_tbps = 3.35  # TB/s

    # Llama-70B memory access per token
    model_bytes = 70e9 * 2  # 70B params × 2 bytes (FP16)
    # Must read all weights for each token

    # Theoretical max tokens/second (memory limited)
    theoretical_tps = h100_bandwidth_tbps * 1e12 / model_bytes
    # = 3.35e12 / 140e9 = 24 tokens/second

    # If you're getting 50 tokens/second with batching:
    # You're actually exceeding single-request theoretical!
    # Batching amortizes weight loading across requests

    return {
        "theoretical_single_request": 24,
        "with_batching": "Can be higher due to weight reuse",
        "target": "Measure memory bandwidth utilization, not compute",
    }

What To Actually Monitor

def production_monitoring():
    return {
        "primary_metrics": [
            "Tokens per second (throughput)",
            "Latency percentiles (P50, P99)",
            "Requests per GPU per hour",
            "Cost per million tokens",
        ],
        "efficiency_indicators": [
            "Batch size average",
            "Queue depth (should be >0 but not huge)",
            "Memory utilization % (should be high)",
        ],
        "warning_signs": [
            "nvidia-smi: 90%, tokens/sec: low → inefficient serving",
            "High memory util, low throughput → need optimization",
            "Low batch sizes → not enough traffic or bad batching",
        ],
    }

Don't trust nvidia-smi utilization. Trust tokens per second, cost per token, and latency percentiles. Those metrics tell you if the GPU is doing useful work—not just that it's busy.