When to Use FP8 for Inference

Hybrid cars represented a transition technology—better than pure gas, not quite electric. FP8 occupies a similar position in the precision spectrum: better memory efficiency than FP16, better quality than INT8.

Unlike hybrid cars, FP8 might be the final destination for many inference workloads.

What FP8 Is

class FP8Explanation:
    """
    8-bit floating point: the best of both worlds
    """

    format = {
        "e4m3": {
            "exponent_bits": 4,
            "mantissa_bits": 3,
            "range": "±240",
            "use_case": "Weights and activations",
        },
        "e5m2": {
            "exponent_bits": 5,
            "mantissa_bits": 2,
            "range": "±57344",
            "use_case": "Gradients (training)",
        },
    }

    comparison = {
        "fp16": {
            "bits": 16,
            "range": "±65504",
            "precision": "Good",
        },
        "fp8_e4m3": {
            "bits": 8,
            "range": "±240",
            "precision": "Moderate (but sufficient)",
        },
        "int8": {
            "bits": 8,
            "range": "[-128, 127]",
            "precision": "Discrete steps",
        },
    }

    key_insight = """
    FP8 preserves floating-point semantics (exponent + mantissa)
    while using only 8 bits.

    INT8 forces everything into fixed-point representation,
    which can be lossy for values with high dynamic range.

    Result: FP8 quality ≈ FP16, FP8 speed ≈ INT8
    """

Hardware Support

def hardware_support():
    return {
        "h100": {
            "native_support": True,
            "fp8_tflops": 1979,
            "fp16_tflops": 990,
            "speedup": "2x theoretical over FP16",
            "availability": "Good",
        },
        "a100": {
            "native_support": False,
            "workaround": "Software emulation (slow)",
            "recommendation": "Use INT8 instead on A100",
        },
        "h200": {
            "native_support": True,
            "performance": "Similar to H100 per chip",
        },
        "consumer_gpus": {
            "native_support": "RTX 40 series (limited)",
            "recommendation": "Check specific model",
        },
    }

Quality Comparison

def fp8_quality_data():
    """
    Real benchmark data (approximate)
    """
    return {
        "perplexity": {
            "fp16_baseline": 5.00,
            "fp8": 5.02,
            "int8": 5.08,
            "degradation_fp8": "0.4%",
            "degradation_int8": "1.6%",
        },
        "mmlu_accuracy": {
            "fp16_baseline": 68.0,
            "fp8": 67.7,
            "int8": 67.2,
            "degradation_fp8": "0.4%",
            "degradation_int8": "1.2%",
        },
        "code_pass_rate": {
            "fp16_baseline": 45.0,
            "fp8": 44.5,
            "int8": 43.2,
            "degradation_fp8": "1.1%",
            "degradation_int8": "4.0%",
        },
        "conclusion": """
        FP8 quality is consistently closer to FP16.
        For sensitive tasks like code, FP8 > INT8.
        For tolerant tasks, both are acceptable.
        """,
    }

When to Use FP8

def fp8_decision_matrix():
    return {
        "use_fp8": [
            {
                "condition": "Have H100 GPUs",
                "reason": "Native support, best performance",
            },
            {
                "condition": "Quality-sensitive tasks",
                "reason": "Better quality than INT8",
            },
            {
                "condition": "Want simple setup",
                "reason": "No complex quantization process",
            },
            {
                "condition": "Mixed precision workflow",
                "reason": "Works well with FP16 components",
            },
        ],
        "use_int8_instead": [
            {
                "condition": "A100 or older GPUs",
                "reason": "No FP8 hardware support",
            },
            {
                "condition": "Extreme memory constraints",
                "reason": "INT4 available, FP4 is not",
            },
            {
                "condition": "Using pre-quantized models",
                "reason": "Most available models are INT8",
            },
        ],
        "use_fp16": [
            {
                "condition": "Maximum quality required",
                "reason": "No precision loss",
            },
            {
                "condition": "Memory not constrained",
                "reason": "No reason to optimize",
            },
        ],
    }

Implementation

class FP8Implementation:
    vllm = """
    # vLLM with FP8
    # Requires H100 + FP8 model or dynamic quantization

    # Option 1: Use FP8 quantized model
    vllm serve model-name-fp8 --dtype float8

    # Option 2: Dynamic FP8 quantization
    vllm serve model-name --quantization fp8
    """

    tensorrt_llm = """
    # TensorRT-LLM with FP8
    # Build engine with FP8

    python build.py \\
        --model_dir ./model \\
        --dtype float16 \\
        --use_fp8_context_fmha \\
        --fp8_kv_cache
    """

    transformers = """
    # Native transformers (limited support)
    # Most FP8 workflows go through vLLM or TensorRT-LLM
    """

Performance Numbers

def fp8_performance():
    """
    Typical performance comparison on H100
    """
    return {
        "throughput_tokens_per_second": {
            "fp16": 50,
            "fp8": 85,
            "int8": 90,
            "improvement_fp8_over_fp16": "70%",
        },
        "memory_usage_gb": {
            "fp16": 140,  # 70B model
            "fp8": 70,
            "int8": 70,
            "reduction": "50%",
        },
        "latency_ms_ttft": {
            "fp16": 100,
            "fp8": 70,
            "int8": 65,
            "improvement": "30%",
        },
        "note": """
        FP8 and INT8 have similar performance.
        FP8 typically has better quality.
        Choose FP8 when quality matters, INT8 when every ms counts.
        """,
    }

Migration Path

def fp8_migration():
    return [
        {
            "step": 1,
            "action": "Verify hardware support",
            "command": "nvidia-smi -q | grep FP8",
        },
        {
            "step": 2,
            "action": "Benchmark FP16 baseline",
            "metrics": ["Throughput", "Latency", "Quality scores"],
        },
        {
            "step": 3,
            "action": "Enable FP8 in serving framework",
            "command": "Add --dtype float8 or --quantization fp8",
        },
        {
            "step": 4,
            "action": "Run quality benchmarks",
            "check": "Verify < 1% degradation",
        },
        {
            "step": 5,
            "action": "Deploy with monitoring",
            "watch": ["Quality metrics", "User feedback"],
        },
    ]

FP8 on H100 is becoming the default choice for production inference. It offers the memory efficiency of INT8 with quality much closer to FP16. If you have H100s, there's rarely a reason not to use it.