Back to Blog
When to Use FP8 for Inference
Hybrid cars represented a transition technology—better than pure gas, not quite electric. FP8 occupies a similar position in the precision spectrum: better memory efficiency than FP16, better quality than INT8.
Unlike hybrid cars, FP8 might be the final destination for many inference workloads.
What FP8 Is
class FP8Explanation:
"""
8-bit floating point: the best of both worlds
"""
format = {
"e4m3": {
"exponent_bits": 4,
"mantissa_bits": 3,
"range": "±240",
"use_case": "Weights and activations",
},
"e5m2": {
"exponent_bits": 5,
"mantissa_bits": 2,
"range": "±57344",
"use_case": "Gradients (training)",
},
}
comparison = {
"fp16": {
"bits": 16,
"range": "±65504",
"precision": "Good",
},
"fp8_e4m3": {
"bits": 8,
"range": "±240",
"precision": "Moderate (but sufficient)",
},
"int8": {
"bits": 8,
"range": "[-128, 127]",
"precision": "Discrete steps",
},
}
key_insight = """
FP8 preserves floating-point semantics (exponent + mantissa)
while using only 8 bits.
INT8 forces everything into fixed-point representation,
which can be lossy for values with high dynamic range.
Result: FP8 quality ≈ FP16, FP8 speed ≈ INT8
"""
Hardware Support
def hardware_support():
return {
"h100": {
"native_support": True,
"fp8_tflops": 1979,
"fp16_tflops": 990,
"speedup": "2x theoretical over FP16",
"availability": "Good",
},
"a100": {
"native_support": False,
"workaround": "Software emulation (slow)",
"recommendation": "Use INT8 instead on A100",
},
"h200": {
"native_support": True,
"performance": "Similar to H100 per chip",
},
"consumer_gpus": {
"native_support": "RTX 40 series (limited)",
"recommendation": "Check specific model",
},
}
Quality Comparison
def fp8_quality_data():
"""
Real benchmark data (approximate)
"""
return {
"perplexity": {
"fp16_baseline": 5.00,
"fp8": 5.02,
"int8": 5.08,
"degradation_fp8": "0.4%",
"degradation_int8": "1.6%",
},
"mmlu_accuracy": {
"fp16_baseline": 68.0,
"fp8": 67.7,
"int8": 67.2,
"degradation_fp8": "0.4%",
"degradation_int8": "1.2%",
},
"code_pass_rate": {
"fp16_baseline": 45.0,
"fp8": 44.5,
"int8": 43.2,
"degradation_fp8": "1.1%",
"degradation_int8": "4.0%",
},
"conclusion": """
FP8 quality is consistently closer to FP16.
For sensitive tasks like code, FP8 > INT8.
For tolerant tasks, both are acceptable.
""",
}
When to Use FP8
def fp8_decision_matrix():
return {
"use_fp8": [
{
"condition": "Have H100 GPUs",
"reason": "Native support, best performance",
},
{
"condition": "Quality-sensitive tasks",
"reason": "Better quality than INT8",
},
{
"condition": "Want simple setup",
"reason": "No complex quantization process",
},
{
"condition": "Mixed precision workflow",
"reason": "Works well with FP16 components",
},
],
"use_int8_instead": [
{
"condition": "A100 or older GPUs",
"reason": "No FP8 hardware support",
},
{
"condition": "Extreme memory constraints",
"reason": "INT4 available, FP4 is not",
},
{
"condition": "Using pre-quantized models",
"reason": "Most available models are INT8",
},
],
"use_fp16": [
{
"condition": "Maximum quality required",
"reason": "No precision loss",
},
{
"condition": "Memory not constrained",
"reason": "No reason to optimize",
},
],
}
Implementation
class FP8Implementation:
vllm = """
# vLLM with FP8
# Requires H100 + FP8 model or dynamic quantization
# Option 1: Use FP8 quantized model
vllm serve model-name-fp8 --dtype float8
# Option 2: Dynamic FP8 quantization
vllm serve model-name --quantization fp8
"""
tensorrt_llm = """
# TensorRT-LLM with FP8
# Build engine with FP8
python build.py \\
--model_dir ./model \\
--dtype float16 \\
--use_fp8_context_fmha \\
--fp8_kv_cache
"""
transformers = """
# Native transformers (limited support)
# Most FP8 workflows go through vLLM or TensorRT-LLM
"""
Performance Numbers
def fp8_performance():
"""
Typical performance comparison on H100
"""
return {
"throughput_tokens_per_second": {
"fp16": 50,
"fp8": 85,
"int8": 90,
"improvement_fp8_over_fp16": "70%",
},
"memory_usage_gb": {
"fp16": 140, # 70B model
"fp8": 70,
"int8": 70,
"reduction": "50%",
},
"latency_ms_ttft": {
"fp16": 100,
"fp8": 70,
"int8": 65,
"improvement": "30%",
},
"note": """
FP8 and INT8 have similar performance.
FP8 typically has better quality.
Choose FP8 when quality matters, INT8 when every ms counts.
""",
}
Migration Path
def fp8_migration():
return [
{
"step": 1,
"action": "Verify hardware support",
"command": "nvidia-smi -q | grep FP8",
},
{
"step": 2,
"action": "Benchmark FP16 baseline",
"metrics": ["Throughput", "Latency", "Quality scores"],
},
{
"step": 3,
"action": "Enable FP8 in serving framework",
"command": "Add --dtype float8 or --quantization fp8",
},
{
"step": 4,
"action": "Run quality benchmarks",
"check": "Verify < 1% degradation",
},
{
"step": 5,
"action": "Deploy with monitoring",
"watch": ["Quality metrics", "User feedback"],
},
]
FP8 on H100 is becoming the default choice for production inference. It offers the memory efficiency of INT8 with quality much closer to FP16. If you have H100s, there's rarely a reason not to use it.