Back to Blog
H100 vs A100: Which One for Inference
Fuel economy matters differently for commuters vs truckers. A commuter might accept lower mpg for comfort. A trucker calculates fuel cost per mile—it's the biggest expense.
LLM inference is like trucking. The hardware cost per token directly impacts unit economics. Understanding H100 vs A100 differences determines whether your margins are healthy.
The Spec Sheet
class GPUSpecs:
a100_80gb = {
"memory": "80 GB HBM2e",
"memory_bandwidth": "2.0 TB/s",
"fp16_compute": "312 TFLOPS",
"fp8_compute": "N/A", # Not supported
"price_per_hour": "$2.00-2.50",
"availability": "Good",
}
h100_80gb = {
"memory": "80 GB HBM3",
"memory_bandwidth": "3.35 TB/s", # 67% more
"fp16_compute": "990 TFLOPS", # 3x more
"fp8_compute": "1979 TFLOPS",
"price_per_hour": "$4.00-5.00",
"availability": "Improving",
}
key_difference = """
Memory bandwidth: 2.0 → 3.35 TB/s (67% increase)
For decode (memory-bound), this is the key metric.
"""
Why Memory Bandwidth Matters
def decode_is_memory_bound():
"""
During decode (token generation):
- Load all model weights for each token
- Perform one forward pass
- Memory bandwidth is the bottleneck, not compute
Math for 70B model:
- Weights: 70B params × 2 bytes (FP16) = 140 GB
- Each token: read 140 GB
A100: 140 GB / 2.0 TB/s = 70ms minimum per token
H100: 140 GB / 3.35 TB/s = 42ms minimum per token
H100 is 40% faster for decode, just from bandwidth.
"""
pass
When H100 Wins
def h100_advantages() -> dict:
return {
"high_volume_serving": {
"reason": "More tokens per GPU-hour",
"savings": "Cost per token can be 20-30% lower",
},
"latency_critical": {
"reason": "Faster decode = lower per-token latency",
"impact": "Better user experience",
},
"large_models": {
"reason": "Same memory, faster access",
"note": "Still need same number of GPUs for model size",
},
"fp8_quantization": {
"reason": "Native FP8 support",
"benefit": "Near-FP16 quality at INT8-like speed",
},
}
When A100 Wins
def a100_advantages() -> dict:
return {
"cost_constrained": {
"reason": "Half the price per hour",
"when": "Throughput per dollar matters more than per GPU",
},
"prefill_heavy": {
"reason": "Prefill is compute-bound, not memory-bound",
"when": "Long prompts, short outputs",
},
"availability": {
"reason": "Easier to get A100 capacity",
"when": "H100 waitlists are months long",
},
"experimentation": {
"reason": "Cheaper to iterate",
"when": "Don't need production performance yet",
},
}
The Cost-Per-Token Calculation
def cost_per_million_tokens(
hourly_cost: float,
tokens_per_second: float,
) -> float:
tokens_per_hour = tokens_per_second * 3600
return hourly_cost / tokens_per_hour * 1_000_000
# Realistic numbers for 70B model
a100_cost = cost_per_million_tokens(
hourly_cost=2.50,
tokens_per_second=30, # Typical with optimizations
) # ≈ $0.023 per 1K tokens
h100_cost = cost_per_million_tokens(
hourly_cost=4.50,
tokens_per_second=50, # 67% faster from bandwidth
) # ≈ $0.025 per 1K tokens
# Surprise: cost per token often similar!
# H100 advantage is throughput per GPU, not cost per token
The Real Decision Factors
def decision_factors() -> dict:
return {
"optimize_for_latency": {
"choice": "H100",
"reason": "Faster decode = lower P99",
},
"optimize_for_cost_per_token": {
"choice": "Depends on utilization",
"note": "Well-utilized A100 can match H100 cost/token",
},
"optimize_for_throughput_per_gpu": {
"choice": "H100",
"reason": "More tokens/sec = fewer GPUs needed",
},
"optimize_for_availability": {
"choice": "A100",
"reason": "Easier to procure",
},
"using_fp8": {
"choice": "H100",
"reason": "Native support, A100 can't",
},
}
Practical Recommendation
def practical_recommendation(context: dict) -> str:
# New deployment with budget
if context.get("starting_fresh"):
if context.get("can_afford_h100"):
return "H100 - better investment for the future"
# Existing A100 fleet
if context.get("have_a100_fleet"):
if context.get("utilization") < 0.7:
return "Optimize A100 usage before upgrading"
# Spot instances
if context.get("can_use_spot"):
return "Mix - H100 spot when available, A100 fallback"
return "A100 for experiments, H100 for production"
The H100 isn't always worth 2x the price. But for high-volume, latency-sensitive inference, the memory bandwidth advantage compounds. Calculate your specific workload—the general answer might not apply.