Back to Blog
Cost Per Token Across Hardware Options
Logistics companies track cost per mile, not total truck cost. A $200K truck that runs efficiently beats a $100K truck with high fuel costs. Unit economics matter more than purchase price.
LLM serving follows the same logic. Cost per token is your unit economics. Different hardware options produce wildly different cost per token numbers.
The Hardware Landscape
def hardware_options():
"""
Approximate costs for Llama-70B inference
Throughput varies by optimization and workload
"""
return {
"h100_spot": {
"hourly_cost": 2.50,
"tokens_per_second": 50,
"cost_per_million": 2.50 / (50 * 3600) * 1_000_000, # $0.14
"availability": "Variable, can be interrupted",
"best_for": "Batch processing",
},
"h100_on_demand": {
"hourly_cost": 4.50,
"tokens_per_second": 50,
"cost_per_million": 4.50 / (50 * 3600) * 1_000_000, # $0.25
"availability": "Guaranteed",
"best_for": "Production serving",
},
"h100_reserved": {
"hourly_cost": 3.00,
"tokens_per_second": 50,
"cost_per_million": 3.00 / (50 * 3600) * 1_000_000, # $0.17
"availability": "1-3 year commitment",
"best_for": "Predictable workloads",
},
"a100_spot": {
"hourly_cost": 1.50,
"tokens_per_second": 30,
"cost_per_million": 1.50 / (30 * 3600) * 1_000_000, # $0.14
"availability": "Variable",
"best_for": "Cost-sensitive batch",
},
"a100_on_demand": {
"hourly_cost": 2.50,
"tokens_per_second": 30,
"cost_per_million": 2.50 / (30 * 3600) * 1_000_000, # $0.23
"availability": "Good",
"best_for": "General purpose",
},
"api_providers": {
"hourly_cost": "N/A",
"cost_per_million": 0.80, # Typical Llama-70B API pricing
"availability": "Excellent",
"best_for": "Simplicity, variable load",
},
}
The Real Comparison
def cost_per_million_tokens_comparison():
"""
Normalized comparison including overhead
"""
# Self-hosted costs need overhead added
overhead_multiplier = 1.4 # Egress, logging, eng time
comparison = {
"h100_spot": {
"raw": 0.14,
"with_overhead": 0.14 * overhead_multiplier, # $0.20
"utilization_assumption": "80%",
"at_50pct_util": 0.14 * overhead_multiplier * (100/50), # $0.40
},
"h100_on_demand": {
"raw": 0.25,
"with_overhead": 0.25 * overhead_multiplier, # $0.35
},
"a100_on_demand": {
"raw": 0.23,
"with_overhead": 0.23 * overhead_multiplier, # $0.32
},
"api": {
"raw": 0.80,
"with_overhead": 0.80, # No hidden overhead
"utilization_assumption": "Pay exactly for what you use",
},
}
return comparison
When Each Option Wins
def optimal_choice_by_scenario():
return {
"high_volume_steady": {
"volume": ">100M tokens/day",
"pattern": "Consistent, predictable",
"winner": "H100 reserved",
"cost": "$0.17/1M tokens",
"reason": "Volume amortizes fixed costs",
},
"high_volume_variable": {
"volume": ">100M tokens/day",
"pattern": "Spiky, 5x peak to trough",
"winner": "Mix: reserved baseline + API burst",
"cost": "$0.25/1M tokens (blended)",
"reason": "Don't pay for unused capacity",
},
"medium_volume": {
"volume": "10-100M tokens/day",
"pattern": "Business hours heavy",
"winner": "On-demand + spot batch",
"cost": "$0.30/1M tokens",
"reason": "Flexibility without commitment",
},
"low_volume": {
"volume": "<10M tokens/day",
"pattern": "Variable",
"winner": "API",
"cost": "$0.80/1M tokens",
"reason": "Overhead not worth it",
},
"batch_processing": {
"volume": "Any",
"pattern": "Scheduled, interruptible",
"winner": "Spot instances",
"cost": "$0.14/1M tokens",
"reason": "Lowest cost, tolerate interruption",
},
}
The Break-Even Calculation
def break_even_vs_api(
api_cost_per_million: float,
self_hosted_cost_per_million: float,
monthly_setup_overhead: float,
) -> float:
"""
Monthly tokens needed for self-hosting to beat API
"""
# Savings per token
savings_per_million = api_cost_per_million - self_hosted_cost_per_million
if savings_per_million <= 0:
return float('inf') # Never break even
# Tokens needed to cover overhead
break_even_millions = monthly_setup_overhead / savings_per_million
return break_even_millions
# Example:
# API: $0.80/1M
# Self-hosted: $0.35/1M (with overhead)
# Monthly overhead: $5,000 (engineering time, etc.)
break_even = break_even_vs_api(0.80, 0.35, 5000)
# = 5000 / 0.45 = 11.1 million tokens/month
# Below 11M tokens/month: API cheaper
# Above 11M tokens/month: Self-hosted cheaper
The Utilization Trap
def utilization_impact():
"""
Low utilization kills self-hosted economics
"""
h100_hourly = 4.50
max_tokens_per_hour = 50 * 3600 # 180K
def cost_at_utilization(util_pct: float) -> float:
actual_tokens = max_tokens_per_hour * (util_pct / 100)
if actual_tokens == 0:
return float('inf')
return h100_hourly / actual_tokens * 1_000_000
return {
"100%_utilization": cost_at_utilization(100), # $0.25
"80%_utilization": cost_at_utilization(80), # $0.31
"50%_utilization": cost_at_utilization(50), # $0.50
"20%_utilization": cost_at_utilization(20), # $1.25 (worse than API!)
}
# Key insight: At <50% utilization, API is often cheaper
The Decision Matrix
def recommendation_matrix():
return {
"question_1": {
"question": "Monthly token volume?",
"if_under_10M": "Use API, don't self-host",
"if_10M_to_100M": "Analyze carefully, might break even",
"if_over_100M": "Self-hosting likely wins",
},
"question_2": {
"question": "Utilization expectation?",
"if_under_50pct": "API, unless latency-critical",
"if_50_to_80pct": "Self-host with on-demand",
"if_over_80pct": "Self-host with reserved",
},
"question_3": {
"question": "Traffic pattern?",
"if_steady": "Reserved instances",
"if_predictable_spikes": "On-demand, autoscale",
"if_random_spikes": "API or on-demand + API burst",
},
}
Cost per token is the number that determines your unit economics. Track it, optimize it, and make hardware decisions based on it—not on hourly rates or specs.