Cost Per Token Across Hardware Options

Logistics companies track cost per mile, not total truck cost. A $200K truck that runs efficiently beats a $100K truck with high fuel costs. Unit economics matter more than purchase price.

LLM serving follows the same logic. Cost per token is your unit economics. Different hardware options produce wildly different cost per token numbers.

The Hardware Landscape

def hardware_options():
    """
    Approximate costs for Llama-70B inference
    Throughput varies by optimization and workload
    """
    return {
        "h100_spot": {
            "hourly_cost": 2.50,
            "tokens_per_second": 50,
            "cost_per_million": 2.50 / (50 * 3600) * 1_000_000,  # $0.14
            "availability": "Variable, can be interrupted",
            "best_for": "Batch processing",
        },
        "h100_on_demand": {
            "hourly_cost": 4.50,
            "tokens_per_second": 50,
            "cost_per_million": 4.50 / (50 * 3600) * 1_000_000,  # $0.25
            "availability": "Guaranteed",
            "best_for": "Production serving",
        },
        "h100_reserved": {
            "hourly_cost": 3.00,
            "tokens_per_second": 50,
            "cost_per_million": 3.00 / (50 * 3600) * 1_000_000,  # $0.17
            "availability": "1-3 year commitment",
            "best_for": "Predictable workloads",
        },
        "a100_spot": {
            "hourly_cost": 1.50,
            "tokens_per_second": 30,
            "cost_per_million": 1.50 / (30 * 3600) * 1_000_000,  # $0.14
            "availability": "Variable",
            "best_for": "Cost-sensitive batch",
        },
        "a100_on_demand": {
            "hourly_cost": 2.50,
            "tokens_per_second": 30,
            "cost_per_million": 2.50 / (30 * 3600) * 1_000_000,  # $0.23
            "availability": "Good",
            "best_for": "General purpose",
        },
        "api_providers": {
            "hourly_cost": "N/A",
            "cost_per_million": 0.80,  # Typical Llama-70B API pricing
            "availability": "Excellent",
            "best_for": "Simplicity, variable load",
        },
    }

The Real Comparison

def cost_per_million_tokens_comparison():
    """
    Normalized comparison including overhead
    """
    # Self-hosted costs need overhead added
    overhead_multiplier = 1.4  # Egress, logging, eng time

    comparison = {
        "h100_spot": {
            "raw": 0.14,
            "with_overhead": 0.14 * overhead_multiplier,  # $0.20
            "utilization_assumption": "80%",
            "at_50pct_util": 0.14 * overhead_multiplier * (100/50),  # $0.40
        },
        "h100_on_demand": {
            "raw": 0.25,
            "with_overhead": 0.25 * overhead_multiplier,  # $0.35
        },
        "a100_on_demand": {
            "raw": 0.23,
            "with_overhead": 0.23 * overhead_multiplier,  # $0.32
        },
        "api": {
            "raw": 0.80,
            "with_overhead": 0.80,  # No hidden overhead
            "utilization_assumption": "Pay exactly for what you use",
        },
    }

    return comparison

When Each Option Wins

def optimal_choice_by_scenario():
    return {
        "high_volume_steady": {
            "volume": ">100M tokens/day",
            "pattern": "Consistent, predictable",
            "winner": "H100 reserved",
            "cost": "$0.17/1M tokens",
            "reason": "Volume amortizes fixed costs",
        },
        "high_volume_variable": {
            "volume": ">100M tokens/day",
            "pattern": "Spiky, 5x peak to trough",
            "winner": "Mix: reserved baseline + API burst",
            "cost": "$0.25/1M tokens (blended)",
            "reason": "Don't pay for unused capacity",
        },
        "medium_volume": {
            "volume": "10-100M tokens/day",
            "pattern": "Business hours heavy",
            "winner": "On-demand + spot batch",
            "cost": "$0.30/1M tokens",
            "reason": "Flexibility without commitment",
        },
        "low_volume": {
            "volume": "<10M tokens/day",
            "pattern": "Variable",
            "winner": "API",
            "cost": "$0.80/1M tokens",
            "reason": "Overhead not worth it",
        },
        "batch_processing": {
            "volume": "Any",
            "pattern": "Scheduled, interruptible",
            "winner": "Spot instances",
            "cost": "$0.14/1M tokens",
            "reason": "Lowest cost, tolerate interruption",
        },
    }

The Break-Even Calculation

def break_even_vs_api(
    api_cost_per_million: float,
    self_hosted_cost_per_million: float,
    monthly_setup_overhead: float,
) -> float:
    """
    Monthly tokens needed for self-hosting to beat API
    """
    # Savings per token
    savings_per_million = api_cost_per_million - self_hosted_cost_per_million

    if savings_per_million <= 0:
        return float('inf')  # Never break even

    # Tokens needed to cover overhead
    break_even_millions = monthly_setup_overhead / savings_per_million

    return break_even_millions

# Example:
# API: $0.80/1M
# Self-hosted: $0.35/1M (with overhead)
# Monthly overhead: $5,000 (engineering time, etc.)

break_even = break_even_vs_api(0.80, 0.35, 5000)
# = 5000 / 0.45 = 11.1 million tokens/month

# Below 11M tokens/month: API cheaper
# Above 11M tokens/month: Self-hosted cheaper

The Utilization Trap

def utilization_impact():
    """
    Low utilization kills self-hosted economics
    """
    h100_hourly = 4.50
    max_tokens_per_hour = 50 * 3600  # 180K

    def cost_at_utilization(util_pct: float) -> float:
        actual_tokens = max_tokens_per_hour * (util_pct / 100)
        if actual_tokens == 0:
            return float('inf')
        return h100_hourly / actual_tokens * 1_000_000

    return {
        "100%_utilization": cost_at_utilization(100),  # $0.25
        "80%_utilization": cost_at_utilization(80),   # $0.31
        "50%_utilization": cost_at_utilization(50),   # $0.50
        "20%_utilization": cost_at_utilization(20),   # $1.25 (worse than API!)
    }

# Key insight: At <50% utilization, API is often cheaper

The Decision Matrix

def recommendation_matrix():
    return {
        "question_1": {
            "question": "Monthly token volume?",
            "if_under_10M": "Use API, don't self-host",
            "if_10M_to_100M": "Analyze carefully, might break even",
            "if_over_100M": "Self-hosting likely wins",
        },
        "question_2": {
            "question": "Utilization expectation?",
            "if_under_50pct": "API, unless latency-critical",
            "if_50_to_80pct": "Self-host with on-demand",
            "if_over_80pct": "Self-host with reserved",
        },
        "question_3": {
            "question": "Traffic pattern?",
            "if_steady": "Reserved instances",
            "if_predictable_spikes": "On-demand, autoscale",
            "if_random_spikes": "API or on-demand + API burst",
        },
    }

Cost per token is the number that determines your unit economics. Track it, optimize it, and make hardware decisions based on it—not on hourly rates or specs.