Back to Blog

Understanding Inference Platform Economics

Restaurant margins look obvious: food cost, labor, rent. But restaurateurs know the hidden killers: waste, breakage, credit card fees, insurance. The successful ones budget for everything, not just the visible costs.

LLM platform economics follow the same pattern. GPU hours are the food cost—obvious and large. But egress, storage, logging, and engineering time compound into surprisingly large numbers.

The Visible vs Hidden Split

def cost_breakdown_reality():
    """Typical self-hosted LLM platform costs"""

    visible = {
        "gpu_compute": 10000,  # Monthly H100 cost
        "cpu_instances": 1500,  # API servers, load balancers
        "storage": 500,  # Model weights, checkpoints
    }
    visible_total = sum(visible.values())  # $12,000

    hidden = {
        "egress": 3000,  # Data transfer out
        "logging": 2000,  # Observability stack
        "networking": 1000,  # VPC, load balancers, DNS
        "security": 500,  # Scans, certificates, WAF
        "backups": 300,  # Disaster recovery
    }
    hidden_total = sum(hidden.values())  # $6,800

    engineering = {
        "on_call": 4000,  # Engineer time for incidents
        "maintenance": 3000,  # Updates, patches, optimization
        "capacity_planning": 1000,  # Monitoring, forecasting
    }
    eng_total = sum(engineering.values())  # $8,000

    return {
        "visible": visible_total,  # $12,000
        "hidden": hidden_total,    # $6,800
        "engineering": eng_total,   # $8,000
        "true_total": 26800,        # 2.2x the visible cost
    }

The Egress Surprise

def egress_calculation():
    """
    Cloud providers charge for data leaving their network
    """
    # 100 concurrent users
    # Average response: 500 tokens = ~2KB
    # Requests per user per day: 50

    daily_egress_gb = (
        100 * 50 * 2 / 1024 / 1024  # Users × requests × KB / conversion
    )  # ~0.01 GB per day

    # That's tiny. But add:
    # - Logging all requests/responses: 10x
    # - Debug mode with full context: 5x
    # - Metrics with high cardinality: 2x

    realistic_egress = daily_egress_gb * 17 * 30  # ~5 GB/month
    # Still small, but at scale:

    at_scale = {
        "10K_daily_users": "150 GB/month = $15/month",
        "100K_daily_users": "1.5 TB/month = $150/month",
        "1M_daily_users": "15 TB/month = $1,500/month",
    }

    return at_scale

The Logging Trap

class LoggingCostAnalysis:
    """Observability costs more than you expect"""

    def __init__(self, requests_per_day: int):
        self.requests = requests_per_day

    def calculate_logging_cost(self) -> dict:
        # Per request logging
        bytes_per_request = {
            "structured_log": 500,  # JSON with metadata
            "request_body": 2000,  # Average prompt
            "response_body": 2000,  # Average completion
            "metrics": 200,  # Latency, tokens, etc.
        }
        total_per_request = sum(bytes_per_request.values())  # 4.7 KB

        daily_volume_gb = self.requests * total_per_request / 1e9

        # Cloud logging pricing (varies)
        ingestion_per_gb = 0.50
        storage_per_gb_month = 0.03
        query_per_gb = 0.01

        monthly_cost = (
            daily_volume_gb * 30 * ingestion_per_gb +
            daily_volume_gb * 30 * storage_per_gb_month +
            daily_volume_gb * 30 * 10 * query_per_gb  # Assume 10 queries per day's data
        )

        return {
            "daily_volume_gb": daily_volume_gb,
            "monthly_cost": monthly_cost,
            "at_1M_requests_per_day": f"${monthly_cost * (1_000_000 / self.requests):.0f}",
        }

Engineering Time Is Real Cost

def engineering_overhead() -> dict:
    """
    Engineers maintaining the platform have a cost
    """
    fully_loaded_eng_monthly = 15000  # Salary + benefits + overhead

    time_allocation = {
        "incidents": {
            "hours_per_month": 20,
            "cost": fully_loaded_eng_monthly * 20 / 160,  # $1,875
        },
        "maintenance": {
            "hours_per_month": 16,
            "cost": fully_loaded_eng_monthly * 16 / 160,  # $1,500
        },
        "optimization": {
            "hours_per_month": 12,
            "cost": fully_loaded_eng_monthly * 12 / 160,  # $1,125
        },
        "capacity_planning": {
            "hours_per_month": 8,
            "cost": fully_loaded_eng_monthly * 8 / 160,  # $750
        },
    }

    total_eng_cost = sum(item["cost"] for item in time_allocation.values())
    # $5,250/month = 0.35 FTE dedicated to platform

    return {
        "breakdown": time_allocation,
        "total": total_eng_cost,
        "fte_equivalent": 0.35,
    }

The Full Platform Cost Model

def platform_tco(
    gpu_monthly: float,
    requests_per_day: int,
    eng_salary: float = 15000,
) -> dict:
    """Total cost of ownership for LLM platform"""

    # Direct compute (visible)
    compute = {
        "gpu": gpu_monthly,
        "supporting_infra": gpu_monthly * 0.15,  # ~15% for API servers, etc.
    }

    # Infrastructure overhead
    infra_overhead = {
        "egress": requests_per_day * 30 * 0.00001,  # Rough estimate
        "logging": requests_per_day * 30 * 0.00005,
        "networking": gpu_monthly * 0.05,
        "storage": gpu_monthly * 0.05,
        "security": 500,  # Base cost
    }

    # People cost
    people = {
        "on_call": eng_salary * 0.15,
        "maintenance": eng_salary * 0.10,
        "optimization": eng_salary * 0.08,
    }

    visible_total = sum(compute.values())
    hidden_total = sum(infra_overhead.values()) + sum(people.values())

    return {
        "visible_cost": visible_total,
        "hidden_cost": hidden_total,
        "total_monthly": visible_total + hidden_total,
        "multiplier": (visible_total + hidden_total) / visible_total,
        # Expect 1.4-1.6x multiplier
    }

The 40% Rule

A reasonable planning heuristic:

def budget_estimate(gpu_cost: float) -> dict:
    return {
        "compute": gpu_cost,
        "hidden_overhead": gpu_cost * 0.40,
        "budget_total": gpu_cost * 1.40,
        "reality_range": f"${gpu_cost * 1.3:.0f} - ${gpu_cost * 1.6:.0f}",
    }

# $10K GPU budget → budget $14K total
# If you only have $10K, get $7K worth of GPUs

The 40% isn't arbitrary. It's the accumulation of all the things you forget to count until the bill arrives. Budget for them upfront and you won't be surprised.