Back to Blog
Understanding Inference Platform Economics
Restaurant margins look obvious: food cost, labor, rent. But restaurateurs know the hidden killers: waste, breakage, credit card fees, insurance. The successful ones budget for everything, not just the visible costs.
LLM platform economics follow the same pattern. GPU hours are the food cost—obvious and large. But egress, storage, logging, and engineering time compound into surprisingly large numbers.
The Visible vs Hidden Split
def cost_breakdown_reality():
"""Typical self-hosted LLM platform costs"""
visible = {
"gpu_compute": 10000, # Monthly H100 cost
"cpu_instances": 1500, # API servers, load balancers
"storage": 500, # Model weights, checkpoints
}
visible_total = sum(visible.values()) # $12,000
hidden = {
"egress": 3000, # Data transfer out
"logging": 2000, # Observability stack
"networking": 1000, # VPC, load balancers, DNS
"security": 500, # Scans, certificates, WAF
"backups": 300, # Disaster recovery
}
hidden_total = sum(hidden.values()) # $6,800
engineering = {
"on_call": 4000, # Engineer time for incidents
"maintenance": 3000, # Updates, patches, optimization
"capacity_planning": 1000, # Monitoring, forecasting
}
eng_total = sum(engineering.values()) # $8,000
return {
"visible": visible_total, # $12,000
"hidden": hidden_total, # $6,800
"engineering": eng_total, # $8,000
"true_total": 26800, # 2.2x the visible cost
}
The Egress Surprise
def egress_calculation():
"""
Cloud providers charge for data leaving their network
"""
# 100 concurrent users
# Average response: 500 tokens = ~2KB
# Requests per user per day: 50
daily_egress_gb = (
100 * 50 * 2 / 1024 / 1024 # Users × requests × KB / conversion
) # ~0.01 GB per day
# That's tiny. But add:
# - Logging all requests/responses: 10x
# - Debug mode with full context: 5x
# - Metrics with high cardinality: 2x
realistic_egress = daily_egress_gb * 17 * 30 # ~5 GB/month
# Still small, but at scale:
at_scale = {
"10K_daily_users": "150 GB/month = $15/month",
"100K_daily_users": "1.5 TB/month = $150/month",
"1M_daily_users": "15 TB/month = $1,500/month",
}
return at_scale
The Logging Trap
class LoggingCostAnalysis:
"""Observability costs more than you expect"""
def __init__(self, requests_per_day: int):
self.requests = requests_per_day
def calculate_logging_cost(self) -> dict:
# Per request logging
bytes_per_request = {
"structured_log": 500, # JSON with metadata
"request_body": 2000, # Average prompt
"response_body": 2000, # Average completion
"metrics": 200, # Latency, tokens, etc.
}
total_per_request = sum(bytes_per_request.values()) # 4.7 KB
daily_volume_gb = self.requests * total_per_request / 1e9
# Cloud logging pricing (varies)
ingestion_per_gb = 0.50
storage_per_gb_month = 0.03
query_per_gb = 0.01
monthly_cost = (
daily_volume_gb * 30 * ingestion_per_gb +
daily_volume_gb * 30 * storage_per_gb_month +
daily_volume_gb * 30 * 10 * query_per_gb # Assume 10 queries per day's data
)
return {
"daily_volume_gb": daily_volume_gb,
"monthly_cost": monthly_cost,
"at_1M_requests_per_day": f"${monthly_cost * (1_000_000 / self.requests):.0f}",
}
Engineering Time Is Real Cost
def engineering_overhead() -> dict:
"""
Engineers maintaining the platform have a cost
"""
fully_loaded_eng_monthly = 15000 # Salary + benefits + overhead
time_allocation = {
"incidents": {
"hours_per_month": 20,
"cost": fully_loaded_eng_monthly * 20 / 160, # $1,875
},
"maintenance": {
"hours_per_month": 16,
"cost": fully_loaded_eng_monthly * 16 / 160, # $1,500
},
"optimization": {
"hours_per_month": 12,
"cost": fully_loaded_eng_monthly * 12 / 160, # $1,125
},
"capacity_planning": {
"hours_per_month": 8,
"cost": fully_loaded_eng_monthly * 8 / 160, # $750
},
}
total_eng_cost = sum(item["cost"] for item in time_allocation.values())
# $5,250/month = 0.35 FTE dedicated to platform
return {
"breakdown": time_allocation,
"total": total_eng_cost,
"fte_equivalent": 0.35,
}
The Full Platform Cost Model
def platform_tco(
gpu_monthly: float,
requests_per_day: int,
eng_salary: float = 15000,
) -> dict:
"""Total cost of ownership for LLM platform"""
# Direct compute (visible)
compute = {
"gpu": gpu_monthly,
"supporting_infra": gpu_monthly * 0.15, # ~15% for API servers, etc.
}
# Infrastructure overhead
infra_overhead = {
"egress": requests_per_day * 30 * 0.00001, # Rough estimate
"logging": requests_per_day * 30 * 0.00005,
"networking": gpu_monthly * 0.05,
"storage": gpu_monthly * 0.05,
"security": 500, # Base cost
}
# People cost
people = {
"on_call": eng_salary * 0.15,
"maintenance": eng_salary * 0.10,
"optimization": eng_salary * 0.08,
}
visible_total = sum(compute.values())
hidden_total = sum(infra_overhead.values()) + sum(people.values())
return {
"visible_cost": visible_total,
"hidden_cost": hidden_total,
"total_monthly": visible_total + hidden_total,
"multiplier": (visible_total + hidden_total) / visible_total,
# Expect 1.4-1.6x multiplier
}
The 40% Rule
A reasonable planning heuristic:
def budget_estimate(gpu_cost: float) -> dict:
return {
"compute": gpu_cost,
"hidden_overhead": gpu_cost * 0.40,
"budget_total": gpu_cost * 1.40,
"reality_range": f"${gpu_cost * 1.3:.0f} - ${gpu_cost * 1.6:.0f}",
}
# $10K GPU budget → budget $14K total
# If you only have $10K, get $7K worth of GPUs
The 40% isn't arbitrary. It's the accumulation of all the things you forget to count until the bill arrives. Budget for them upfront and you won't be surprised.