The Math on Self-Hosting vs API

Accountants have a concept called "payback period"—how long until an investment returns its cost. Every self-hosting decision should start with this calculation.

Most teams don't do it. They assume "cheaper per token = cheaper overall." That assumption ignores fixed costs, migration effort, and opportunity cost.

The Break-Even Formula

def break_even_months(
    monthly_api_cost: float,
    gpu_monthly: float,
    eng_salary_monthly: float,
    eng_fraction: float,  # How much of their time
    setup_months: float,
    setup_eng_fraction: float,
) -> float:
    # Fixed costs
    setup_cost = setup_months * eng_salary_monthly * setup_eng_fraction

    # Ongoing costs
    monthly_self_host = gpu_monthly + (eng_salary_monthly * eng_fraction)

    # Monthly savings after migration
    monthly_savings = monthly_api_cost - monthly_self_host

    if monthly_savings <= 0:
        return float('inf')  # Never breaks even

    return setup_cost / monthly_savings

# Example:
# $40K/month API cost
# $10K/month GPU cost
# $15K/month engineer salary, 30% time on infra
# 3 months setup at 50% engineer time

break_even = break_even_months(
    monthly_api_cost=40000,
    gpu_monthly=10000,
    eng_salary_monthly=15000,
    eng_fraction=0.30,
    setup_months=3,
    setup_eng_fraction=0.50
)
# Result: ~1.5 months to break even after setup
# Total time to ROI: 4.5 months

The Variables That Change Everything

class BreakEvenVariables:
    def __init__(self):
        # Often underestimated
        self.setup_time = {
            "optimistic": "2 weeks",
            "realistic": "2-3 months",
            "with_production_hardening": "4-6 months",
        }

        self.ongoing_eng_time = {
            "if_everything_works": "10%",
            "reality": "20-40%",
            "during_incidents": "100%",
        }

        # Often ignored
        self.opportunity_cost = {
            "what_else_could_eng_build": "Features, not infra",
            "hiring_difficulty": "ML infra people are scarce",
            "context_switching": "On-call fragments focus",
        }

The 6-Month Reality Check

def six_month_projection(scenario: str) -> dict:
    scenarios = {
        "optimistic": {
            "month_1": "Setup complete, running in dev",
            "month_2": "Production deployment",
            "month_3": "Stable, start saving",
            "month_4_6": "Smooth sailing, savings accumulate",
            "total_savings": 60000,
        },
        "realistic": {
            "month_1": "Environment setup, dependency hell",
            "month_2": "First models running, performance issues",
            "month_3": "Optimization, still slower than API",
            "month_4": "Production deployment, incidents",
            "month_5": "Stability, but engineer burned out",
            "month_6": "Finally stable, savings start",
            "total_savings": 10000,
        },
        "pessimistic": {
            "month_1": "Procurement delays for GPUs",
            "month_2": "Setup, security review",
            "month_3": "Performance never matches API",
            "month_4": "Fall back to API for high traffic",
            "month_5": "Hybrid mode, complexity increases",
            "month_6": "Engineer quits, back to API",
            "total_savings": -80000,
        }
    }
    return scenarios[scenario]

The Hidden Line Items

def true_cost_self_hosting() -> dict:
    obvious = {
        "gpu_instances": 10000,  # per month
        "storage": 500,
        "networking": 300,
    }

    hidden = {
        "egress": 3000,  # Data transfer out
        "logging": 2000,  # Observability stack
        "redundancy": 5000,  # Failover instances
        "security": 1000,  # Scans, compliance
    }

    eng_time = {
        "maintenance": 4000,  # 0.25 FTE
        "on_call": 2000,  # Incident response
        "optimization": 2000,  # Keeping it fast
    }

    return {
        "obvious_total": sum(obvious.values()),  # $10,800
        "hidden_total": sum(hidden.values()),    # $11,000
        "eng_total": sum(eng_time.values()),     # $8,000
        "true_monthly": 29800,  # Not $10,800
    }

When the Math Actually Works

def scenarios_that_work() -> list:
    return [
        {
            "scenario": "High volume, stable traffic",
            "monthly_tokens": "500M+",
            "why_works": "Fixed costs amortized over huge volume",
        },
        {
            "scenario": "Existing ML platform team",
            "team": "3+ dedicated ML infra engineers",
            "why_works": "Marginal cost to add LLM serving",
        },
        {
            "scenario": "Latency-critical, regional",
            "requirement": "<50ms P99",
            "why_works": "API can't match, value exceeds cost",
        },
        {
            "scenario": "Regulatory requirement",
            "constraint": "Data can't leave premises",
            "why_works": "No API option exists",
        },
    ]

The break-even formula is simple. The inputs are hard to estimate honestly.

When in doubt, multiply your setup time estimate by 2 and your ongoing maintenance estimate by 1.5. If it still breaks even, consider it.