Using Spot Instances for Inference Workloads

Airlines overbook flights knowing some passengers won't show. Spot instances are cloud computing's overbooking—providers sell unused capacity cheap, but can reclaim it anytime.

For LLM inference, this creates an opportunity. Batch jobs don't care about interruptions. Real-time serving does. Mixing both can cut costs dramatically.

The Spot Economics

def spot_savings():
    """
    Real spot vs on-demand pricing
    """
    gpu_pricing = {
        "h100": {
            "on_demand": 4.50,
            "spot": 1.80,  # 60% discount
            "spot_availability": "Variable by region",
        },
        "a100": {
            "on_demand": 2.50,
            "spot": 0.90,  # 64% discount
            "spot_availability": "Generally better than H100",
        },
    }

    savings_example = {
        "100_gpu_hours_on_demand": 450.00,
        "100_gpu_hours_spot": 180.00,
        "savings": 270.00,
        "savings_percent": 60,
    }

    return gpu_pricing, savings_example

Where Spot Works for Inference

def spot_appropriate_workloads():
    """
    Spot works when interruption is tolerable
    """
    return {
        "good_for_spot": [
            {
                "workload": "Batch summarization",
                "why": "Can restart interrupted jobs",
                "handling": "Checkpoint progress, resume",
            },
            {
                "workload": "Embedding generation",
                "why": "Stateless, idempotent",
                "handling": "Retry failed batches",
            },
            {
                "workload": "Evaluation runs",
                "why": "Not time-sensitive",
                "handling": "Queue and retry",
            },
            {
                "workload": "Fine-tuning data prep",
                "why": "Can wait for capacity",
                "handling": "Checkpoint frequently",
            },
            {
                "workload": "Low-priority background requests",
                "why": "Users not waiting",
                "handling": "Queue with fallback",
            },
        ],
        "bad_for_spot": [
            {
                "workload": "Real-time chat",
                "why": "User waiting, can't retry mid-response",
            },
            {
                "workload": "Streaming responses",
                "why": "Interruption breaks stream",
            },
            {
                "workload": "Low-latency API serving",
                "why": "Interruption = failed request",
            },
            {
                "workload": "SLA-bound requests",
                "why": "Can't guarantee availability",
            },
        ],
    }

The Hybrid Architecture

class HybridInferenceCluster:
    """
    Mix spot and on-demand for cost optimization
    """
    def __init__(self):
        self.on_demand_pool = {
            "purpose": "Real-time, guaranteed capacity",
            "size": "Minimum needed for baseline",
            "autoscale": "Up to 2x for peaks",
        }
        self.spot_pool = {
            "purpose": "Batch jobs, overflow",
            "size": "As much as available",
            "fallback": "Queue or use on-demand",
        }

    def route_request(self, request: dict) -> str:
        if request["type"] == "batch":
            if self.spot_available():
                return "spot"
            else:
                return "queue"  # Wait for spot

        if request["type"] == "real_time":
            return "on_demand"  # Always on-demand

        if request["priority"] == "low":
            if self.spot_available():
                return "spot"
            else:
                return "on_demand"  # Fallback

        return "on_demand"  # Default safe choice

Handling Interruptions

class SpotInterruptionHandler:
    """
    Spot instances give ~2 minute warning before termination
    """
    def __init__(self):
        self.checkpoint_interval = 30  # seconds
        self.warning_received = False

    async def monitor_interruption(self):
        """Poll cloud metadata for interruption notice"""
        while True:
            if await self.check_interruption_notice():
                self.warning_received = True
                await self.graceful_shutdown()
            await asyncio.sleep(5)

    async def graceful_shutdown(self):
        """
        ~2 minutes to shut down gracefully
        """
        # 1. Stop accepting new requests
        self.stop_accepting_requests()

        # 2. Complete in-flight requests if possible
        for request in self.in_flight_requests:
            if request.estimated_completion < 90:  # seconds
                await request.complete()
            else:
                await self.checkpoint_request(request)

        # 3. Checkpoint batch job progress
        if self.current_batch_job:
            self.checkpoint_batch_progress()

        # 4. Notify orchestrator
        await self.notify_pool_manager("shutting_down")

    def checkpoint_batch_progress(self):
        """Save progress so another instance can resume"""
        checkpoint = {
            "job_id": self.current_job.id,
            "items_processed": self.current_job.processed_count,
            "next_item_index": self.current_job.current_index,
            "timestamp": time.time(),
        }
        self.save_checkpoint(checkpoint)

The Cost Math

def spot_cost_analysis():
    """
    Calculate actual savings with interruptions
    """
    # Assumptions
    spot_hourly = 1.80
    on_demand_hourly = 4.50
    interruption_rate = 0.05  # 5% of spot hours interrupted
    restart_overhead_minutes = 10  # Time lost per interruption

    # 1000 GPU hours on spot
    spot_hours = 1000
    expected_interruptions = spot_hours * interruption_rate  # 50
    wasted_compute_hours = expected_interruptions * (restart_overhead_minutes / 60)  # 8.3 hours

    effective_hours = spot_hours - wasted_compute_hours
    effective_cost = (spot_hours * spot_hourly)

    # Compare to on-demand (no interruptions)
    on_demand_cost = effective_hours * on_demand_hourly

    return {
        "spot_cost": spot_hours * spot_hourly,  # $1,800
        "effective_spot_hours": effective_hours,  # 991.7
        "cost_per_effective_hour": effective_cost / effective_hours,  # $1.82
        "on_demand_equivalent": effective_hours * on_demand_hourly,  # $4,462
        "savings": on_demand_cost - effective_cost,  # $2,662
        "savings_percent": 59.6,
    }

Best Practices

def spot_best_practices():
    return [
        {
            "practice": "Design for idempotency",
            "why": "Retrying interrupted work should be safe",
            "how": "Unique job IDs, checkpoint before processing",
        },
        {
            "practice": "Checkpoint frequently",
            "why": "Minimize work lost on interruption",
            "how": "Save state every N items or M seconds",
        },
        {
            "practice": "Use multiple availability zones",
            "why": "Spot availability varies by zone",
            "how": "Spread spot fleet across 3+ zones",
        },
        {
            "practice": "Keep on-demand baseline",
            "why": "Guaranteed capacity for critical work",
            "how": "Size for minimum viable throughput",
        },
        {
            "practice": "Monitor spot pricing",
            "why": "Prices spike when capacity is low",
            "how": "Set max price, fall back to on-demand",
        },
        {
            "practice": "Test interruption handling",
            "why": "Find bugs before production",
            "how": "Manually terminate spot instances",
        },
    ]

Spot instances aren't free money—they're cheap capacity with strings attached. Design around the strings, and you get 50-70% cost reduction. Ignore the strings, and you get reliability problems.