Back to Blog
Using Spot Instances for Inference Workloads
Airlines overbook flights knowing some passengers won't show. Spot instances are cloud computing's overbooking—providers sell unused capacity cheap, but can reclaim it anytime.
For LLM inference, this creates an opportunity. Batch jobs don't care about interruptions. Real-time serving does. Mixing both can cut costs dramatically.
The Spot Economics
def spot_savings():
"""
Real spot vs on-demand pricing
"""
gpu_pricing = {
"h100": {
"on_demand": 4.50,
"spot": 1.80, # 60% discount
"spot_availability": "Variable by region",
},
"a100": {
"on_demand": 2.50,
"spot": 0.90, # 64% discount
"spot_availability": "Generally better than H100",
},
}
savings_example = {
"100_gpu_hours_on_demand": 450.00,
"100_gpu_hours_spot": 180.00,
"savings": 270.00,
"savings_percent": 60,
}
return gpu_pricing, savings_example
Where Spot Works for Inference
def spot_appropriate_workloads():
"""
Spot works when interruption is tolerable
"""
return {
"good_for_spot": [
{
"workload": "Batch summarization",
"why": "Can restart interrupted jobs",
"handling": "Checkpoint progress, resume",
},
{
"workload": "Embedding generation",
"why": "Stateless, idempotent",
"handling": "Retry failed batches",
},
{
"workload": "Evaluation runs",
"why": "Not time-sensitive",
"handling": "Queue and retry",
},
{
"workload": "Fine-tuning data prep",
"why": "Can wait for capacity",
"handling": "Checkpoint frequently",
},
{
"workload": "Low-priority background requests",
"why": "Users not waiting",
"handling": "Queue with fallback",
},
],
"bad_for_spot": [
{
"workload": "Real-time chat",
"why": "User waiting, can't retry mid-response",
},
{
"workload": "Streaming responses",
"why": "Interruption breaks stream",
},
{
"workload": "Low-latency API serving",
"why": "Interruption = failed request",
},
{
"workload": "SLA-bound requests",
"why": "Can't guarantee availability",
},
],
}
The Hybrid Architecture
class HybridInferenceCluster:
"""
Mix spot and on-demand for cost optimization
"""
def __init__(self):
self.on_demand_pool = {
"purpose": "Real-time, guaranteed capacity",
"size": "Minimum needed for baseline",
"autoscale": "Up to 2x for peaks",
}
self.spot_pool = {
"purpose": "Batch jobs, overflow",
"size": "As much as available",
"fallback": "Queue or use on-demand",
}
def route_request(self, request: dict) -> str:
if request["type"] == "batch":
if self.spot_available():
return "spot"
else:
return "queue" # Wait for spot
if request["type"] == "real_time":
return "on_demand" # Always on-demand
if request["priority"] == "low":
if self.spot_available():
return "spot"
else:
return "on_demand" # Fallback
return "on_demand" # Default safe choice
Handling Interruptions
class SpotInterruptionHandler:
"""
Spot instances give ~2 minute warning before termination
"""
def __init__(self):
self.checkpoint_interval = 30 # seconds
self.warning_received = False
async def monitor_interruption(self):
"""Poll cloud metadata for interruption notice"""
while True:
if await self.check_interruption_notice():
self.warning_received = True
await self.graceful_shutdown()
await asyncio.sleep(5)
async def graceful_shutdown(self):
"""
~2 minutes to shut down gracefully
"""
# 1. Stop accepting new requests
self.stop_accepting_requests()
# 2. Complete in-flight requests if possible
for request in self.in_flight_requests:
if request.estimated_completion < 90: # seconds
await request.complete()
else:
await self.checkpoint_request(request)
# 3. Checkpoint batch job progress
if self.current_batch_job:
self.checkpoint_batch_progress()
# 4. Notify orchestrator
await self.notify_pool_manager("shutting_down")
def checkpoint_batch_progress(self):
"""Save progress so another instance can resume"""
checkpoint = {
"job_id": self.current_job.id,
"items_processed": self.current_job.processed_count,
"next_item_index": self.current_job.current_index,
"timestamp": time.time(),
}
self.save_checkpoint(checkpoint)
The Cost Math
def spot_cost_analysis():
"""
Calculate actual savings with interruptions
"""
# Assumptions
spot_hourly = 1.80
on_demand_hourly = 4.50
interruption_rate = 0.05 # 5% of spot hours interrupted
restart_overhead_minutes = 10 # Time lost per interruption
# 1000 GPU hours on spot
spot_hours = 1000
expected_interruptions = spot_hours * interruption_rate # 50
wasted_compute_hours = expected_interruptions * (restart_overhead_minutes / 60) # 8.3 hours
effective_hours = spot_hours - wasted_compute_hours
effective_cost = (spot_hours * spot_hourly)
# Compare to on-demand (no interruptions)
on_demand_cost = effective_hours * on_demand_hourly
return {
"spot_cost": spot_hours * spot_hourly, # $1,800
"effective_spot_hours": effective_hours, # 991.7
"cost_per_effective_hour": effective_cost / effective_hours, # $1.82
"on_demand_equivalent": effective_hours * on_demand_hourly, # $4,462
"savings": on_demand_cost - effective_cost, # $2,662
"savings_percent": 59.6,
}
Best Practices
def spot_best_practices():
return [
{
"practice": "Design for idempotency",
"why": "Retrying interrupted work should be safe",
"how": "Unique job IDs, checkpoint before processing",
},
{
"practice": "Checkpoint frequently",
"why": "Minimize work lost on interruption",
"how": "Save state every N items or M seconds",
},
{
"practice": "Use multiple availability zones",
"why": "Spot availability varies by zone",
"how": "Spread spot fleet across 3+ zones",
},
{
"practice": "Keep on-demand baseline",
"why": "Guaranteed capacity for critical work",
"how": "Size for minimum viable throughput",
},
{
"practice": "Monitor spot pricing",
"why": "Prices spike when capacity is low",
"how": "Set max price, fall back to on-demand",
},
{
"practice": "Test interruption handling",
"why": "Find bugs before production",
"how": "Manually terminate spot instances",
},
]
Spot instances aren't free money—they're cheap capacity with strings attached. Design around the strings, and you get 50-70% cost reduction. Ignore the strings, and you get reliability problems.