Back to Blog
What Production LLM Systems Need to Survive
A prototype car only needs to drive. A production car needs seatbelts, airbags, crumple zones, and a hundred safety systems you never think about until you need them. The engine is the same. Everything else is different.
Production LLM systems have the same distinction. The model is the engine. Timeouts, retries, fallbacks, and rate limits are the safety systems that determine if the car survives the real road.
The Production Survival Kit
class ProductionSurvivalKit:
"""
What demo code doesn't have
"""
components = {
"timeouts": {
"purpose": "Prevent stuck requests from consuming resources",
"layer": "Apply at every network boundary",
"default": "Never wait forever",
},
"retries": {
"purpose": "Handle transient failures",
"pattern": "Exponential backoff with jitter",
"limit": "Don't retry forever",
},
"circuit_breakers": {
"purpose": "Stop calling failing services",
"pattern": "Open after N failures, half-open to test recovery",
"why": "Prevent cascade failures",
},
"rate_limits": {
"purpose": "Protect from overload",
"granularity": "Per user, per endpoint, global",
"why": "One bad actor shouldn't take down everyone",
},
"fallbacks": {
"purpose": "Degrade gracefully",
"options": "Cached response, smaller model, error message",
"why": "Something is better than nothing",
},
"monitoring": {
"purpose": "Know when things break",
"metrics": "Latency, errors, throughput, saturation",
"alerting": "Page before users complain",
},
}
Timeout Architecture
def timeout_strategy():
return {
"principle": "Timeouts at every layer, coordinated",
"layers": {
"client": {
"timeout": "60s",
"reason": "User patience limit",
},
"load_balancer": {
"timeout": "55s",
"reason": "Must be less than client",
},
"api_gateway": {
"timeout": "50s",
"reason": "Time for processing after LB",
},
"application": {
"timeout": "45s",
"reason": "Time for cleanup on timeout",
},
"model_inference": {
"timeout": "40s",
"reason": "Allow app layer to handle timeout",
},
},
"implementation": """
# Coordinated timeouts
async def handle_request(request):
try:
async with asyncio.timeout(45):
result = await generate(request)
return result
except asyncio.TimeoutError:
log.warning(f"Request timed out: {request.id}")
return error_response("Request timed out", 504)
""",
}
Retry Strategy
class RetryStrategy:
"""
How to retry without making things worse
"""
def __init__(self):
self.max_retries = 3
self.base_delay = 1.0
self.max_delay = 30.0
self.jitter = 0.1
def should_retry(self, error: Exception, attempt: int) -> bool:
if attempt >= self.max_retries:
return False
# Retry on transient errors
retryable = [
"Connection refused",
"Timeout",
"503 Service Unavailable",
"429 Too Many Requests",
]
return any(r in str(error) for r in retryable)
def delay(self, attempt: int) -> float:
# Exponential backoff with jitter
delay = min(
self.base_delay * (2 ** attempt),
self.max_delay
)
# Add jitter to prevent thundering herd
jitter = delay * self.jitter * random.random()
return delay + jitter
async def execute_with_retry(self, func, *args):
last_error = None
for attempt in range(self.max_retries + 1):
try:
return await func(*args)
except Exception as e:
last_error = e
if self.should_retry(e, attempt):
await asyncio.sleep(self.delay(attempt))
else:
raise
raise last_error
Circuit Breaker
class CircuitBreaker:
"""
Stop calling failing services
"""
def __init__(self, failure_threshold: int = 5, recovery_time: float = 60):
self.failure_threshold = failure_threshold
self.recovery_time = recovery_time
self.failures = 0
self.state = "closed" # closed, open, half-open
self.last_failure_time = None
async def call(self, func, *args):
if self.state == "open":
if time.time() - self.last_failure_time > self.recovery_time:
self.state = "half-open"
else:
raise CircuitBreakerOpen()
try:
result = await func(*args)
self.on_success()
return result
except Exception as e:
self.on_failure()
raise
def on_success(self):
self.failures = 0
self.state = "closed"
def on_failure(self):
self.failures += 1
self.last_failure_time = time.time()
if self.failures >= self.failure_threshold:
self.state = "open"
Rate Limiting
def rate_limiting_strategy():
return {
"per_user": {
"purpose": "Prevent single user abuse",
"limit": "100 requests/minute or 1M tokens/day",
"response": "429 Too Many Requests with Retry-After header",
},
"per_endpoint": {
"purpose": "Protect expensive endpoints",
"example": "/generate has lower limit than /health",
"implementation": "Different limits by path",
},
"global": {
"purpose": "Protect infrastructure",
"limit": "Based on actual capacity",
"trigger": "When approaching capacity",
},
"implementation": """
# Token bucket rate limiter
class RateLimiter:
def __init__(self, rate: float, capacity: int):
self.rate = rate # Tokens per second
self.capacity = capacity
self.tokens = capacity
self.last_update = time.time()
def allow(self) -> bool:
now = time.time()
elapsed = now - self.last_update
self.tokens = min(
self.capacity,
self.tokens + elapsed * self.rate
)
self.last_update = now
if self.tokens >= 1:
self.tokens -= 1
return True
return False
""",
}
Fallback Strategy
def fallback_strategy():
return {
"cascade": {
"tier_1": "Primary model/provider",
"tier_2": "Backup model/provider",
"tier_3": "Cached response (if available)",
"tier_4": "Graceful error message",
},
"implementation": """
async def generate_with_fallback(request):
# Try primary
try:
return await primary_model.generate(request)
except Exception as e:
log.warning(f"Primary failed: {e}")
# Try backup
try:
return await backup_model.generate(request)
except Exception as e:
log.warning(f"Backup failed: {e}")
# Try cache
cached = await cache.get(request.prompt_hash)
if cached:
return cached.with_warning("Using cached response")
# Graceful failure
return error_response(
"Service temporarily unavailable. Please retry.",
503
)
""",
}
Monitoring and Alerting
def monitoring_essentials():
return {
"metrics": {
"latency": {
"what": "P50, P95, P99 response time",
"alert": "P99 > 5s",
},
"error_rate": {
"what": "Percentage of failed requests",
"alert": "> 1%",
},
"throughput": {
"what": "Requests per second",
"alert": "Sudden drop > 50%",
},
"saturation": {
"what": "GPU utilization, queue depth",
"alert": "> 90% sustained",
},
},
"logs": {
"structured": "JSON logs with request_id, latency, status",
"correlation": "Trace requests across services",
"retention": "Keep for debugging (14-30 days)",
},
"dashboards": {
"operational": "Current health at a glance",
"analytical": "Trends over time",
"debugging": "Drill-down capability",
},
}
The Checklist
def production_readiness_checklist():
return [
"[ ] Timeouts configured at all layers",
"[ ] Retry logic with exponential backoff",
"[ ] Circuit breakers for external dependencies",
"[ ] Rate limits per user and global",
"[ ] Fallback strategy defined and tested",
"[ ] Health check endpoint (separate from inference)",
"[ ] Metrics for latency, errors, throughput",
"[ ] Alerts before user impact",
"[ ] Graceful shutdown on SIGTERM",
"[ ] Log correlation across requests",
"[ ] Runbooks for common failures",
"[ ] Chaos testing performed",
]
Production systems survive not because nothing goes wrong, but because everything has a plan for when it does. The model is the easy part. The resilience infrastructure is what keeps it running.