Back to Blog

What Production LLM Systems Need to Survive

A prototype car only needs to drive. A production car needs seatbelts, airbags, crumple zones, and a hundred safety systems you never think about until you need them. The engine is the same. Everything else is different.

Production LLM systems have the same distinction. The model is the engine. Timeouts, retries, fallbacks, and rate limits are the safety systems that determine if the car survives the real road.

The Production Survival Kit

class ProductionSurvivalKit:
    """
    What demo code doesn't have
    """
    components = {
        "timeouts": {
            "purpose": "Prevent stuck requests from consuming resources",
            "layer": "Apply at every network boundary",
            "default": "Never wait forever",
        },
        "retries": {
            "purpose": "Handle transient failures",
            "pattern": "Exponential backoff with jitter",
            "limit": "Don't retry forever",
        },
        "circuit_breakers": {
            "purpose": "Stop calling failing services",
            "pattern": "Open after N failures, half-open to test recovery",
            "why": "Prevent cascade failures",
        },
        "rate_limits": {
            "purpose": "Protect from overload",
            "granularity": "Per user, per endpoint, global",
            "why": "One bad actor shouldn't take down everyone",
        },
        "fallbacks": {
            "purpose": "Degrade gracefully",
            "options": "Cached response, smaller model, error message",
            "why": "Something is better than nothing",
        },
        "monitoring": {
            "purpose": "Know when things break",
            "metrics": "Latency, errors, throughput, saturation",
            "alerting": "Page before users complain",
        },
    }

Timeout Architecture

def timeout_strategy():
    return {
        "principle": "Timeouts at every layer, coordinated",

        "layers": {
            "client": {
                "timeout": "60s",
                "reason": "User patience limit",
            },
            "load_balancer": {
                "timeout": "55s",
                "reason": "Must be less than client",
            },
            "api_gateway": {
                "timeout": "50s",
                "reason": "Time for processing after LB",
            },
            "application": {
                "timeout": "45s",
                "reason": "Time for cleanup on timeout",
            },
            "model_inference": {
                "timeout": "40s",
                "reason": "Allow app layer to handle timeout",
            },
        },

        "implementation": """
        # Coordinated timeouts
        async def handle_request(request):
            try:
                async with asyncio.timeout(45):
                    result = await generate(request)
                    return result
            except asyncio.TimeoutError:
                log.warning(f"Request timed out: {request.id}")
                return error_response("Request timed out", 504)
        """,
    }

Retry Strategy

class RetryStrategy:
    """
    How to retry without making things worse
    """

    def __init__(self):
        self.max_retries = 3
        self.base_delay = 1.0
        self.max_delay = 30.0
        self.jitter = 0.1

    def should_retry(self, error: Exception, attempt: int) -> bool:
        if attempt >= self.max_retries:
            return False

        # Retry on transient errors
        retryable = [
            "Connection refused",
            "Timeout",
            "503 Service Unavailable",
            "429 Too Many Requests",
        ]

        return any(r in str(error) for r in retryable)

    def delay(self, attempt: int) -> float:
        # Exponential backoff with jitter
        delay = min(
            self.base_delay * (2 ** attempt),
            self.max_delay
        )
        # Add jitter to prevent thundering herd
        jitter = delay * self.jitter * random.random()
        return delay + jitter

    async def execute_with_retry(self, func, *args):
        last_error = None
        for attempt in range(self.max_retries + 1):
            try:
                return await func(*args)
            except Exception as e:
                last_error = e
                if self.should_retry(e, attempt):
                    await asyncio.sleep(self.delay(attempt))
                else:
                    raise
        raise last_error

Circuit Breaker

class CircuitBreaker:
    """
    Stop calling failing services
    """

    def __init__(self, failure_threshold: int = 5, recovery_time: float = 60):
        self.failure_threshold = failure_threshold
        self.recovery_time = recovery_time
        self.failures = 0
        self.state = "closed"  # closed, open, half-open
        self.last_failure_time = None

    async def call(self, func, *args):
        if self.state == "open":
            if time.time() - self.last_failure_time > self.recovery_time:
                self.state = "half-open"
            else:
                raise CircuitBreakerOpen()

        try:
            result = await func(*args)
            self.on_success()
            return result
        except Exception as e:
            self.on_failure()
            raise

    def on_success(self):
        self.failures = 0
        self.state = "closed"

    def on_failure(self):
        self.failures += 1
        self.last_failure_time = time.time()
        if self.failures >= self.failure_threshold:
            self.state = "open"

Rate Limiting

def rate_limiting_strategy():
    return {
        "per_user": {
            "purpose": "Prevent single user abuse",
            "limit": "100 requests/minute or 1M tokens/day",
            "response": "429 Too Many Requests with Retry-After header",
        },
        "per_endpoint": {
            "purpose": "Protect expensive endpoints",
            "example": "/generate has lower limit than /health",
            "implementation": "Different limits by path",
        },
        "global": {
            "purpose": "Protect infrastructure",
            "limit": "Based on actual capacity",
            "trigger": "When approaching capacity",
        },
        "implementation": """
        # Token bucket rate limiter
        class RateLimiter:
            def __init__(self, rate: float, capacity: int):
                self.rate = rate  # Tokens per second
                self.capacity = capacity
                self.tokens = capacity
                self.last_update = time.time()

            def allow(self) -> bool:
                now = time.time()
                elapsed = now - self.last_update
                self.tokens = min(
                    self.capacity,
                    self.tokens + elapsed * self.rate
                )
                self.last_update = now

                if self.tokens >= 1:
                    self.tokens -= 1
                    return True
                return False
        """,
    }

Fallback Strategy

def fallback_strategy():
    return {
        "cascade": {
            "tier_1": "Primary model/provider",
            "tier_2": "Backup model/provider",
            "tier_3": "Cached response (if available)",
            "tier_4": "Graceful error message",
        },
        "implementation": """
        async def generate_with_fallback(request):
            # Try primary
            try:
                return await primary_model.generate(request)
            except Exception as e:
                log.warning(f"Primary failed: {e}")

            # Try backup
            try:
                return await backup_model.generate(request)
            except Exception as e:
                log.warning(f"Backup failed: {e}")

            # Try cache
            cached = await cache.get(request.prompt_hash)
            if cached:
                return cached.with_warning("Using cached response")

            # Graceful failure
            return error_response(
                "Service temporarily unavailable. Please retry.",
                503
            )
        """,
    }

Monitoring and Alerting

def monitoring_essentials():
    return {
        "metrics": {
            "latency": {
                "what": "P50, P95, P99 response time",
                "alert": "P99 > 5s",
            },
            "error_rate": {
                "what": "Percentage of failed requests",
                "alert": "> 1%",
            },
            "throughput": {
                "what": "Requests per second",
                "alert": "Sudden drop > 50%",
            },
            "saturation": {
                "what": "GPU utilization, queue depth",
                "alert": "> 90% sustained",
            },
        },
        "logs": {
            "structured": "JSON logs with request_id, latency, status",
            "correlation": "Trace requests across services",
            "retention": "Keep for debugging (14-30 days)",
        },
        "dashboards": {
            "operational": "Current health at a glance",
            "analytical": "Trends over time",
            "debugging": "Drill-down capability",
        },
    }

The Checklist

def production_readiness_checklist():
    return [
        "[ ] Timeouts configured at all layers",
        "[ ] Retry logic with exponential backoff",
        "[ ] Circuit breakers for external dependencies",
        "[ ] Rate limits per user and global",
        "[ ] Fallback strategy defined and tested",
        "[ ] Health check endpoint (separate from inference)",
        "[ ] Metrics for latency, errors, throughput",
        "[ ] Alerts before user impact",
        "[ ] Graceful shutdown on SIGTERM",
        "[ ] Log correlation across requests",
        "[ ] Runbooks for common failures",
        "[ ] Chaos testing performed",
    ]

Production systems survive not because nothing goes wrong, but because everything has a plan for when it does. The model is the easy part. The resilience infrastructure is what keeps it running.