Back to Blog
Degrading Gracefully Under Load
During blackouts, hospitals don't shut down. They switch to generators, dim non-essential lights, and prioritize critical care. Reduced service is better than no service.
LLM systems should degrade similarly. When capacity is strained, serve something rather than nothing. A shorter response is better than a timeout. A cached answer beats an error page.
The Degradation Ladder
class DegradationLevels:
"""
Progressive degradation as load increases
"""
levels = {
"normal": {
"load_threshold": 0.7, # Below 70% capacity
"actions": "Full service",
"quality": "100%",
},
"elevated": {
"load_threshold": 0.85, # 70-85% capacity
"actions": [
"Reduce max_tokens to 2K",
"Disable expensive features",
],
"quality": "90%",
},
"high": {
"load_threshold": 0.95, # 85-95% capacity
"actions": [
"Use smaller model for low-priority",
"Reduce context window",
"Enable aggressive caching",
],
"quality": "75%",
},
"critical": {
"load_threshold": 1.0, # 95%+ capacity
"actions": [
"All requests use small model",
"Return cached responses where possible",
"Queue non-urgent requests",
],
"quality": "50%",
},
"overload": {
"load_threshold": float("inf"), # Beyond capacity
"actions": [
"Reject low-priority requests",
"Serve only from cache",
"Emergency mode messaging",
],
"quality": "Emergency only",
},
}
Implementing Degradation
class GracefulDegradation:
def __init__(self, config):
self.config = config
def current_load(self) -> float:
"""Calculate current system load"""
metrics = {
"gpu_utilization": self.get_gpu_util(),
"queue_depth": self.get_queue_depth(),
"concurrent_requests": self.get_concurrent(),
}
# Weighted average
return (
metrics["gpu_utilization"] * 0.4 +
min(1.0, metrics["queue_depth"] / self.config.max_queue) * 0.3 +
min(1.0, metrics["concurrent_requests"] / self.config.max_concurrent) * 0.3
)
def get_degradation_level(self) -> str:
load = self.current_load()
for level, config in DegradationLevels.levels.items():
if load < config["load_threshold"]:
return level
return "overload"
def apply_degradation(self, request: Request) -> Request:
"""Modify request based on degradation level"""
level = self.get_degradation_level()
if level == "normal":
return request
if level == "elevated":
request.max_tokens = min(request.max_tokens, 2000)
return request
if level == "high":
request.max_tokens = min(request.max_tokens, 1000)
if request.priority == "low":
request.model = "small"
return request
if level in ["critical", "overload"]:
request.model = "small"
request.max_tokens = min(request.max_tokens, 500)
return request
return request
Degradation Strategies
def degradation_strategies():
return {
"reduce_output_length": {
"how": "Lower max_tokens",
"impact": "Shorter responses",
"user_perception": "Truncated answers",
"implementation": "request.max_tokens = min(request.max_tokens, limit)",
},
"smaller_model": {
"how": "Route to less capable model",
"impact": "Faster but lower quality",
"user_perception": "Simpler answers",
"implementation": "Route to haiku instead of opus",
},
"shorter_context": {
"how": "Truncate input context",
"impact": "Less context-aware responses",
"user_perception": "May miss earlier conversation",
"implementation": "Summarize or truncate history",
},
"cached_responses": {
"how": "Return cached answers for common queries",
"impact": "Instant but potentially stale",
"user_perception": "Fast but may not be tailored",
"implementation": "Check cache before model call",
},
"queue_with_estimate": {
"how": "Queue request, give ETA",
"impact": "Delayed but eventual service",
"user_perception": "Waiting but informed",
"implementation": "Return 202 with estimated wait time",
},
"feature_reduction": {
"how": "Disable non-essential features",
"impact": "Core function works, extras don't",
"user_perception": "Limited but functional",
"implementation": "Check feature flags based on load",
},
}
User Communication
def communicate_degradation():
"""
Users should know when they're getting degraded service
"""
return {
"headers": {
"X-Service-Level": "degraded | normal",
"X-Service-Reason": "high_load | capacity_limit",
},
"response_metadata": {
"service_status": "degraded",
"reason": "High demand, shorter response",
"full_service_eta": "~5 minutes",
},
"user_message": """
[Response may be shorter than usual due to high demand]
""",
"why_communicate": [
"Sets expectations",
"Reduces support tickets",
"Allows user to retry later",
"Builds trust through transparency",
],
}
Priority-Based Degradation
class PriorityDegradation:
"""
Degrade low-priority first, protect high-priority
"""
def handle_request(self, request: Request) -> Response:
load = self.current_load()
# High priority: never degrade until critical
if request.priority == "high":
if load < 0.95:
return self.full_service(request)
return self.minimal_degradation(request)
# Medium priority: degrade under elevated load
if request.priority == "medium":
if load < 0.85:
return self.full_service(request)
return self.moderate_degradation(request)
# Low priority: degrade early, protect capacity
if load < 0.7:
return self.full_service(request)
if load < 0.9:
return self.aggressive_degradation(request)
return self.queue_or_reject(request)
Automatic Recovery
def automatic_recovery():
return {
"detect_recovery": """
# Monitor load, restore service levels automatically
async def recovery_monitor():
while True:
load = get_current_load()
if load < 0.6 and degradation_level != "normal":
# Load dropped, restore service
await gradual_restore()
await asyncio.sleep(10)
""",
"gradual_restore": """
# Don't restore all at once (could spike load again)
def gradual_restore():
current = get_degradation_level()
if current == "critical":
set_degradation_level("high")
elif current == "high":
set_degradation_level("elevated")
elif current == "elevated":
set_degradation_level("normal")
""",
"hysteresis": """
# Don't oscillate between levels
# Require sustained improvement before upgrading
if load < threshold for 5 minutes:
upgrade_service_level()
""",
}
Monitoring Degradation
def degradation_monitoring():
return {
"metrics": [
"Time spent at each degradation level",
"Requests affected by degradation",
"Quality impact (user feedback during degradation)",
"Recovery time after load spikes",
],
"alerts": [
"Degradation level elevated for > 30 minutes",
"Critical level reached",
"Quality feedback negative during degradation",
],
"post_incident": """
After degradation event:
1. How long were we degraded?
2. What triggered it?
3. Did degradation prevent worse outcomes?
4. Should we add capacity?
""",
}
Graceful degradation is the difference between "site down" and "site slow." When you can't serve everyone fully, serve everyone partially. Users prefer a slower response to an error message.