Back to Blog
Starting Cheap and Escalating When Needed
Tech support has tiers. Level 1 handles password resets. Unsolved issues escalate to Level 2. Complex problems reach Level 3 engineers. Most issues never leave Level 1, keeping costs low while ensuring hard problems get expert attention.
Cascade routing applies this pattern to models. Start with the cheap model. If it can't solve the problem confidently, escalate to the expensive one. Most requests never escalate.
The Cascade Pattern
class CascadeRouter:
"""
Try cheap model first, escalate if needed
"""
def __init__(self):
self.small_model = load_model("haiku")
self.large_model = load_model("opus")
self.confidence_threshold = 0.85
async def generate(self, prompt: str) -> dict:
# Try small model first
small_result = await self.small_model.generate_with_confidence(prompt)
if small_result.confidence >= self.confidence_threshold:
return {
"output": small_result.text,
"model": "small",
"cost": "low",
"escalated": False,
}
# Escalate to large model
large_result = await self.large_model.generate(prompt)
return {
"output": large_result.text,
"model": "large",
"cost": "high",
"escalated": True,
}
Measuring Confidence
class ConfidenceMeasures:
"""
Different ways to determine when to escalate
"""
def log_probability_confidence(self, logprobs: list) -> float:
"""Use model's own confidence via log probabilities"""
avg_logprob = sum(logprobs) / len(logprobs)
# Convert to 0-1 scale (roughly)
return min(1.0, max(0.0, (avg_logprob + 2) / 2))
def self_reported_confidence(self, prompt: str, response: str) -> float:
"""Ask model to rate its confidence"""
confidence_prompt = f"""
You answered: "{response[:200]}"
to the question: "{prompt[:200]}"
Rate your confidence 0-100 that this is correct: """
score = self.model.generate(confidence_prompt)
return int(score) / 100
def consistency_confidence(self, prompt: str, n: int = 3) -> float:
"""Generate multiple times, check consistency"""
responses = [self.model.generate(prompt) for _ in range(n)]
# Check if responses agree
unique_responses = len(set(responses))
consistency = 1 - (unique_responses - 1) / n
return consistency
def heuristic_confidence(self, response: str) -> float:
"""Simple heuristics for confidence"""
signals = {
"uncertainty_phrases": ["I'm not sure", "I think", "might be", "possibly"],
"short_response": len(response) < 50,
"refusal": "I cannot" in response or "I don't know" in response,
}
if any(phrase in response for phrase in signals["uncertainty_phrases"]):
return 0.6
if signals["short_response"]:
return 0.7
if signals["refusal"]:
return 0.3
return 0.9 # Default high confidence
Cost-Quality Tradeoff
def cascade_economics():
"""
The math on cascade routing
"""
# Assumptions
small_model_cost = 0.25 # per 1M tokens
large_model_cost = 15.00 # per 1M tokens
# Scenario: 1M requests
escalation_rates = [0.10, 0.20, 0.30, 0.40] # Different thresholds
results = {}
for escalation_rate in escalation_rates:
small_requests = 1_000_000 * (1 - escalation_rate)
large_requests = 1_000_000 * escalation_rate
small_cost = small_requests * small_model_cost / 1_000_000
large_cost = large_requests * large_model_cost / 1_000_000
total_cost = small_cost + large_cost
baseline = 1_000_000 * 3.00 / 1_000_000 # All medium model
results[escalation_rate] = {
"total_cost": total_cost,
"savings_vs_baseline": f"{(1 - total_cost/baseline)*100:.0f}%",
}
return {
"10% escalation": "$1,525 (49% savings)",
"20% escalation": "$2,800 (7% savings)",
"30% escalation": "$4,075 (-36% more expensive)",
"insight": "Escalation rate must be low for savings",
}
Calibrating the Threshold
def calibrate_threshold():
"""
Find the right confidence threshold
"""
return {
"process": """
1. Run evaluation set through small model with confidence scores
2. For each potential threshold:
- Calculate what % would escalate
- Calculate quality of non-escalated responses
- Calculate cost
3. Pick threshold that maximizes: quality * (1 - cost_ratio)
""",
"implementation": """
def find_optimal_threshold(eval_set):
results = []
for threshold in [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
escalation_rate = sum(
1 for item in eval_set
if item.small_confidence < threshold
) / len(eval_set)
# Quality of non-escalated
non_escalated = [
item for item in eval_set
if item.small_confidence >= threshold
]
quality = evaluate_quality(non_escalated)
results.append({
"threshold": threshold,
"escalation_rate": escalation_rate,
"quality": quality,
})
return results
""",
"typical_outcome": """
Threshold 0.85 often works well:
- 15-25% of requests escalate
- 95%+ quality on non-escalated
- 60-70% cost savings overall
""",
}
Quality Safeguards
def cascade_quality_safeguards():
return {
"task_type_override": {
"description": "Some tasks always go to large model",
"example": "Code generation, legal review",
"implementation": """
if task_type in ["code", "legal", "medical"]:
return await large_model.generate(prompt)
# else cascade logic
""",
},
"user_tier_override": {
"description": "Premium users always get large model",
"example": "Enterprise customers",
"implementation": """
if user.tier == "enterprise":
return await large_model.generate(prompt)
""",
},
"quality_monitoring": {
"description": "Track quality of small vs large outputs",
"alert": "If small model quality drops, lower threshold",
},
"user_feedback_loop": {
"description": "If users regenerate, consider escalating",
"implementation": """
if request.is_regeneration and previous.model == "small":
return await large_model.generate(prompt)
""",
},
}
Implementation Pattern
class CascadeWithFallback:
"""
Full cascade implementation with safeguards
"""
async def generate(self, request: Request) -> Response:
# Check for overrides
if self.should_skip_cascade(request):
return await self.large_model.generate(request)
# Try small model
try:
small_response = await asyncio.wait_for(
self.small_model.generate_with_confidence(request),
timeout=5.0
)
except asyncio.TimeoutError:
# Small model slow, go to large
return await self.large_model.generate(request)
# Check confidence
if small_response.confidence < self.threshold:
# Log for analysis
self.log_escalation(request, small_response)
return await self.large_model.generate(request)
# Verify not a refusal or low-quality response
if self.is_low_quality(small_response):
return await self.large_model.generate(request)
return small_response
def should_skip_cascade(self, request: Request) -> bool:
return (
request.user.tier == "enterprise" or
request.task_type in self.always_large_tasks or
request.is_regeneration
)
Monitoring Cascade Performance
def cascade_metrics():
return {
"track": [
"Escalation rate by task type",
"Escalation rate by time of day",
"Quality score by model used",
"Cost savings vs baseline",
"User regeneration rate by model",
],
"alerts": [
"Escalation rate > 40% (cost savings eroding)",
"Small model quality dropping",
"User complaints correlated with small model",
],
"dashboard": """
Cascade Health Dashboard:
- Escalation funnel
- Quality comparison: small vs large
- Cost per request by path
- Threshold calibration suggestions
""",
}
Cascade routing works when the small model handles most requests well. If escalation rate creeps above 30-40%, the savings disappear. Monitor the ratio and adjust the threshold, or improve the small model's coverage.