Back to Blog
Deciding Which Model Handles Each Request
Hospitals triage patients. A sprained ankle doesn't need the chief surgeon. A complex trauma does. Matching care level to need is how hospitals serve everyone efficiently.
Model routing applies the same principle. A yes/no classification doesn't need a 400B parameter model. A complex reasoning task might. Routing correctly saves 10x costs on simple requests while preserving quality where it matters.
The Routing Decision
class ModelRouter:
"""
Route requests to appropriate model
"""
def __init__(self):
self.models = {
"small": "haiku", # Fast, cheap, good for simple tasks
"medium": "sonnet", # Balanced
"large": "opus", # Best quality, expensive
}
def route(self, request: dict) -> str:
task_type = self.classify_task(request)
routing_table = {
"classification": "small",
"extraction": "small",
"summarization": "medium",
"question_answering": "medium",
"code_generation": "large",
"complex_reasoning": "large",
"creative_writing": "large",
}
return self.models[routing_table.get(task_type, "medium")]
Task Classification
def task_classification_approaches():
return {
"keyword_based": {
"approach": "Check for task keywords in prompt",
"implementation": """
def classify_by_keywords(prompt: str) -> str:
if any(kw in prompt.lower() for kw in ["classify", "yes or no", "true or false"]):
return "classification"
if any(kw in prompt.lower() for kw in ["extract", "find", "list all"]):
return "extraction"
if any(kw in prompt.lower() for kw in ["summarize", "tl;dr"]):
return "summarization"
if "write code" in prompt.lower() or "```" in prompt:
return "code_generation"
return "general"
""",
"pros": ["Fast", "No model calls"],
"cons": ["Brittle", "Misses nuance"],
},
"classifier_model": {
"approach": "Use small model to classify task",
"implementation": """
async def classify_with_model(prompt: str) -> str:
classification = await haiku.generate(
f"Classify this task type: {prompt[:500]}"
"Options: classification, extraction, summarization, "
"question_answering, code_generation, reasoning, creative"
)
return classification.strip().lower()
""",
"pros": ["Accurate", "Handles edge cases"],
"cons": ["Adds latency", "Extra cost"],
},
"hybrid": {
"approach": "Keywords first, classifier for ambiguous",
"pros": ["Fast for obvious cases", "Accurate for hard cases"],
},
}
Cost Impact
def routing_cost_impact():
"""
Real impact of smart routing
"""
# Example pricing (approximate)
pricing_per_million = {
"haiku": 0.25,
"sonnet": 3.00,
"opus": 15.00,
}
# Example workload: 1M requests/day
workload = {
"classification": 0.30, # 30% of requests
"extraction": 0.20,
"summarization": 0.15,
"qa": 0.15,
"code": 0.10,
"reasoning": 0.10,
}
# Without routing: all to medium
no_routing_cost = 1_000_000 * pricing_per_million["sonnet"] # $3,000/day
# With routing
routed_cost = (
1_000_000 * 0.30 * pricing_per_million["haiku"] + # classification
1_000_000 * 0.20 * pricing_per_million["haiku"] + # extraction
1_000_000 * 0.15 * pricing_per_million["sonnet"] + # summarization
1_000_000 * 0.15 * pricing_per_million["sonnet"] + # qa
1_000_000 * 0.10 * pricing_per_million["opus"] + # code
1_000_000 * 0.10 * pricing_per_million["opus"] # reasoning
) # $875/day
return {
"without_routing": f"${no_routing_cost:,.0f}/day",
"with_routing": f"${routed_cost:,.0f}/day",
"savings": f"{(1 - routed_cost/no_routing_cost)*100:.0f}%",
}
Quality Validation
def validate_routing_quality():
"""
Ensure small models don't hurt quality
"""
return {
"approach": """
For each task type:
1. Run 100 samples through intended model
2. Run same samples through larger model
3. Compare outputs
4. If quality gap > threshold, route to larger model
""",
"thresholds": {
"classification": "99% accuracy required",
"extraction": "95% F1 required",
"summarization": "ROUGE > 0.85 vs large model",
"qa": "90% answer equivalence",
"code": "98% test pass rate required",
},
"monitoring": """
# Track quality by task type
for request in production_sample:
actual_output = model.generate(request)
large_output = opus.generate(request)
quality = compute_quality(actual_output, large_output)
metrics.record("routing_quality", quality, tags={
"task_type": request.task_type,
"model": request.routed_model,
})
""",
}
Dynamic Routing
class DynamicRouter:
"""
Adjust routing based on observed quality
"""
def __init__(self):
self.quality_scores = {} # task_type -> model -> score
self.routing_table = self.default_routing()
def update_routing(self, task_type: str, model: str, quality: float):
"""Update routing based on observed quality"""
key = (task_type, model)
if key not in self.quality_scores:
self.quality_scores[key] = []
self.quality_scores[key].append(quality)
# Check if we should upgrade model for this task
recent_scores = self.quality_scores[key][-100:]
avg_quality = sum(recent_scores) / len(recent_scores)
if avg_quality < self.quality_threshold(task_type):
# Quality too low, upgrade to larger model
self.upgrade_routing(task_type)
def upgrade_routing(self, task_type: str):
"""Move task type to larger model"""
current = self.routing_table[task_type]
upgrade_path = {"small": "medium", "medium": "large"}
if current in upgrade_path:
self.routing_table[task_type] = upgrade_path[current]
log.warning(f"Upgraded {task_type} to {self.routing_table[task_type]}")
Latency Considerations
def routing_latency_considerations():
return {
"small_model_advantage": {
"ttft": "50ms vs 200ms",
"total_latency": "200ms vs 800ms for short outputs",
"user_experience": "Feels instant vs feels slow",
},
"routing_overhead": {
"keyword_classification": "< 1ms",
"model_classification": "50-100ms",
"consideration": "Model classification adds latency",
},
"optimization": """
# Classify task type asynchronously while preparing request
async def handle_request(request):
# Start classification immediately
classification_task = asyncio.create_task(
classify_task(request.prompt)
)
# Prepare request (tokenization, etc.)
prepared = await prepare_request(request)
# Get classification result
task_type = await classification_task
# Route to appropriate model
model = router.get_model(task_type)
return await model.generate(prepared)
""",
}
Implementation Checklist
def routing_checklist():
return [
"[ ] Task types identified for your workload",
"[ ] Quality thresholds set per task type",
"[ ] Small model validated for simple tasks",
"[ ] Routing logic implemented (keywords or classifier)",
"[ ] Quality monitoring by task type + model",
"[ ] Cost tracking by task type + model",
"[ ] Fallback to larger model on quality issues",
"[ ] A/B test routing vs no routing",
]
The goal isn't to use the smallest model everywhere. It's to use the right model everywhere. Smart routing matches task complexity to model capability, saving money on simple tasks while preserving quality where it matters.