Back to Blog

Deciding Which Model Handles Each Request

Hospitals triage patients. A sprained ankle doesn't need the chief surgeon. A complex trauma does. Matching care level to need is how hospitals serve everyone efficiently.

Model routing applies the same principle. A yes/no classification doesn't need a 400B parameter model. A complex reasoning task might. Routing correctly saves 10x costs on simple requests while preserving quality where it matters.

The Routing Decision

class ModelRouter:
    """
    Route requests to appropriate model
    """

    def __init__(self):
        self.models = {
            "small": "haiku",      # Fast, cheap, good for simple tasks
            "medium": "sonnet",    # Balanced
            "large": "opus",       # Best quality, expensive
        }

    def route(self, request: dict) -> str:
        task_type = self.classify_task(request)

        routing_table = {
            "classification": "small",
            "extraction": "small",
            "summarization": "medium",
            "question_answering": "medium",
            "code_generation": "large",
            "complex_reasoning": "large",
            "creative_writing": "large",
        }

        return self.models[routing_table.get(task_type, "medium")]

Task Classification

def task_classification_approaches():
    return {
        "keyword_based": {
            "approach": "Check for task keywords in prompt",
            "implementation": """
            def classify_by_keywords(prompt: str) -> str:
                if any(kw in prompt.lower() for kw in ["classify", "yes or no", "true or false"]):
                    return "classification"
                if any(kw in prompt.lower() for kw in ["extract", "find", "list all"]):
                    return "extraction"
                if any(kw in prompt.lower() for kw in ["summarize", "tl;dr"]):
                    return "summarization"
                if "write code" in prompt.lower() or "```" in prompt:
                    return "code_generation"
                return "general"
            """,
            "pros": ["Fast", "No model calls"],
            "cons": ["Brittle", "Misses nuance"],
        },
        "classifier_model": {
            "approach": "Use small model to classify task",
            "implementation": """
            async def classify_with_model(prompt: str) -> str:
                classification = await haiku.generate(
                    f"Classify this task type: {prompt[:500]}"
                    "Options: classification, extraction, summarization, "
                    "question_answering, code_generation, reasoning, creative"
                )
                return classification.strip().lower()
            """,
            "pros": ["Accurate", "Handles edge cases"],
            "cons": ["Adds latency", "Extra cost"],
        },
        "hybrid": {
            "approach": "Keywords first, classifier for ambiguous",
            "pros": ["Fast for obvious cases", "Accurate for hard cases"],
        },
    }

Cost Impact

def routing_cost_impact():
    """
    Real impact of smart routing
    """
    # Example pricing (approximate)
    pricing_per_million = {
        "haiku": 0.25,
        "sonnet": 3.00,
        "opus": 15.00,
    }

    # Example workload: 1M requests/day
    workload = {
        "classification": 0.30,  # 30% of requests
        "extraction": 0.20,
        "summarization": 0.15,
        "qa": 0.15,
        "code": 0.10,
        "reasoning": 0.10,
    }

    # Without routing: all to medium
    no_routing_cost = 1_000_000 * pricing_per_million["sonnet"]  # $3,000/day

    # With routing
    routed_cost = (
        1_000_000 * 0.30 * pricing_per_million["haiku"] +    # classification
        1_000_000 * 0.20 * pricing_per_million["haiku"] +    # extraction
        1_000_000 * 0.15 * pricing_per_million["sonnet"] +   # summarization
        1_000_000 * 0.15 * pricing_per_million["sonnet"] +   # qa
        1_000_000 * 0.10 * pricing_per_million["opus"] +     # code
        1_000_000 * 0.10 * pricing_per_million["opus"]       # reasoning
    )  # $875/day

    return {
        "without_routing": f"${no_routing_cost:,.0f}/day",
        "with_routing": f"${routed_cost:,.0f}/day",
        "savings": f"{(1 - routed_cost/no_routing_cost)*100:.0f}%",
    }

Quality Validation

def validate_routing_quality():
    """
    Ensure small models don't hurt quality
    """
    return {
        "approach": """
        For each task type:
        1. Run 100 samples through intended model
        2. Run same samples through larger model
        3. Compare outputs
        4. If quality gap > threshold, route to larger model
        """,

        "thresholds": {
            "classification": "99% accuracy required",
            "extraction": "95% F1 required",
            "summarization": "ROUGE > 0.85 vs large model",
            "qa": "90% answer equivalence",
            "code": "98% test pass rate required",
        },

        "monitoring": """
        # Track quality by task type
        for request in production_sample:
            actual_output = model.generate(request)
            large_output = opus.generate(request)

            quality = compute_quality(actual_output, large_output)
            metrics.record("routing_quality", quality, tags={
                "task_type": request.task_type,
                "model": request.routed_model,
            })
        """,
    }

Dynamic Routing

class DynamicRouter:
    """
    Adjust routing based on observed quality
    """

    def __init__(self):
        self.quality_scores = {}  # task_type -> model -> score
        self.routing_table = self.default_routing()

    def update_routing(self, task_type: str, model: str, quality: float):
        """Update routing based on observed quality"""
        key = (task_type, model)
        if key not in self.quality_scores:
            self.quality_scores[key] = []

        self.quality_scores[key].append(quality)

        # Check if we should upgrade model for this task
        recent_scores = self.quality_scores[key][-100:]
        avg_quality = sum(recent_scores) / len(recent_scores)

        if avg_quality < self.quality_threshold(task_type):
            # Quality too low, upgrade to larger model
            self.upgrade_routing(task_type)

    def upgrade_routing(self, task_type: str):
        """Move task type to larger model"""
        current = self.routing_table[task_type]
        upgrade_path = {"small": "medium", "medium": "large"}
        if current in upgrade_path:
            self.routing_table[task_type] = upgrade_path[current]
            log.warning(f"Upgraded {task_type} to {self.routing_table[task_type]}")

Latency Considerations

def routing_latency_considerations():
    return {
        "small_model_advantage": {
            "ttft": "50ms vs 200ms",
            "total_latency": "200ms vs 800ms for short outputs",
            "user_experience": "Feels instant vs feels slow",
        },
        "routing_overhead": {
            "keyword_classification": "< 1ms",
            "model_classification": "50-100ms",
            "consideration": "Model classification adds latency",
        },
        "optimization": """
        # Classify task type asynchronously while preparing request
        async def handle_request(request):
            # Start classification immediately
            classification_task = asyncio.create_task(
                classify_task(request.prompt)
            )

            # Prepare request (tokenization, etc.)
            prepared = await prepare_request(request)

            # Get classification result
            task_type = await classification_task

            # Route to appropriate model
            model = router.get_model(task_type)
            return await model.generate(prepared)
        """,
    }

Implementation Checklist

def routing_checklist():
    return [
        "[ ] Task types identified for your workload",
        "[ ] Quality thresholds set per task type",
        "[ ] Small model validated for simple tasks",
        "[ ] Routing logic implemented (keywords or classifier)",
        "[ ] Quality monitoring by task type + model",
        "[ ] Cost tracking by task type + model",
        "[ ] Fallback to larger model on quality issues",
        "[ ] A/B test routing vs no routing",
    ]

The goal isn't to use the smallest model everywhere. It's to use the right model everywhere. Smart routing matches task complexity to model capability, saving money on simple tasks while preserving quality where it matters.