Back to Blog

Matching the Right Model to Each Task

You wouldn't hire a surgeon to change a lightbulb or an electrician to perform surgery. Both are skilled professionals, but their skills match different tasks. Overskilling wastes money. Underskilling produces poor results.

Model selection follows the same logic. Classification doesn't need 70B parameters. Novel reasoning does. Code generation benefits from code-specialized training. Matching model capabilities to task requirements optimizes both cost and quality.

Task-Model Mapping

def task_model_mapping():
    return {
        "classification": {
            "task": "Categorize into fixed labels",
            "examples": ["Sentiment analysis", "Intent detection", "Spam filtering"],
            "recommended_model": "Small (1-8B)",
            "why": "Fixed output space, pattern matching",
            "overkill_risk": "Large models offer little quality gain",
        },

        "extraction": {
            "task": "Pull structured data from text",
            "examples": ["NER", "Date extraction", "Form parsing"],
            "recommended_model": "Small to Medium (3-13B)",
            "why": "Constrained output, pattern recognition",
            "overkill_risk": "Medium models usually sufficient",
        },

        "summarization": {
            "task": "Condense text preserving meaning",
            "examples": ["Document summaries", "Meeting notes", "TL;DR"],
            "recommended_model": "Medium to Large (13-70B)",
            "why": "Requires understanding nuance, importance",
            "underskill_risk": "Small models miss key points",
        },

        "question_answering": {
            "task": "Answer questions from context",
            "examples": ["RAG responses", "FAQ bots", "Document Q&A"],
            "recommended_model": "Medium (7-13B) for simple, Large for complex",
            "why": "Depends on reasoning depth required",
        },

        "code_generation": {
            "task": "Write functional code",
            "examples": ["Autocomplete", "Function implementation", "Bug fixes"],
            "recommended_model": "Code-specialized (any size)",
            "why": "Code-trained models dramatically outperform generalists",
            "specific_models": ["CodeLlama", "StarCoder", "DeepSeek-Coder"],
        },

        "reasoning": {
            "task": "Multi-step logical deduction",
            "examples": ["Math problems", "Logic puzzles", "Planning"],
            "recommended_model": "Large (70B+)",
            "why": "Emergent capability, needs scale",
            "underskill_risk": "Small models fail complex reasoning",
        },

        "creative_writing": {
            "task": "Generate novel, engaging content",
            "examples": ["Stories", "Marketing copy", "Poetry"],
            "recommended_model": "Large (70B+)",
            "why": "Quality, style, and coherence improve with scale",
        },
    }

Building a Routing System

class TaskAwareRouter:
    """
    Route requests to appropriate models based on task
    """

    def __init__(self):
        self.models = {
            "small": "llama-8b",
            "medium": "llama-13b",
            "large": "llama-70b",
            "code": "codellama-34b",
        }

        self.task_routes = {
            "classification": "small",
            "extraction": "small",
            "summarization": "medium",
            "qa_simple": "small",
            "qa_complex": "large",
            "code": "code",
            "reasoning": "large",
            "creative": "large",
        }

    def route(self, request: dict) -> str:
        """Determine best model for request"""
        # Explicit task type if provided
        if task_type := request.get("task_type"):
            return self.models[self.task_routes[task_type]]

        # Infer task type from request
        inferred_task = self.infer_task_type(request)
        return self.models[self.task_routes[inferred_task]]

    def infer_task_type(self, request: dict) -> str:
        """Infer task type from request content"""
        prompt = request.get("prompt", "").lower()

        # Simple heuristics
        if "classify" in prompt or "label" in prompt:
            return "classification"
        if "extract" in prompt or "find all" in prompt:
            return "extraction"
        if "summarize" in prompt or "summary" in prompt:
            return "summarization"
        if "```" in prompt or "code" in prompt or "function" in prompt:
            return "code"
        if "why" in prompt or "explain" in prompt or "reason" in prompt:
            return "reasoning"
        if "write" in prompt or "create" in prompt or "story" in prompt:
            return "creative"

        # Default
        return "qa_simple"

Specialized vs General Models

def specialized_vs_general():
    return {
        "when_specialized_wins": {
            "code": {
                "specialized": "CodeLlama-34B",
                "general": "LLaMA-70B",
                "performance_gap": "Specialized is 20-40% better on code tasks",
                "cost_gap": "Specialized is cheaper (smaller)",
            },
            "math": {
                "specialized": "Llemma-34B",
                "general": "LLaMA-70B",
                "performance_gap": "Specialized is 15-30% better",
            },
        },

        "when_general_wins": {
            "mixed_tasks": {
                "scenario": "Code + explanation + reasoning",
                "general_advantage": "Handles all tasks adequately",
                "specialized_disadvantage": "May struggle outside specialty",
            },
            "edge_cases": {
                "scenario": "Unusual requests, creative prompts",
                "general_advantage": "Broader training coverage",
            },
        },

        "hybrid_approach": """
            Route code-specific tasks to code model.
            Route general tasks to general model.
            Use specialized for specialties, general for everything else.
        """,
    }

Quality vs Cost Trade-offs

def quality_cost_by_task():
    return {
        "classification": {
            "haiku": {"quality": 0.95, "cost": 0.25},
            "sonnet": {"quality": 0.96, "cost": 3.00},
            "opus": {"quality": 0.97, "cost": 15.00},
            "recommendation": "Haiku - minimal quality gain from larger",
        },

        "summarization": {
            "haiku": {"quality": 0.80, "cost": 0.25},
            "sonnet": {"quality": 0.90, "cost": 3.00},
            "opus": {"quality": 0.95, "cost": 15.00},
            "recommendation": "Sonnet - good balance",
        },

        "complex_reasoning": {
            "haiku": {"quality": 0.60, "cost": 0.25},
            "sonnet": {"quality": 0.75, "cost": 3.00},
            "opus": {"quality": 0.92, "cost": 15.00},
            "recommendation": "Opus - only large model handles well",
        },

        "decision_framework": """
            Quality gap > 20%? Use larger model.
            Quality gap < 5%? Use smaller model.
            In between? Consider volume and importance.
        """,
    }

Dynamic Model Selection

class DynamicModelSelector:
    """
    Select model based on request characteristics
    """

    def __init__(self):
        self.model_performance = {}  # task -> model -> quality
        self.model_costs = {}  # model -> cost per token

    def select_model(self, request: dict, constraints: dict) -> str:
        """Select best model given constraints"""
        task_type = self.infer_task(request)
        max_cost = constraints.get("max_cost_per_request")
        min_quality = constraints.get("min_quality")

        # Get viable models for this task
        viable = []
        for model, quality in self.model_performance[task_type].items():
            cost = self.estimate_cost(request, model)

            if max_cost and cost > max_cost:
                continue
            if min_quality and quality < min_quality:
                continue

            viable.append({
                "model": model,
                "quality": quality,
                "cost": cost,
                "efficiency": quality / cost,
            })

        if not viable:
            return self.default_model  # Fallback

        # Select based on strategy
        if constraints.get("strategy") == "max_quality":
            return max(viable, key=lambda x: x["quality"])["model"]
        elif constraints.get("strategy") == "min_cost":
            return min(viable, key=lambda x: x["cost"])["model"]
        else:  # best efficiency
            return max(viable, key=lambda x: x["efficiency"])["model"]

Measuring and Updating

class ModelSelectionOptimizer:
    """
    Continuously optimize model selection
    """

    def __init__(self):
        self.performance_log = []

    def log_outcome(self, task_type: str, model: str, quality: float, cost: float):
        """Log outcome for analysis"""
        self.performance_log.append({
            "task_type": task_type,
            "model": model,
            "quality": quality,
            "cost": cost,
            "timestamp": datetime.now(),
        })

    def analyze_selection(self, window_days: int = 7) -> dict:
        """Analyze selection performance"""
        recent = [
            log for log in self.performance_log
            if log["timestamp"] > datetime.now() - timedelta(days=window_days)
        ]

        by_task = {}
        for log in recent:
            task = log["task_type"]
            if task not in by_task:
                by_task[task] = []
            by_task[task].append(log)

        recommendations = {}
        for task, logs in by_task.items():
            # Find model with best quality/cost ratio
            model_stats = {}
            for log in logs:
                model = log["model"]
                if model not in model_stats:
                    model_stats[model] = {"quality": [], "cost": []}
                model_stats[model]["quality"].append(log["quality"])
                model_stats[model]["cost"].append(log["cost"])

            best = max(
                model_stats.items(),
                key=lambda x: (
                    statistics.mean(x[1]["quality"]) /
                    statistics.mean(x[1]["cost"])
                )
            )
            recommendations[task] = best[0]

        return recommendations

Model selection is task matching. Classification is a small-model task. Complex reasoning is a large-model task. Code is a code-model task. Match the model to the task, measure the results, and adjust based on data. The right model for the task is cheaper and better than the wrong model scaled up.