Back to Blog

Starting Cheap and Escalating When Needed

Tech support has tiers. Level 1 handles password resets. Unsolved issues escalate to Level 2. Complex problems reach Level 3 engineers. Most issues never leave Level 1, keeping costs low while ensuring hard problems get expert attention.

Cascade routing applies this pattern to models. Start with the cheap model. If it can't solve the problem confidently, escalate to the expensive one. Most requests never escalate.

The Cascade Pattern

class CascadeRouter:
    """
    Try cheap model first, escalate if needed
    """

    def __init__(self):
        self.small_model = load_model("haiku")
        self.large_model = load_model("opus")
        self.confidence_threshold = 0.85

    async def generate(self, prompt: str) -> dict:
        # Try small model first
        small_result = await self.small_model.generate_with_confidence(prompt)

        if small_result.confidence >= self.confidence_threshold:
            return {
                "output": small_result.text,
                "model": "small",
                "cost": "low",
                "escalated": False,
            }

        # Escalate to large model
        large_result = await self.large_model.generate(prompt)
        return {
            "output": large_result.text,
            "model": "large",
            "cost": "high",
            "escalated": True,
        }

Measuring Confidence

class ConfidenceMeasures:
    """
    Different ways to determine when to escalate
    """

    def log_probability_confidence(self, logprobs: list) -> float:
        """Use model's own confidence via log probabilities"""
        avg_logprob = sum(logprobs) / len(logprobs)
        # Convert to 0-1 scale (roughly)
        return min(1.0, max(0.0, (avg_logprob + 2) / 2))

    def self_reported_confidence(self, prompt: str, response: str) -> float:
        """Ask model to rate its confidence"""
        confidence_prompt = f"""
        You answered: "{response[:200]}"
        to the question: "{prompt[:200]}"

        Rate your confidence 0-100 that this is correct: """

        score = self.model.generate(confidence_prompt)
        return int(score) / 100

    def consistency_confidence(self, prompt: str, n: int = 3) -> float:
        """Generate multiple times, check consistency"""
        responses = [self.model.generate(prompt) for _ in range(n)]

        # Check if responses agree
        unique_responses = len(set(responses))
        consistency = 1 - (unique_responses - 1) / n

        return consistency

    def heuristic_confidence(self, response: str) -> float:
        """Simple heuristics for confidence"""
        signals = {
            "uncertainty_phrases": ["I'm not sure", "I think", "might be", "possibly"],
            "short_response": len(response) < 50,
            "refusal": "I cannot" in response or "I don't know" in response,
        }

        if any(phrase in response for phrase in signals["uncertainty_phrases"]):
            return 0.6
        if signals["short_response"]:
            return 0.7
        if signals["refusal"]:
            return 0.3

        return 0.9  # Default high confidence

Cost-Quality Tradeoff

def cascade_economics():
    """
    The math on cascade routing
    """
    # Assumptions
    small_model_cost = 0.25  # per 1M tokens
    large_model_cost = 15.00  # per 1M tokens

    # Scenario: 1M requests
    escalation_rates = [0.10, 0.20, 0.30, 0.40]  # Different thresholds

    results = {}
    for escalation_rate in escalation_rates:
        small_requests = 1_000_000 * (1 - escalation_rate)
        large_requests = 1_000_000 * escalation_rate

        small_cost = small_requests * small_model_cost / 1_000_000
        large_cost = large_requests * large_model_cost / 1_000_000

        total_cost = small_cost + large_cost
        baseline = 1_000_000 * 3.00 / 1_000_000  # All medium model

        results[escalation_rate] = {
            "total_cost": total_cost,
            "savings_vs_baseline": f"{(1 - total_cost/baseline)*100:.0f}%",
        }

    return {
        "10% escalation": "$1,525 (49% savings)",
        "20% escalation": "$2,800 (7% savings)",
        "30% escalation": "$4,075 (-36% more expensive)",
        "insight": "Escalation rate must be low for savings",
    }

Calibrating the Threshold

def calibrate_threshold():
    """
    Find the right confidence threshold
    """
    return {
        "process": """
        1. Run evaluation set through small model with confidence scores
        2. For each potential threshold:
           - Calculate what % would escalate
           - Calculate quality of non-escalated responses
           - Calculate cost
        3. Pick threshold that maximizes: quality * (1 - cost_ratio)
        """,

        "implementation": """
        def find_optimal_threshold(eval_set):
            results = []
            for threshold in [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
                escalation_rate = sum(
                    1 for item in eval_set
                    if item.small_confidence < threshold
                ) / len(eval_set)

                # Quality of non-escalated
                non_escalated = [
                    item for item in eval_set
                    if item.small_confidence >= threshold
                ]
                quality = evaluate_quality(non_escalated)

                results.append({
                    "threshold": threshold,
                    "escalation_rate": escalation_rate,
                    "quality": quality,
                })

            return results
        """,

        "typical_outcome": """
        Threshold 0.85 often works well:
        - 15-25% of requests escalate
        - 95%+ quality on non-escalated
        - 60-70% cost savings overall
        """,
    }

Quality Safeguards

def cascade_quality_safeguards():
    return {
        "task_type_override": {
            "description": "Some tasks always go to large model",
            "example": "Code generation, legal review",
            "implementation": """
            if task_type in ["code", "legal", "medical"]:
                return await large_model.generate(prompt)
            # else cascade logic
            """,
        },
        "user_tier_override": {
            "description": "Premium users always get large model",
            "example": "Enterprise customers",
            "implementation": """
            if user.tier == "enterprise":
                return await large_model.generate(prompt)
            """,
        },
        "quality_monitoring": {
            "description": "Track quality of small vs large outputs",
            "alert": "If small model quality drops, lower threshold",
        },
        "user_feedback_loop": {
            "description": "If users regenerate, consider escalating",
            "implementation": """
            if request.is_regeneration and previous.model == "small":
                return await large_model.generate(prompt)
            """,
        },
    }

Implementation Pattern

class CascadeWithFallback:
    """
    Full cascade implementation with safeguards
    """

    async def generate(self, request: Request) -> Response:
        # Check for overrides
        if self.should_skip_cascade(request):
            return await self.large_model.generate(request)

        # Try small model
        try:
            small_response = await asyncio.wait_for(
                self.small_model.generate_with_confidence(request),
                timeout=5.0
            )
        except asyncio.TimeoutError:
            # Small model slow, go to large
            return await self.large_model.generate(request)

        # Check confidence
        if small_response.confidence < self.threshold:
            # Log for analysis
            self.log_escalation(request, small_response)
            return await self.large_model.generate(request)

        # Verify not a refusal or low-quality response
        if self.is_low_quality(small_response):
            return await self.large_model.generate(request)

        return small_response

    def should_skip_cascade(self, request: Request) -> bool:
        return (
            request.user.tier == "enterprise" or
            request.task_type in self.always_large_tasks or
            request.is_regeneration
        )

Monitoring Cascade Performance

def cascade_metrics():
    return {
        "track": [
            "Escalation rate by task type",
            "Escalation rate by time of day",
            "Quality score by model used",
            "Cost savings vs baseline",
            "User regeneration rate by model",
        ],
        "alerts": [
            "Escalation rate > 40% (cost savings eroding)",
            "Small model quality dropping",
            "User complaints correlated with small model",
        ],
        "dashboard": """
        Cascade Health Dashboard:
        - Escalation funnel
        - Quality comparison: small vs large
        - Cost per request by path
        - Threshold calibration suggestions
        """,
    }

Cascade routing works when the small model handles most requests well. If escalation rate creeps above 30-40%, the savings disappear. Monitor the ratio and adjust the threshold, or improve the small model's coverage.