Back to Blog

Where Speculative Decoding Actually Helps

Weather prediction works well for tomorrow, poorly for next month. The chaotic nature of weather means small uncertainties compound. Speculative decoding has similar predictability constraints. When outputs follow patterns, prediction works. When outputs are creative or unexpected, prediction fails.

Understanding where speculative decoding helps requires understanding what makes outputs predictable. Code follows syntax rules. JSON follows schemas. Templates follow formats. Creative writing follows only the author's imagination.

Predictability is Everything

def predictability_determines_success():
    return {
        "high_predictability": {
            "code_completion": {
                "why_predictable": "Syntax rules, common patterns, context clues",
                "example": "'def calculate_total(' → 'self, items):'",
                "acceptance_rate": "70-85%",
                "speedup": "2-3x",
            },
            "json_generation": {
                "why_predictable": "Schema is known, values often from context",
                "example": '{"name": "''John", "age": 30}',
                "acceptance_rate": "75-90%",
                "speedup": "2-4x",
            },
            "template_filling": {
                "why_predictable": "Fixed format, variable content from context",
                "example": "Dear [NAME], Thank you for your order of [ITEM]...",
                "acceptance_rate": "80-90%",
                "speedup": "2-4x",
            },
        },

        "medium_predictability": {
            "technical_documentation": {
                "why_moderate": "Structure predictable, details less so",
                "acceptance_rate": "50-70%",
                "speedup": "1.5-2x",
            },
            "translation": {
                "why_moderate": "Grammar predictable, word choice less so",
                "acceptance_rate": "40-60%",
                "speedup": "1.3-1.8x",
            },
        },

        "low_predictability": {
            "creative_writing": {
                "why_unpredictable": "Novel ideas, unexpected turns",
                "acceptance_rate": "20-40%",
                "speedup": "1.0-1.2x (marginal)",
            },
            "brainstorming": {
                "why_unpredictable": "Deliberately divergent thinking",
                "acceptance_rate": "15-30%",
                "speedup": "Often no benefit",
            },
        },
    }

Task-by-Task Analysis

def task_specific_results():
    return {
        "code_generation": {
            "acceptance_rates": {
                "boilerplate": "85-95%",
                "standard_patterns": "70-85%",
                "novel_algorithms": "40-60%",
            },
            "best_for": [
                "Function signatures",
                "Import statements",
                "Common patterns (loops, conditionals)",
                "Closing brackets/braces",
            ],
            "worst_for": [
                "Novel algorithm logic",
                "Domain-specific implementations",
                "Complex mathematical expressions",
            ],
        },

        "structured_output": {
            "acceptance_rates": {
                "json_with_schema": "80-95%",
                "xml_well_defined": "75-90%",
                "markdown_tables": "70-85%",
            },
            "why_high": "Structure is constrained, vocabulary limited",
        },

        "customer_support": {
            "acceptance_rates": {
                "canned_responses": "85-95%",
                "templated_replies": "70-85%",
                "custom_explanations": "40-60%",
            },
            "optimization": "Use speculative for templates, skip for custom",
        },

        "summarization": {
            "acceptance_rates": {
                "extractive": "60-75%",
                "abstractive": "35-55%",
            },
            "note": "Extractive summarization is more predictable",
        },
    }

Measuring for Your Workload

class WorkloadAnalyzer:
    """
    Measure speculative decoding benefit for your specific use case
    """

    def analyze_workload(
        self,
        draft_model,
        target_model,
        sample_prompts: list,
        gamma: int = 5
    ) -> dict:
        """Analyze potential benefit for a workload"""
        results = []

        for prompt in sample_prompts:
            acceptance = self.measure_acceptance(
                draft_model, target_model, prompt, gamma
            )
            results.append({
                "prompt": prompt[:100],
                "acceptance_rate": acceptance,
                "expected_speedup": self.expected_speedup(acceptance, gamma),
            })

        # Aggregate
        avg_acceptance = statistics.mean(r["acceptance_rate"] for r in results)
        avg_speedup = statistics.mean(r["expected_speedup"] for r in results)
        variance = statistics.variance(r["acceptance_rate"] for r in results)

        return {
            "avg_acceptance_rate": avg_acceptance,
            "avg_expected_speedup": avg_speedup,
            "acceptance_variance": variance,
            "recommendation": self.recommend(avg_acceptance, variance),
            "sample_results": results[:10],
        }

    def recommend(self, avg_acceptance: float, variance: float) -> str:
        if avg_acceptance > 0.7 and variance < 0.05:
            return "HIGHLY_RECOMMENDED: Consistent high acceptance"
        elif avg_acceptance > 0.5:
            return "RECOMMENDED: Moderate benefit expected"
        elif avg_acceptance > 0.3:
            return "CONDITIONAL: May help for some prompts"
        else:
            return "NOT_RECOMMENDED: Low acceptance rate"

When to Skip Speculative Decoding

def when_to_skip():
    return {
        "low_acceptance_rate": {
            "threshold": "< 40%",
            "impact": "Overhead exceeds benefit",
            "action": "Use standard decoding",
        },

        "high_variance": {
            "symptom": "Some prompts 90%, others 20%",
            "impact": "Unpredictable performance",
            "action": "Route selectively based on prompt type",
        },

        "memory_constrained": {
            "symptom": "Can't fit both models comfortably",
            "impact": "Reduced batch size, lower throughput",
            "action": "Consider smaller draft or skip entirely",
        },

        "throughput_priority": {
            "symptom": "Need max tokens/second, not min latency",
            "impact": "Speculative decoding helps latency, not throughput",
            "action": "Use batching optimizations instead",
        },

        "short_outputs": {
            "symptom": "Most outputs < 20 tokens",
            "impact": "Speculation overhead not amortized",
            "action": "Skip for short generations",
        },
    }

Adaptive Speculation

class AdaptiveSpeculation:
    """
    Adjust speculation strategy based on real-time acceptance
    """

    def __init__(self):
        self.acceptance_history = []
        self.gamma = 5  # Initial speculation length
        self.min_gamma = 2
        self.max_gamma = 10

    def update(self, accepted: int, proposed: int):
        """Update based on recent acceptance"""
        rate = accepted / proposed
        self.acceptance_history.append(rate)

        # Keep recent history
        if len(self.acceptance_history) > 100:
            self.acceptance_history.pop(0)

        # Adjust gamma
        recent_rate = statistics.mean(self.acceptance_history[-10:])

        if recent_rate > 0.8 and self.gamma < self.max_gamma:
            self.gamma += 1  # More speculation
        elif recent_rate < 0.5 and self.gamma > self.min_gamma:
            self.gamma -= 1  # Less speculation

    def should_speculate(self) -> bool:
        """Decide whether to use speculation for next request"""
        if len(self.acceptance_history) < 10:
            return True  # Not enough data, try it

        recent_rate = statistics.mean(self.acceptance_history[-10:])
        return recent_rate > 0.3  # Only if acceptance reasonable


class PromptRoutedSpeculation:
    """
    Route to speculation based on prompt characteristics
    """

    def __init__(self):
        self.task_acceptance = {}  # task_type -> acceptance rate

    def should_speculate(self, prompt: str) -> bool:
        task_type = self.classify_task(prompt)

        # Check if we have data for this task type
        if task_type in self.task_acceptance:
            return self.task_acceptance[task_type] > 0.4

        # Default: try speculation
        return True

    def classify_task(self, prompt: str) -> str:
        """Classify prompt into task categories"""
        if "```" in prompt or "def " in prompt or "function" in prompt:
            return "code"
        elif "{" in prompt and ":" in prompt:
            return "json"
        elif "summarize" in prompt.lower():
            return "summary"
        elif "write a story" in prompt.lower() or "create" in prompt.lower():
            return "creative"
        return "general"

Production Configuration

def production_configuration():
    return {
        "code_assistant": {
            "enable_speculation": True,
            "draft_model": "codellama-7b",
            "target_model": "codellama-34b",
            "gamma": 8,
            "expected_speedup": "2-3x",
        },

        "json_api": {
            "enable_speculation": True,
            "draft_model": "llama-3b",
            "target_model": "llama-70b",
            "gamma": 10,
            "expected_speedup": "2-4x",
        },

        "chat_assistant": {
            "enable_speculation": "conditional",
            "condition": "prompt_type != 'creative'",
            "draft_model": "llama-7b",
            "target_model": "llama-70b",
            "gamma": 5,
            "expected_speedup": "1.3-2x",
        },

        "creative_writing": {
            "enable_speculation": False,
            "reason": "Low acceptance rate makes it counterproductive",
        },
    }

Speculative decoding is not universally beneficial. It accelerates predictable generation dramatically and provides minimal benefit for unpredictable generation. Measure acceptance rates on your actual workload before committing to the complexity. Route adaptively if your workload is mixed.