The Techniques That Actually Cut Costs

Home energy audits reveal that some improvements pay for themselves in months while others take decades. Sealing air leaks costs $100 and saves $200/year. Replacing windows costs $10,000 and saves $150/year. Both are "improvements," but the returns are wildly different.

LLM cost optimization has the same dynamic. Some techniques deliver 50% savings with minimal effort. Others require months of work for 5% gains. Knowing which optimizations actually move the needle prevents wasted effort.

The High-Impact Techniques

def high_impact_techniques():
    return {
        "prefix_caching": {
            "savings": "30-60% on repeated prefixes",
            "effort": "Low (configuration change)",
            "when_it_helps": "Same system prompt across requests",
            "implementation": """
                # vLLM automatic prefix caching
                llm = LLM(model, enable_prefix_caching=True)

                # All requests with same prefix share computation
            """,
            "real_example": """
                System prompt: 1000 tokens
                Requests/day: 100,000
                Without caching: 100M system prompt tokens/day
                With caching: ~1M system prompt tokens/day (99% reduction)
            """,
        },

        "smart_routing": {
            "savings": "50-80% by using right model for task",
            "effort": "Medium (routing logic)",
            "when_it_helps": "Mixed workload with varying complexity",
            "implementation": """
                if task_complexity(request) < 0.3:
                    return haiku(request)  # $0.25/M tokens
                elif task_complexity(request) < 0.7:
                    return sonnet(request)  # $3/M tokens
                else:
                    return opus(request)  # $15/M tokens
            """,
            "real_example": """
                60% of requests are simple: Haiku @ $0.25
                30% are medium: Sonnet @ $3
                10% are complex: Opus @ $15

                Blended: $2.15 vs $3 (all Sonnet) = 28% savings
            """,
        },

        "quantization": {
            "savings": "30-50% GPU memory, 20-40% cost",
            "effort": "Low to Medium",
            "when_it_helps": "Self-hosted inference",
            "implementation": """
                # INT8 quantization
                model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    load_in_8bit=True
                )

                # Or use vLLM with quantized weights
                llm = LLM(model, quantization="awq")
            """,
            "real_example": """
                70B model FP16: Needs 2x A100-80GB ($6/hr)
                70B model INT8: Needs 1x A100-80GB ($3/hr)
                50% hardware cost reduction
            """,
        },

        "batching": {
            "savings": "3-10x throughput improvement",
            "effort": "Medium (requires infrastructure)",
            "when_it_helps": "High volume, latency-tolerant workloads",
            "implementation": """
                # Use continuous batching
                llm = LLM(model, max_num_seqs=32)

                # Batch requests together
                responses = llm.generate(batch_of_prompts)
            """,
            "real_example": """
                Sequential: 10 tokens/second
                Batched (32): 80 tokens/second
                8x throughput = 8x better cost efficiency
            """,
        },
    }

Medium-Impact Techniques

def medium_impact_techniques():
    return {
        "context_compression": {
            "savings": "20-40% on context-heavy workloads",
            "effort": "Medium",
            "approach": "Summarize or retrieve instead of stuffing",
            "tradeoff": "Some quality loss possible",
        },

        "output_length_optimization": {
            "savings": "10-30%",
            "effort": "Low",
            "approach": "Set appropriate max_tokens, use stop sequences",
            "example": "Many apps set max_tokens=4096 but average 200",
        },

        "kv_cache_quantization": {
            "savings": "50% memory, supports more concurrent users",
            "effort": "Low (configuration)",
            "implementation": "kv_cache_dtype='fp8'",
        },

        "prompt_optimization": {
            "savings": "10-40% on prompt-heavy workloads",
            "effort": "Medium",
            "approach": "Shorter prompts, better instructions",
            "example": "1000 token prompt -> 400 tokens with same quality",
        },
    }

Lower-Impact Techniques

def lower_impact_techniques():
    return {
        "kernel_fusion": {
            "savings": "5-15% latency",
            "effort": "Already done by frameworks (vLLM, TGI)",
            "action": "Use modern serving framework",
        },

        "flash_attention": {
            "savings": "10-20% on long contexts",
            "effort": "Already included in modern frameworks",
            "action": "Ensure you're using it (usually default)",
        },

        "speculative_decoding": {
            "savings": "1.5-3x latency improvement",
            "effort": "High (requires draft model)",
            "caveat": "Helps latency, not throughput",
        },
    }

Cost Reduction Playbook

def cost_reduction_playbook():
    return {
        "week_1_quick_wins": [
            {
                "action": "Enable prefix caching",
                "expected_savings": "20-50%",
                "effort": "1 hour",
            },
            {
                "action": "Audit max_tokens settings",
                "expected_savings": "10-20%",
                "effort": "2 hours",
            },
            {
                "action": "Check for retry storms",
                "expected_savings": "5-20%",
                "effort": "2 hours",
            },
        ],

        "week_2_medium_effort": [
            {
                "action": "Implement smart model routing",
                "expected_savings": "30-60%",
                "effort": "1 week",
            },
            {
                "action": "Enable quantization (if self-hosted)",
                "expected_savings": "30-50%",
                "effort": "1 day",
            },
        ],

        "month_1_infrastructure": [
            {
                "action": "Implement proper batching",
                "expected_savings": "3-8x throughput",
                "effort": "1-2 weeks",
            },
            {
                "action": "Context compression system",
                "expected_savings": "20-40%",
                "effort": "2 weeks",
            },
        ],

        "ongoing": [
            "Monitor cost per request by feature",
            "Review and optimize top cost drivers monthly",
            "Evaluate new models and pricing",
        ],
    }

Measuring Impact

class CostImpactMeasurement:
    """
    Measure actual savings from optimizations
    """

    def __init__(self):
        self.baseline_metrics = {}
        self.current_metrics = {}

    def establish_baseline(self, window_days: int = 7):
        """Capture baseline before optimization"""
        self.baseline_metrics = {
            "total_cost": self.get_total_cost(window_days),
            "cost_per_request": self.get_avg_cost_per_request(window_days),
            "tokens_per_dollar": self.get_tokens_per_dollar(window_days),
            "requests_volume": self.get_request_count(window_days),
        }

    def measure_impact(self, optimization_name: str, window_days: int = 7):
        """Measure impact of optimization"""
        self.current_metrics = {
            "total_cost": self.get_total_cost(window_days),
            "cost_per_request": self.get_avg_cost_per_request(window_days),
            "tokens_per_dollar": self.get_tokens_per_dollar(window_days),
            "requests_volume": self.get_request_count(window_days),
        }

        return {
            "optimization": optimization_name,
            "cost_reduction": 1 - (
                self.current_metrics["cost_per_request"] /
                self.baseline_metrics["cost_per_request"]
            ),
            "efficiency_improvement": (
                self.current_metrics["tokens_per_dollar"] /
                self.baseline_metrics["tokens_per_dollar"]
            ) - 1,
            "quality_maintained": self.quality_check(),
        }

Common Mistakes

def cost_optimization_mistakes():
    return {
        "optimizing_wrong_thing": {
            "mistake": "Spending weeks on 5% savings, ignoring 50% opportunity",
            "example": "Optimizing kernels while not using prefix caching",
            "fix": "Measure cost breakdown, optimize biggest first",
        },

        "ignoring_quality": {
            "mistake": "Cut costs but degrade quality",
            "example": "Route everything to smallest model",
            "fix": "Measure quality alongside cost",
        },

        "over_engineering": {
            "mistake": "Build complex systems for small gains",
            "example": "Custom kernel for 3% improvement",
            "fix": "Use existing frameworks, focus on high-impact",
        },

        "not_measuring": {
            "mistake": "Assume optimizations work",
            "example": "Enable feature, don't verify savings",
            "fix": "A/B test, measure actual cost impact",
        },
    }

Prioritization Matrix

def prioritization_matrix():
    return """
    Optimization          | Effort | Savings | Priority
    ----------------------|--------|---------|----------
    Prefix caching        | Low    | High    | Do first
    Smart routing         | Medium | High    | Do second
    Max tokens audit      | Low    | Medium  | Do first
    Quantization          | Low    | High    | Do first (if self-hosted)
    Batching              | Medium | High    | Do second
    Context compression   | Medium | Medium  | Do third
    Prompt optimization   | Medium | Medium  | Do third
    Speculative decoding  | High   | Medium  | Consider later
    Custom kernels        | High   | Low     | Skip usually
    """

Start with the techniques that combine low effort with high savings: prefix caching, quantization, and model routing. Then move to medium-effort items like batching and context compression. Leave high-effort, lower-impact optimizations for when the big wins are already captured.