The Techniques That Actually Cut Costs
Home energy audits reveal that some improvements pay for themselves in months while others take decades. Sealing air leaks costs $100 and saves $200/year. Replacing windows costs $10,000 and saves $150/year. Both are "improvements," but the returns are wildly different.
LLM cost optimization has the same dynamic. Some techniques deliver 50% savings with minimal effort. Others require months of work for 5% gains. Knowing which optimizations actually move the needle prevents wasted effort.
The High-Impact Techniques
def high_impact_techniques():
return {
"prefix_caching": {
"savings": "30-60% on repeated prefixes",
"effort": "Low (configuration change)",
"when_it_helps": "Same system prompt across requests",
"implementation": """
# vLLM automatic prefix caching
llm = LLM(model, enable_prefix_caching=True)
# All requests with same prefix share computation
""",
"real_example": """
System prompt: 1000 tokens
Requests/day: 100,000
Without caching: 100M system prompt tokens/day
With caching: ~1M system prompt tokens/day (99% reduction)
""",
},
"smart_routing": {
"savings": "50-80% by using right model for task",
"effort": "Medium (routing logic)",
"when_it_helps": "Mixed workload with varying complexity",
"implementation": """
if task_complexity(request) < 0.3:
return haiku(request) # $0.25/M tokens
elif task_complexity(request) < 0.7:
return sonnet(request) # $3/M tokens
else:
return opus(request) # $15/M tokens
""",
"real_example": """
60% of requests are simple: Haiku @ $0.25
30% are medium: Sonnet @ $3
10% are complex: Opus @ $15
Blended: $2.15 vs $3 (all Sonnet) = 28% savings
""",
},
"quantization": {
"savings": "30-50% GPU memory, 20-40% cost",
"effort": "Low to Medium",
"when_it_helps": "Self-hosted inference",
"implementation": """
# INT8 quantization
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_8bit=True
)
# Or use vLLM with quantized weights
llm = LLM(model, quantization="awq")
""",
"real_example": """
70B model FP16: Needs 2x A100-80GB ($6/hr)
70B model INT8: Needs 1x A100-80GB ($3/hr)
50% hardware cost reduction
""",
},
"batching": {
"savings": "3-10x throughput improvement",
"effort": "Medium (requires infrastructure)",
"when_it_helps": "High volume, latency-tolerant workloads",
"implementation": """
# Use continuous batching
llm = LLM(model, max_num_seqs=32)
# Batch requests together
responses = llm.generate(batch_of_prompts)
""",
"real_example": """
Sequential: 10 tokens/second
Batched (32): 80 tokens/second
8x throughput = 8x better cost efficiency
""",
},
}
Medium-Impact Techniques
def medium_impact_techniques():
return {
"context_compression": {
"savings": "20-40% on context-heavy workloads",
"effort": "Medium",
"approach": "Summarize or retrieve instead of stuffing",
"tradeoff": "Some quality loss possible",
},
"output_length_optimization": {
"savings": "10-30%",
"effort": "Low",
"approach": "Set appropriate max_tokens, use stop sequences",
"example": "Many apps set max_tokens=4096 but average 200",
},
"kv_cache_quantization": {
"savings": "50% memory, supports more concurrent users",
"effort": "Low (configuration)",
"implementation": "kv_cache_dtype='fp8'",
},
"prompt_optimization": {
"savings": "10-40% on prompt-heavy workloads",
"effort": "Medium",
"approach": "Shorter prompts, better instructions",
"example": "1000 token prompt -> 400 tokens with same quality",
},
}
Lower-Impact Techniques
def lower_impact_techniques():
return {
"kernel_fusion": {
"savings": "5-15% latency",
"effort": "Already done by frameworks (vLLM, TGI)",
"action": "Use modern serving framework",
},
"flash_attention": {
"savings": "10-20% on long contexts",
"effort": "Already included in modern frameworks",
"action": "Ensure you're using it (usually default)",
},
"speculative_decoding": {
"savings": "1.5-3x latency improvement",
"effort": "High (requires draft model)",
"caveat": "Helps latency, not throughput",
},
}
Cost Reduction Playbook
def cost_reduction_playbook():
return {
"week_1_quick_wins": [
{
"action": "Enable prefix caching",
"expected_savings": "20-50%",
"effort": "1 hour",
},
{
"action": "Audit max_tokens settings",
"expected_savings": "10-20%",
"effort": "2 hours",
},
{
"action": "Check for retry storms",
"expected_savings": "5-20%",
"effort": "2 hours",
},
],
"week_2_medium_effort": [
{
"action": "Implement smart model routing",
"expected_savings": "30-60%",
"effort": "1 week",
},
{
"action": "Enable quantization (if self-hosted)",
"expected_savings": "30-50%",
"effort": "1 day",
},
],
"month_1_infrastructure": [
{
"action": "Implement proper batching",
"expected_savings": "3-8x throughput",
"effort": "1-2 weeks",
},
{
"action": "Context compression system",
"expected_savings": "20-40%",
"effort": "2 weeks",
},
],
"ongoing": [
"Monitor cost per request by feature",
"Review and optimize top cost drivers monthly",
"Evaluate new models and pricing",
],
}
Measuring Impact
class CostImpactMeasurement:
"""
Measure actual savings from optimizations
"""
def __init__(self):
self.baseline_metrics = {}
self.current_metrics = {}
def establish_baseline(self, window_days: int = 7):
"""Capture baseline before optimization"""
self.baseline_metrics = {
"total_cost": self.get_total_cost(window_days),
"cost_per_request": self.get_avg_cost_per_request(window_days),
"tokens_per_dollar": self.get_tokens_per_dollar(window_days),
"requests_volume": self.get_request_count(window_days),
}
def measure_impact(self, optimization_name: str, window_days: int = 7):
"""Measure impact of optimization"""
self.current_metrics = {
"total_cost": self.get_total_cost(window_days),
"cost_per_request": self.get_avg_cost_per_request(window_days),
"tokens_per_dollar": self.get_tokens_per_dollar(window_days),
"requests_volume": self.get_request_count(window_days),
}
return {
"optimization": optimization_name,
"cost_reduction": 1 - (
self.current_metrics["cost_per_request"] /
self.baseline_metrics["cost_per_request"]
),
"efficiency_improvement": (
self.current_metrics["tokens_per_dollar"] /
self.baseline_metrics["tokens_per_dollar"]
) - 1,
"quality_maintained": self.quality_check(),
}
Common Mistakes
def cost_optimization_mistakes():
return {
"optimizing_wrong_thing": {
"mistake": "Spending weeks on 5% savings, ignoring 50% opportunity",
"example": "Optimizing kernels while not using prefix caching",
"fix": "Measure cost breakdown, optimize biggest first",
},
"ignoring_quality": {
"mistake": "Cut costs but degrade quality",
"example": "Route everything to smallest model",
"fix": "Measure quality alongside cost",
},
"over_engineering": {
"mistake": "Build complex systems for small gains",
"example": "Custom kernel for 3% improvement",
"fix": "Use existing frameworks, focus on high-impact",
},
"not_measuring": {
"mistake": "Assume optimizations work",
"example": "Enable feature, don't verify savings",
"fix": "A/B test, measure actual cost impact",
},
}
Prioritization Matrix
def prioritization_matrix():
return """
Optimization | Effort | Savings | Priority
----------------------|--------|---------|----------
Prefix caching | Low | High | Do first
Smart routing | Medium | High | Do second
Max tokens audit | Low | Medium | Do first
Quantization | Low | High | Do first (if self-hosted)
Batching | Medium | High | Do second
Context compression | Medium | Medium | Do third
Prompt optimization | Medium | Medium | Do third
Speculative decoding | High | Medium | Consider later
Custom kernels | High | Low | Skip usually
"""
Start with the techniques that combine low effort with high savings: prefix caching, quantization, and model routing. Then move to medium-effort items like batching and context compression. Leave high-effort, lower-impact optimizations for when the big wins are already captured.