Back to Blog
Getting 95% Quality at 12% Cost
Book indexes exist because nobody reads entire books to find one fact. The index directs you to the relevant 3 pages out of 300. You get the answer without processing the irrelevant 99%.
LLM context works similarly. Most queries don't need the full conversation history or the complete document. The relevant 12% often provides 95% of the answer quality. Finding that 12% efficiently is cheaper than processing everything.
The Empirical Finding
def the_95_12_observation():
return {
"experiment": """
Test query quality with varying context lengths:
- Full context (100K tokens): 100% quality baseline
- Random 12K sample: ~80% quality
- Best 12K (retrieval): ~95% quality
- Best 4K (retrieval): ~88% quality
""",
"key_insight": """
Quality doesn't scale linearly with context.
Doubling context does not double quality.
The marginal value of additional context decreases.
Most value is in the most relevant portions.
""",
"cost_implication": """
If 12K context gives 95% quality of 100K:
- Cost: 12% of full context
- Latency: ~15% of full context (O(n²) attention)
- Quality: 95% of full context
5% quality loss for 85% cost reduction.
""",
}
Identifying the Relevant 12%
class ContextSelector:
"""
Select the most relevant context for each query
"""
def select(self, full_context: list, query: str, budget_tokens: int) -> list:
"""
Select most relevant context within token budget
"""
scored = []
for chunk in full_context:
score = self.relevance_score(chunk, query)
scored.append((score, chunk))
# Sort by relevance
scored.sort(reverse=True)
# Select top chunks within budget
selected = []
tokens_used = 0
for score, chunk in scored:
chunk_tokens = self.count_tokens(chunk)
if tokens_used + chunk_tokens <= budget_tokens:
selected.append(chunk)
tokens_used += chunk_tokens
return selected
def relevance_score(self, chunk: str, query: str) -> float:
"""Calculate relevance of chunk to query"""
scores = {
"semantic": self.embedding_similarity(chunk, query),
"lexical": self.keyword_overlap(chunk, query),
"recency": self.recency_bonus(chunk),
"position": self.position_bonus(chunk), # Start/end more valuable
}
# Weighted combination
return (
scores["semantic"] * 0.5 +
scores["lexical"] * 0.2 +
scores["recency"] * 0.2 +
scores["position"] * 0.1
)
Chunking Strategies
def chunking_strategies():
return {
"fixed_size": {
"approach": "Split into fixed-token chunks (e.g., 512 tokens)",
"pros": ["Simple", "Predictable sizing"],
"cons": ["May split mid-sentence", "Context lost at boundaries"],
},
"semantic": {
"approach": "Split at natural boundaries (paragraphs, sections)",
"pros": ["Preserves meaning", "Natural units"],
"cons": ["Variable sizes", "May be too large"],
},
"sliding_window": {
"approach": "Overlapping chunks for continuity",
"pros": ["No lost context at boundaries"],
"cons": ["Redundant content", "More to store"],
},
"hierarchical": {
"approach": "Summary chunks + detail chunks",
"pros": ["Multi-resolution retrieval", "Efficient for long docs"],
"cons": ["Requires summarization", "More complex"],
},
"recommended": """
For conversations:
- Chunk by message/turn
- Keep recent N turns always
- Retrieve older turns by relevance
For documents:
- Chunk by paragraph/section
- Include document summary always
- Retrieve detail chunks by relevance
""",
}
The Retrieval Pipeline
class RetrievalPipeline:
"""
Efficient context retrieval for 95/12 optimization
"""
def __init__(self, config):
self.budget_tokens = config["budget_tokens"] # 12K
self.always_include = config["always_include"] # System prompt, recent
self.retriever = config["retriever"]
def build_context(self, query: str, history: list) -> str:
"""Build optimal context for query"""
context_parts = []
tokens_used = 0
# Always include: system prompt (high value, start position)
system = self.always_include["system_prompt"]
context_parts.append(system)
tokens_used += self.count_tokens(system)
# Always include: recent messages (high value, end position)
recent = history[-3:] # Last 3 turns
for msg in recent:
context_parts.append(msg)
tokens_used += self.count_tokens(msg)
# Fill remaining budget with retrieved context
remaining_budget = self.budget_tokens - tokens_used
older_history = history[:-3]
if older_history and remaining_budget > 0:
relevant = self.retriever.retrieve(
query=query,
candidates=older_history,
max_tokens=remaining_budget
)
# Insert retrieved context in middle (less prime position)
context_parts.insert(1, "\n".join(relevant))
return "\n".join(context_parts)
Measuring the Tradeoff
class QualityCostMeasurement:
"""
Measure actual quality vs cost tradeoff
"""
def measure_curve(self, model, test_cases: list) -> dict:
"""
Measure quality at different context budgets
"""
budgets = [0.05, 0.10, 0.15, 0.25, 0.50, 1.0] # Fractions of full
results = {}
for budget_ratio in budgets:
qualities = []
for case in test_cases:
# Full context quality
full_quality = self.evaluate(
model, case["full_context"], case["query"]
)
# Budget context quality
budget_context = self.select_context(
case["full_context"],
case["query"],
ratio=budget_ratio
)
budget_quality = self.evaluate(
model, budget_context, case["query"]
)
qualities.append(budget_quality / full_quality)
results[budget_ratio] = {
"avg_quality_retention": statistics.mean(qualities),
"cost_ratio": budget_ratio,
}
return results
def find_optimal_budget(self, results: dict, quality_threshold: float = 0.95) -> float:
"""Find minimum budget that achieves quality threshold"""
for budget_ratio, metrics in sorted(results.items()):
if metrics["avg_quality_retention"] >= quality_threshold:
return budget_ratio
return 1.0 # Need full context
When 95/12 Works
def when_95_12_works():
return {
"works_well": [
{
"use_case": "Customer support with history",
"why": "Most queries relate to recent issue",
"savings": "80% cost reduction typical",
},
{
"use_case": "Document Q&A",
"why": "Answers usually in specific sections",
"savings": "90% cost reduction for long docs",
},
{
"use_case": "Code assistance with codebase",
"why": "Relevant files usually identifiable",
"savings": "95% cost reduction for large repos",
},
],
"works_poorly": [
{
"use_case": "Summarization of full document",
"why": "Need to read everything",
"mitigation": "Hierarchical summarization",
},
{
"use_case": "Complex multi-step reasoning",
"why": "May need to connect distant facts",
"mitigation": "Chain of retrieval",
},
{
"use_case": "Compliance checking",
"why": "Any section could have violation",
"mitigation": "Full scan required",
},
],
}
Implementation Checklist
def implementation_checklist():
return [
{
"step": "Chunk your context appropriately",
"how": "By message for chat, by paragraph for docs",
"tool": "LangChain text splitters, custom logic",
},
{
"step": "Build retrieval index",
"how": "Embed chunks, store in vector database",
"tool": "Pinecone, Chroma, FAISS",
},
{
"step": "Define always-include content",
"how": "System prompt, recent messages",
"rationale": "Some context always relevant",
},
{
"step": "Set token budget",
"how": "Start at 15% of max context",
"tuning": "Adjust based on quality measurements",
},
{
"step": "Measure quality retention",
"how": "Compare full vs budget responses",
"threshold": "95% quality retention acceptable",
},
{
"step": "Monitor in production",
"how": "Track quality scores, user feedback",
"alert": "If quality drops, increase budget",
},
]
Cost Impact Calculator
def cost_calculator():
"""
Calculate savings from 95/12 optimization
"""
scenarios = {
"baseline": {
"context_tokens": 100_000,
"requests_per_day": 1_000_000,
"cost_per_1m_tokens": 3.00, # Example Sonnet pricing
},
}
baseline = scenarios["baseline"]
daily_baseline_cost = (
baseline["context_tokens"] / 1_000_000 *
baseline["cost_per_1m_tokens"] *
baseline["requests_per_day"]
)
optimized = {
"context_tokens": 12_000, # 12% of baseline
"requests_per_day": baseline["requests_per_day"],
"cost_per_1m_tokens": baseline["cost_per_1m_tokens"],
}
daily_optimized_cost = (
optimized["context_tokens"] / 1_000_000 *
optimized["cost_per_1m_tokens"] *
optimized["requests_per_day"]
)
return {
"baseline_daily": f"${daily_baseline_cost:,.0f}",
"optimized_daily": f"${daily_optimized_cost:,.0f}",
"savings_daily": f"${daily_baseline_cost - daily_optimized_cost:,.0f}",
"savings_monthly": f"${(daily_baseline_cost - daily_optimized_cost) * 30:,.0f}",
"savings_percent": f"{(1 - daily_optimized_cost/daily_baseline_cost)*100:.0f}%",
}
The 95/12 rule isn't about cutting corners. It's about recognizing that context has diminishing returns. The first 12% of relevant context does most of the work. Finding that 12% efficiently lets you deliver near-full quality at a fraction of the cost.