Back to Blog
Compressing the Cache Not Just the Model
Photographers compress raw images for storage but forget about thumbnails. Those thumbnails, multiplied across thousands of photos, can consume more space than a few raw files.
LLM memory management has a similar blind spot. Teams optimize model weights but ignore KV cache. For long-context serving, the cache often dominates memory usage.
The Memory Split
def memory_comparison(model_params_b: float, context_length: int, batch_size: int):
"""
Where does memory actually go?
"""
# Model weights (FP16)
model_gb = model_params_b * 2
# KV cache (FP16)
# Approximate: 2 bytes × 2 (K,V) × layers × heads × dim × context × batch
kv_per_token = 0.0025 # GB for 70B model
kv_cache_gb = context_length * batch_size * kv_per_token
return {
"model_weights": model_gb,
"kv_cache": kv_cache_gb,
"which_is_larger": "kv_cache" if kv_cache_gb > model_gb else "model",
}
# Example: 70B model, 8K context, 10 concurrent requests
result = memory_comparison(70, 8000, 10)
# Model: 140 GB
# KV cache: 200 GB
# KV cache is larger!
KV Cache Quantization
class KVCacheQuantization:
"""
Reduce KV cache precision from FP16 to INT8
"""
implementation = """
# vLLM
vllm serve model_name --kv-cache-dtype int8
# That's it. One flag for 2x cache capacity.
"""
before_and_after = {
"fp16": {
"bytes_per_element": 2,
"8K_context_10_requests": "200 GB",
},
"int8": {
"bytes_per_element": 1,
"8K_context_10_requests": "100 GB",
"savings": "100 GB",
},
}
quality_impact = """
KV cache quantization is less sensitive than weight quantization.
Why:
- KV values are computed fresh each request
- They're intermediate values, not learned parameters
- Small errors in K/V have muted effect after softmax
- Each token's KV is independent
Typical quality degradation: < 1%
Often undetectable in practical benchmarks.
"""
Why It's Often Overlooked
def why_overlooked():
return {
"reason_1": {
"misconception": "Model weights are the memory consumer",
"reality": "For long context, KV cache dominates",
},
"reason_2": {
"misconception": "Quantizing cache would hurt quality",
"reality": "Quality impact is minimal and well-tested",
},
"reason_3": {
"misconception": "It's complex to implement",
"reality": "One config flag in modern frameworks",
},
"reason_4": {
"misconception": "Only matters at extreme scale",
"reality": "Matters whenever context length × batch is significant",
},
}
Combined Optimization
def combined_memory_optimization():
"""
Stack multiple techniques for maximum effect
"""
baseline = {
"model": "70B FP16 = 140 GB",
"kv_cache": "8K × 10 requests = 200 GB",
"total": "340 GB",
"gpus_needed": "5× H100 80GB",
}
with_weight_quantization = {
"model": "70B INT8 = 70 GB",
"kv_cache": "8K × 10 requests = 200 GB",
"total": "270 GB",
"gpus_needed": "4× H100 80GB",
}
with_both = {
"model": "70B INT8 = 70 GB",
"kv_cache": "8K × 10 requests INT8 = 100 GB",
"total": "170 GB",
"gpus_needed": "3× H100 80GB",
"savings_vs_baseline": "2 GPUs saved",
}
return baseline, with_weight_quantization, with_both
Quality Testing for KV Quantization
def kv_quantization_test_protocol():
return {
"test_1": {
"name": "Perplexity comparison",
"method": "Compare PPL on validation set",
"acceptable_increase": "< 0.1",
},
"test_2": {
"name": "Long context coherence",
"method": "Verify references to early context work",
"why": "KV cache errors could compound over length",
},
"test_3": {
"name": "Output diff at various lengths",
"method": "Compare outputs at 1K, 4K, 8K, 16K context",
"acceptable": "> 95% token match",
},
"test_4": {
"name": "Task-specific benchmarks",
"method": "Run your actual use case evaluation",
"threshold": "No statistically significant degradation",
},
}
When KV Cache Quantization Helps Most
def high_impact_scenarios():
return {
"long_context_serving": {
"context_length": "> 4K tokens",
"impact": "High - cache is large",
"recommendation": "Definitely enable",
},
"high_concurrency": {
"concurrent_requests": "> 10",
"impact": "High - many caches add up",
"recommendation": "Definitely enable",
},
"memory_constrained_gpu": {
"setup": "Model barely fits",
"impact": "Critical - enables serving",
"recommendation": "Required for feasibility",
},
"short_context_low_concurrency": {
"context_length": "< 1K",
"concurrent_requests": "< 5",
"impact": "Low - cache is small anyway",
"recommendation": "Optional, but free performance",
},
}
Implementation Checklist
def kv_quantization_checklist():
return [
{
"step": 1,
"action": "Measure current KV cache memory usage",
"command": "Monitor GPU memory during varying context lengths",
},
{
"step": 2,
"action": "Enable INT8 KV cache",
"command": "--kv-cache-dtype int8 (vLLM)",
},
{
"step": 3,
"action": "Run quality benchmarks",
"command": "Compare outputs on test set",
},
{
"step": 4,
"action": "Measure memory improvement",
"command": "Same test, observe memory reduction",
},
{
"step": 5,
"action": "Deploy with monitoring",
"command": "Track quality metrics in production",
},
]
KV cache quantization is one of the highest-ROI optimizations available. Minimal quality impact, significant memory savings, and trivial to enable. If you're not using it, you're leaving performance on the table.