Back to Blog
Finding the KV Cache Problem Before Your Bill Does
Doctors don't guess—they run tests. A patient complaining of fatigue might have a hundred causes. Blood tests, imaging, and vitals narrow it down systematically.
Memory problems in LLM serving need the same diagnostic discipline. "It's running slow" could be model weights, KV cache, fragmentation, or something else entirely. Here's how to find the actual problem.
The Memory Breakdown
def memory_audit(gpu_memory_gb: float = 80):
"""
Where does memory actually go?
Example: 70B model on H100 with tensor parallelism 4
"""
breakdown = {
"model_weights": {
"total": 140, # 70B × 2 bytes FP16
"per_gpu": 35, # 140 / 4 GPUs
"status": "Fixed, predictable",
},
"kv_cache": {
"per_token_per_request": 0.0025, # 2.5 MB
"at_1k_context_1_request": 2.5,
"at_8k_context_10_requests": 200, # Often the killer
"status": "Variable, grows with usage",
},
"activations": {
"per_gpu_during_forward": 4, # Varies with batch
"status": "Temporary, freed after forward pass",
},
"cuda_context": {
"overhead": 1.5,
"status": "Fixed per GPU",
},
"allocator_overhead": {
"typical": 2,
"status": "Fragmentation and buffers",
},
}
available = gpu_memory_gb - breakdown["model_weights"]["per_gpu"] - \
breakdown["cuda_context"]["overhead"] - \
breakdown["allocator_overhead"]["typical"]
breakdown["available_for_kv_cache"] = available
# 80 - 35 - 1.5 - 2 = 41.5 GB per GPU for KV cache
return breakdown
The Diagnostic Steps
class KVCacheDiagnosis:
"""
Step-by-step diagnosis
"""
step_1_baseline = """
1. Measure baseline memory (no requests)
nvidia-smi --query-gpu=memory.used --format=csv -l 1
Expected: model_weights + cuda_overhead + allocator
If higher: Something else consuming memory
"""
step_2_single_request = """
2. Send single request, measure growth
# Before request: X GB
# After request starts: X + KV_cache GB
# After request completes: Should return to ~X
If it doesn't return: Memory leak
"""
step_3_concurrent_load = """
3. Test concurrent requests
Send 10 concurrent requests with 2K context each.
Expected growth: 10 × 2K × cache_per_token
If growth >> expected: Something wrong
"""
step_4_long_context = """
4. Test long context specifically
Send 1 request with 16K context.
Monitor memory continuously.
Growth should be linear with tokens.
If it's faster: Possible duplication or leak
"""
Monitoring Commands
def monitoring_commands():
"""
Practical commands for diagnosis
"""
return {
"continuous_memory": {
"command": "nvidia-smi dmon -s m -d 1",
"output": "Memory utilization every second",
},
"detailed_breakdown": {
"command": "python -c 'import torch; print(torch.cuda.memory_stats())'",
"output": "Allocated, reserved, active memory",
},
"vllm_specific": {
"info": "Check vLLM logs for KV cache stats",
"look_for": [
"num_batched_tokens",
"num_running_reqs",
"gpu_cache_usage",
],
},
"custom_tracking": """
# Add to your serving code
import torch
def log_memory():
allocated = torch.cuda.memory_allocated() / 1e9
reserved = torch.cuda.memory_reserved() / 1e9
print(f"Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
""",
}
Common Problems and Solutions
def common_issues():
return {
"memory_grows_continuously": {
"symptom": "Memory increases over time, never releases",
"likely_cause": "Memory leak or unbounded cache",
"diagnosis": "Track memory during idle period",
"solution": "Update framework, check for cache clearing",
},
"oom_with_few_requests": {
"symptom": "OOM crash with 5 concurrent requests",
"likely_cause": "Long context not accounted for",
"diagnosis": "Check max_tokens in requests",
"solution": "Lower max_num_seqs or max_model_len",
},
"fragmentation": {
"symptom": "nvidia-smi shows memory available, still OOM",
"likely_cause": "Memory fragmentation",
"diagnosis": "torch.cuda.memory_summary()",
"solution": "Restart worker, use paged allocation",
},
"baseline_too_high": {
"symptom": "Fresh start uses more memory than expected",
"likely_cause": "Multiple model copies or cached data",
"diagnosis": "Check for duplicate model loads",
"solution": "Verify tensor parallelism config",
},
}
The Calculation Worksheet
def kv_cache_calculator(
model_name: str,
context_length: int,
concurrent_requests: int,
gpu_memory_gb: float,
tensor_parallel: int,
) -> dict:
"""
Calculate if your config will fit
"""
# Model-specific KV cache sizes (approximate)
kv_per_token = {
"llama_7b": 0.0005, # GB
"llama_13b": 0.001,
"llama_70b": 0.0025,
"mixtral_8x7b": 0.002,
}
cache_per_token = kv_per_token.get(model_name, 0.001)
# Total KV cache needed
total_kv_cache = (
cache_per_token *
context_length *
concurrent_requests
)
# Memory per GPU (distributed)
kv_per_gpu = total_kv_cache / tensor_parallel
# Available memory (rough estimate)
model_sizes = {
"llama_7b": 14,
"llama_13b": 26,
"llama_70b": 140,
"mixtral_8x7b": 90,
}
model_per_gpu = model_sizes.get(model_name, 50) / tensor_parallel
overhead = 4 # CUDA + allocator
available = gpu_memory_gb - model_per_gpu - overhead
return {
"kv_cache_needed_per_gpu": kv_per_gpu,
"available_per_gpu": available,
"will_fit": kv_per_gpu < available,
"headroom_gb": available - kv_per_gpu,
"max_concurrent": int(available / (cache_per_token * context_length)),
}
The Alert Thresholds
def recommended_alerts():
return {
"memory_utilization": {
"warning": 80, # %
"critical": 90,
"why": "Leave headroom for bursts",
},
"kv_cache_fragmentation": {
"metric": "reserved - allocated",
"warning": "2GB",
"action": "Consider worker restart",
},
"concurrent_requests": {
"track": "requests_in_flight vs max_concurrent",
"warning": "80% of max",
"why": "Approaching memory limit",
},
"context_length_spikes": {
"track": "max context_length over 5 minutes",
"warning": "2x average",
"why": "Long requests consume disproportionate memory",
},
}
The KV cache is often the limiting factor in production. Knowing how to diagnose it means knowing when to add memory, when to reduce concurrency, and when the problem is actually something else entirely.