Back to Blog
When to Move Data Off the GPU
Warehouses use tiered storage: fast-access shelves near shipping, slower racks in the back, cold storage offsite. Moving items between tiers costs time, but keeping everything in the fast tier is impossible.
GPU memory follows similar economics. Moving data to CPU is slow, but keeping everything on GPU is expensive. The question is what to move and when.
The Offloading Decision
def should_offload(
data_size_gb: float,
access_frequency: float, # Times accessed per second
transfer_time_ms: float, # Time to move to/from GPU
recompute_time_ms: float, # Time to regenerate data
) -> str:
"""
Offload decision framework
"""
# Option 1: Keep on GPU
gpu_cost = data_size_gb * 0.5 # $/GB/hour equivalent
# Option 2: Offload to CPU, transfer when needed
transfer_cost_per_access = transfer_time_ms / 1000 # Seconds
total_transfer_time = access_frequency * transfer_cost_per_access
offload_viable = total_transfer_time < 0.1 # Less than 10% overhead
# Option 3: Don't store, recompute
recompute_viable = recompute_time_ms < transfer_time_ms
if not offload_viable and not recompute_viable:
return "keep_on_gpu" # No good alternative
elif recompute_viable:
return "recompute" # Faster to regenerate
else:
return "offload_to_cpu" # Transfer cost acceptable
What To Offload
class OffloadingCandidates:
"""
Different data types have different offloading characteristics
"""
good_candidates = {
"inactive_kv_cache": {
"description": "KV cache for paused/waiting requests",
"access_pattern": "Infrequent (request is waiting)",
"transfer_cost": "High (large data)",
"benefit": "Frees GPU for active requests",
},
"model_layers_for_small_models": {
"description": "Unused layers during pipeline parallelism",
"access_pattern": "Periodic (during specific phase)",
"transfer_cost": "Medium",
"benefit": "Fit larger models on smaller GPUs",
},
"quantization_buffers": {
"description": "Dequantization lookup tables",
"access_pattern": "Sparse",
"transfer_cost": "Low (small data)",
"benefit": "Minor memory savings",
},
}
bad_candidates = {
"active_kv_cache": {
"description": "KV cache for currently generating request",
"why_bad": "Accessed every token, transfer would kill latency",
},
"model_weights": {
"description": "The actual model parameters",
"why_bad": "Needed for every forward pass",
},
"current_activations": {
"description": "Intermediate computation results",
"why_bad": "Used immediately in next layer",
},
}
The Transfer Cost Reality
def transfer_cost_analysis():
"""
PCIe bandwidth limits what's practical
"""
# PCIe Gen4 x16: ~25 GB/s theoretical, ~20 GB/s practical
# PCIe Gen5 x16: ~50 GB/s theoretical, ~40 GB/s practical
pcie_bandwidth_gbps = 20 # Conservative estimate
def transfer_time_ms(data_gb: float) -> float:
return (data_gb / pcie_bandwidth_gbps) * 1000
examples = {
"1GB_kv_cache": transfer_time_ms(1), # 50ms
"10GB_kv_cache": transfer_time_ms(10), # 500ms
"40GB_model_shard": transfer_time_ms(40), # 2000ms
}
insight = """
10GB takes 500ms to transfer.
That's 500ms added latency per request if you offload KV cache.
Offloading only makes sense if:
- Request will be paused longer than transfer time anyway
- Alternative is OOM (no choice)
- Batch processing where latency doesn't matter
"""
return examples, insight
Offloading Strategies
class OffloadingStrategies:
"""
Different approaches for different scenarios
"""
preemptive_offload = {
"description": "Move data before it's needed off",
"when": "Predict which requests will be paused",
"implementation": "Track request priority, offload low priority",
"benefit": "Overlap transfer with computation",
}
reactive_offload = {
"description": "Move data when GPU memory is tight",
"when": "Memory pressure triggers offload",
"implementation": "LRU eviction to CPU",
"benefit": "Simple, works for unpredictable workloads",
}
hybrid_offload = {
"description": "Keep frequently accessed on GPU, rest on CPU",
"when": "Mixed workload with varying access patterns",
"implementation": "Track access frequency, tier accordingly",
"benefit": "Balances latency and memory efficiency",
}
no_offload = {
"description": "Just limit concurrency instead",
"when": "Latency is critical, memory is secondary",
"implementation": "Queue requests instead of offloading",
"benefit": "Predictable latency",
}
Implementation Pattern
class KVCacheOffloader:
"""
Practical KV cache offloading
"""
def __init__(self, gpu_limit_gb: float, cpu_limit_gb: float):
self.gpu_cache = {} # request_id -> tensor
self.cpu_cache = {} # request_id -> tensor
self.gpu_usage = 0
self.gpu_limit = gpu_limit_gb
def store(self, request_id: str, kv_cache: torch.Tensor, priority: int):
cache_size = kv_cache.numel() * kv_cache.element_size() / 1e9
if self.gpu_usage + cache_size > self.gpu_limit:
self._evict_to_cpu(cache_size)
self.gpu_cache[request_id] = kv_cache
self.gpu_usage += cache_size
def _evict_to_cpu(self, needed_gb: float):
"""Evict lowest priority caches to CPU"""
# Sort by priority, evict lowest first
for request_id in self._sorted_by_priority():
if self.gpu_usage < self.gpu_limit - needed_gb:
break
cache = self.gpu_cache.pop(request_id)
cache_size = cache.numel() * cache.element_size() / 1e9
self.gpu_usage -= cache_size
# Move to CPU (async for overlap)
self.cpu_cache[request_id] = cache.to('cpu', non_blocking=True)
def retrieve(self, request_id: str) -> torch.Tensor:
if request_id in self.gpu_cache:
return self.gpu_cache[request_id]
if request_id in self.cpu_cache:
# Move back to GPU
cache = self.cpu_cache.pop(request_id)
gpu_cache = cache.to('cuda', non_blocking=True)
torch.cuda.synchronize() # Wait for transfer
return gpu_cache
raise KeyError(f"Cache not found for {request_id}")
The Decision Matrix
def offloading_decision_matrix():
return {
"high_latency_sensitive": {
"strategy": "Don't offload, limit concurrency",
"reason": "Transfer latency unacceptable",
},
"batch_processing": {
"strategy": "Aggressive offloading",
"reason": "Throughput matters, not latency",
},
"mixed_priority_workload": {
"strategy": "Offload low-priority request caches",
"reason": "High-priority stays fast, low-priority waits",
},
"memory_constrained": {
"strategy": "Offload everything possible",
"reason": "Alternative is OOM or rejecting requests",
},
}
Offloading is a tool, not a solution. The right approach depends on your latency requirements, workload patterns, and hardware. Sometimes the best "offloading strategy" is simply running fewer concurrent requests.