When to Move Data Off the GPU

Warehouses use tiered storage: fast-access shelves near shipping, slower racks in the back, cold storage offsite. Moving items between tiers costs time, but keeping everything in the fast tier is impossible.

GPU memory follows similar economics. Moving data to CPU is slow, but keeping everything on GPU is expensive. The question is what to move and when.

The Offloading Decision

def should_offload(
    data_size_gb: float,
    access_frequency: float,  # Times accessed per second
    transfer_time_ms: float,  # Time to move to/from GPU
    recompute_time_ms: float,  # Time to regenerate data
) -> str:
    """
    Offload decision framework
    """
    # Option 1: Keep on GPU
    gpu_cost = data_size_gb * 0.5  # $/GB/hour equivalent

    # Option 2: Offload to CPU, transfer when needed
    transfer_cost_per_access = transfer_time_ms / 1000  # Seconds
    total_transfer_time = access_frequency * transfer_cost_per_access
    offload_viable = total_transfer_time < 0.1  # Less than 10% overhead

    # Option 3: Don't store, recompute
    recompute_viable = recompute_time_ms < transfer_time_ms

    if not offload_viable and not recompute_viable:
        return "keep_on_gpu"  # No good alternative
    elif recompute_viable:
        return "recompute"  # Faster to regenerate
    else:
        return "offload_to_cpu"  # Transfer cost acceptable

What To Offload

class OffloadingCandidates:
    """
    Different data types have different offloading characteristics
    """

    good_candidates = {
        "inactive_kv_cache": {
            "description": "KV cache for paused/waiting requests",
            "access_pattern": "Infrequent (request is waiting)",
            "transfer_cost": "High (large data)",
            "benefit": "Frees GPU for active requests",
        },
        "model_layers_for_small_models": {
            "description": "Unused layers during pipeline parallelism",
            "access_pattern": "Periodic (during specific phase)",
            "transfer_cost": "Medium",
            "benefit": "Fit larger models on smaller GPUs",
        },
        "quantization_buffers": {
            "description": "Dequantization lookup tables",
            "access_pattern": "Sparse",
            "transfer_cost": "Low (small data)",
            "benefit": "Minor memory savings",
        },
    }

    bad_candidates = {
        "active_kv_cache": {
            "description": "KV cache for currently generating request",
            "why_bad": "Accessed every token, transfer would kill latency",
        },
        "model_weights": {
            "description": "The actual model parameters",
            "why_bad": "Needed for every forward pass",
        },
        "current_activations": {
            "description": "Intermediate computation results",
            "why_bad": "Used immediately in next layer",
        },
    }

The Transfer Cost Reality

def transfer_cost_analysis():
    """
    PCIe bandwidth limits what's practical
    """
    # PCIe Gen4 x16: ~25 GB/s theoretical, ~20 GB/s practical
    # PCIe Gen5 x16: ~50 GB/s theoretical, ~40 GB/s practical

    pcie_bandwidth_gbps = 20  # Conservative estimate

    def transfer_time_ms(data_gb: float) -> float:
        return (data_gb / pcie_bandwidth_gbps) * 1000

    examples = {
        "1GB_kv_cache": transfer_time_ms(1),    # 50ms
        "10GB_kv_cache": transfer_time_ms(10),  # 500ms
        "40GB_model_shard": transfer_time_ms(40),  # 2000ms
    }

    insight = """
    10GB takes 500ms to transfer.
    That's 500ms added latency per request if you offload KV cache.

    Offloading only makes sense if:
    - Request will be paused longer than transfer time anyway
    - Alternative is OOM (no choice)
    - Batch processing where latency doesn't matter
    """

    return examples, insight

Offloading Strategies

class OffloadingStrategies:
    """
    Different approaches for different scenarios
    """

    preemptive_offload = {
        "description": "Move data before it's needed off",
        "when": "Predict which requests will be paused",
        "implementation": "Track request priority, offload low priority",
        "benefit": "Overlap transfer with computation",
    }

    reactive_offload = {
        "description": "Move data when GPU memory is tight",
        "when": "Memory pressure triggers offload",
        "implementation": "LRU eviction to CPU",
        "benefit": "Simple, works for unpredictable workloads",
    }

    hybrid_offload = {
        "description": "Keep frequently accessed on GPU, rest on CPU",
        "when": "Mixed workload with varying access patterns",
        "implementation": "Track access frequency, tier accordingly",
        "benefit": "Balances latency and memory efficiency",
    }

    no_offload = {
        "description": "Just limit concurrency instead",
        "when": "Latency is critical, memory is secondary",
        "implementation": "Queue requests instead of offloading",
        "benefit": "Predictable latency",
    }

Implementation Pattern

class KVCacheOffloader:
    """
    Practical KV cache offloading
    """
    def __init__(self, gpu_limit_gb: float, cpu_limit_gb: float):
        self.gpu_cache = {}  # request_id -> tensor
        self.cpu_cache = {}  # request_id -> tensor
        self.gpu_usage = 0
        self.gpu_limit = gpu_limit_gb

    def store(self, request_id: str, kv_cache: torch.Tensor, priority: int):
        cache_size = kv_cache.numel() * kv_cache.element_size() / 1e9

        if self.gpu_usage + cache_size > self.gpu_limit:
            self._evict_to_cpu(cache_size)

        self.gpu_cache[request_id] = kv_cache
        self.gpu_usage += cache_size

    def _evict_to_cpu(self, needed_gb: float):
        """Evict lowest priority caches to CPU"""
        # Sort by priority, evict lowest first
        for request_id in self._sorted_by_priority():
            if self.gpu_usage < self.gpu_limit - needed_gb:
                break

            cache = self.gpu_cache.pop(request_id)
            cache_size = cache.numel() * cache.element_size() / 1e9
            self.gpu_usage -= cache_size

            # Move to CPU (async for overlap)
            self.cpu_cache[request_id] = cache.to('cpu', non_blocking=True)

    def retrieve(self, request_id: str) -> torch.Tensor:
        if request_id in self.gpu_cache:
            return self.gpu_cache[request_id]

        if request_id in self.cpu_cache:
            # Move back to GPU
            cache = self.cpu_cache.pop(request_id)
            gpu_cache = cache.to('cuda', non_blocking=True)
            torch.cuda.synchronize()  # Wait for transfer
            return gpu_cache

        raise KeyError(f"Cache not found for {request_id}")

The Decision Matrix

def offloading_decision_matrix():
    return {
        "high_latency_sensitive": {
            "strategy": "Don't offload, limit concurrency",
            "reason": "Transfer latency unacceptable",
        },
        "batch_processing": {
            "strategy": "Aggressive offloading",
            "reason": "Throughput matters, not latency",
        },
        "mixed_priority_workload": {
            "strategy": "Offload low-priority request caches",
            "reason": "High-priority stays fast, low-priority waits",
        },
        "memory_constrained": {
            "strategy": "Offload everything possible",
            "reason": "Alternative is OOM or rejecting requests",
        },
    }

Offloading is a tool, not a solution. The right approach depends on your latency requirements, workload patterns, and hardware. Sometimes the best "offloading strategy" is simply running fewer concurrent requests.