Back to Blog

The Formula for Offloading Decisions

Warehouses balance storage cost against retrieval time. Store rarely-needed items in cheap offsite storage, accepting longer retrieval. Store frequently-needed items nearby, paying higher rent. The decision is economic: access frequency times retrieval cost versus storage cost.

GPU memory offloading follows the same economics. Move data to CPU RAM or disk, accepting transfer latency. Keep data on GPU, paying with precious GPU memory. The decision is mathematical: transfer cost versus recompute cost versus memory value.

The Basic Formula

def offload_decision_formula():
    return {
        "formula": """
            Offload if: transfer_time < recompute_time + memory_pressure_cost

            Where:
            - transfer_time = data_size / transfer_bandwidth
            - recompute_time = flops_to_recompute / gpu_flops
            - memory_pressure_cost = opportunity cost of occupied memory
        """,

        "simplified": """
            For most cases, ignore memory_pressure_cost initially:

            Offload if: transfer_time < recompute_time

            data_size / bandwidth < flops / gpu_speed
        """,

        "example": """
            Offload 1GB of KV cache?

            Transfer: 1GB / 32 GB/s (PCIe 4) = 31ms
            Recompute: Depends on what the KV cache represents

            For KV cache: Recomputing means re-running all prior tokens
            Recompute cost >> Transfer cost for long contexts

            Decision: Offload is worth it
        """,
    }

Transfer Bandwidth Reality

def transfer_bandwidths():
    return {
        "gpu_to_cpu": {
            "pcie_3_x16": "~16 GB/s",
            "pcie_4_x16": "~32 GB/s",
            "pcie_5_x16": "~64 GB/s",
            "nvlink_to_cpu": "~50-100 GB/s",
        },

        "cpu_to_disk": {
            "nvme_ssd": "~3-7 GB/s",
            "sata_ssd": "~0.5 GB/s",
            "ram_disk": "~20 GB/s",
        },

        "gpu_to_gpu": {
            "nvlink_same_node": "~300-600 GB/s",
            "pcie_same_node": "~32 GB/s",
            "network_cross_node": "~10-50 GB/s",
        },

        "practical_implications": """
            PCIe is 10-20x slower than GPU memory bandwidth.
            Offloading has real latency cost.

            1GB offload:
            - PCIe 4: 31ms
            - NVMe: 140-330ms

            These are significant for interactive inference.
        """,
    }

What to Offload

def what_to_offload():
    return {
        "good_candidates": {
            "kv_cache_old_tokens": {
                "why": "Old tokens rarely accessed until needed",
                "access_pattern": "Read once when that position is attended",
                "offload_to": "CPU RAM (fast enough for decode)",
            },
            "model_weights_unused_layers": {
                "why": "Layer weights only needed when layer runs",
                "access_pattern": "Sequential, predictable",
                "offload_to": "CPU RAM, prefetch next layer",
            },
            "activations_for_gradient": {
                "why": "Only needed during backward pass",
                "access_pattern": "Write once, read once",
                "offload_to": "CPU RAM (training only)",
            },
        },

        "bad_candidates": {
            "current_layer_weights": {
                "why": "Needed right now for computation",
                "result": "Would stall GPU waiting for transfer",
            },
            "recent_kv_cache": {
                "why": "Likely to be accessed in next few tokens",
                "result": "Transfer overhead exceeds benefit",
            },
            "frequently_accessed_data": {
                "why": "Transfer cost paid repeatedly",
                "result": "Faster to keep on GPU",
            },
        },
    }

KV Cache Offloading

class KVCacheOffloader:
    """
    Offload old KV cache to CPU
    """

    def __init__(self, config: dict):
        self.keep_on_gpu = config["keep_tokens"]  # Recent tokens to keep
        self.cpu_cache = {}  # layer -> cpu tensor
        self.gpu_cache = {}  # layer -> gpu tensor

    def manage_cache(self, layer_id: int, new_kv: torch.Tensor, total_tokens: int):
        """Move old tokens to CPU, keep recent on GPU"""
        if total_tokens <= self.keep_on_gpu:
            # All fits on GPU
            self.gpu_cache[layer_id] = new_kv
            return

        # Offload old tokens
        old_count = total_tokens - self.keep_on_gpu
        old_kv = self.gpu_cache[layer_id][:, :, :old_count, :]
        recent_kv = new_kv  # Only recent tokens

        # Async transfer to CPU
        self.cpu_cache[layer_id] = old_kv.to("cpu", non_blocking=True)
        self.gpu_cache[layer_id] = recent_kv

    def get_full_cache(self, layer_id: int) -> torch.Tensor:
        """Reconstruct full cache when needed"""
        cpu_part = self.cpu_cache[layer_id].to("cuda", non_blocking=True)
        gpu_part = self.gpu_cache[layer_id]
        return torch.cat([cpu_part, gpu_part], dim=2)

Layer-wise Offloading

def layer_offloading():
    """
    Offload model layers when not in use
    """
    return {
        "concept": """
            Only keep current layer(s) on GPU.
            Prefetch next layer while current runs.
            Offload previous layer when done.
        """,

        "implementation": """
            class LayerOffloader:
                def __init__(self, model, gpu_layers=2):
                    self.model = model
                    self.gpu_layers = gpu_layers  # Layers on GPU at once

                def forward(self, x):
                    # Start prefetch of first batch
                    self.prefetch_layers(0, self.gpu_layers)

                    for i, layer in enumerate(self.model.layers):
                        # Wait for this layer to be on GPU
                        self.wait_for_layer(i)

                        # Run layer
                        x = layer(x)

                        # Start prefetch of next layer(s)
                        if i + self.gpu_layers < len(self.model.layers):
                            self.prefetch_layer(i + self.gpu_layers)

                        # Offload layer no longer needed
                        if i >= self.gpu_layers:
                            self.offload_layer(i - self.gpu_layers)

                    return x
        """,

        "overlap_efficiency": """
            Key: Prefetch overlaps with compute.

            If transfer_time < layer_compute_time:
                Zero visible overhead (fully hidden)

            If transfer_time > layer_compute_time:
                Overhead = transfer_time - layer_compute_time
        """,
    }

Async Transfer Patterns

def async_transfer():
    return {
        "cuda_streams": """
            # Use separate stream for data transfer
            compute_stream = torch.cuda.Stream()
            transfer_stream = torch.cuda.Stream()

            with torch.cuda.stream(transfer_stream):
                # Start async transfer
                data_gpu = data_cpu.to('cuda', non_blocking=True)

            with torch.cuda.stream(compute_stream):
                # Do compute while transfer happens
                result = model(other_data)

            # Synchronize when transfer needed
            transfer_stream.synchronize()
            # Now data_gpu is ready
        """,

        "double_buffering": """
            # Two buffers: one computing, one transferring
            buffer_a = torch.empty(size, device='cuda')
            buffer_b = torch.empty(size, device='cuda')

            for batch in data:
                if batch % 2 == 0:
                    compute_buffer, transfer_buffer = buffer_a, buffer_b
                else:
                    compute_buffer, transfer_buffer = buffer_b, buffer_a

                # Compute on compute_buffer
                # Transfer into transfer_buffer
                # Overlap hides latency
        """,
    }

Decision Matrix

def offload_decision_matrix():
    return {
        "matrix": """
            Data Type       | Access Freq | Size | Offload? | To Where
            ----------------|-------------|------|----------|----------
            Model weights   | Every layer | Large| Maybe    | CPU, prefetch
            Recent KV cache | High        | Med  | No       | Stay on GPU
            Old KV cache    | Low         | Large| Yes      | CPU RAM
            Activations     | Once        | Med  | Yes      | CPU RAM (training)
            Optimizer state | Once/step   | Large| Yes      | CPU RAM (training)
        """,

        "simple_rule": """
            Memory pressure low? Don't offload, complexity not worth it.
            Memory pressure high + data accessed rarely? Offload.
            Memory pressure high + data accessed often? Need bigger GPU.
        """,

        "benchmarking": """
            Always benchmark your specific workload:

            1. Measure baseline (no offloading)
            2. Add offloading for candidate data
            3. Compare throughput and latency
            4. If improvement, keep. If not, revert.

            Theory says when to offload.
            Benchmarks confirm it works.
        """,
    }

The offload decision is economic. If transfer cost is less than the value of freed memory (measured by what else you could do with it), offload. If transfer cost exceeds the benefit, keep the data on GPU. Run the numbers for your specific workload.