Finding Memory Leaks in LLM Serving

A slow leak in a pipe wastes gallons before you notice the water bill. By the time the ceiling stains, you've lost hundreds of dollars. Finding the leak early requires looking where you don't see obvious damage.

GPU memory leaks work similarly. Usage grows request by request, invisible until OOM. Finding them requires profiling when things look fine, not when they're already broken.

The Symptoms

def memory_leak_symptoms():
    return {
        "obvious": [
            "OOM after N hours of serving",
            "nvidia-smi shows memory creeping up",
            "Performance degrades over time",
        ],
        "subtle": [
            "Occasional OOM under normal load",
            "Memory doesn't return to baseline after traffic spike",
            "Inconsistent behavior between restarts",
        ],
        "not_actually_leaks": [
            "KV cache growth with active requests (normal)",
            "Memory fragmentation (not a leak, but similar effect)",
            "Batch size variation causing memory fluctuation",
        ],
    }

The Diagnostic Commands

def memory_diagnostics():
    return {
        "basic_monitoring": {
            "nvidia_smi": """
            # Continuous monitoring
            nvidia-smi -l 1

            # Log to file
            nvidia-smi --query-gpu=timestamp,memory.used,memory.free --format=csv -l 5 > memory_log.csv
            """,
        },
        "pytorch_stats": {
            "current_state": """
            import torch

            print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
            print(f"Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
            print(f"Max allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
            """,
            "detailed_stats": """
            stats = torch.cuda.memory_stats()
            print(f"Active: {stats['active_bytes.all.current'] / 1e9:.2f} GB")
            print(f"Inactive: {stats['inactive_split_bytes.all.current'] / 1e9:.2f} GB")
            """,
        },
        "memory_snapshot": {
            "capture": """
            # Capture detailed allocation history
            torch.cuda.memory._record_memory_history()

            # ... run your workload ...

            torch.cuda.memory._dump_snapshot("memory_snapshot.pickle")
            """,
            "analyze": "Use torch.cuda.memory._snapshot() for detailed breakdown",
        },
    }

Common Leak Sources

def common_leak_sources():
    return {
        "reference_holding": {
            "symptom": "Tensors not freed after request completes",
            "cause": "References kept in global structures",
            "check": """
            # After request completes, force garbage collection
            import gc
            gc.collect()
            torch.cuda.empty_cache()
            # If memory doesn't drop, something is holding references
            """,
        },
        "kv_cache_not_freed": {
            "symptom": "Memory grows with each request",
            "cause": "KV cache not released when request completes",
            "check": "Verify cache cleanup in request lifecycle",
        },
        "cuda_graph_accumulation": {
            "symptom": "Memory grows as new input shapes are seen",
            "cause": "CUDA graphs cached for each shape",
            "fix": "Limit shape variations or disable CUDA graphs",
        },
        "profiler_artifacts": {
            "symptom": "Memory grows when profiling is enabled",
            "cause": "Profiler storing history",
            "fix": "Disable profiling in production",
        },
        "logging_large_tensors": {
            "symptom": "Memory grows with log volume",
            "cause": "Accidentally logging tensor values",
            "fix": "Log shapes and dtypes, not values",
        },
    }

Isolation Testing

class MemoryLeakTester:
    """
    Systematically test for leaks
    """

    def __init__(self, model):
        self.model = model

    def baseline_memory(self) -> float:
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        return torch.cuda.memory_allocated()

    def test_single_request_leak(self, num_requests: int = 100) -> dict:
        """Does memory grow with sequential requests?"""
        initial = self.baseline_memory()
        memory_samples = [initial]

        for i in range(num_requests):
            self.run_single_request()

            # Force cleanup
            torch.cuda.synchronize()
            torch.cuda.empty_cache()

            current = torch.cuda.memory_allocated()
            memory_samples.append(current)

        final = memory_samples[-1]
        growth = final - initial

        return {
            "initial_gb": initial / 1e9,
            "final_gb": final / 1e9,
            "growth_gb": growth / 1e9,
            "per_request_bytes": growth / num_requests if growth > 0 else 0,
            "verdict": "LEAK" if growth > 10_000_000 else "OK",  # 10MB threshold
        }

    def test_concurrent_requests_leak(self, cycles: int = 10) -> dict:
        """Does memory grow with concurrent request cycles?"""
        initial = self.baseline_memory()

        for cycle in range(cycles):
            # Simulate load
            self.run_concurrent_requests(n=10)

            # Complete all requests
            self.wait_for_completion()

            # Cleanup
            torch.cuda.synchronize()
            torch.cuda.empty_cache()

        final = self.baseline_memory()
        growth = final - initial

        return {
            "cycles": cycles,
            "growth_gb": growth / 1e9,
            "verdict": "LEAK" if growth > 50_000_000 else "OK",  # 50MB threshold
        }

The Memory Lifecycle

def healthy_memory_lifecycle():
    return {
        "request_start": {
            "allocate": "KV cache for new request",
            "expected": "Memory increases by context × cache_per_token",
        },
        "during_generation": {
            "behavior": "KV cache grows as tokens generate",
            "expected": "Linear growth until max_tokens or completion",
        },
        "request_complete": {
            "cleanup": "Release KV cache for this request",
            "expected": "Memory returns to pre-request level",
        },
        "idle_state": {
            "behavior": "No active requests",
            "expected": "Memory stable at model weights + framework overhead",
        },
        "leak_indicator": {
            "symptom": "Idle memory higher than at startup",
            "action": "Investigate reference holding",
        },
    }

Prevention Strategies

def leak_prevention():
    return {
        "design_patterns": [
            "Explicit cleanup in request handler finally block",
            "Context managers for temporary allocations",
            "Weak references for caches that should be droppable",
        ],
        "operational": [
            "Periodic worker restarts (every N hours)",
            "Memory alerts at 80% threshold",
            "Automated profiling in staging",
        ],
        "monitoring": [
            "Track memory baseline after each restart",
            "Alert if baseline drifts upward",
            "Log memory stats per request",
        ],
        "testing": [
            "Memory leak tests in CI",
            "Long-running soak tests before release",
            "Compare memory profiles across versions",
        ],
    }

The Quick Check

def quick_leak_check():
    """
    Run this before investigating deeper
    """
    return """
    1. Note memory before any requests
    2. Run 100 sequential requests
    3. Wait 30 seconds, run gc.collect(), torch.cuda.empty_cache()
    4. Note memory after

    If after > before by more than 10MB:
        You probably have a leak

    If after ≈ before:
        Probably not a leak (might be fragmentation)

    This takes 5 minutes and catches most leaks.
    """

Memory leaks are slow-motion crashes. By the time you see the OOM, you've lost the context of what caused it. Profile early, profile regularly, and add memory assertions to your tests.