Back to Blog

The Performance Wins from Fusing Kernels

A chef preparing dinner doesn't wash their hands between every vegetable they cut. They batch the prep, then wash once. The handwashing is necessary, but doing it per-vegetable would make cooking take forever.

CUDA kernel launches work the same way. Each launch has overhead: memory transfers, synchronization, scheduler work. Fusing multiple operations into one kernel eliminates repeated overhead.

The Kernel Launch Problem

def kernel_launch_overhead():
    """
    Each kernel launch isn't free
    """
    return {
        "per_launch_cost": {
            "kernel_dispatch": "~5-10 microseconds",
            "memory_sync": "Variable, can be significant",
            "scheduler": "Queue management",
        },
        "example_unfused": {
            "operations": [
                "Load Q, K, V from memory",
                "Compute Q @ K^T",
                "Apply scaling",
                "Apply mask",
                "Softmax",
                "Dropout",
                "Multiply by V",
            ],
            "kernel_launches": 7,
            "memory_trips": "Each intermediate result written to HBM",
        },
        "example_fused": {
            "operations": ["Single fused attention kernel"],
            "kernel_launches": 1,
            "memory_trips": "Input and output only",
        },
        "savings": "6 kernel launches, 5 memory round trips",
    }

Where Fusion Helps Most

def fusion_opportunities():
    return {
        "qkv_projection": {
            "unfused": [
                "Q = input @ W_q",
                "K = input @ W_k",
                "V = input @ W_v",
            ],
            "fused": "QKV = input @ W_qkv (single larger matmul)",
            "speedup": "~1.2x (batch efficiency)",
        },
        "attention_block": {
            "unfused": [
                "scores = Q @ K^T",
                "scores = scores / sqrt(d)",
                "scores = mask(scores)",
                "weights = softmax(scores)",
                "output = weights @ V",
            ],
            "fused": "output = flash_attention(Q, K, V)",
            "speedup": "2-4x (Flash Attention)",
        },
        "mlp_block": {
            "unfused": [
                "x = linear1(input)",
                "x = gelu(x)",
                "x = linear2(x)",
            ],
            "fused": "Partial: gelu fused with linear1",
            "speedup": "~1.1-1.2x",
        },
        "layernorm_residual": {
            "unfused": [
                "residual = x + attention_out",
                "x = layernorm(residual)",
            ],
            "fused": "x = fused_layernorm_residual(x, attention_out)",
            "speedup": "~1.1x",
        },
    }

How to Get Fusion

def getting_fusion():
    """
    Most users don't write CUDA kernels. Here's how to benefit anyway.
    """
    return {
        "use_framework_defaults": {
            "description": "Modern frameworks fuse automatically",
            "vllm": "Fused attention, fused RoPE, etc.",
            "pytorch_compile": "torch.compile() finds fusion opportunities",
        },
        "torch_compile": {
            "description": "JIT compilation with automatic fusion",
            "code": """
            model = torch.compile(model, mode="reduce-overhead")
            # or mode="max-autotune" for more aggressive optimization
            """,
            "what_it_does": "Traces execution, finds fusion opportunities, generates optimized kernels",
        },
        "tensorrt": {
            "description": "NVIDIA's optimizer aggressively fuses",
            "tradeoff": "Slower build time, faster inference",
            "best_for": "Production with stable models",
        },
        "custom_kernels": {
            "description": "Hand-written for specific patterns",
            "examples": ["Flash Attention", "FasterTransformer kernels"],
            "when": "Framework fusion isn't enough",
        },
    }

Measuring Fusion Impact

def measure_fusion():
    return {
        "profiling_approach": """
        # Use NVIDIA Nsight Systems to see kernel launches
        nsys profile python your_inference.py

        # Look for:
        # - Number of kernel launches
        # - Time between kernels (idle GPU)
        # - Memory transfer patterns
        """,

        "before_after_comparison": """
        # Without torch.compile
        model = load_model()
        time_unfused = benchmark(model)

        # With torch.compile
        model_compiled = torch.compile(model, mode="reduce-overhead")
        time_fused = benchmark(model_compiled)

        # First run is slower (compilation)
        # Subsequent runs show fusion benefits
        """,

        "what_to_look_for": [
            "Reduction in kernel count",
            "Reduction in memory transfer",
            "Increased compute utilization",
            "Reduced idle time between kernels",
        ],
    }

The Tradeoffs

def fusion_tradeoffs():
    return {
        "compilation_time": {
            "issue": "Fused kernels need compilation",
            "torch_compile": "Minutes on first run",
            "tensorrt": "Can be hours for large models",
            "mitigation": "Cache compiled models",
        },
        "debugging": {
            "issue": "Fused code harder to debug",
            "problem": "Stack traces point to generated code",
            "mitigation": "Debug unfused first, then enable fusion",
        },
        "flexibility": {
            "issue": "Fused kernels are specialized",
            "problem": "Changes may require recompilation",
            "mitigation": "Separate dev (unfused) and prod (fused) configs",
        },
        "memory": {
            "issue": "Some fusions trade memory for speed",
            "problem": "Might not help if memory-constrained",
            "mitigation": "Profile memory impact, not just speed",
        },
    }

Practical Recommendations

def fusion_recommendations():
    return {
        "start_here": {
            "action": "Use vLLM or similar serving framework",
            "why": "Automatic fusion for common patterns",
            "effort": "Zero",
        },
        "easy_win": {
            "action": "Enable torch.compile for custom code",
            "code": "model = torch.compile(model)",
            "speedup": "10-30% typical",
        },
        "advanced": {
            "action": "Use TensorRT-LLM for maximum performance",
            "when": "Stable model, need every millisecond",
            "effort": "Days of work",
        },
        "avoid": {
            "action": "Writing custom CUDA kernels",
            "when": "Unless frameworks genuinely can't help",
            "why": "Maintenance burden is high",
        },
    }

Kernel fusion is optimization you get for free from good frameworks. Don't write fused kernels yourself unless you've exhausted framework options. The maintenance cost of custom CUDA is high, and frameworks keep getting better.