Taking PyTorch Models to Production

Test kitchens and production kitchens operate differently. A recipe that works for one dish fails when you need to make 200. Equipment, workflow, and preparation all change.

PyTorch models work the same way. The training code that produced your model is not the serving code that should run it. The gap between research and production is 3-5x performance.

The Performance Gap

def performance_gap_analysis():
    """
    Why raw PyTorch is slow for serving
    """
    return {
        "python_overhead": {
            "issue": "Python interpreter in the loop",
            "impact": "Microseconds per operation add up",
            "solution": "Compilation (TorchScript, torch.compile)",
        },
        "no_batching": {
            "issue": "Processing one request at a time",
            "impact": "GPU utilization 10-20%",
            "solution": "Dynamic batching",
        },
        "naive_attention": {
            "issue": "Standard attention implementation",
            "impact": "Quadratic memory, no fusion",
            "solution": "FlashAttention, xformers",
        },
        "no_kv_cache_management": {
            "issue": "Recreating cache each forward pass",
            "impact": "O(n²) instead of O(n) generation",
            "solution": "Proper KV cache handling",
        },
        "eager_execution": {
            "issue": "Each operation launches separately",
            "impact": "Kernel launch overhead",
            "solution": "CUDA graphs, operator fusion",
        },
    }

The Optimization Stack

class OptimizationStack:
    """
    Layers of optimization, from quick wins to deep optimization
    """

    level_1_easy = {
        "name": "Use a serving framework",
        "effort": "Minutes",
        "impact": "3-5x",
        "how": "Deploy with vLLM instead of raw PyTorch",
        "code": """
        # Instead of:
        model = AutoModelForCausalLM.from_pretrained("...")
        outputs = model.generate(...)

        # Use:
        # vllm serve model_name --tensor-parallel-size 2
        """,
    }

    level_2_moderate = {
        "name": "torch.compile",
        "effort": "Hours",
        "impact": "1.5-2x",
        "how": "Compile model for faster execution",
        "code": """
        model = torch.compile(model, mode="reduce-overhead")
        # or mode="max-autotune" for more optimization
        """,
    }

    level_3_involved = {
        "name": "Quantization",
        "effort": "Days",
        "impact": "2-4x memory, 1.5-2x speed",
        "how": "Convert to INT8 or INT4",
        "code": """
        # Using transformers
        model = AutoModelForCausalLM.from_pretrained(
            "...",
            load_in_8bit=True,  # or load_in_4bit=True
        )
        """,
    }

    level_4_advanced = {
        "name": "TensorRT-LLM",
        "effort": "Weeks",
        "impact": "Additional 10-20%",
        "how": "Convert to TensorRT for maximum speed",
        "when": "When last 10% matters",
    }

Practical Migration Path

def migration_steps():
    return [
        {
            "step": 1,
            "action": "Export model to standard format",
            "code": """
            # Ensure model is in HuggingFace format
            model.save_pretrained("./my_model")
            tokenizer.save_pretrained("./my_model")
            """,
        },
        {
            "step": 2,
            "action": "Test with serving framework",
            "code": """
            # vLLM
            from vllm import LLM, SamplingParams

            llm = LLM(model="./my_model")
            outputs = llm.generate(prompts, SamplingParams())
            """,
        },
        {
            "step": 3,
            "action": "Benchmark and compare",
            "code": """
            # Compare:
            # - Tokens per second
            # - Latency (TTFT, P99)
            # - Memory usage
            # - Concurrent requests supported
            """,
        },
        {
            "step": 4,
            "action": "Add quantization if needed",
            "code": """
            # vLLM with quantization
            llm = LLM(
                model="./my_model",
                quantization="awq",  # or "gptq", "squeezellm"
            )
            """,
        },
        {
            "step": 5,
            "action": "Deploy with proper configuration",
            "code": """
            # Production vLLM config
            vllm serve ./my_model \\
                --tensor-parallel-size 2 \\
                --max-num-seqs 256 \\
                --gpu-memory-utilization 0.9
            """,
        },
    ]

Common Migration Issues

def migration_issues():
    return {
        "custom_attention": {
            "problem": "Model has custom attention implementation",
            "symptom": "Serving framework doesn't support it",
            "solution": "Port to standard attention or use raw PyTorch + compile",
        },
        "custom_layers": {
            "problem": "Non-standard layers in model",
            "symptom": "Framework can't optimize them",
            "solution": "Register custom ops or accept reduced optimization",
        },
        "different_outputs": {
            "problem": "Outputs differ between PyTorch and serving framework",
            "symptom": "Quality regression after deployment",
            "solution": "Check sampling parameters match exactly",
        },
        "memory_mismatch": {
            "problem": "OOM in production but not in testing",
            "symptom": "Crash under load",
            "solution": "Test with production batch sizes and contexts",
        },
    }

The Numbers

def typical_improvements():
    """
    Real-world improvements from optimization
    """
    return {
        "baseline_raw_pytorch": {
            "tokens_per_second": 15,
            "concurrent_requests": 1,
            "gpu_utilization": 20,
        },
        "with_torch_compile": {
            "tokens_per_second": 25,
            "concurrent_requests": 1,
            "gpu_utilization": 30,
            "improvement": "1.7x",
        },
        "with_vllm": {
            "tokens_per_second": 80,
            "concurrent_requests": 50,
            "gpu_utilization": 80,
            "improvement": "5x",
        },
        "with_vllm_quantized": {
            "tokens_per_second": 100,
            "concurrent_requests": 100,
            "gpu_utilization": 85,
            "improvement": "7x",
        },
    }

What Not To Do

def antipatterns():
    return [
        {
            "antipattern": "model.generate() in a web server",
            "why_bad": "No batching, no optimization, GIL contention",
            "do_instead": "Use vLLM or similar serving framework",
        },
        {
            "antipattern": "Loading model per request",
            "why_bad": "Model loading takes seconds",
            "do_instead": "Load once, serve many",
        },
        {
            "antipattern": "Using CPU when GPU available",
            "why_bad": "10-100x slower",
            "do_instead": "Always use GPU for LLM inference",
        },
        {
            "antipattern": "Ignoring KV cache",
            "why_bad": "Quadratic generation time",
            "do_instead": "Use framework with proper cache handling",
        },
    ]

The gap between research code and production code is real. Don't try to close it manually—use tools built for the job. A day spent setting up vLLM beats weeks of hand-optimization.