Deploying and Serving Fine-tuned Models

A restaurant with one menu is simple. Print it, hand it out, everyone orders from the same options. A restaurant with personalized menus for each table needs a different system: track preferences, generate menus on demand, handle updates efficiently.

Serving fine-tuned models has the same fork. One customization? Merge it into the base model and serve like normal. Many customizations? Keep adapters separate and switch dynamically. The architecture follows the use case.

Single-Tenant: Merge and Serve

def single_tenant_deployment():
    """
    One fine-tuned model for your entire application
    """
    return {
        "approach": "Merge adapter weights into base model",

        "merge_process": """
            from peft import PeftModel
            from transformers import AutoModelForCausalLM

            # Load base model
            base_model = AutoModelForCausalLM.from_pretrained("base_model")

            # Load adapter
            peft_model = PeftModel.from_pretrained(base_model, "adapter_path")

            # Merge into base weights
            merged_model = peft_model.merge_and_unload()

            # Save merged model
            merged_model.save_pretrained("merged_model_path")
        """,

        "serving": """
            # Serve like any other model
            from vllm import LLM

            llm = LLM(model="merged_model_path")
            output = llm.generate(prompt)
        """,

        "advantages": [
            "Zero runtime overhead",
            "Same latency as base model",
            "Simple deployment",
            "No adapter management",
        ],

        "disadvantages": [
            "Can't switch adapters",
            "Need separate deployment per customization",
            "Storage cost for full model per variant",
        ],
    }

Multi-Tenant: Dynamic Adapter Switching

def multi_tenant_deployment():
    """
    Many customizations, one deployment
    """
    return {
        "approach": "Base model + switchable adapters",

        "architecture": """
            ┌──────────────────────────────────┐
            │        Base Model (frozen)        │
            │          Loaded once              │
            └──────────────────────────────────┘
                           │
            ┌──────────────┼──────────────┐
            │              │              │
            ▼              ▼              ▼
         Adapter A     Adapter B     Adapter C
        (Customer 1)  (Customer 2)  (Customer 3)
        """,

        "implementation": """
            # With vLLM LoRA support
            from vllm import LLM, SamplingParams
            from vllm.lora.request import LoRARequest

            llm = LLM(
                model="base_model",
                enable_lora=True,
                max_loras=10,      # Max concurrent adapters
                max_lora_rank=64,  # Max rank to support
            )

            # Request with specific adapter
            output = llm.generate(
                prompt,
                sampling_params=SamplingParams(max_tokens=100),
                lora_request=LoRARequest(
                    lora_name="customer_1",
                    lora_path="/adapters/customer_1"
                )
            )
        """,

        "advantages": [
            "Single deployment serves all customers",
            "Low storage per customization (~500MB)",
            "Easy to add new customizations",
            "Base model memory shared",
        ],

        "disadvantages": [
            "Small latency overhead (~10%)",
            "Adapter switching has latency (~10ms)",
            "More complex serving infrastructure",
            "Need adapter management system",
        ],
    }

Choosing the Architecture

def architecture_decision():
    return {
        "merge_when": [
            "One or few (<5) customizations",
            "Latency is critical",
            "Simple deployment preferred",
            "Each variant has dedicated resources",
        ],

        "dynamic_adapters_when": [
            "Many (>5) customizations",
            "Customizations added frequently",
            "Resource efficiency matters",
            "Multi-tenant SaaS product",
        ],

        "decision_matrix": """
            Customizations | Resources | Latency Req | Recommendation
            ---------------|-----------|-------------|----------------
            1-2            | Flexible  | Strict      | Merge
            1-2            | Limited   | Flexible    | Merge
            3-10           | Flexible  | Strict      | Merge (separate deploys)
            3-10           | Limited   | Flexible    | Dynamic adapters
            10+            | Any       | Any         | Dynamic adapters
        """,
    }

Adapter Management System

class AdapterManager:
    """
    Manage adapters for multi-tenant serving
    """

    def __init__(self, storage_path: str, cache_size: int = 10):
        self.storage_path = storage_path
        self.adapter_cache = LRUCache(maxsize=cache_size)
        self.registry = {}  # adapter_id -> metadata

    def register_adapter(
        self,
        adapter_id: str,
        adapter_path: str,
        metadata: dict
    ):
        """Register a new adapter"""
        # Validate adapter
        self.validate_adapter(adapter_path)

        # Store metadata
        self.registry[adapter_id] = {
            "path": adapter_path,
            "created": datetime.now(),
            "version": metadata.get("version", "1.0"),
            "customer": metadata.get("customer"),
        }

    def get_adapter(self, adapter_id: str) -> str:
        """Get adapter path, loading into cache if needed"""
        if adapter_id not in self.registry:
            raise ValueError(f"Unknown adapter: {adapter_id}")

        # Track usage for cache management
        self.adapter_cache.touch(adapter_id)

        return self.registry[adapter_id]["path"]

    def route_request(self, request: dict) -> str:
        """Determine which adapter to use for request"""
        customer_id = request.get("customer_id")
        task_type = request.get("task_type")

        # Customer-specific adapter
        if f"{customer_id}_adapter" in self.registry:
            return f"{customer_id}_adapter"

        # Task-specific default
        if f"default_{task_type}" in self.registry:
            return f"default_{task_type}"

        # No adapter (use base model)
        return None

S-LoRA for High-Throughput Multi-Tenant

def slora_serving():
    """
    S-LoRA: Scalable Serving of LoRA Adapters
    """
    return {
        "what_it_is": """
            Research approach for efficient multi-adapter serving.
            Key idea: Batch requests across different adapters.
        """,

        "how_it_works": {
            "unified_paging": "Manage adapter weights like KV cache",
            "heterogeneous_batching": "Same batch, different adapters",
            "custom_cuda_kernels": "Efficient adapter application",
        },

        "performance": {
            "throughput": "Near single-adapter performance",
            "adapters": "Thousands simultaneously",
            "switching": "<1ms overhead",
        },

        "practical_status": """
            S-LoRA concepts now in vLLM:
            - enable_lora=True
            - max_loras for concurrent adapters
            - Automatic batching across adapters

            For production:
            - Use vLLM with LoRA support
            - Configure max_loras based on expected concurrency
            - Monitor adapter cache hit rate
        """,
    }

Resource Planning

def resource_planning():
    """
    Plan GPU memory for adapter serving
    """
    return {
        "merged_model": {
            "formula": "base_model_memory + kv_cache",
            "example_70b": "140GB + 20GB = 160GB",
            "per_variant": "Full model memory each",
        },

        "dynamic_adapters": {
            "base": "Base model memory (shared)",
            "per_adapter": "r * hidden_dim * 2 * target_layers * dtype",
            "example_70b_r16": """
                16 * 8192 * 2 * 80 * 2 = ~42MB per adapter
                100 adapters = ~4.2GB
            """,
            "kv_cache": "Shared across requests",
        },

        "planning_formula": """
            Total GPU memory needed:
            = base_model
            + max_concurrent_adapters * adapter_size
            + kv_cache_for_batch_size

            Example (70B, 10 concurrent adapters, batch 32):
            = 140GB + 0.4GB + 20GB
            = 160.4GB
            = 2x A100-80GB or 2x H100-80GB
        """,
    }

Monitoring Adapter Performance

class AdapterMonitor:
    """
    Monitor fine-tuned model performance in production
    """

    def track_request(
        self,
        adapter_id: str,
        request: dict,
        response: dict,
        latency_ms: float
    ):
        """Track individual request metrics"""
        self.metrics.record({
            "adapter_id": adapter_id,
            "latency_ms": latency_ms,
            "input_tokens": request["input_tokens"],
            "output_tokens": response["output_tokens"],
            "timestamp": datetime.now(),
        })

    def adapter_health(self, adapter_id: str, window_hours: int = 1) -> dict:
        """Get health metrics for an adapter"""
        requests = self.get_requests(adapter_id, window_hours)

        return {
            "request_count": len(requests),
            "avg_latency_ms": statistics.mean(r["latency_ms"] for r in requests),
            "p99_latency_ms": self.percentile(requests, 99, "latency_ms"),
            "error_rate": sum(1 for r in requests if r.get("error")) / len(requests),
        }

    def compare_to_baseline(self, adapter_id: str) -> dict:
        """Compare adapter performance to base model"""
        adapter_metrics = self.adapter_health(adapter_id)
        base_metrics = self.adapter_health("base_model")

        return {
            "latency_overhead": (
                adapter_metrics["avg_latency_ms"] /
                base_metrics["avg_latency_ms"] - 1
            ),
            "error_rate_diff": (
                adapter_metrics["error_rate"] -
                base_metrics["error_rate"]
            ),
        }

Deployment Checklist

def deployment_checklist():
    return {
        "single_tenant_merge": [
            "[ ] Adapter trained and validated",
            "[ ] Merged model created and tested",
            "[ ] Merged model passes base capability tests",
            "[ ] Deployment infrastructure sized for merged model",
            "[ ] Monitoring configured",
            "[ ] Rollback plan documented",
        ],

        "multi_tenant_dynamic": [
            "[ ] Base model deployed with LoRA support",
            "[ ] Adapter registry populated",
            "[ ] Routing logic implemented",
            "[ ] Adapter cache sized appropriately",
            "[ ] Per-adapter monitoring configured",
            "[ ] Adapter hot-reload tested",
            "[ ] Fallback to base model implemented",
            "[ ] Customer isolation verified",
        ],
    }

The serving architecture follows the customization pattern. Single customization means simple merged deployment. Many customizations mean shared infrastructure with dynamic switching. Choose based on your multi-tenancy requirements, then build the monitoring to match.