Switching LoRA Adapters at Runtime

Hotel chains use master key systems. Every room has a unique lock, but housekeeping doesn't carry 500 keys. A single master key opens all doors. The mechanism adapts to each door, but the base system stays constant.

Runtime adapter switching works similarly. The base model is the master infrastructure. Adapters are the unique configurations per customer or task. Switching adapters takes milliseconds, not the minutes required to load a new model. One deployment serves all customizations.

How Runtime Switching Works

def adapter_switching_mechanism():
    return {
        "conceptual_model": """
            Base model weights: W (frozen, loaded once)
            Adapter i weights: A_i, B_i (small, swappable)

            For request with adapter i:
            W_effective = W + A_i @ B_i

            Switch to adapter j:
            W_effective = W + A_j @ B_j

            Base W never changes, just which adapter is applied.
        """,

        "why_fast": {
            "no_base_reload": "Base model stays in GPU memory",
            "small_adapters": "Adapter weights are ~0.1% of base",
            "memory_operations": "Just pointer swaps + small copies",
        },

        "timing": {
            "load_full_model": "30-120 seconds",
            "load_adapter": "100-500ms (first time)",
            "switch_adapter": "5-15ms (if cached)",
        },
    }

Implementation with vLLM

def vllm_lora_serving():
    """
    Production adapter switching with vLLM
    """
    return {
        "setup": """
            from vllm import LLM, SamplingParams
            from vllm.lora.request import LoRARequest

            # Initialize with LoRA support
            llm = LLM(
                model="meta-llama/Llama-2-70b-hf",
                enable_lora=True,
                max_loras=10,        # Max concurrent adapters in memory
                max_lora_rank=64,    # Max rank across all adapters
            )
        """,

        "serving_requests": """
            # Request without adapter (base model)
            base_output = llm.generate("Hello, world")

            # Request with customer-specific adapter
            customer_1_output = llm.generate(
                "Hello, world",
                lora_request=LoRARequest(
                    lora_name="customer_1",
                    lora_path="/adapters/customer_1",
                    lora_int_id=1,  # Unique ID for this adapter
                )
            )

            # Different customer, different adapter
            customer_2_output = llm.generate(
                "Hello, world",
                lora_request=LoRARequest(
                    lora_name="customer_2",
                    lora_path="/adapters/customer_2",
                    lora_int_id=2,
                )
            )
        """,

        "batching": """
            # Requests with different adapters can batch together
            prompts = ["Hello", "Hi there", "Greetings"]
            adapters = [
                LoRARequest("cust_1", "/adapters/1", 1),
                LoRARequest("cust_2", "/adapters/2", 2),
                LoRARequest("cust_1", "/adapters/1", 1),  # Same as first
            ]

            outputs = llm.generate(prompts, lora_request=adapters)
            # All processed efficiently in same batch
        """,
    }

Adapter Caching Strategy

class AdapterCache:
    """
    Manage which adapters are loaded in GPU memory
    """

    def __init__(self, max_adapters: int = 10):
        self.max_adapters = max_adapters
        self.loaded = OrderedDict()  # adapter_id -> loaded_adapter
        self.access_count = Counter()  # adapter_id -> access count

    def get_or_load(self, adapter_id: str, adapter_path: str):
        """Get adapter, loading if necessary"""
        # Already loaded
        if adapter_id in self.loaded:
            self.loaded.move_to_end(adapter_id)  # LRU touch
            self.access_count[adapter_id] += 1
            return self.loaded[adapter_id]

        # Need to load
        if len(self.loaded) >= self.max_adapters:
            self.evict_one()

        adapter = self.load_adapter(adapter_path)
        self.loaded[adapter_id] = adapter
        self.access_count[adapter_id] = 1
        return adapter

    def evict_one(self):
        """Evict least recently used adapter"""
        # LRU eviction
        oldest_id, _ = self.loaded.popitem(last=False)
        self.unload_adapter(oldest_id)
        return oldest_id

    def cache_metrics(self) -> dict:
        """Cache performance metrics"""
        total_requests = sum(self.access_count.values())
        hits = total_requests - len(self.loaded)  # Rough approximation

        return {
            "loaded_adapters": len(self.loaded),
            "max_adapters": self.max_adapters,
            "total_requests": total_requests,
            "estimated_hit_rate": hits / total_requests if total_requests > 0 else 0,
        }

Request Routing

class AdapterRouter:
    """
    Route requests to appropriate adapters
    """

    def __init__(self):
        self.customer_adapters = {}  # customer_id -> adapter_config
        self.task_adapters = {}      # task_type -> adapter_config
        self.default_adapter = None

    def register_customer_adapter(
        self,
        customer_id: str,
        adapter_path: str,
        adapter_id: int
    ):
        """Register a customer-specific adapter"""
        self.customer_adapters[customer_id] = {
            "path": adapter_path,
            "id": adapter_id,
            "name": f"customer_{customer_id}",
        }

    def route(self, request: dict) -> LoRARequest | None:
        """Determine which adapter to use for a request"""
        customer_id = request.get("customer_id")
        task_type = request.get("task_type")

        # Customer-specific adapter takes priority
        if customer_id in self.customer_adapters:
            config = self.customer_adapters[customer_id]
            return LoRARequest(
                lora_name=config["name"],
                lora_path=config["path"],
                lora_int_id=config["id"],
            )

        # Task-specific adapter
        if task_type in self.task_adapters:
            config = self.task_adapters[task_type]
            return LoRARequest(
                lora_name=config["name"],
                lora_path=config["path"],
                lora_int_id=config["id"],
            )

        # Default adapter or None (base model)
        return self.default_adapter

Performance Considerations

def performance_considerations():
    return {
        "adapter_loading_time": {
            "cold_load": "100-500ms per adapter",
            "from_cache": "5-15ms",
            "optimization": "Pre-load expected adapters at startup",
        },

        "memory_overhead": {
            "per_adapter": "~50MB for r=16",
            "10_adapters": "~500MB",
            "100_adapters": "~5GB (need enough GPU memory)",
        },

        "throughput_impact": {
            "single_adapter": "No impact vs merged",
            "mixed_adapters": "~5-10% overhead for switching",
            "batching_efficiency": "Lower if adapters too diverse in batch",
        },

        "optimization_tips": [
            "Set max_loras based on distinct adapters in typical window",
            "Pre-warm cache with high-traffic adapters",
            "Group requests by adapter when possible",
            "Monitor cache hit rate, adjust max_loras",
        ],
    }

Hot Reloading Adapters

class HotReloadableAdapterServer:
    """
    Update adapters without restarting server
    """

    def __init__(self, llm: LLM):
        self.llm = llm
        self.adapter_versions = {}  # adapter_id -> version

    def register_adapter(
        self,
        adapter_id: str,
        adapter_path: str,
        version: str
    ):
        """Register or update an adapter"""
        current_version = self.adapter_versions.get(adapter_id)

        if current_version == version:
            # Already have this version
            return

        if current_version is not None:
            # Unload old version
            self.unload_adapter(adapter_id)

        # Load new version
        self.load_adapter(adapter_id, adapter_path)
        self.adapter_versions[adapter_id] = version

    def handle_adapter_update(self, event: dict):
        """Handle notification of adapter update"""
        adapter_id = event["adapter_id"]
        new_path = event["path"]
        new_version = event["version"]

        # Graceful update: requests in flight use old adapter
        # New requests use new adapter
        self.register_adapter(adapter_id, new_path, new_version)

    def watch_for_updates(self, adapter_registry_url: str):
        """Watch for adapter updates"""
        import asyncio

        async def watch_loop():
            async for event in self.subscribe(adapter_registry_url):
                if event["type"] == "adapter_updated":
                    self.handle_adapter_update(event)

        asyncio.create_task(watch_loop())

Monitoring and Alerts

def adapter_monitoring():
    return {
        "metrics_to_track": {
            "per_adapter": [
                "Request count",
                "Latency (p50, p95, p99)",
                "Error rate",
                "Cache hit rate",
            ],
            "system_wide": [
                "Total adapters loaded",
                "Adapter cache hit rate",
                "Switch latency",
                "Memory usage",
            ],
        },

        "alerts": [
            {
                "condition": "adapter_cache_hit_rate < 0.8",
                "action": "Increase max_loras or optimize routing",
            },
            {
                "condition": "adapter_load_time > 1s",
                "action": "Check storage performance",
            },
            {
                "condition": "adapter_error_rate > 0.01",
                "action": "Investigate adapter-specific issues",
            },
        ],

        "dashboard_panels": [
            "Adapters by request volume",
            "Cache hit rate over time",
            "Latency by adapter",
            "Memory usage breakdown",
        ],
    }

Scaling Considerations

def scaling_considerations():
    return {
        "vertical_scaling": {
            "more_gpu_memory": "More concurrent adapters",
            "faster_storage": "Faster cold loading",
            "limit": "Single machine GPU capacity",
        },

        "horizontal_scaling": {
            "approach": "Shard adapters across instances",
            "routing": "Route customer to their adapter's instance",
            "challenge": "Load balancing with affinity",
        },

        "hybrid_approach": """
            Tier 1: High-traffic adapters on dedicated instances (merged)
            Tier 2: Medium-traffic adapters on shared LoRA instances
            Tier 3: Low-traffic adapters loaded on-demand

            Routing layer determines tier based on customer/traffic.
        """,
    }

Runtime adapter switching transforms how you serve customizations. Instead of N deployments for N customers, one deployment with adapter switching. The trade-off is complexity in routing and caching, but the resource efficiency makes it worthwhile for multi-tenant scenarios.