Switching LoRA Adapters at Runtime
Hotel chains use master key systems. Every room has a unique lock, but housekeeping doesn't carry 500 keys. A single master key opens all doors. The mechanism adapts to each door, but the base system stays constant.
Runtime adapter switching works similarly. The base model is the master infrastructure. Adapters are the unique configurations per customer or task. Switching adapters takes milliseconds, not the minutes required to load a new model. One deployment serves all customizations.
How Runtime Switching Works
def adapter_switching_mechanism():
return {
"conceptual_model": """
Base model weights: W (frozen, loaded once)
Adapter i weights: A_i, B_i (small, swappable)
For request with adapter i:
W_effective = W + A_i @ B_i
Switch to adapter j:
W_effective = W + A_j @ B_j
Base W never changes, just which adapter is applied.
""",
"why_fast": {
"no_base_reload": "Base model stays in GPU memory",
"small_adapters": "Adapter weights are ~0.1% of base",
"memory_operations": "Just pointer swaps + small copies",
},
"timing": {
"load_full_model": "30-120 seconds",
"load_adapter": "100-500ms (first time)",
"switch_adapter": "5-15ms (if cached)",
},
}
Implementation with vLLM
def vllm_lora_serving():
"""
Production adapter switching with vLLM
"""
return {
"setup": """
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
# Initialize with LoRA support
llm = LLM(
model="meta-llama/Llama-2-70b-hf",
enable_lora=True,
max_loras=10, # Max concurrent adapters in memory
max_lora_rank=64, # Max rank across all adapters
)
""",
"serving_requests": """
# Request without adapter (base model)
base_output = llm.generate("Hello, world")
# Request with customer-specific adapter
customer_1_output = llm.generate(
"Hello, world",
lora_request=LoRARequest(
lora_name="customer_1",
lora_path="/adapters/customer_1",
lora_int_id=1, # Unique ID for this adapter
)
)
# Different customer, different adapter
customer_2_output = llm.generate(
"Hello, world",
lora_request=LoRARequest(
lora_name="customer_2",
lora_path="/adapters/customer_2",
lora_int_id=2,
)
)
""",
"batching": """
# Requests with different adapters can batch together
prompts = ["Hello", "Hi there", "Greetings"]
adapters = [
LoRARequest("cust_1", "/adapters/1", 1),
LoRARequest("cust_2", "/adapters/2", 2),
LoRARequest("cust_1", "/adapters/1", 1), # Same as first
]
outputs = llm.generate(prompts, lora_request=adapters)
# All processed efficiently in same batch
""",
}
Adapter Caching Strategy
class AdapterCache:
"""
Manage which adapters are loaded in GPU memory
"""
def __init__(self, max_adapters: int = 10):
self.max_adapters = max_adapters
self.loaded = OrderedDict() # adapter_id -> loaded_adapter
self.access_count = Counter() # adapter_id -> access count
def get_or_load(self, adapter_id: str, adapter_path: str):
"""Get adapter, loading if necessary"""
# Already loaded
if adapter_id in self.loaded:
self.loaded.move_to_end(adapter_id) # LRU touch
self.access_count[adapter_id] += 1
return self.loaded[adapter_id]
# Need to load
if len(self.loaded) >= self.max_adapters:
self.evict_one()
adapter = self.load_adapter(adapter_path)
self.loaded[adapter_id] = adapter
self.access_count[adapter_id] = 1
return adapter
def evict_one(self):
"""Evict least recently used adapter"""
# LRU eviction
oldest_id, _ = self.loaded.popitem(last=False)
self.unload_adapter(oldest_id)
return oldest_id
def cache_metrics(self) -> dict:
"""Cache performance metrics"""
total_requests = sum(self.access_count.values())
hits = total_requests - len(self.loaded) # Rough approximation
return {
"loaded_adapters": len(self.loaded),
"max_adapters": self.max_adapters,
"total_requests": total_requests,
"estimated_hit_rate": hits / total_requests if total_requests > 0 else 0,
}
Request Routing
class AdapterRouter:
"""
Route requests to appropriate adapters
"""
def __init__(self):
self.customer_adapters = {} # customer_id -> adapter_config
self.task_adapters = {} # task_type -> adapter_config
self.default_adapter = None
def register_customer_adapter(
self,
customer_id: str,
adapter_path: str,
adapter_id: int
):
"""Register a customer-specific adapter"""
self.customer_adapters[customer_id] = {
"path": adapter_path,
"id": adapter_id,
"name": f"customer_{customer_id}",
}
def route(self, request: dict) -> LoRARequest | None:
"""Determine which adapter to use for a request"""
customer_id = request.get("customer_id")
task_type = request.get("task_type")
# Customer-specific adapter takes priority
if customer_id in self.customer_adapters:
config = self.customer_adapters[customer_id]
return LoRARequest(
lora_name=config["name"],
lora_path=config["path"],
lora_int_id=config["id"],
)
# Task-specific adapter
if task_type in self.task_adapters:
config = self.task_adapters[task_type]
return LoRARequest(
lora_name=config["name"],
lora_path=config["path"],
lora_int_id=config["id"],
)
# Default adapter or None (base model)
return self.default_adapter
Performance Considerations
def performance_considerations():
return {
"adapter_loading_time": {
"cold_load": "100-500ms per adapter",
"from_cache": "5-15ms",
"optimization": "Pre-load expected adapters at startup",
},
"memory_overhead": {
"per_adapter": "~50MB for r=16",
"10_adapters": "~500MB",
"100_adapters": "~5GB (need enough GPU memory)",
},
"throughput_impact": {
"single_adapter": "No impact vs merged",
"mixed_adapters": "~5-10% overhead for switching",
"batching_efficiency": "Lower if adapters too diverse in batch",
},
"optimization_tips": [
"Set max_loras based on distinct adapters in typical window",
"Pre-warm cache with high-traffic adapters",
"Group requests by adapter when possible",
"Monitor cache hit rate, adjust max_loras",
],
}
Hot Reloading Adapters
class HotReloadableAdapterServer:
"""
Update adapters without restarting server
"""
def __init__(self, llm: LLM):
self.llm = llm
self.adapter_versions = {} # adapter_id -> version
def register_adapter(
self,
adapter_id: str,
adapter_path: str,
version: str
):
"""Register or update an adapter"""
current_version = self.adapter_versions.get(adapter_id)
if current_version == version:
# Already have this version
return
if current_version is not None:
# Unload old version
self.unload_adapter(adapter_id)
# Load new version
self.load_adapter(adapter_id, adapter_path)
self.adapter_versions[adapter_id] = version
def handle_adapter_update(self, event: dict):
"""Handle notification of adapter update"""
adapter_id = event["adapter_id"]
new_path = event["path"]
new_version = event["version"]
# Graceful update: requests in flight use old adapter
# New requests use new adapter
self.register_adapter(adapter_id, new_path, new_version)
def watch_for_updates(self, adapter_registry_url: str):
"""Watch for adapter updates"""
import asyncio
async def watch_loop():
async for event in self.subscribe(adapter_registry_url):
if event["type"] == "adapter_updated":
self.handle_adapter_update(event)
asyncio.create_task(watch_loop())
Monitoring and Alerts
def adapter_monitoring():
return {
"metrics_to_track": {
"per_adapter": [
"Request count",
"Latency (p50, p95, p99)",
"Error rate",
"Cache hit rate",
],
"system_wide": [
"Total adapters loaded",
"Adapter cache hit rate",
"Switch latency",
"Memory usage",
],
},
"alerts": [
{
"condition": "adapter_cache_hit_rate < 0.8",
"action": "Increase max_loras or optimize routing",
},
{
"condition": "adapter_load_time > 1s",
"action": "Check storage performance",
},
{
"condition": "adapter_error_rate > 0.01",
"action": "Investigate adapter-specific issues",
},
],
"dashboard_panels": [
"Adapters by request volume",
"Cache hit rate over time",
"Latency by adapter",
"Memory usage breakdown",
],
}
Scaling Considerations
def scaling_considerations():
return {
"vertical_scaling": {
"more_gpu_memory": "More concurrent adapters",
"faster_storage": "Faster cold loading",
"limit": "Single machine GPU capacity",
},
"horizontal_scaling": {
"approach": "Shard adapters across instances",
"routing": "Route customer to their adapter's instance",
"challenge": "Load balancing with affinity",
},
"hybrid_approach": """
Tier 1: High-traffic adapters on dedicated instances (merged)
Tier 2: Medium-traffic adapters on shared LoRA instances
Tier 3: Low-traffic adapters loaded on-demand
Routing layer determines tier based on customer/traffic.
""",
}
Runtime adapter switching transforms how you serve customizations. Instead of N deployments for N customers, one deployment with adapter switching. The trade-off is complexity in routing and caching, but the resource efficiency makes it worthwhile for multi-tenant scenarios.