Back to Blog
Deploying and Serving Fine-tuned Models
A restaurant with one menu is simple. Print it, hand it out, everyone orders from the same options. A restaurant with personalized menus for each table needs a different system: track preferences, generate menus on demand, handle updates efficiently.
Serving fine-tuned models has the same fork. One customization? Merge it into the base model and serve like normal. Many customizations? Keep adapters separate and switch dynamically. The architecture follows the use case.
Single-Tenant: Merge and Serve
def single_tenant_deployment():
"""
One fine-tuned model for your entire application
"""
return {
"approach": "Merge adapter weights into base model",
"merge_process": """
from peft import PeftModel
from transformers import AutoModelForCausalLM
# Load base model
base_model = AutoModelForCausalLM.from_pretrained("base_model")
# Load adapter
peft_model = PeftModel.from_pretrained(base_model, "adapter_path")
# Merge into base weights
merged_model = peft_model.merge_and_unload()
# Save merged model
merged_model.save_pretrained("merged_model_path")
""",
"serving": """
# Serve like any other model
from vllm import LLM
llm = LLM(model="merged_model_path")
output = llm.generate(prompt)
""",
"advantages": [
"Zero runtime overhead",
"Same latency as base model",
"Simple deployment",
"No adapter management",
],
"disadvantages": [
"Can't switch adapters",
"Need separate deployment per customization",
"Storage cost for full model per variant",
],
}
Multi-Tenant: Dynamic Adapter Switching
def multi_tenant_deployment():
"""
Many customizations, one deployment
"""
return {
"approach": "Base model + switchable adapters",
"architecture": """
┌──────────────────────────────────┐
│ Base Model (frozen) │
│ Loaded once │
└──────────────────────────────────┘
│
┌──────────────┼──────────────┐
│ │ │
▼ ▼ ▼
Adapter A Adapter B Adapter C
(Customer 1) (Customer 2) (Customer 3)
""",
"implementation": """
# With vLLM LoRA support
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
llm = LLM(
model="base_model",
enable_lora=True,
max_loras=10, # Max concurrent adapters
max_lora_rank=64, # Max rank to support
)
# Request with specific adapter
output = llm.generate(
prompt,
sampling_params=SamplingParams(max_tokens=100),
lora_request=LoRARequest(
lora_name="customer_1",
lora_path="/adapters/customer_1"
)
)
""",
"advantages": [
"Single deployment serves all customers",
"Low storage per customization (~500MB)",
"Easy to add new customizations",
"Base model memory shared",
],
"disadvantages": [
"Small latency overhead (~10%)",
"Adapter switching has latency (~10ms)",
"More complex serving infrastructure",
"Need adapter management system",
],
}
Choosing the Architecture
def architecture_decision():
return {
"merge_when": [
"One or few (<5) customizations",
"Latency is critical",
"Simple deployment preferred",
"Each variant has dedicated resources",
],
"dynamic_adapters_when": [
"Many (>5) customizations",
"Customizations added frequently",
"Resource efficiency matters",
"Multi-tenant SaaS product",
],
"decision_matrix": """
Customizations | Resources | Latency Req | Recommendation
---------------|-----------|-------------|----------------
1-2 | Flexible | Strict | Merge
1-2 | Limited | Flexible | Merge
3-10 | Flexible | Strict | Merge (separate deploys)
3-10 | Limited | Flexible | Dynamic adapters
10+ | Any | Any | Dynamic adapters
""",
}
Adapter Management System
class AdapterManager:
"""
Manage adapters for multi-tenant serving
"""
def __init__(self, storage_path: str, cache_size: int = 10):
self.storage_path = storage_path
self.adapter_cache = LRUCache(maxsize=cache_size)
self.registry = {} # adapter_id -> metadata
def register_adapter(
self,
adapter_id: str,
adapter_path: str,
metadata: dict
):
"""Register a new adapter"""
# Validate adapter
self.validate_adapter(adapter_path)
# Store metadata
self.registry[adapter_id] = {
"path": adapter_path,
"created": datetime.now(),
"version": metadata.get("version", "1.0"),
"customer": metadata.get("customer"),
}
def get_adapter(self, adapter_id: str) -> str:
"""Get adapter path, loading into cache if needed"""
if adapter_id not in self.registry:
raise ValueError(f"Unknown adapter: {adapter_id}")
# Track usage for cache management
self.adapter_cache.touch(adapter_id)
return self.registry[adapter_id]["path"]
def route_request(self, request: dict) -> str:
"""Determine which adapter to use for request"""
customer_id = request.get("customer_id")
task_type = request.get("task_type")
# Customer-specific adapter
if f"{customer_id}_adapter" in self.registry:
return f"{customer_id}_adapter"
# Task-specific default
if f"default_{task_type}" in self.registry:
return f"default_{task_type}"
# No adapter (use base model)
return None
S-LoRA for High-Throughput Multi-Tenant
def slora_serving():
"""
S-LoRA: Scalable Serving of LoRA Adapters
"""
return {
"what_it_is": """
Research approach for efficient multi-adapter serving.
Key idea: Batch requests across different adapters.
""",
"how_it_works": {
"unified_paging": "Manage adapter weights like KV cache",
"heterogeneous_batching": "Same batch, different adapters",
"custom_cuda_kernels": "Efficient adapter application",
},
"performance": {
"throughput": "Near single-adapter performance",
"adapters": "Thousands simultaneously",
"switching": "<1ms overhead",
},
"practical_status": """
S-LoRA concepts now in vLLM:
- enable_lora=True
- max_loras for concurrent adapters
- Automatic batching across adapters
For production:
- Use vLLM with LoRA support
- Configure max_loras based on expected concurrency
- Monitor adapter cache hit rate
""",
}
Resource Planning
def resource_planning():
"""
Plan GPU memory for adapter serving
"""
return {
"merged_model": {
"formula": "base_model_memory + kv_cache",
"example_70b": "140GB + 20GB = 160GB",
"per_variant": "Full model memory each",
},
"dynamic_adapters": {
"base": "Base model memory (shared)",
"per_adapter": "r * hidden_dim * 2 * target_layers * dtype",
"example_70b_r16": """
16 * 8192 * 2 * 80 * 2 = ~42MB per adapter
100 adapters = ~4.2GB
""",
"kv_cache": "Shared across requests",
},
"planning_formula": """
Total GPU memory needed:
= base_model
+ max_concurrent_adapters * adapter_size
+ kv_cache_for_batch_size
Example (70B, 10 concurrent adapters, batch 32):
= 140GB + 0.4GB + 20GB
= 160.4GB
= 2x A100-80GB or 2x H100-80GB
""",
}
Monitoring Adapter Performance
class AdapterMonitor:
"""
Monitor fine-tuned model performance in production
"""
def track_request(
self,
adapter_id: str,
request: dict,
response: dict,
latency_ms: float
):
"""Track individual request metrics"""
self.metrics.record({
"adapter_id": adapter_id,
"latency_ms": latency_ms,
"input_tokens": request["input_tokens"],
"output_tokens": response["output_tokens"],
"timestamp": datetime.now(),
})
def adapter_health(self, adapter_id: str, window_hours: int = 1) -> dict:
"""Get health metrics for an adapter"""
requests = self.get_requests(adapter_id, window_hours)
return {
"request_count": len(requests),
"avg_latency_ms": statistics.mean(r["latency_ms"] for r in requests),
"p99_latency_ms": self.percentile(requests, 99, "latency_ms"),
"error_rate": sum(1 for r in requests if r.get("error")) / len(requests),
}
def compare_to_baseline(self, adapter_id: str) -> dict:
"""Compare adapter performance to base model"""
adapter_metrics = self.adapter_health(adapter_id)
base_metrics = self.adapter_health("base_model")
return {
"latency_overhead": (
adapter_metrics["avg_latency_ms"] /
base_metrics["avg_latency_ms"] - 1
),
"error_rate_diff": (
adapter_metrics["error_rate"] -
base_metrics["error_rate"]
),
}
Deployment Checklist
def deployment_checklist():
return {
"single_tenant_merge": [
"[ ] Adapter trained and validated",
"[ ] Merged model created and tested",
"[ ] Merged model passes base capability tests",
"[ ] Deployment infrastructure sized for merged model",
"[ ] Monitoring configured",
"[ ] Rollback plan documented",
],
"multi_tenant_dynamic": [
"[ ] Base model deployed with LoRA support",
"[ ] Adapter registry populated",
"[ ] Routing logic implemented",
"[ ] Adapter cache sized appropriately",
"[ ] Per-adapter monitoring configured",
"[ ] Adapter hot-reload tested",
"[ ] Fallback to base model implemented",
"[ ] Customer isolation verified",
],
}
The serving architecture follows the customization pattern. Single customization means simple merged deployment. Many customizations mean shared infrastructure with dynamic switching. Choose based on your multi-tenancy requirements, then build the monitoring to match.