Running Multiple Customers on One GPU
Apartment buildings house many tenants with shared infrastructure. The plumbing, electrical, and structure are common. The living spaces are separate. Nobody wants to pay for an entire building when an apartment suffices. Good design provides isolation without duplication.
Multi-tenant LLM inference follows the same model. Multiple customers share GPU resources without sharing data. The base model is common. The KV caches, prompts, and responses are isolated. The economics of shared infrastructure make the service viable.
The Multi-Tenant Architecture
def multi_tenant_architecture():
return {
"shared_resources": {
"model_weights": "One copy in GPU memory",
"inference_engine": "Single vLLM/TGI instance",
"compute_capacity": "Dynamically allocated across requests",
},
"isolated_resources": {
"kv_cache": "Per-request, never mixed",
"prompts": "Tenant-specific, not visible to others",
"responses": "Returned only to requesting tenant",
"adapters": "Per-tenant LoRA if customized",
},
"isolation_model": """
Tenant A request → Tenant A KV cache → Tenant A response
Tenant B request → Tenant B KV cache → Tenant B response
Both use same GPU, same model weights.
Zero data leakage between tenants.
""",
}
Ensuring Isolation
class TenantIsolationManager:
"""
Ensure strict isolation between tenants
"""
def __init__(self):
self.request_to_tenant = {}
self.tenant_to_kv_cache = {}
def allocate_request(self, request_id: str, tenant_id: str):
"""Track which tenant owns which request"""
self.request_to_tenant[request_id] = tenant_id
def get_kv_cache(self, request_id: str) -> KVCache:
"""Get KV cache for request, ensuring tenant isolation"""
tenant_id = self.request_to_tenant[request_id]
# Each request gets its own KV cache
# Never shared, even within same tenant
return KVCache(tenant_id=tenant_id, request_id=request_id)
def return_response(self, request_id: str, response: str) -> str:
"""Return response only to correct tenant"""
tenant_id = self.request_to_tenant[request_id]
# Log for audit
self.log_access(tenant_id, request_id, response_hash=hash(response))
# Verify tenant before returning
return response
def cleanup_request(self, request_id: str):
"""Clean up resources after request completion"""
tenant_id = self.request_to_tenant.pop(request_id)
# KV cache automatically freed when request done
self.log_cleanup(tenant_id, request_id)
Request Routing
class MultiTenantRouter:
"""
Route requests to appropriate resources
"""
def __init__(self, config: dict):
self.tenant_quotas = config["quotas"]
self.tenant_adapters = config.get("adapters", {})
self.priority_levels = config.get("priorities", {})
def route_request(self, request: dict) -> dict:
"""Route request with tenant-specific configuration"""
tenant_id = self.extract_tenant(request)
# Check quota
if not self.check_quota(tenant_id):
return {"status": "rate_limited", "retry_after": 60}
# Get tenant-specific config
routing_config = {
"tenant_id": tenant_id,
"priority": self.priority_levels.get(tenant_id, "standard"),
"adapter": self.tenant_adapters.get(tenant_id),
"max_tokens": self.tenant_quotas[tenant_id].get("max_tokens", 4096),
}
return routing_config
def check_quota(self, tenant_id: str) -> bool:
"""Check if tenant is within rate limits"""
quota = self.tenant_quotas.get(tenant_id, {})
current_usage = self.get_current_usage(tenant_id)
return (
current_usage["requests"] < quota.get("requests_per_minute", 100) and
current_usage["tokens"] < quota.get("tokens_per_minute", 100000)
)
Fair Scheduling
class FairScheduler:
"""
Ensure fair resource distribution across tenants
"""
def __init__(self, config: dict):
self.tenant_weights = config.get("weights", {})
self.default_weight = 1.0
self.recent_usage = {} # tenant -> recent token count
def schedule_requests(self, pending_requests: list) -> list:
"""Order requests for fair processing"""
# Calculate fairness score for each request
scored = []
for request in pending_requests:
tenant = request["tenant_id"]
weight = self.tenant_weights.get(tenant, self.default_weight)
recent = self.recent_usage.get(tenant, 0)
# Lower score = higher priority
# Penalize tenants with high recent usage
fairness_score = recent / weight
scored.append((fairness_score, request))
# Sort by fairness score
scored.sort(key=lambda x: x[0])
return [request for _, request in scored]
def record_completion(self, tenant_id: str, tokens_used: int):
"""Record usage for fairness calculation"""
current = self.recent_usage.get(tenant_id, 0)
# Decay old usage, add new
self.recent_usage[tenant_id] = current * 0.9 + tokens_used
Resource Limits
def resource_limits_per_tenant():
return {
"configurable_limits": {
"max_concurrent_requests": "Prevent one tenant monopolizing",
"max_tokens_per_request": "Prevent memory exhaustion",
"max_context_length": "Tenant-specific context limits",
"requests_per_minute": "Rate limiting",
"tokens_per_minute": "Token-based rate limiting",
},
"implementation": """
class TenantLimits:
def __init__(self, tenant_id: str, config: dict):
self.tenant_id = tenant_id
self.max_concurrent = config.get("max_concurrent", 10)
self.max_tokens = config.get("max_tokens", 4096)
self.rpm = config.get("rpm", 100)
self.tpm = config.get("tpm", 100000)
self.current_concurrent = 0
def can_accept(self, request: dict) -> tuple[bool, str]:
if self.current_concurrent >= self.max_concurrent:
return False, "concurrent_limit"
if request.get("max_tokens", 0) > self.max_tokens:
return False, "token_limit"
if self.over_rate_limit():
return False, "rate_limit"
return True, "ok"
""",
}
Monitoring Per Tenant
class TenantMonitoring:
"""
Track metrics per tenant
"""
def record_request(self, tenant_id: str, metrics: dict):
"""Record request-level metrics"""
self.metrics.record({
"tenant_id": tenant_id,
"latency_ms": metrics["latency_ms"],
"input_tokens": metrics["input_tokens"],
"output_tokens": metrics["output_tokens"],
"timestamp": datetime.now(),
})
def tenant_dashboard(self, tenant_id: str, window_hours: int = 1) -> dict:
"""Get dashboard for specific tenant"""
requests = self.get_requests(tenant_id, window_hours)
return {
"total_requests": len(requests),
"total_tokens": sum(r["input_tokens"] + r["output_tokens"] for r in requests),
"avg_latency_ms": statistics.mean(r["latency_ms"] for r in requests),
"p99_latency_ms": self.percentile(requests, 99, "latency_ms"),
"error_rate": sum(1 for r in requests if r.get("error")) / len(requests),
}
def cross_tenant_comparison(self) -> dict:
"""Compare metrics across tenants (internal use)"""
return {
"resource_utilization_by_tenant": self.get_utilization_breakdown(),
"latency_by_tenant": self.get_latency_by_tenant(),
"quota_usage_by_tenant": self.get_quota_usage(),
}
Cost Attribution
class TenantCostAttribution:
"""
Track and attribute costs per tenant
"""
def __init__(self, pricing: dict):
self.input_price = pricing["input_per_million"]
self.output_price = pricing["output_per_million"]
self.compute_price = pricing.get("compute_per_hour", 0)
def record_usage(self, tenant_id: str, request: dict):
"""Record usage for billing"""
cost = self.calculate_cost(request)
self.usage_db.insert({
"tenant_id": tenant_id,
"request_id": request["request_id"],
"input_tokens": request["input_tokens"],
"output_tokens": request["output_tokens"],
"cost": cost,
"timestamp": datetime.now(),
})
def calculate_cost(self, request: dict) -> float:
input_cost = request["input_tokens"] * self.input_price / 1_000_000
output_cost = request["output_tokens"] * self.output_price / 1_000_000
return input_cost + output_cost
def monthly_invoice(self, tenant_id: str) -> dict:
"""Generate monthly invoice"""
usage = self.get_monthly_usage(tenant_id)
return {
"tenant_id": tenant_id,
"period": "2025-12",
"total_requests": len(usage),
"total_input_tokens": sum(u["input_tokens"] for u in usage),
"total_output_tokens": sum(u["output_tokens"] for u in usage),
"total_cost": sum(u["cost"] for u in usage),
}
Security Considerations
def multi_tenant_security():
return {
"data_isolation": {
"requirement": "No data leakage between tenants",
"implementation": [
"Per-request KV cache allocation",
"Request-tenant binding at entry",
"Audit logging of all access",
],
"verification": "Regular security audits",
},
"prompt_injection": {
"risk": "Malicious prompt affects other tenants",
"mitigation": [
"Each request is independent",
"No shared state between requests",
"System prompts are per-tenant",
],
},
"model_extraction": {
"risk": "Tenant extracts model through queries",
"mitigation": [
"Rate limiting",
"Query logging and analysis",
"Anomaly detection on query patterns",
],
},
"audit_trail": {
"what_to_log": [
"Request entry with tenant ID",
"Resources allocated",
"Response hash (not content)",
"Resource cleanup",
],
"retention": "Per compliance requirements",
},
}
Multi-tenant inference shares the expensive parts (model, GPUs) while keeping the sensitive parts separate (data, prompts, responses). The economics work because many tenants share fixed costs. The security works because isolation happens at the request level, not the hardware level.