Back to Blog

Running Multiple Customers on One GPU

Apartment buildings house many tenants with shared infrastructure. The plumbing, electrical, and structure are common. The living spaces are separate. Nobody wants to pay for an entire building when an apartment suffices. Good design provides isolation without duplication.

Multi-tenant LLM inference follows the same model. Multiple customers share GPU resources without sharing data. The base model is common. The KV caches, prompts, and responses are isolated. The economics of shared infrastructure make the service viable.

The Multi-Tenant Architecture

def multi_tenant_architecture():
    return {
        "shared_resources": {
            "model_weights": "One copy in GPU memory",
            "inference_engine": "Single vLLM/TGI instance",
            "compute_capacity": "Dynamically allocated across requests",
        },

        "isolated_resources": {
            "kv_cache": "Per-request, never mixed",
            "prompts": "Tenant-specific, not visible to others",
            "responses": "Returned only to requesting tenant",
            "adapters": "Per-tenant LoRA if customized",
        },

        "isolation_model": """
            Tenant A request → Tenant A KV cache → Tenant A response
            Tenant B request → Tenant B KV cache → Tenant B response

            Both use same GPU, same model weights.
            Zero data leakage between tenants.
        """,
    }

Ensuring Isolation

class TenantIsolationManager:
    """
    Ensure strict isolation between tenants
    """

    def __init__(self):
        self.request_to_tenant = {}
        self.tenant_to_kv_cache = {}

    def allocate_request(self, request_id: str, tenant_id: str):
        """Track which tenant owns which request"""
        self.request_to_tenant[request_id] = tenant_id

    def get_kv_cache(self, request_id: str) -> KVCache:
        """Get KV cache for request, ensuring tenant isolation"""
        tenant_id = self.request_to_tenant[request_id]

        # Each request gets its own KV cache
        # Never shared, even within same tenant
        return KVCache(tenant_id=tenant_id, request_id=request_id)

    def return_response(self, request_id: str, response: str) -> str:
        """Return response only to correct tenant"""
        tenant_id = self.request_to_tenant[request_id]

        # Log for audit
        self.log_access(tenant_id, request_id, response_hash=hash(response))

        # Verify tenant before returning
        return response

    def cleanup_request(self, request_id: str):
        """Clean up resources after request completion"""
        tenant_id = self.request_to_tenant.pop(request_id)
        # KV cache automatically freed when request done
        self.log_cleanup(tenant_id, request_id)

Request Routing

class MultiTenantRouter:
    """
    Route requests to appropriate resources
    """

    def __init__(self, config: dict):
        self.tenant_quotas = config["quotas"]
        self.tenant_adapters = config.get("adapters", {})
        self.priority_levels = config.get("priorities", {})

    def route_request(self, request: dict) -> dict:
        """Route request with tenant-specific configuration"""
        tenant_id = self.extract_tenant(request)

        # Check quota
        if not self.check_quota(tenant_id):
            return {"status": "rate_limited", "retry_after": 60}

        # Get tenant-specific config
        routing_config = {
            "tenant_id": tenant_id,
            "priority": self.priority_levels.get(tenant_id, "standard"),
            "adapter": self.tenant_adapters.get(tenant_id),
            "max_tokens": self.tenant_quotas[tenant_id].get("max_tokens", 4096),
        }

        return routing_config

    def check_quota(self, tenant_id: str) -> bool:
        """Check if tenant is within rate limits"""
        quota = self.tenant_quotas.get(tenant_id, {})
        current_usage = self.get_current_usage(tenant_id)

        return (
            current_usage["requests"] < quota.get("requests_per_minute", 100) and
            current_usage["tokens"] < quota.get("tokens_per_minute", 100000)
        )

Fair Scheduling

class FairScheduler:
    """
    Ensure fair resource distribution across tenants
    """

    def __init__(self, config: dict):
        self.tenant_weights = config.get("weights", {})
        self.default_weight = 1.0
        self.recent_usage = {}  # tenant -> recent token count

    def schedule_requests(self, pending_requests: list) -> list:
        """Order requests for fair processing"""
        # Calculate fairness score for each request
        scored = []
        for request in pending_requests:
            tenant = request["tenant_id"]
            weight = self.tenant_weights.get(tenant, self.default_weight)
            recent = self.recent_usage.get(tenant, 0)

            # Lower score = higher priority
            # Penalize tenants with high recent usage
            fairness_score = recent / weight

            scored.append((fairness_score, request))

        # Sort by fairness score
        scored.sort(key=lambda x: x[0])

        return [request for _, request in scored]

    def record_completion(self, tenant_id: str, tokens_used: int):
        """Record usage for fairness calculation"""
        current = self.recent_usage.get(tenant_id, 0)
        # Decay old usage, add new
        self.recent_usage[tenant_id] = current * 0.9 + tokens_used

Resource Limits

def resource_limits_per_tenant():
    return {
        "configurable_limits": {
            "max_concurrent_requests": "Prevent one tenant monopolizing",
            "max_tokens_per_request": "Prevent memory exhaustion",
            "max_context_length": "Tenant-specific context limits",
            "requests_per_minute": "Rate limiting",
            "tokens_per_minute": "Token-based rate limiting",
        },

        "implementation": """
            class TenantLimits:
                def __init__(self, tenant_id: str, config: dict):
                    self.tenant_id = tenant_id
                    self.max_concurrent = config.get("max_concurrent", 10)
                    self.max_tokens = config.get("max_tokens", 4096)
                    self.rpm = config.get("rpm", 100)
                    self.tpm = config.get("tpm", 100000)
                    self.current_concurrent = 0

                def can_accept(self, request: dict) -> tuple[bool, str]:
                    if self.current_concurrent >= self.max_concurrent:
                        return False, "concurrent_limit"

                    if request.get("max_tokens", 0) > self.max_tokens:
                        return False, "token_limit"

                    if self.over_rate_limit():
                        return False, "rate_limit"

                    return True, "ok"
        """,
    }

Monitoring Per Tenant

class TenantMonitoring:
    """
    Track metrics per tenant
    """

    def record_request(self, tenant_id: str, metrics: dict):
        """Record request-level metrics"""
        self.metrics.record({
            "tenant_id": tenant_id,
            "latency_ms": metrics["latency_ms"],
            "input_tokens": metrics["input_tokens"],
            "output_tokens": metrics["output_tokens"],
            "timestamp": datetime.now(),
        })

    def tenant_dashboard(self, tenant_id: str, window_hours: int = 1) -> dict:
        """Get dashboard for specific tenant"""
        requests = self.get_requests(tenant_id, window_hours)

        return {
            "total_requests": len(requests),
            "total_tokens": sum(r["input_tokens"] + r["output_tokens"] for r in requests),
            "avg_latency_ms": statistics.mean(r["latency_ms"] for r in requests),
            "p99_latency_ms": self.percentile(requests, 99, "latency_ms"),
            "error_rate": sum(1 for r in requests if r.get("error")) / len(requests),
        }

    def cross_tenant_comparison(self) -> dict:
        """Compare metrics across tenants (internal use)"""
        return {
            "resource_utilization_by_tenant": self.get_utilization_breakdown(),
            "latency_by_tenant": self.get_latency_by_tenant(),
            "quota_usage_by_tenant": self.get_quota_usage(),
        }

Cost Attribution

class TenantCostAttribution:
    """
    Track and attribute costs per tenant
    """

    def __init__(self, pricing: dict):
        self.input_price = pricing["input_per_million"]
        self.output_price = pricing["output_per_million"]
        self.compute_price = pricing.get("compute_per_hour", 0)

    def record_usage(self, tenant_id: str, request: dict):
        """Record usage for billing"""
        cost = self.calculate_cost(request)

        self.usage_db.insert({
            "tenant_id": tenant_id,
            "request_id": request["request_id"],
            "input_tokens": request["input_tokens"],
            "output_tokens": request["output_tokens"],
            "cost": cost,
            "timestamp": datetime.now(),
        })

    def calculate_cost(self, request: dict) -> float:
        input_cost = request["input_tokens"] * self.input_price / 1_000_000
        output_cost = request["output_tokens"] * self.output_price / 1_000_000
        return input_cost + output_cost

    def monthly_invoice(self, tenant_id: str) -> dict:
        """Generate monthly invoice"""
        usage = self.get_monthly_usage(tenant_id)

        return {
            "tenant_id": tenant_id,
            "period": "2025-12",
            "total_requests": len(usage),
            "total_input_tokens": sum(u["input_tokens"] for u in usage),
            "total_output_tokens": sum(u["output_tokens"] for u in usage),
            "total_cost": sum(u["cost"] for u in usage),
        }

Security Considerations

def multi_tenant_security():
    return {
        "data_isolation": {
            "requirement": "No data leakage between tenants",
            "implementation": [
                "Per-request KV cache allocation",
                "Request-tenant binding at entry",
                "Audit logging of all access",
            ],
            "verification": "Regular security audits",
        },

        "prompt_injection": {
            "risk": "Malicious prompt affects other tenants",
            "mitigation": [
                "Each request is independent",
                "No shared state between requests",
                "System prompts are per-tenant",
            ],
        },

        "model_extraction": {
            "risk": "Tenant extracts model through queries",
            "mitigation": [
                "Rate limiting",
                "Query logging and analysis",
                "Anomaly detection on query patterns",
            ],
        },

        "audit_trail": {
            "what_to_log": [
                "Request entry with tenant ID",
                "Resources allocated",
                "Response hash (not content)",
                "Resource cleanup",
            ],
            "retention": "Per compliance requirements",
        },
    }

Multi-tenant inference shares the expensive parts (model, GPUs) while keeping the sensitive parts separate (data, prompts, responses). The economics work because many tenants share fixed costs. The security works because isolation happens at the request level, not the hardware level.