Back to Blog
Using Multiple Providers to Cut Costs
Savvy travelers check multiple airlines for the same route. Different carriers, same destination, wildly different prices. The smart move is flexibility.
LLM providers work the same way. Llama-70B runs on Together, Anyscale, Replicate, Fireworks, and more. Same model, different pricing, different performance characteristics.
The Provider Landscape
class ProviderComparison:
# Approximate pricing (changes frequently)
providers = {
"together": {
"llama_70b_per_million": 0.90,
"strengths": ["Competitive pricing", "Good throughput"],
"latency": "Medium",
},
"fireworks": {
"llama_70b_per_million": 0.90,
"strengths": ["Optimized serving", "Low latency"],
"latency": "Low",
},
"replicate": {
"llama_70b_per_million": 1.50,
"strengths": ["Easy to use", "Reliable"],
"latency": "Medium",
},
"aws_bedrock": {
"llama_70b_per_million": 2.00,
"strengths": ["Enterprise integration", "Compliance"],
"latency": "Medium",
},
"self_hosted": {
"llama_70b_per_million": 0.30,
"strengths": ["Full control", "Best unit economics"],
"latency": "Controllable",
"caveat": "Add engineering overhead",
},
}
The Multi-Provider Router
import random
from typing import Optional
class MultiProviderRouter:
def __init__(self):
self.providers = {
"primary": {
"name": "fireworks",
"cost_per_million": 0.90,
"latency_p99_ms": 800,
"reliability": 0.999,
},
"secondary": {
"name": "together",
"cost_per_million": 0.90,
"latency_p99_ms": 1000,
"reliability": 0.998,
},
"fallback": {
"name": "replicate",
"cost_per_million": 1.50,
"latency_p99_ms": 1200,
"reliability": 0.9999,
},
}
self.health_status = {p: True for p in self.providers}
def route(self, request: dict) -> str:
# Priority 1: Health
if not self.health_status["primary"]:
return self._failover_route(request)
# Priority 2: Latency-critical
if request.get("latency_critical"):
return "primary" # Fastest healthy provider
# Priority 3: Cost optimization
if request.get("batch_job"):
return self._cheapest_healthy()
return "primary"
def _cheapest_healthy(self) -> str:
healthy = [p for p, status in self.health_status.items() if status]
return min(healthy, key=lambda p: self.providers[p]["cost_per_million"])
Cost-Based Routing Strategy
class CostOptimizedRouter:
def __init__(self, monthly_budget: float):
self.budget = monthly_budget
self.spend_this_month = 0
def route_for_cost(self, request: dict, available_providers: list) -> str:
# Sort by cost
by_cost = sorted(available_providers, key=lambda p: p["cost_per_million"])
# If under budget, use cheapest
if self.spend_this_month < self.budget * 0.8:
return by_cost[0]["name"]
# If approaching budget, start being picky
# Route only high-value requests to expensive providers
if request.get("user_tier") == "premium":
return by_cost[0]["name"]
else:
# Queue or reject non-premium
return "queue_for_later"
Latency vs Cost Tradeoff
def provider_selection_matrix():
"""
Different requests have different priorities
"""
return {
"real_time_chat": {
"priority": "latency",
"provider": "fireworks", # Fastest
"accept_cost": "Up to 2x for speed",
},
"batch_summarization": {
"priority": "cost",
"provider": "together", # Cheapest
"accept_latency": "2-3 seconds fine",
},
"enterprise_api": {
"priority": "reliability",
"provider": "aws_bedrock",
"accept_cost": "Premium for SLA",
},
"internal_tools": {
"priority": "cost",
"provider": "self_hosted",
"accept_latency": "5 seconds fine",
},
}
Failover Logic
class FailoverRouter:
def __init__(self):
self.circuit_breakers = {} # provider -> CircuitBreaker
async def call_with_failover(self, request: dict) -> dict:
providers = ["primary", "secondary", "fallback"]
for provider in providers:
if self.circuit_breakers.get(provider, {}).get("open"):
continue # Skip unhealthy provider
try:
response = await self._call_provider(provider, request)
return response
except ProviderError as e:
self._record_failure(provider)
continue
# All providers failed
raise AllProvidersFailedError()
def _record_failure(self, provider: str):
breaker = self.circuit_breakers.setdefault(provider, {
"failures": 0,
"open": False,
"last_failure": None,
})
breaker["failures"] += 1
breaker["last_failure"] = time.time()
if breaker["failures"] > 5:
breaker["open"] = True
# Reset after 60 seconds
asyncio.create_task(self._reset_after(provider, 60))
The Multi-Provider Bill
def monthly_cost_comparison():
"""
100M tokens/month example
"""
single_provider = {
"provider": "replicate",
"cost": 100_000_000 * 1.50 / 1_000_000, # $150
}
multi_provider = {
"batch_60pct": 60_000_000 * 0.90 / 1_000_000, # $54
"realtime_30pct": 30_000_000 * 0.90 / 1_000_000, # $27
"premium_10pct": 10_000_000 * 1.50 / 1_000_000, # $15
"total": 54 + 27 + 15, # $96
}
return {
"single": single_provider["cost"], # $150
"multi": multi_provider["total"], # $96
"savings": 150 - 96, # $54 (36%)
}
Implementation Checklist
def multi_provider_checklist() -> list:
return [
"Unified API abstraction layer",
"Health checks for each provider",
"Circuit breaker per provider",
"Cost tracking per provider",
"Latency monitoring per provider",
"Automatic failover logic",
"Manual override capability",
"Alerts on cost anomalies",
"Regular price comparison updates",
]
The best provider changes over time. Prices shift, performance varies, new options emerge. Building multi-provider support isn't just about today's savings—it's about future flexibility.