Back to Blog
Managing Model Versions Without Downtime
Library APIs evolve but old versions stay available. Breaking changes get new major versions. Clients migrate when ready, not when forced. Software has solved this problem.
Model versioning needs similar discipline. "The model changed" shouldn't surprise production systems. Versions should be explicit, immutable, and controlled.
The Versioning Problem
def versioning_problems():
return {
"silent_updates": {
"problem": "Model updated, behavior changes, no warning",
"symptom": "Production behavior differs from testing",
"cause": "Using 'latest' in production",
},
"no_rollback": {
"problem": "Old version overwritten, can't go back",
"symptom": "Stuck with regression",
"cause": "Mutable model references",
},
"drift": {
"problem": "Different environments have different versions",
"symptom": "Works in staging, fails in prod",
"cause": "No version pinning",
},
}
The Version Schema
class ModelVersion:
"""
Explicit, immutable model versions
"""
def __init__(
self,
model_id: str,
version: str,
created_at: str,
config_hash: str,
):
self.model_id = model_id # e.g., "llama-70b-chat"
self.version = version # e.g., "v2.1.3" or date-based "20250809"
self.created_at = created_at
self.config_hash = config_hash # Hash of all config for reproducibility
@property
def full_id(self) -> str:
"""Globally unique, immutable identifier"""
return f"{self.model_id}:{self.version}"
# Example versions:
# llama-70b-chat:v2.1.3
# llama-70b-chat:20250809-quantized
# internal-tuned-v1:exp-42-final
Never Use "Latest" in Production
def latest_considered_harmful():
return {
"bad": """
# Production code
model = load_model("llama-70b-chat:latest")
# Problems:
# - Different behavior on different days
# - Can't reproduce issues
# - No audit trail
# - Surprise regressions
""",
"good": """
# Production code
MODEL_VERSION = "llama-70b-chat:v2.1.3"
model = load_model(MODEL_VERSION)
# Benefits:
# - Reproducible behavior
# - Clear rollback target
# - Audit trail
# - Controlled updates
""",
"exception": """
# "latest" is OK for:
# - Local development (convenience)
# - Experiments (intentionally want newest)
# Never in production configs
""",
}
Version Lifecycle
def version_lifecycle():
return {
"stages": {
"development": {
"status": "Active development, unstable",
"usage": "Internal testing only",
"example": "llama-70b-chat:dev-abc123",
},
"staging": {
"status": "Feature complete, testing",
"usage": "Staging environment, internal users",
"example": "llama-70b-chat:rc-v2.2.0",
},
"production": {
"status": "Released, stable",
"usage": "Production traffic",
"example": "llama-70b-chat:v2.1.3",
},
"deprecated": {
"status": "Superseded, migration encouraged",
"usage": "Existing clients, no new adoption",
"example": "llama-70b-chat:v1.9.0",
},
"retired": {
"status": "End of life",
"usage": "None, will return error",
"example": "llama-70b-chat:v1.0.0",
},
},
"transitions": """
dev -> staging: Feature complete
staging -> production: QA passed, canary complete
production -> deprecated: New version released
deprecated -> retired: After deprecation period (30-90 days)
""",
}
Zero-Downtime Updates
class ZeroDowntimeUpdate:
"""
How to update without interruption
"""
def rolling_update(self):
return """
1. Deploy new version to subset of replicas
2. Health check passes
3. Shift traffic to new replicas
4. Repeat until all replicas updated
5. Remove old replicas
# Kubernetes handles this automatically:
# kubectl set image deployment/llm-serving model=model:v2.1.3
"""
def blue_green(self):
return """
1. Deploy entirely new "green" environment
2. Test green environment
3. Switch traffic from "blue" to "green"
4. Keep blue as rollback target
5. Decommission blue after confidence period
# Instant rollback: switch traffic back to blue
"""
def model_hot_reload(self):
return """
# If supported by serving framework
async def hot_reload_model(new_version: str):
# Load new model while old continues serving
new_model = await load_model(new_version)
# Atomic swap
with self.model_lock:
old_model = self.model
self.model = new_model
# Cleanup old model after drain
await asyncio.sleep(60) # Wait for in-flight requests
await old_model.unload()
"""
Version Pinning in Configuration
def version_pinning():
return {
"environment_config": """
# config/production.yaml
models:
chat:
model_id: llama-70b-chat
version: v2.1.3 # PINNED, not "latest"
embedding:
model_id: bge-large
version: v1.5.0
# Updates happen by:
# 1. Change version in config
# 2. Deploy config change
# 3. Automated rollout with canary
""",
"code_constants": """
# constants.py
class ModelVersions:
CHAT_MODEL = "llama-70b-chat:v2.1.3"
EMBED_MODEL = "bge-large:v1.5.0"
# Version changes require code deploy
# This is intentional: version bumps should be deliberate
""",
}
Version Tracking
class VersionRegistry:
"""
Track what versions exist and their status
"""
def register_version(self, version: ModelVersion, status: str):
"""Register a new model version"""
self.db.insert({
"full_id": version.full_id,
"model_id": version.model_id,
"version": version.version,
"status": status,
"created_at": version.created_at,
"config_hash": version.config_hash,
})
def get_production_version(self, model_id: str) -> str:
"""Get current production version for a model"""
return self.db.query(
"SELECT full_id FROM versions "
"WHERE model_id = ? AND status = 'production' "
"ORDER BY created_at DESC LIMIT 1",
[model_id]
)
def list_versions(self, model_id: str) -> list:
"""List all versions of a model"""
return self.db.query(
"SELECT * FROM versions WHERE model_id = ? ORDER BY created_at DESC",
[model_id]
)
Audit Trail
def audit_requirements():
return {
"what_to_log": [
"Model version serving each request",
"Version changes (who, when, why)",
"Rollbacks (trigger, time, outcome)",
"Deprecations and retirements",
],
"why_it_matters": [
"Debug issues: 'What version was serving at 2pm?'",
"Compliance: Prove what model made a decision",
"Rollback: Know what 'previous' version was",
"Learning: Track which versions had issues",
],
"implementation": """
# Log version with each request
logger.info("Request handled",
request_id=req.id,
model_version="llama-70b-chat:v2.1.3",
latency_ms=latency,
)
""",
}
Versioning feels like overhead until you need to rollback at 2am. Then it's the difference between "deploy previous version" and "figure out what previous means while the site is down."