Back to Blog

Managing Model Versions Without Downtime

Library APIs evolve but old versions stay available. Breaking changes get new major versions. Clients migrate when ready, not when forced. Software has solved this problem.

Model versioning needs similar discipline. "The model changed" shouldn't surprise production systems. Versions should be explicit, immutable, and controlled.

The Versioning Problem

def versioning_problems():
    return {
        "silent_updates": {
            "problem": "Model updated, behavior changes, no warning",
            "symptom": "Production behavior differs from testing",
            "cause": "Using 'latest' in production",
        },
        "no_rollback": {
            "problem": "Old version overwritten, can't go back",
            "symptom": "Stuck with regression",
            "cause": "Mutable model references",
        },
        "drift": {
            "problem": "Different environments have different versions",
            "symptom": "Works in staging, fails in prod",
            "cause": "No version pinning",
        },
    }

The Version Schema

class ModelVersion:
    """
    Explicit, immutable model versions
    """

    def __init__(
        self,
        model_id: str,
        version: str,
        created_at: str,
        config_hash: str,
    ):
        self.model_id = model_id  # e.g., "llama-70b-chat"
        self.version = version   # e.g., "v2.1.3" or date-based "20250809"
        self.created_at = created_at
        self.config_hash = config_hash  # Hash of all config for reproducibility

    @property
    def full_id(self) -> str:
        """Globally unique, immutable identifier"""
        return f"{self.model_id}:{self.version}"

    # Example versions:
    # llama-70b-chat:v2.1.3
    # llama-70b-chat:20250809-quantized
    # internal-tuned-v1:exp-42-final

Never Use "Latest" in Production

def latest_considered_harmful():
    return {
        "bad": """
        # Production code
        model = load_model("llama-70b-chat:latest")

        # Problems:
        # - Different behavior on different days
        # - Can't reproduce issues
        # - No audit trail
        # - Surprise regressions
        """,

        "good": """
        # Production code
        MODEL_VERSION = "llama-70b-chat:v2.1.3"
        model = load_model(MODEL_VERSION)

        # Benefits:
        # - Reproducible behavior
        # - Clear rollback target
        # - Audit trail
        # - Controlled updates
        """,

        "exception": """
        # "latest" is OK for:
        # - Local development (convenience)
        # - Experiments (intentionally want newest)
        # Never in production configs
        """,
    }

Version Lifecycle

def version_lifecycle():
    return {
        "stages": {
            "development": {
                "status": "Active development, unstable",
                "usage": "Internal testing only",
                "example": "llama-70b-chat:dev-abc123",
            },
            "staging": {
                "status": "Feature complete, testing",
                "usage": "Staging environment, internal users",
                "example": "llama-70b-chat:rc-v2.2.0",
            },
            "production": {
                "status": "Released, stable",
                "usage": "Production traffic",
                "example": "llama-70b-chat:v2.1.3",
            },
            "deprecated": {
                "status": "Superseded, migration encouraged",
                "usage": "Existing clients, no new adoption",
                "example": "llama-70b-chat:v1.9.0",
            },
            "retired": {
                "status": "End of life",
                "usage": "None, will return error",
                "example": "llama-70b-chat:v1.0.0",
            },
        },
        "transitions": """
        dev -> staging: Feature complete
        staging -> production: QA passed, canary complete
        production -> deprecated: New version released
        deprecated -> retired: After deprecation period (30-90 days)
        """,
    }

Zero-Downtime Updates

class ZeroDowntimeUpdate:
    """
    How to update without interruption
    """

    def rolling_update(self):
        return """
        1. Deploy new version to subset of replicas
        2. Health check passes
        3. Shift traffic to new replicas
        4. Repeat until all replicas updated
        5. Remove old replicas

        # Kubernetes handles this automatically:
        # kubectl set image deployment/llm-serving model=model:v2.1.3
        """

    def blue_green(self):
        return """
        1. Deploy entirely new "green" environment
        2. Test green environment
        3. Switch traffic from "blue" to "green"
        4. Keep blue as rollback target
        5. Decommission blue after confidence period

        # Instant rollback: switch traffic back to blue
        """

    def model_hot_reload(self):
        return """
        # If supported by serving framework
        async def hot_reload_model(new_version: str):
            # Load new model while old continues serving
            new_model = await load_model(new_version)

            # Atomic swap
            with self.model_lock:
                old_model = self.model
                self.model = new_model

            # Cleanup old model after drain
            await asyncio.sleep(60)  # Wait for in-flight requests
            await old_model.unload()
        """

Version Pinning in Configuration

def version_pinning():
    return {
        "environment_config": """
        # config/production.yaml
        models:
          chat:
            model_id: llama-70b-chat
            version: v2.1.3  # PINNED, not "latest"
          embedding:
            model_id: bge-large
            version: v1.5.0

        # Updates happen by:
        # 1. Change version in config
        # 2. Deploy config change
        # 3. Automated rollout with canary
        """,

        "code_constants": """
        # constants.py
        class ModelVersions:
            CHAT_MODEL = "llama-70b-chat:v2.1.3"
            EMBED_MODEL = "bge-large:v1.5.0"

            # Version changes require code deploy
            # This is intentional: version bumps should be deliberate
        """,
    }

Version Tracking

class VersionRegistry:
    """
    Track what versions exist and their status
    """

    def register_version(self, version: ModelVersion, status: str):
        """Register a new model version"""
        self.db.insert({
            "full_id": version.full_id,
            "model_id": version.model_id,
            "version": version.version,
            "status": status,
            "created_at": version.created_at,
            "config_hash": version.config_hash,
        })

    def get_production_version(self, model_id: str) -> str:
        """Get current production version for a model"""
        return self.db.query(
            "SELECT full_id FROM versions "
            "WHERE model_id = ? AND status = 'production' "
            "ORDER BY created_at DESC LIMIT 1",
            [model_id]
        )

    def list_versions(self, model_id: str) -> list:
        """List all versions of a model"""
        return self.db.query(
            "SELECT * FROM versions WHERE model_id = ? ORDER BY created_at DESC",
            [model_id]
        )

Audit Trail

def audit_requirements():
    return {
        "what_to_log": [
            "Model version serving each request",
            "Version changes (who, when, why)",
            "Rollbacks (trigger, time, outcome)",
            "Deprecations and retirements",
        ],
        "why_it_matters": [
            "Debug issues: 'What version was serving at 2pm?'",
            "Compliance: Prove what model made a decision",
            "Rollback: Know what 'previous' version was",
            "Learning: Track which versions had issues",
        ],
        "implementation": """
        # Log version with each request
        logger.info("Request handled",
            request_id=req.id,
            model_version="llama-70b-chat:v2.1.3",
            latency_ms=latency,
        )
        """,
    }

Versioning feels like overhead until you need to rollback at 2am. Then it's the difference between "deploy previous version" and "figure out what previous means while the site is down."