Back to Blog

How to Catch Quality Regressions

Boiling a frog slowly works because the frog doesn't notice gradual temperature changes. Quality regressions work the same way. A 1% drop this week, 2% next week, and suddenly you're 20% worse than six months ago. Nobody noticed because each change was small.

Regression detection is the thermometer in the pot. It measures quality continuously, compares to baselines, and alerts when things drift. The goal is catching the 1% drop, not discovering the 20% decline during a quarterly review.

The Regression Detection Pipeline

class RegressionDetector:
    """
    Catch quality drops before users do
    """

    def __init__(self, config):
        self.baseline_window = config.get("baseline_days", 7)
        self.detection_threshold = config.get("threshold", 0.02)  # 2%
        self.alert_on = config.get("alert_dimensions", ["quality", "safety", "relevance"])

    def detect(self, current_metrics: dict) -> list:
        baseline = self.get_baseline()
        regressions = []

        for dimension in self.alert_on:
            current = current_metrics.get(dimension)
            baseline_val = baseline.get(dimension)

            if current is None or baseline_val is None:
                continue

            drop = baseline_val - current
            if drop > self.detection_threshold:
                regressions.append({
                    "dimension": dimension,
                    "current": current,
                    "baseline": baseline_val,
                    "drop": drop,
                    "drop_percent": f"{(drop/baseline_val)*100:.1f}%",
                    "severity": self.classify_severity(drop),
                })

        return regressions

    def classify_severity(self, drop: float) -> str:
        if drop > 0.10:
            return "critical"
        elif drop > 0.05:
            return "high"
        elif drop > 0.02:
            return "medium"
        return "low"

Baseline Management

class BaselineManager:
    """
    Manage what 'normal' looks like
    """

    def __init__(self):
        self.baselines = {}

    def compute_baseline(self, dimension: str, window_days: int = 7) -> dict:
        """Calculate baseline from recent history"""
        history = self.get_history(dimension, window_days)

        return {
            "mean": statistics.mean(history),
            "std": statistics.stdev(history) if len(history) > 1 else 0,
            "p5": self.percentile(history, 5),
            "p95": self.percentile(history, 95),
            "computed_at": datetime.now(),
            "sample_size": len(history),
        }

    def update_baseline(self, dimension: str, new_value: float):
        """Update baseline with new observation"""
        # Don't update during known incidents
        if self.is_incident_mode():
            return

        # Exponential moving average for smooth updates
        alpha = 0.1  # How fast baseline adapts
        current = self.baselines.get(dimension, {}).get("mean", new_value)
        self.baselines[dimension]["mean"] = alpha * new_value + (1 - alpha) * current

    def should_update_baseline(self, metrics: dict) -> bool:
        """
        Only update baseline when quality is stable
        """
        regressions = self.detector.detect(metrics)

        # Don't update during regression
        if len(regressions) > 0:
            return False

        # Don't update during anomalies
        if self.is_anomalous(metrics):
            return False

        return True

Statistical Detection Methods

def detection_methods():
    return {
        "threshold_based": {
            "method": "Alert if score < baseline - threshold",
            "pros": ["Simple", "Interpretable"],
            "cons": ["Requires tuning threshold", "Fixed sensitivity"],
            "example": """
                if current_score < baseline_mean - 0.02:
                    alert("Quality dropped below threshold")
            """,
        },

        "standard_deviation": {
            "method": "Alert if score < baseline - N*std",
            "pros": ["Adapts to natural variance"],
            "cons": ["May miss slow drifts"],
            "example": """
                z_score = (current - baseline_mean) / baseline_std
                if z_score < -2:
                    alert(f"Score {z_score:.1f} std below baseline")
            """,
        },

        "change_point_detection": {
            "method": "Detect when distribution shifts",
            "pros": ["Catches gradual drifts", "Statistical rigor"],
            "cons": ["More complex", "Requires more data"],
            "example": """
                from ruptures import Pelt

                algo = Pelt(model="rbf").fit(scores)
                change_points = algo.predict(pen=10)

                if len(change_points) > 0:
                    alert(f"Distribution shift detected at {change_points}")
            """,
        },

        "cusum": {
            "method": "Cumulative sum of deviations from target",
            "pros": ["Detects small persistent shifts"],
            "cons": ["Requires threshold tuning"],
            "use_case": "When small drifts matter",
        },
    }

Pre-Deployment Regression Testing

class PreDeployRegressionTest:
    """
    Catch regressions before they hit production
    """

    def __init__(self, test_suite_path: str):
        self.test_cases = self.load_test_suite(test_suite_path)
        self.baseline_results = self.load_baseline_results()

    def run_regression_suite(self, new_model) -> dict:
        """Run full regression suite against new model"""
        results = {
            "passed": True,
            "tests_run": len(self.test_cases),
            "regressions": [],
            "improvements": [],
        }

        for test in self.test_cases:
            baseline_score = self.baseline_results.get(test.id)
            new_score = self.evaluate(new_model, test)

            if new_score < baseline_score - test.allowed_regression:
                results["regressions"].append({
                    "test_id": test.id,
                    "baseline": baseline_score,
                    "new": new_score,
                    "category": test.category,
                })
                results["passed"] = False
            elif new_score > baseline_score + 0.05:
                results["improvements"].append({
                    "test_id": test.id,
                    "baseline": baseline_score,
                    "new": new_score,
                })

        return results

    def gate_deployment(self, results: dict) -> dict:
        """Make go/no-go decision"""
        if not results["passed"]:
            return {
                "decision": "BLOCKED",
                "reason": f"{len(results['regressions'])} regression(s) detected",
                "regressions": results["regressions"],
                "action": "Fix regressions before deploying",
            }
        return {"decision": "APPROVED"}

Production Monitoring

class ProductionRegressionMonitor:
    """
    Continuous monitoring in production
    """

    def __init__(self):
        self.alert_channels = ["pagerduty", "slack"]
        self.check_interval_minutes = 5

    async def monitor_loop(self):
        """Run continuous monitoring"""
        while True:
            metrics = await self.collect_current_metrics()
            regressions = self.detector.detect(metrics)

            if regressions:
                await self.handle_regressions(regressions)

            await asyncio.sleep(self.check_interval_minutes * 60)

    async def handle_regressions(self, regressions: list):
        """Respond to detected regressions"""
        for regression in regressions:
            if regression["severity"] == "critical":
                await self.page_oncall(regression)
                await self.consider_auto_rollback(regression)
            elif regression["severity"] == "high":
                await self.alert_slack(regression)
                await self.increase_monitoring(regression)
            else:
                await self.log_for_review(regression)

    async def consider_auto_rollback(self, regression: dict):
        """Auto-rollback if regression is severe enough"""
        if regression["drop"] > 0.15:  # >15% drop
            if self.recent_deployment():
                await self.trigger_rollback()
                await self.notify("Auto-rollback triggered due to quality regression")

Debugging Regressions

def debugging_regressions():
    return {
        "step_1_identify_scope": {
            "questions": [
                "Which dimensions regressed?",
                "When did it start?",
                "What percentage of traffic affected?",
            ],
            "tools": "Dashboard filters, time-series analysis",
        },

        "step_2_isolate_cause": {
            "questions": [
                "What changed? (model, config, traffic)",
                "Is regression uniform or concentrated?",
                "Which user segments affected?",
            ],
            "tools": "Deployment logs, A/B comparison",
        },

        "step_3_examine_examples": {
            "actions": [
                "Pull sample of regressed responses",
                "Compare to baseline responses",
                "Identify patterns in failures",
            ],
            "tools": "Response sampling, diff analysis",
        },

        "step_4_root_cause": {
            "common_causes": [
                "Model update introduced behavior change",
                "Prompt modification had side effects",
                "Traffic mix shifted to harder queries",
                "Dependency (API, model) changed behavior",
            ],
        },

        "step_5_remediate": {
            "options": [
                "Rollback to previous version",
                "Hotfix specific issue",
                "Adjust prompts/configuration",
                "Accept regression with justification",
            ],
        },
    }

Preventing False Alarms

def reduce_false_alarms():
    return {
        "require_persistence": {
            "description": "Alert only if regression persists across multiple checks",
            "implementation": """
                consecutive_failures = 0
                for check in checks:
                    if check.is_regression:
                        consecutive_failures += 1
                    else:
                        consecutive_failures = 0

                    if consecutive_failures >= 3:
                        alert("Persistent regression detected")
            """,
        },

        "segment_analysis": {
            "description": "Check if regression is real or due to traffic mix",
            "implementation": """
                overall_regression = detect(all_traffic)
                segment_regressions = [detect(segment) for segment in segments]

                if overall_regression and not any(segment_regressions):
                    # Traffic mix change, not real regression
                    log("Traffic mix shift, not quality regression")
            """,
        },

        "volume_gating": {
            "description": "Require minimum sample size before alerting",
            "implementation": """
                if sample_size < 100:
                    log("Sample too small, deferring alert")
                    return
            """,
        },
    }

Regression detection is about making quality visible. Without it, quality drifts silently until users complain or quarterly reviews reveal the damage. With it, you catch problems early, investigate quickly, and maintain the quality bar that users expect.