Testing Quality After Quantization

Pharmaceutical companies don't ship drugs based on lab results alone. Clinical trials test on real patients, in real conditions, measuring outcomes that matter. Lab metrics are necessary but not sufficient.

Quantization testing works similarly. Perplexity is your lab result. User-facing quality is your clinical outcome. You need both, and the second one is harder.

The Testing Pyramid

def testing_pyramid():
    return {
        "layer_1_sanity": {
            "tests": ["Model loads", "Inference runs", "Output is valid text"],
            "when": "Every build",
            "time": "Seconds",
            "catches": "Complete breakage",
        },
        "layer_2_metrics": {
            "tests": ["Perplexity", "Standard benchmarks", "Latency"],
            "when": "Every candidate model",
            "time": "Minutes",
            "catches": "Severe quality degradation",
        },
        "layer_3_task_specific": {
            "tests": ["Your actual use cases", "Domain-specific evals"],
            "when": "Before deployment decision",
            "time": "Minutes to hours",
            "catches": "Task-specific regressions",
        },
        "layer_4_human": {
            "tests": ["Side-by-side preference", "Expert review"],
            "when": "Final validation",
            "time": "Days",
            "catches": "Subtle quality issues",
        },
    }

Automated Regression Detection

class QuantizationRegressionTest:
    def __init__(self, baseline_model, quantized_model, test_prompts):
        self.baseline = baseline_model
        self.quantized = quantized_model
        self.test_prompts = test_prompts
        self.threshold = 0.02  # 2% degradation threshold

    def run_regression_suite(self) -> dict:
        results = {
            "passed": True,
            "failures": [],
            "warnings": [],
        }

        # Test 1: Perplexity
        ppl_baseline = self.measure_perplexity(self.baseline)
        ppl_quantized = self.measure_perplexity(self.quantized)
        ppl_increase = (ppl_quantized - ppl_baseline) / ppl_baseline

        if ppl_increase > self.threshold:
            results["passed"] = False
            results["failures"].append(f"Perplexity increased {ppl_increase:.1%}")

        # Test 2: Output similarity
        similarities = []
        for prompt in self.test_prompts:
            baseline_out = self.generate(self.baseline, prompt)
            quantized_out = self.generate(self.quantized, prompt)
            sim = self.compute_similarity(baseline_out, quantized_out)
            similarities.append(sim)

            if sim < 0.9:
                results["warnings"].append(f"Low similarity on: {prompt[:50]}...")

        avg_similarity = sum(similarities) / len(similarities)
        if avg_similarity < 0.95:
            results["passed"] = False
            results["failures"].append(f"Average similarity {avg_similarity:.2f} < 0.95")

        return results

Test Prompt Categories

def test_prompt_categories():
    """
    Different prompts catch different failures
    """
    return {
        "standard": {
            "purpose": "Baseline quality check",
            "examples": ["Summarize this article...", "Explain X to a child..."],
            "what_it_catches": "General quality issues",
        },
        "edge_length": {
            "purpose": "Context length extremes",
            "examples": ["Very short prompts", "Maximum context prompts"],
            "what_it_catches": "Length-dependent failures",
        },
        "numeric": {
            "purpose": "Math and number handling",
            "examples": ["What is 347 * 89?", "Convert 3.7 miles to km"],
            "what_it_catches": "Precision issues in quantized models",
        },
        "structured_output": {
            "purpose": "JSON, code, specific formats",
            "examples": ["Return a JSON object with...", "Write a function that..."],
            "what_it_catches": "Format compliance regressions",
        },
        "rare_tokens": {
            "purpose": "Unusual vocabulary",
            "examples": ["Prompts with technical jargon", "Non-English words"],
            "what_it_catches": "Embedding quantization issues",
        },
        "adversarial": {
            "purpose": "Stress test edge cases",
            "examples": ["Prompts that previously failed", "Known difficult inputs"],
            "what_it_catches": "Regressions on tricky cases",
        },
    }

CI/CD Integration

def quantization_ci_pipeline():
    """
    Automated testing on every model change
    """
    return {
        "trigger": "New quantized model artifact",
        "stages": [
            {
                "name": "sanity",
                "tests": ["load_model", "generate_single", "format_valid"],
                "timeout": "1 minute",
                "on_fail": "Block",
            },
            {
                "name": "benchmarks",
                "tests": ["perplexity", "mmlu_sample", "latency"],
                "timeout": "10 minutes",
                "on_fail": "Block if > 5% regression",
            },
            {
                "name": "regression",
                "tests": ["output_similarity", "task_evals"],
                "timeout": "30 minutes",
                "on_fail": "Block if > 2% regression",
            },
            {
                "name": "extended",
                "tests": ["full_benchmark_suite", "edge_cases"],
                "timeout": "2 hours",
                "on_fail": "Warn, manual review required",
            },
        ],
        "artifacts": {
            "metrics_json": "All test results",
            "comparison_report": "Baseline vs quantized",
            "failure_samples": "Examples where quality dropped",
        },
    }

Similarity Metrics

class SimilarityMetrics:
    """
    Different ways to measure output similarity
    """

    def token_overlap(self, ref: str, hyp: str) -> float:
        """Simple but fast"""
        ref_tokens = set(ref.split())
        hyp_tokens = set(hyp.split())
        intersection = ref_tokens & hyp_tokens
        union = ref_tokens | hyp_tokens
        return len(intersection) / len(union) if union else 0

    def rouge_l(self, ref: str, hyp: str) -> float:
        """Longest common subsequence"""
        # Standard ROUGE-L implementation
        pass

    def bertscore(self, ref: str, hyp: str) -> float:
        """Semantic similarity"""
        # Requires embedding model, more expensive but captures meaning
        pass

    def llm_judge(self, prompt: str, ref: str, hyp: str) -> float:
        """Use another LLM to judge similarity"""
        judge_prompt = f"""
        Rate how similar these two responses are to the prompt.
        Score 1-5 where 5 is identical meaning.

        Prompt: {prompt}
        Response A: {ref}
        Response B: {hyp}
        """
        # Useful but expensive and potentially circular
        pass

    def recommendation(self):
        return """
        Use token_overlap for fast CI checks (< 0.9 is suspicious).
        Use BERTScore for quality gates (< 0.95 needs investigation).
        Use LLM-judge for final validation (if resources allow).
        """

Monitoring After Deployment

def production_quality_monitoring():
    return {
        "metrics_to_track": [
            {
                "metric": "User feedback rate",
                "baseline": "Measure before deployment",
                "alert": "20% increase in negative feedback",
            },
            {
                "metric": "Regeneration rate",
                "baseline": "How often users click 'regenerate'",
                "alert": "Significant increase",
            },
            {
                "metric": "Task completion rate",
                "baseline": "If measurable for your use case",
                "alert": "Drop in successful completions",
            },
            {
                "metric": "Output length distribution",
                "baseline": "Average response length",
                "alert": "Significant shift in distribution",
            },
        ],
        "shadow_testing": {
            "description": "Run both models, compare outputs",
            "implementation": "Log quantized output, compare offline",
            "catches": "Issues that only appear at scale",
        },
    }

The Testing Checklist

def quantization_test_checklist():
    return [
        "[ ] Perplexity within threshold",
        "[ ] Standard benchmarks within threshold",
        "[ ] Task-specific evals pass",
        "[ ] Output similarity > 95% on test set",
        "[ ] Edge cases handled correctly",
        "[ ] Numeric accuracy acceptable",
        "[ ] Code generation still produces valid code",
        "[ ] No new failure modes on adversarial prompts",
        "[ ] Human review of sample outputs passes",
        "[ ] Rollback plan documented and tested",
    ]

Testing quantization isn't about proving the model still works. It's about proving it works well enough for your users, on your tasks, with your quality bar. Generic benchmarks are a floor, not a ceiling.