Testing Fine-tuned Model Quality

Medical school exams test book knowledge. Board certifications test clinical scenarios. Residencies test real patient care. Each level reveals different competencies. Passing one doesn't guarantee passing another.

Evaluating fine-tuned models needs the same layered approach. Training loss shows the model learned something. Generic benchmarks show it didn't catastrophically forget. Domain-specific evals show it works for your task. A/B tests show it works in production. Each layer matters.

The Evaluation Hierarchy

def evaluation_hierarchy():
    return {
        "level_1_training_metrics": {
            "what": "Training and validation loss",
            "when": "During training",
            "tells_you": "Model is learning from data",
            "doesnt_tell_you": "If it learned the right things",
        },

        "level_2_capability_retention": {
            "what": "Generic benchmarks (MMLU, HumanEval, etc.)",
            "when": "After training",
            "tells_you": "Model didn't forget general capabilities",
            "doesnt_tell_you": "If it's good at your specific task",
        },

        "level_3_task_specific": {
            "what": "Evals designed for your use case",
            "when": "After training, before deploy",
            "tells_you": "Model handles your task well",
            "doesnt_tell_you": "How real users will experience it",
        },

        "level_4_production": {
            "what": "A/B tests, user feedback, quality monitoring",
            "when": "After deployment",
            "tells_you": "Actual production quality",
            "gold_standard": True,
        },
    }

Building Task-Specific Evals

class TaskSpecificEval:
    """
    Build evaluation suite for your fine-tuning task
    """

    def __init__(self, task_config: dict):
        self.task_type = task_config["type"]
        self.success_criteria = task_config["criteria"]
        self.test_cases = self.load_test_cases(task_config["test_path"])

    def evaluate(self, model) -> dict:
        """Run full evaluation"""
        results = {
            "accuracy": self.measure_accuracy(model),
            "format_compliance": self.check_format(model),
            "edge_cases": self.test_edge_cases(model),
            "error_analysis": self.analyze_errors(model),
        }

        results["passed"] = all([
            results["accuracy"] >= self.success_criteria["min_accuracy"],
            results["format_compliance"] >= self.success_criteria["min_format"],
        ])

        return results

    def measure_accuracy(self, model) -> float:
        """Task-specific accuracy measurement"""
        correct = 0
        for test in self.test_cases["accuracy"]:
            response = model.generate(test["input"])
            if self.is_correct(response, test["expected"]):
                correct += 1
        return correct / len(self.test_cases["accuracy"])

    def check_format(self, model) -> float:
        """Check output format compliance"""
        compliant = 0
        for test in self.test_cases["format"]:
            response = model.generate(test["input"])
            if self.matches_format(response, test["expected_format"]):
                compliant += 1
        return compliant / len(self.test_cases["format"])

    def test_edge_cases(self, model) -> dict:
        """Test known edge cases"""
        results = {}
        for edge_case in self.test_cases["edge_cases"]:
            response = model.generate(edge_case["input"])
            results[edge_case["name"]] = {
                "passed": self.is_acceptable(response, edge_case),
                "response": response[:200],
            }
        return results

Regression Testing

class RegressionSuite:
    """
    Ensure fine-tuning didn't break existing capabilities
    """

    def __init__(self, base_model, test_sets: dict):
        self.base_model = base_model
        self.test_sets = test_sets
        self.baseline_scores = self.compute_baseline()

    def compute_baseline(self) -> dict:
        """Compute baseline scores from base model"""
        scores = {}
        for name, test_set in self.test_sets.items():
            scores[name] = self.evaluate_capability(self.base_model, test_set)
        return scores

    def check_regression(self, fine_tuned_model, max_regression: float = 0.05) -> dict:
        """Check if fine-tuned model regressed"""
        results = {}

        for name, test_set in self.test_sets.items():
            ft_score = self.evaluate_capability(fine_tuned_model, test_set)
            baseline = self.baseline_scores[name]
            regression = baseline - ft_score

            results[name] = {
                "baseline": baseline,
                "fine_tuned": ft_score,
                "regression": regression,
                "passed": regression <= max_regression,
            }

        results["overall_passed"] = all(r["passed"] for r in results.values())
        return results

    def standard_test_sets(self) -> dict:
        """Standard capabilities to test for regression"""
        return {
            "general_knowledge": "Sample from MMLU or similar",
            "instruction_following": "Basic instruction tests",
            "code_generation": "Simple coding tasks",
            "math": "Basic arithmetic and word problems",
            "safety": "Safety refusals",
        }

Comparative Evaluation

class ComparativeEval:
    """
    Compare fine-tuned model to baseline
    """

    def head_to_head(
        self,
        base_model,
        fine_tuned_model,
        test_cases: list
    ) -> dict:
        """Direct comparison on same inputs"""
        results = {
            "base_wins": 0,
            "fine_tuned_wins": 0,
            "ties": 0,
            "examples": [],
        }

        for test in test_cases:
            base_response = base_model.generate(test["input"])
            ft_response = fine_tuned_model.generate(test["input"])

            # Use LLM judge or human evaluation
            winner = self.judge(test["input"], base_response, ft_response)

            if winner == "base":
                results["base_wins"] += 1
            elif winner == "fine_tuned":
                results["fine_tuned_wins"] += 1
            else:
                results["ties"] += 1

            results["examples"].append({
                "input": test["input"][:100],
                "base": base_response[:200],
                "fine_tuned": ft_response[:200],
                "winner": winner,
            })

        results["win_rate"] = results["fine_tuned_wins"] / len(test_cases)
        return results

    def judge(self, input_text: str, response_a: str, response_b: str) -> str:
        """LLM-as-judge comparison"""
        prompt = f"""Compare these two responses to the input.

Input: {input_text}

Response A: {response_a}

Response B: {response_b}

Which response is better? Answer only 'A', 'B', or 'TIE'.
"""
        result = self.judge_model.generate(prompt)
        if "A" in result:
            return "base"
        elif "B" in result:
            return "fine_tuned"
        return "tie"

Error Analysis

def error_analysis():
    """
    Understand where the model fails
    """
    return {
        "categorize_errors": {
            "format_errors": "Output doesn't match expected format",
            "factual_errors": "Wrong information",
            "instruction_errors": "Doesn't follow instructions",
            "edge_case_errors": "Fails on unusual inputs",
            "regression_errors": "Lost capability it had before",
        },

        "analysis_process": """
            1. Collect all failed test cases
            2. Categorize each failure type
            3. Look for patterns:
               - Specific input types that fail?
               - Specific output types that fail?
               - Correlated with training data gaps?
            4. Prioritize fixes:
               - High frequency errors first
               - High impact errors first
        """,

        "actionable_insights": {
            "format_errors_high": "Add more format examples to training",
            "edge_case_errors_high": "Add edge cases to training data",
            "regression_errors": "Mix in general data, reduce LR",
            "clustered_failures": "Missing coverage in training data",
        },
    }

A/B Testing Framework

class ABTestFramework:
    """
    Production A/B testing for fine-tuned models
    """

    def __init__(self, base_model, fine_tuned_model):
        self.models = {
            "control": base_model,
            "treatment": fine_tuned_model,
        }
        self.traffic_split = 0.5

    def route_request(self, request: dict) -> str:
        """Route request to model variant"""
        # Consistent assignment by user for clean comparison
        user_hash = hash(request.get("user_id", request.get("session_id")))
        if user_hash % 100 < self.traffic_split * 100:
            return "treatment"
        return "control"

    def analyze_results(self, metrics: dict) -> dict:
        """Analyze A/B test results"""
        control = metrics["control"]
        treatment = metrics["treatment"]

        return {
            "quality_lift": (treatment["quality"] - control["quality"]) / control["quality"],
            "latency_change": (treatment["latency"] - control["latency"]) / control["latency"],
            "user_preference": treatment["regeneration_rate"] < control["regeneration_rate"],
            "statistical_significance": self.compute_significance(control, treatment),
        }

    def decision(self, results: dict) -> str:
        """Make rollout decision"""
        if not results["statistical_significance"]:
            return "CONTINUE_TEST"

        if results["quality_lift"] > 0.05 and results["latency_change"] < 0.1:
            return "SHIP_TREATMENT"

        if results["quality_lift"] < -0.02:
            return "REVERT_TO_CONTROL"

        return "CONTINUE_TEST"

Evaluation Checklist

def evaluation_checklist():
    return {
        "before_training": [
            "[ ] Define success criteria (metrics and thresholds)",
            "[ ] Create task-specific test set",
            "[ ] Create regression test set",
            "[ ] Establish baseline scores",
        ],

        "during_training": [
            "[ ] Monitor training/validation loss",
            "[ ] Track validation metrics each checkpoint",
            "[ ] Save checkpoint when validation improves",
        ],

        "after_training": [
            "[ ] Run task-specific eval suite",
            "[ ] Run regression suite",
            "[ ] Compare to base model head-to-head",
            "[ ] Analyze errors and failure modes",
        ],

        "before_deployment": [
            "[ ] Passes all task-specific criteria",
            "[ ] No significant regressions",
            "[ ] Error analysis doesn't show blockers",
            "[ ] Approved by stakeholders",
        ],

        "after_deployment": [
            "[ ] A/B test running",
            "[ ] Quality monitoring active",
            "[ ] Rollback plan tested",
            "[ ] Decision criteria for ship/revert defined",
        ],
    }

Generic benchmarks tell you the model is still a capable LLM. Task-specific evals tell you it does your job well. A/B tests tell you users prefer it. You need all three layers to know fine-tuning actually worked.