Building Evals That Catch Real Problems

Smoke detectors save lives because they detect actual fires, not because they're sensitive. A detector that triggers on steam from a shower teaches people to ignore alarms. The best safety device is useless if it cries wolf.

LLM evals face the same trap. An eval that flags good responses as bad trains teams to ignore it. An eval that misses real problems gives false confidence. Good evals catch what matters and stay quiet otherwise.

Principles of Effective Evals

def eval_design_principles():
    return {
        "detect_real_problems": {
            "good": "Catches responses that harm user experience",
            "bad": "Flags stylistic preferences as failures",
            "test": "Would a user complain about this?",
        },
        "low_false_positive_rate": {
            "good": "< 1% false positive rate",
            "bad": "10% of flagged responses are actually fine",
            "consequence": "Teams ignore or disable the eval",
        },
        "deterministic_when_possible": {
            "good": "Same input always produces same result",
            "bad": "Eval results vary on retry",
            "why": "Reproducibility for debugging",
        },
        "fast_feedback": {
            "good": "Results in CI pipeline",
            "bad": "Results days after deployment",
            "why": "Fix problems before they spread",
        },
        "versioned_and_tracked": {
            "good": "Eval v2.3 ran on model v1.5",
            "bad": "Which eval version caught this?",
            "why": "Reproduce and compare over time",
        },
    }

Building Deterministic Evals

class DeterministicEvals:
    """
    Evals that always give the same result for the same input
    """

    def format_check(self, response: str, expected_format: str) -> dict:
        """Check response matches expected format"""
        if expected_format == "json":
            try:
                json.loads(response)
                return {"pass": True, "reason": "Valid JSON"}
            except json.JSONDecodeError as e:
                return {"pass": False, "reason": f"Invalid JSON: {e}"}

        if expected_format == "markdown_list":
            lines = response.strip().split("\n")
            has_list = any(line.strip().startswith(("-", "*", "1.")) for line in lines)
            return {
                "pass": has_list,
                "reason": "Contains list" if has_list else "No list found",
            }

    def length_bounds(self, response: str, min_chars: int, max_chars: int) -> dict:
        """Check response length is within bounds"""
        length = len(response)
        if length < min_chars:
            return {"pass": False, "reason": f"Too short: {length} < {min_chars}"}
        if length > max_chars:
            return {"pass": False, "reason": f"Too long: {length} > {max_chars}"}
        return {"pass": True, "length": length}

    def contains_required(self, response: str, required: list) -> dict:
        """Check response contains required elements"""
        missing = [r for r in required if r.lower() not in response.lower()]
        return {
            "pass": len(missing) == 0,
            "missing": missing,
        }

    def forbidden_patterns(self, response: str, patterns: list) -> dict:
        """Check response doesn't contain forbidden patterns"""
        import re
        found = []
        for pattern in patterns:
            if re.search(pattern, response, re.IGNORECASE):
                found.append(pattern)
        return {
            "pass": len(found) == 0,
            "forbidden_found": found,
        }

Building LLM-Based Evals

class LLMJudge:
    """
    Use LLM to evaluate subjective quality
    """

    def __init__(self, judge_model: str = "claude-3-haiku"):
        self.model = judge_model
        self.temperature = 0  # Maximize consistency

    def evaluate_relevance(self, question: str, answer: str) -> dict:
        """Does the answer address the question?"""
        prompt = f"""Rate how well this answer addresses the question.

Question: {question}

Answer: {answer}

Score from 1-5:
1 = Completely irrelevant
2 = Mentions topic but doesn't answer
3 = Partially answers
4 = Mostly answers
5 = Fully addresses the question

Output only the number and one sentence explanation.
Example: 4 - Answers the main question but misses the follow-up about timing."""

        result = self.model.generate(prompt, temperature=0)
        score, explanation = self.parse_score(result)

        return {
            "score": score,
            "pass": score >= 3,
            "explanation": explanation,
        }

    def evaluate_coherence(self, response: str) -> dict:
        """Is the response internally consistent and logical?"""
        prompt = f"""Evaluate the coherence of this response.

Response: {response}

Check for:
- Internal contradictions
- Logical flow
- Clear structure

Score from 1-5:
1 = Incoherent or contradictory
3 = Understandable but disorganized
5 = Clear, logical, well-structured

Output only the number and one sentence explanation."""

        result = self.model.generate(prompt, temperature=0)
        score, explanation = self.parse_score(result)

        return {
            "score": score,
            "pass": score >= 3,
            "explanation": explanation,
        }

Task-Specific Evals

class TaskSpecificEvals:
    """
    Evals designed for specific output types
    """

    def code_eval(self, code: str, test_cases: list) -> dict:
        """Execute code and run tests"""
        try:
            # Sandbox execution
            result = self.sandbox.execute(code, timeout=10)

            passed = 0
            failed = []
            for test in test_cases:
                try:
                    output = self.sandbox.call(test["function"], test["input"])
                    if output == test["expected"]:
                        passed += 1
                    else:
                        failed.append({
                            "test": test["name"],
                            "expected": test["expected"],
                            "got": output,
                        })
                except Exception as e:
                    failed.append({"test": test["name"], "error": str(e)})

            return {
                "pass": len(failed) == 0,
                "passed": passed,
                "total": len(test_cases),
                "failures": failed,
            }
        except Exception as e:
            return {"pass": False, "error": str(e)}

    def sql_eval(self, query: str, expected_schema: dict) -> dict:
        """Validate SQL query structure and safety"""
        import sqlparse

        parsed = sqlparse.parse(query)[0]

        checks = {
            "valid_syntax": parsed.get_type() is not None,
            "no_destructive": not any(
                kw in query.upper()
                for kw in ["DROP", "DELETE", "TRUNCATE", "UPDATE"]
            ),
            "has_limit": "LIMIT" in query.upper(),
        }

        return {
            "pass": all(checks.values()),
            "checks": checks,
        }

    def math_eval(self, response: str, correct_answer: float, tolerance: float = 0.01) -> dict:
        """Extract numerical answer and compare"""
        import re

        # Extract numbers from response
        numbers = re.findall(r"[-+]?\d*\.?\d+", response)
        if not numbers:
            return {"pass": False, "reason": "No number found in response"}

        # Check last number (usually the final answer)
        answer = float(numbers[-1])
        diff = abs(answer - correct_answer)

        return {
            "pass": diff <= tolerance,
            "expected": correct_answer,
            "found": answer,
            "difference": diff,
        }

Eval Versioning

class VersionedEval:
    """
    Track eval versions for reproducibility
    """

    def __init__(self, name: str, version: str):
        self.name = name
        self.version = version
        self.created_at = datetime.now()

    def run(self, input_data: dict) -> dict:
        result = self._evaluate(input_data)
        return {
            **result,
            "eval_name": self.name,
            "eval_version": self.version,
            "timestamp": datetime.now().isoformat(),
        }

    def compare_versions(self, v1_results: list, v2_results: list) -> dict:
        """Compare results between eval versions"""
        v1_pass_rate = sum(r["pass"] for r in v1_results) / len(v1_results)
        v2_pass_rate = sum(r["pass"] for r in v2_results) / len(v2_results)

        # Find disagreements
        disagreements = []
        for r1, r2 in zip(v1_results, v2_results):
            if r1["pass"] != r2["pass"]:
                disagreements.append({
                    "input": r1["input"],
                    "v1_pass": r1["pass"],
                    "v2_pass": r2["pass"],
                })

        return {
            "v1_pass_rate": v1_pass_rate,
            "v2_pass_rate": v2_pass_rate,
            "agreement_rate": 1 - len(disagreements) / len(v1_results),
            "disagreements": disagreements[:10],  # Sample
        }

Calibrating Evals Against Human Judgment

def calibration_process():
    return {
        "steps": [
            {
                "step": 1,
                "action": "Collect 500 responses with human labels",
                "output": "Ground truth dataset",
            },
            {
                "step": 2,
                "action": "Run eval on labeled responses",
                "output": "Eval predictions",
            },
            {
                "step": 3,
                "action": "Calculate agreement metrics",
                "metrics": ["Accuracy", "Precision", "Recall", "Cohen's Kappa"],
            },
            {
                "step": 4,
                "action": "Analyze disagreements",
                "questions": [
                    "Where does eval fail that humans pass?",
                    "Where does eval pass that humans fail?",
                    "Are there patterns in disagreements?",
                ],
            },
            {
                "step": 5,
                "action": "Adjust thresholds or eval logic",
                "goal": "Maximize agreement, minimize false positives",
            },
        ],
        "target_metrics": {
            "precision": "> 95% (when eval fails, it should be right)",
            "recall": "> 80% (catch most real problems)",
            "kappa": "> 0.7 (substantial agreement)",
        },
    }

Common Eval Mistakes

def eval_antipatterns():
    return {
        "testing_style_not_substance": {
            "bad": "Fails if response doesn't start with 'I'",
            "good": "Fails if response doesn't address the question",
            "why": "Style preferences aren't quality issues",
        },
        "overfitting_to_examples": {
            "bad": "Eval works on 10 test cases, fails in production",
            "good": "Eval validated on diverse, representative sample",
            "why": "Real queries are more varied than test sets",
        },
        "ignoring_false_positives": {
            "bad": "99% recall, 50% precision",
            "good": "95% recall, 95% precision",
            "why": "Too many false positives train teams to ignore alerts",
        },
        "no_versioning": {
            "bad": "Modified eval, broke comparison to historical data",
            "good": "Eval v2.3, results tagged with version",
            "why": "Need to track changes over time",
        },
        "too_slow_for_ci": {
            "bad": "Eval takes 4 hours, runs weekly",
            "good": "Eval takes 10 minutes, runs on every PR",
            "why": "Fast feedback prevents bad code from merging",
        },
    }

Good evals are investments that pay dividends. They catch problems before users do, give confidence to deploy changes, and create institutional knowledge about what quality means. Build them carefully, calibrate them regularly, and version everything.