When to Use LLM-as-Judge

Art critics and accountants evaluate different things. The critic judges aesthetic quality, emotional impact, creativity. The accountant verifies numbers match receipts. You wouldn't ask an accountant to judge a painting or a critic to audit your books.

LLM-as-judge works the same way. It excels at subjective assessment: Is this response helpful? Is the tone appropriate? Does this explanation make sense? It struggles with objective verification: Is this fact true? Does this code work? Is this number correct?

Where LLM Judges Excel

def llm_judge_strengths():
    return {
        "subjective_quality": {
            "examples": [
                "Is this response helpful?",
                "Is the tone appropriate for customer service?",
                "Is this explanation clear and understandable?",
                "Is this summary accurate to the source?",
            ],
            "why_llm_works": "These require human-like judgment that LLMs capture well",
            "agreement_with_humans": "85-95% on well-designed rubrics",
        },

        "style_and_format": {
            "examples": [
                "Does this match the requested style?",
                "Is this response appropriately formal/casual?",
                "Does this follow the template structure?",
            ],
            "why_llm_works": "Pattern matching on learned examples",
            "agreement_with_humans": "90%+ on clear criteria",
        },

        "coherence_and_logic": {
            "examples": [
                "Does this argument flow logically?",
                "Are there internal contradictions?",
                "Is this response self-consistent?",
            ],
            "why_llm_works": "Reasoning about text structure",
            "agreement_with_humans": "80-90%",
        },

        "comparative_ranking": {
            "examples": [
                "Which of these two responses is better?",
                "Rank these answers from best to worst",
            ],
            "why_llm_works": "Relative judgment often easier than absolute",
            "agreement_with_humans": "Higher than absolute scoring",
        },
    }

Where LLM Judges Fail

def llm_judge_weaknesses():
    return {
        "factual_correctness": {
            "examples": [
                "Is this historical date correct?",
                "Does this statistic match the source?",
                "Is this API response accurate?",
            ],
            "why_llm_fails": "LLM may share the same misconceptions as the response",
            "danger": "Confidently validating false information",
            "alternative": "Lookup against authoritative sources",
        },

        "code_correctness": {
            "examples": [
                "Does this code compile?",
                "Does this function return the right output?",
                "Are there bugs in this implementation?",
            ],
            "why_llm_fails": "Can miss subtle bugs, especially edge cases",
            "danger": "Approving code that doesn't work",
            "alternative": "Execute code, run tests",
        },

        "math_verification": {
            "examples": [
                "Is this calculation correct?",
                "Does this proof hold?",
                "Is this equation balanced?",
            ],
            "why_llm_fails": "LLMs make arithmetic errors, miss steps",
            "danger": "Wrong answers marked as correct",
            "alternative": "Symbolic math tools, calculators",
        },

        "external_knowledge": {
            "examples": [
                "Is this person's biography accurate?",
                "Does this match current market prices?",
                "Is this legal interpretation correct?",
            ],
            "why_llm_fails": "Knowledge cutoff, potential hallucination",
            "danger": "Validating outdated or false information",
            "alternative": "RAG with authoritative sources",
        },
    }

Implementing LLM-as-Judge

class LLMJudge:
    """
    Effective LLM-based evaluation
    """

    def __init__(self, model: str = "claude-3-sonnet"):
        self.model = model
        self.temperature = 0  # Consistency

    def evaluate_with_rubric(
        self,
        response: str,
        task: str,
        rubric: dict
    ) -> dict:
        """
        Structured evaluation with explicit criteria
        """
        prompt = f"""Evaluate this response according to the rubric.

Task: {task}

Response: {response}

Rubric:
{self.format_rubric(rubric)}

For each criterion, provide:
1. Score (1-5)
2. One sentence justification

Format your response as:
Criterion 1: Score - Justification
Criterion 2: Score - Justification
..."""

        result = self.model.generate(prompt, temperature=0)
        return self.parse_structured_result(result)

    def format_rubric(self, rubric: dict) -> str:
        """Convert rubric dict to prompt text"""
        lines = []
        for criterion, description in rubric.items():
            lines.append(f"- {criterion}: {description}")
            lines.append(f"  1 = Poor, 3 = Acceptable, 5 = Excellent")
        return "\n".join(lines)


# Example rubric
HELPFULNESS_RUBRIC = {
    "relevance": "Does the response address the user's actual question?",
    "completeness": "Does it cover the key points needed?",
    "clarity": "Is it easy to understand?",
    "actionability": "Can the user act on this information?",
}

Calibration and Validation

def calibrate_llm_judge():
    """
    Ensure your LLM judge agrees with human judgment
    """
    return {
        "calibration_process": """
        1. Collect 200+ responses with human labels
        2. Run LLM judge on same responses
        3. Calculate agreement metrics
        4. Analyze disagreements
        5. Refine prompt/rubric based on patterns
        """,

        "metrics_to_track": {
            "agreement_rate": "% of cases where LLM and human agree",
            "cohens_kappa": "Agreement adjusted for chance (> 0.7 good)",
            "bias": "Does LLM systematically over/under score?",
            "variance": "How consistent is LLM across retries?",
        },

        "when_to_recalibrate": [
            "After changing judge model",
            "After changing prompt/rubric",
            "When agreement drops below threshold",
            "Quarterly regardless",
        ],

        "implementation": """
        class CalibratedJudge:
            def __init__(self, judge, human_labels):
                self.judge = judge
                self.calibration_data = human_labels
                self.calibration_score = self.compute_agreement()

            def is_calibrated(self) -> bool:
                return self.calibration_score > 0.85

            def recalibrate(self, new_labels):
                self.calibration_data.extend(new_labels)
                self.calibration_score = self.compute_agreement()

                if not self.is_calibrated():
                    logging.warning("Judge calibration below threshold")
        """,
    }

Reducing Judge Variability

def reduce_variability():
    return {
        "temperature_zero": {
            "setting": "temperature=0",
            "effect": "Reduces randomness in judge output",
            "note": "Some providers still have variability at 0",
        },

        "structured_output": {
            "approach": "Request specific format",
            "example": "Score: [1-5], Reason: [one sentence]",
            "benefit": "Easier to parse, forces commitment",
        },

        "multiple_evaluations": {
            "approach": "Run judge 3 times, take majority/average",
            "when": "High-stakes decisions",
            "cost": "3x evaluation cost",
        },

        "anchor_examples": {
            "approach": "Include examples of each score level in prompt",
            "example": """
                Here are examples of each score:
                Score 1 (Poor): [example of bad response]
                Score 3 (Acceptable): [example of ok response]
                Score 5 (Excellent): [example of great response]
            """,
            "benefit": "Calibrates judge expectations",
        },
    }

Hybrid Evaluation Systems

class HybridEvaluator:
    """
    Combine LLM judges with deterministic checks
    """

    def evaluate(self, request: dict, response: str) -> dict:
        results = {}

        # Deterministic checks first (fast, reliable)
        results["format"] = self.format_checker.check(response)
        results["length"] = self.length_checker.check(response)
        results["safety"] = self.safety_classifier.check(response)

        # Code execution for code responses
        if request.get("task_type") == "code":
            results["execution"] = self.code_executor.test(response)

        # Fact verification for factual claims
        if request.get("requires_facts"):
            results["facts"] = self.fact_checker.verify(response)

        # LLM judge for subjective quality
        results["quality"] = self.llm_judge.evaluate(
            response=response,
            task=request["task"],
            rubric=self.get_rubric(request["task_type"])
        )

        return {
            "scores": results,
            "pass": self.compute_pass(results),
            "method_used": list(results.keys()),
        }

Common Pitfalls

def llm_judge_pitfalls():
    return {
        "trusting_for_facts": {
            "mistake": "Using LLM to verify factual accuracy",
            "why_bad": "LLM may share same misconceptions",
            "fix": "Use external knowledge sources for facts",
        },

        "vague_rubrics": {
            "mistake": "'Rate quality 1-10' without criteria",
            "why_bad": "Inconsistent, unmotivated scores",
            "fix": "Explicit criteria with examples",
        },

        "no_calibration": {
            "mistake": "Deploy LLM judge without validation",
            "why_bad": "No idea if it matches human judgment",
            "fix": "Always calibrate against human labels",
        },

        "ignoring_variance": {
            "mistake": "Single LLM evaluation as ground truth",
            "why_bad": "Random variation affects decisions",
            "fix": "Multiple evaluations for important decisions",
        },

        "position_bias": {
            "mistake": "Not randomizing order in comparisons",
            "why_bad": "LLMs may prefer first or second response",
            "fix": "Randomize order, run both orderings",
        },
    }

LLM judges are tools with specific strengths. They simulate human judgment on subjective questions effectively. They fail at objective verification tasks. Use them where they excel, combine them with deterministic checks where they don't, and always validate against human judgment.