When to Use LLM-as-Judge
Art critics and accountants evaluate different things. The critic judges aesthetic quality, emotional impact, creativity. The accountant verifies numbers match receipts. You wouldn't ask an accountant to judge a painting or a critic to audit your books.
LLM-as-judge works the same way. It excels at subjective assessment: Is this response helpful? Is the tone appropriate? Does this explanation make sense? It struggles with objective verification: Is this fact true? Does this code work? Is this number correct?
Where LLM Judges Excel
def llm_judge_strengths():
return {
"subjective_quality": {
"examples": [
"Is this response helpful?",
"Is the tone appropriate for customer service?",
"Is this explanation clear and understandable?",
"Is this summary accurate to the source?",
],
"why_llm_works": "These require human-like judgment that LLMs capture well",
"agreement_with_humans": "85-95% on well-designed rubrics",
},
"style_and_format": {
"examples": [
"Does this match the requested style?",
"Is this response appropriately formal/casual?",
"Does this follow the template structure?",
],
"why_llm_works": "Pattern matching on learned examples",
"agreement_with_humans": "90%+ on clear criteria",
},
"coherence_and_logic": {
"examples": [
"Does this argument flow logically?",
"Are there internal contradictions?",
"Is this response self-consistent?",
],
"why_llm_works": "Reasoning about text structure",
"agreement_with_humans": "80-90%",
},
"comparative_ranking": {
"examples": [
"Which of these two responses is better?",
"Rank these answers from best to worst",
],
"why_llm_works": "Relative judgment often easier than absolute",
"agreement_with_humans": "Higher than absolute scoring",
},
}
Where LLM Judges Fail
def llm_judge_weaknesses():
return {
"factual_correctness": {
"examples": [
"Is this historical date correct?",
"Does this statistic match the source?",
"Is this API response accurate?",
],
"why_llm_fails": "LLM may share the same misconceptions as the response",
"danger": "Confidently validating false information",
"alternative": "Lookup against authoritative sources",
},
"code_correctness": {
"examples": [
"Does this code compile?",
"Does this function return the right output?",
"Are there bugs in this implementation?",
],
"why_llm_fails": "Can miss subtle bugs, especially edge cases",
"danger": "Approving code that doesn't work",
"alternative": "Execute code, run tests",
},
"math_verification": {
"examples": [
"Is this calculation correct?",
"Does this proof hold?",
"Is this equation balanced?",
],
"why_llm_fails": "LLMs make arithmetic errors, miss steps",
"danger": "Wrong answers marked as correct",
"alternative": "Symbolic math tools, calculators",
},
"external_knowledge": {
"examples": [
"Is this person's biography accurate?",
"Does this match current market prices?",
"Is this legal interpretation correct?",
],
"why_llm_fails": "Knowledge cutoff, potential hallucination",
"danger": "Validating outdated or false information",
"alternative": "RAG with authoritative sources",
},
}
Implementing LLM-as-Judge
class LLMJudge:
"""
Effective LLM-based evaluation
"""
def __init__(self, model: str = "claude-3-sonnet"):
self.model = model
self.temperature = 0 # Consistency
def evaluate_with_rubric(
self,
response: str,
task: str,
rubric: dict
) -> dict:
"""
Structured evaluation with explicit criteria
"""
prompt = f"""Evaluate this response according to the rubric.
Task: {task}
Response: {response}
Rubric:
{self.format_rubric(rubric)}
For each criterion, provide:
1. Score (1-5)
2. One sentence justification
Format your response as:
Criterion 1: Score - Justification
Criterion 2: Score - Justification
..."""
result = self.model.generate(prompt, temperature=0)
return self.parse_structured_result(result)
def format_rubric(self, rubric: dict) -> str:
"""Convert rubric dict to prompt text"""
lines = []
for criterion, description in rubric.items():
lines.append(f"- {criterion}: {description}")
lines.append(f" 1 = Poor, 3 = Acceptable, 5 = Excellent")
return "\n".join(lines)
# Example rubric
HELPFULNESS_RUBRIC = {
"relevance": "Does the response address the user's actual question?",
"completeness": "Does it cover the key points needed?",
"clarity": "Is it easy to understand?",
"actionability": "Can the user act on this information?",
}
Calibration and Validation
def calibrate_llm_judge():
"""
Ensure your LLM judge agrees with human judgment
"""
return {
"calibration_process": """
1. Collect 200+ responses with human labels
2. Run LLM judge on same responses
3. Calculate agreement metrics
4. Analyze disagreements
5. Refine prompt/rubric based on patterns
""",
"metrics_to_track": {
"agreement_rate": "% of cases where LLM and human agree",
"cohens_kappa": "Agreement adjusted for chance (> 0.7 good)",
"bias": "Does LLM systematically over/under score?",
"variance": "How consistent is LLM across retries?",
},
"when_to_recalibrate": [
"After changing judge model",
"After changing prompt/rubric",
"When agreement drops below threshold",
"Quarterly regardless",
],
"implementation": """
class CalibratedJudge:
def __init__(self, judge, human_labels):
self.judge = judge
self.calibration_data = human_labels
self.calibration_score = self.compute_agreement()
def is_calibrated(self) -> bool:
return self.calibration_score > 0.85
def recalibrate(self, new_labels):
self.calibration_data.extend(new_labels)
self.calibration_score = self.compute_agreement()
if not self.is_calibrated():
logging.warning("Judge calibration below threshold")
""",
}
Reducing Judge Variability
def reduce_variability():
return {
"temperature_zero": {
"setting": "temperature=0",
"effect": "Reduces randomness in judge output",
"note": "Some providers still have variability at 0",
},
"structured_output": {
"approach": "Request specific format",
"example": "Score: [1-5], Reason: [one sentence]",
"benefit": "Easier to parse, forces commitment",
},
"multiple_evaluations": {
"approach": "Run judge 3 times, take majority/average",
"when": "High-stakes decisions",
"cost": "3x evaluation cost",
},
"anchor_examples": {
"approach": "Include examples of each score level in prompt",
"example": """
Here are examples of each score:
Score 1 (Poor): [example of bad response]
Score 3 (Acceptable): [example of ok response]
Score 5 (Excellent): [example of great response]
""",
"benefit": "Calibrates judge expectations",
},
}
Hybrid Evaluation Systems
class HybridEvaluator:
"""
Combine LLM judges with deterministic checks
"""
def evaluate(self, request: dict, response: str) -> dict:
results = {}
# Deterministic checks first (fast, reliable)
results["format"] = self.format_checker.check(response)
results["length"] = self.length_checker.check(response)
results["safety"] = self.safety_classifier.check(response)
# Code execution for code responses
if request.get("task_type") == "code":
results["execution"] = self.code_executor.test(response)
# Fact verification for factual claims
if request.get("requires_facts"):
results["facts"] = self.fact_checker.verify(response)
# LLM judge for subjective quality
results["quality"] = self.llm_judge.evaluate(
response=response,
task=request["task"],
rubric=self.get_rubric(request["task_type"])
)
return {
"scores": results,
"pass": self.compute_pass(results),
"method_used": list(results.keys()),
}
Common Pitfalls
def llm_judge_pitfalls():
return {
"trusting_for_facts": {
"mistake": "Using LLM to verify factual accuracy",
"why_bad": "LLM may share same misconceptions",
"fix": "Use external knowledge sources for facts",
},
"vague_rubrics": {
"mistake": "'Rate quality 1-10' without criteria",
"why_bad": "Inconsistent, unmotivated scores",
"fix": "Explicit criteria with examples",
},
"no_calibration": {
"mistake": "Deploy LLM judge without validation",
"why_bad": "No idea if it matches human judgment",
"fix": "Always calibrate against human labels",
},
"ignoring_variance": {
"mistake": "Single LLM evaluation as ground truth",
"why_bad": "Random variation affects decisions",
"fix": "Multiple evaluations for important decisions",
},
"position_bias": {
"mistake": "Not randomizing order in comparisons",
"why_bad": "LLMs may prefer first or second response",
"fix": "Randomize order, run both orderings",
},
}
LLM judges are tools with specific strengths. They simulate human judgment on subjective questions effectively. They fail at objective verification tasks. Use them where they excel, combine them with deterministic checks where they don't, and always validate against human judgment.