Choosing the Right Precision for Your Model

Camera sensors capture more dynamic range than displays show. Professional photographers shoot in RAW, then compress for delivery. The final format depends on the destination—print, web, or mobile.

Model precision follows similar logic. Training captures full precision. Serving can use less. The right precision depends on your quality requirements and hardware constraints.

The Precision Ladder

def precision_hierarchy():
    return {
        "FP32": {
            "bits": 32,
            "range": "Very large",
            "precision": "High",
            "when_needed": "Training, reference implementation",
            "inference_use": "Almost never (too slow)",
        },
        "FP16": {
            "bits": 16,
            "range": "Large",
            "precision": "Good",
            "when_needed": "Standard inference",
            "inference_use": "Default choice",
        },
        "BF16": {
            "bits": 16,
            "range": "Same as FP32",
            "precision": "Reduced (but range preserved)",
            "when_needed": "Training and inference",
            "inference_use": "Good default, especially for newer models",
        },
        "FP8_E4M3": {
            "bits": 8,
            "range": "Limited",
            "precision": "Moderate",
            "when_needed": "H100 inference",
            "inference_use": "Best balance on supported hardware",
        },
        "INT8": {
            "bits": 8,
            "range": "Fixed [-128, 127]",
            "precision": "Discrete steps",
            "when_needed": "Memory-constrained",
            "inference_use": "Common optimization",
        },
        "INT4": {
            "bits": 4,
            "range": "Fixed [-8, 7]",
            "precision": "Coarse",
            "when_needed": "Extreme memory constraints",
            "inference_use": "With quality testing",
        },
    }

The Decision Framework

def precision_decision(constraints: dict) -> str:
    """
    Choose precision based on constraints
    """
    # Do you need to save memory?
    if not constraints.get("memory_constrained"):
        return "FP16 - no reason to accept quality loss"

    # Do you have H100?
    if constraints.get("has_h100"):
        if constraints.get("quality_critical"):
            return "FP8 - native support, minimal quality loss"
        else:
            return "INT8 - more memory savings, acceptable quality"

    # Memory constrained without H100
    if constraints.get("extreme_memory_constraint"):
        if constraints.get("task_tolerant"):
            return "INT4 - but test thoroughly"
        else:
            return "INT8 - safer than INT4"

    return "INT8 - good balance for most cases"

Quality Impact by Precision

def expected_quality_impact():
    """
    Typical quality degradation (varies by model and task)
    """
    return {
        "FP32_to_FP16": {
            "degradation": "0%",
            "confidence": "Very high",
            "notes": "No practical difference",
        },
        "FP16_to_BF16": {
            "degradation": "0-0.5%",
            "confidence": "High",
            "notes": "Negligible for most tasks",
        },
        "FP16_to_FP8": {
            "degradation": "0.5-1%",
            "confidence": "High",
            "notes": "H100 native support helps",
        },
        "FP16_to_INT8": {
            "degradation": "1-3%",
            "confidence": "Medium-High",
            "notes": "Task dependent, test needed",
        },
        "INT8_to_INT4": {
            "degradation": "3-10%",
            "confidence": "Medium",
            "notes": "Significant, test thoroughly",
        },
    }

Task-Specific Guidelines

def precision_by_task():
    return {
        "classification": {
            "minimum_safe": "INT4",
            "recommended": "INT8",
            "reason": "Output is discrete, robust to small errors",
        },
        "summarization": {
            "minimum_safe": "INT8",
            "recommended": "INT8",
            "reason": "Content matters more than exact words",
        },
        "code_generation": {
            "minimum_safe": "INT8",
            "recommended": "FP8 or FP16",
            "reason": "Syntax errors from quantization are costly",
        },
        "math_reasoning": {
            "minimum_safe": "FP8",
            "recommended": "FP16",
            "reason": "Small errors compound through reasoning chain",
        },
        "creative_writing": {
            "minimum_safe": "INT8",
            "recommended": "INT8",
            "reason": "Variety masks small quality changes",
        },
        "extraction_and_parsing": {
            "minimum_safe": "INT8",
            "recommended": "FP16",
            "reason": "Exact output format often required",
        },
    }

The Testing Protocol

class PrecisionTesting:
    def __init__(self, model, test_dataset):
        self.model = model
        self.test_dataset = test_dataset

    def compare_precisions(self, precisions: list) -> dict:
        """
        Compare model at different precisions
        """
        results = {}

        for precision in precisions:
            model_at_precision = self.load_at_precision(precision)

            results[precision] = {
                "perplexity": self.measure_perplexity(model_at_precision),
                "task_accuracy": self.measure_task_accuracy(model_at_precision),
                "latency_ms": self.measure_latency(model_at_precision),
                "memory_gb": self.measure_memory(model_at_precision),
            }

        return results

    def recommendation(self, results: dict) -> str:
        """
        Pick the best precision for your constraints
        """
        # Find precisions that meet quality threshold
        acceptable = [
            p for p, r in results.items()
            if r["task_accuracy"] > 0.95 * results["FP16"]["task_accuracy"]
        ]

        # Among acceptable, pick smallest memory footprint
        return min(acceptable, key=lambda p: results[p]["memory_gb"])

Common Mistakes

def precision_mistakes():
    return [
        {
            "mistake": "Assuming INT4 always works",
            "reality": "Quality loss can be severe for some models/tasks",
            "fix": "Always test before deploying",
        },
        {
            "mistake": "Not testing on representative data",
            "reality": "Generic benchmarks miss domain-specific failures",
            "fix": "Test on your actual prompts",
        },
        {
            "mistake": "Mixing precisions incorrectly",
            "reality": "Some layers need higher precision",
            "fix": "Use frameworks that handle mixed precision",
        },
        {
            "mistake": "Forgetting KV cache precision",
            "reality": "KV cache can be quantized separately",
            "fix": "Consider KV cache INT8 even with FP16 weights",
        },
    ]

The Practical Path

def practical_recommendation():
    return {
        "start_with": "FP16",
        "if_memory_constrained": "Try INT8, measure quality",
        "if_h100_available": "Use FP8 as default",
        "if_still_constrained": "Consider INT4 with extensive testing",
        "always": "Measure task-specific quality before deploying",
        "remember": "Memory savings are certain, quality impact varies",
    }

Precision is a lever with predictable memory effects and variable quality effects. Pull it only as far as your testing confirms is safe for your specific use case.