Calculating If Quantization Pays Off

Compression algorithms have overhead. Compressing a small file can make it larger—the dictionary takes more space than it saves. The break-even point depends on the original size.

Quantization has similar economics. The overhead of dequantization costs compute. Whether that trade pays off depends on your bottleneck.

The Bottleneck Question

def identify_bottleneck(workload: dict) -> str:
    """
    Is your inference memory-bound or compute-bound?
    """
    if workload["phase"] == "prefill":
        # Prefill is compute-bound
        # Processing many tokens in parallel
        # Quantization: faster ops, but less accurate
        return "compute_bound"

    if workload["phase"] == "decode":
        # Decode is memory-bound
        # Reading weights is the bottleneck
        # Quantization: smaller weights, faster reads
        return "memory_bound"

    if workload["batch_size"] > 16:
        # Large batches = more compute relative to memory
        return "compute_bound"

    if workload["batch_size"] == 1:
        # Single request = memory bandwidth limited
        return "memory_bound"

    return "mixed"

The ROI Calculation

def quantization_roi(
    current_tps: float,       # Tokens per second
    current_cost_per_hour: float,
    memory_bound_fraction: float,  # 0-1, how much is memory-bound
    quantization_speedup: float,   # Expected speedup on memory-bound work
    quality_degradation: float,    # Expected quality loss (0-1)
) -> dict:
    """
    Calculate if quantization is worth it
    """
    # Speedup only applies to memory-bound portion
    effective_speedup = 1 + (quantization_speedup - 1) * memory_bound_fraction

    new_tps = current_tps * effective_speedup

    # Cost per token
    tokens_per_hour = current_tps * 3600
    new_tokens_per_hour = new_tps * 3600

    cost_per_token_before = current_cost_per_hour / tokens_per_hour
    cost_per_token_after = current_cost_per_hour / new_tokens_per_hour

    return {
        "speedup": effective_speedup,
        "tps_before": current_tps,
        "tps_after": new_tps,
        "cost_per_million_before": cost_per_token_before * 1_000_000,
        "cost_per_million_after": cost_per_token_after * 1_000_000,
        "cost_reduction_percent": (1 - cost_per_token_after / cost_per_token_before) * 100,
        "quality_cost": f"{quality_degradation * 100:.1f}% quality degradation",
        "recommendation": "worth_it" if effective_speedup > 1.1 and quality_degradation < 0.03 else "evaluate_carefully",
    }

# Example: Decode-bound workload
result = quantization_roi(
    current_tps=30,
    current_cost_per_hour=4.50,
    memory_bound_fraction=0.8,  # Mostly decode
    quantization_speedup=1.5,    # INT8 weight loading 50% faster
    quality_degradation=0.02,    # 2% quality loss
)
# Result: ~40% speedup, 29% cost reduction

When Quantization Helps Most

def high_roi_scenarios():
    return {
        "decode_heavy": {
            "description": "Most time spent generating tokens",
            "memory_bound_fraction": 0.8,
            "expected_benefit": "High",
            "typical_speedup": "1.3-1.5x",
        },
        "low_batch_size": {
            "description": "Single requests or small batches",
            "memory_bound_fraction": 0.9,
            "expected_benefit": "High",
            "typical_speedup": "1.5-2x",
        },
        "long_outputs": {
            "description": "Generating many tokens per request",
            "memory_bound_fraction": 0.85,
            "expected_benefit": "High",
            "typical_speedup": "1.4-1.6x",
        },
    }


def low_roi_scenarios():
    return {
        "prefill_heavy": {
            "description": "Long prompts, short outputs",
            "memory_bound_fraction": 0.3,
            "expected_benefit": "Low",
            "typical_speedup": "1.0-1.15x",
        },
        "large_batch": {
            "description": "Many requests batched together",
            "memory_bound_fraction": 0.4,
            "expected_benefit": "Low-Medium",
            "typical_speedup": "1.1-1.2x",
        },
        "already_memory_efficient": {
            "description": "Using FP8 on H100",
            "memory_bound_fraction": "Varies",
            "expected_benefit": "Diminishing returns",
            "typical_speedup": "1.0-1.1x",
        },
    }

The Hidden Costs

def quantization_hidden_costs():
    return {
        "engineering_time": {
            "cost": "Days to weeks",
            "for": "Testing, validation, integration",
        },
        "quality_regression_risk": {
            "cost": "Potential user impact",
            "for": "Subtle quality losses hard to detect",
        },
        "complexity": {
            "cost": "Ongoing maintenance",
            "for": "Different model versions to manage",
        },
        "debugging_difficulty": {
            "cost": "Slower root cause analysis",
            "for": "Issues harder to reproduce at lower precision",
        },
    }


def total_cost_analysis(
    monthly_compute_cost: float,
    engineering_hours: float,
    eng_hourly_cost: float,
    speedup_factor: float,
) -> dict:
    # Savings from quantization
    monthly_savings = monthly_compute_cost * (1 - 1/speedup_factor)

    # Cost of engineering time
    eng_cost = engineering_hours * eng_hourly_cost

    # Payback period
    payback_months = eng_cost / monthly_savings if monthly_savings > 0 else float('inf')

    return {
        "monthly_savings": monthly_savings,
        "engineering_investment": eng_cost,
        "payback_months": payback_months,
        "year_1_net": monthly_savings * 12 - eng_cost,
    }

The Decision Matrix

def should_quantize(context: dict) -> str:
    # Clear yes
    if context["memory_constrained"] and context["decode_heavy"]:
        return "Yes - high ROI, likely necessary"

    # Clear no
    if context["quality_critical"] and context["not_memory_constrained"]:
        return "No - risk without benefit"

    # Depends
    if context["moderate_volume"]:
        if context["engineering_capacity"]:
            return "Maybe - run ROI calculation with your numbers"
        else:
            return "No - engineering cost likely exceeds benefit"

    if context["high_volume"]:
        return "Probably - small percentage savings × large volume = significant"

    return "Evaluate - run the numbers for your specific case"

When To Skip Quantization

def skip_quantization_checklist():
    return [
        {
            "condition": "Quality is your primary differentiator",
            "reason": "Any quality loss is unacceptable",
        },
        {
            "condition": "Compute cost is small part of total cost",
            "reason": "Savings not meaningful",
        },
        {
            "condition": "Already hitting latency targets",
            "reason": "No performance problem to solve",
        },
        {
            "condition": "Small team, many priorities",
            "reason": "Engineering time better spent elsewhere",
        },
        {
            "condition": "Frequently changing models",
            "reason": "Re-quantization overhead for each update",
        },
    ]

Quantization is an optimization, not a requirement. Calculate the ROI for your specific situation before investing engineering time. Sometimes the best optimization is no optimization.