Back to Blog
Calculating If Quantization Pays Off
Compression algorithms have overhead. Compressing a small file can make it larger—the dictionary takes more space than it saves. The break-even point depends on the original size.
Quantization has similar economics. The overhead of dequantization costs compute. Whether that trade pays off depends on your bottleneck.
The Bottleneck Question
def identify_bottleneck(workload: dict) -> str:
"""
Is your inference memory-bound or compute-bound?
"""
if workload["phase"] == "prefill":
# Prefill is compute-bound
# Processing many tokens in parallel
# Quantization: faster ops, but less accurate
return "compute_bound"
if workload["phase"] == "decode":
# Decode is memory-bound
# Reading weights is the bottleneck
# Quantization: smaller weights, faster reads
return "memory_bound"
if workload["batch_size"] > 16:
# Large batches = more compute relative to memory
return "compute_bound"
if workload["batch_size"] == 1:
# Single request = memory bandwidth limited
return "memory_bound"
return "mixed"
The ROI Calculation
def quantization_roi(
current_tps: float, # Tokens per second
current_cost_per_hour: float,
memory_bound_fraction: float, # 0-1, how much is memory-bound
quantization_speedup: float, # Expected speedup on memory-bound work
quality_degradation: float, # Expected quality loss (0-1)
) -> dict:
"""
Calculate if quantization is worth it
"""
# Speedup only applies to memory-bound portion
effective_speedup = 1 + (quantization_speedup - 1) * memory_bound_fraction
new_tps = current_tps * effective_speedup
# Cost per token
tokens_per_hour = current_tps * 3600
new_tokens_per_hour = new_tps * 3600
cost_per_token_before = current_cost_per_hour / tokens_per_hour
cost_per_token_after = current_cost_per_hour / new_tokens_per_hour
return {
"speedup": effective_speedup,
"tps_before": current_tps,
"tps_after": new_tps,
"cost_per_million_before": cost_per_token_before * 1_000_000,
"cost_per_million_after": cost_per_token_after * 1_000_000,
"cost_reduction_percent": (1 - cost_per_token_after / cost_per_token_before) * 100,
"quality_cost": f"{quality_degradation * 100:.1f}% quality degradation",
"recommendation": "worth_it" if effective_speedup > 1.1 and quality_degradation < 0.03 else "evaluate_carefully",
}
# Example: Decode-bound workload
result = quantization_roi(
current_tps=30,
current_cost_per_hour=4.50,
memory_bound_fraction=0.8, # Mostly decode
quantization_speedup=1.5, # INT8 weight loading 50% faster
quality_degradation=0.02, # 2% quality loss
)
# Result: ~40% speedup, 29% cost reduction
When Quantization Helps Most
def high_roi_scenarios():
return {
"decode_heavy": {
"description": "Most time spent generating tokens",
"memory_bound_fraction": 0.8,
"expected_benefit": "High",
"typical_speedup": "1.3-1.5x",
},
"low_batch_size": {
"description": "Single requests or small batches",
"memory_bound_fraction": 0.9,
"expected_benefit": "High",
"typical_speedup": "1.5-2x",
},
"long_outputs": {
"description": "Generating many tokens per request",
"memory_bound_fraction": 0.85,
"expected_benefit": "High",
"typical_speedup": "1.4-1.6x",
},
}
def low_roi_scenarios():
return {
"prefill_heavy": {
"description": "Long prompts, short outputs",
"memory_bound_fraction": 0.3,
"expected_benefit": "Low",
"typical_speedup": "1.0-1.15x",
},
"large_batch": {
"description": "Many requests batched together",
"memory_bound_fraction": 0.4,
"expected_benefit": "Low-Medium",
"typical_speedup": "1.1-1.2x",
},
"already_memory_efficient": {
"description": "Using FP8 on H100",
"memory_bound_fraction": "Varies",
"expected_benefit": "Diminishing returns",
"typical_speedup": "1.0-1.1x",
},
}
The Hidden Costs
def quantization_hidden_costs():
return {
"engineering_time": {
"cost": "Days to weeks",
"for": "Testing, validation, integration",
},
"quality_regression_risk": {
"cost": "Potential user impact",
"for": "Subtle quality losses hard to detect",
},
"complexity": {
"cost": "Ongoing maintenance",
"for": "Different model versions to manage",
},
"debugging_difficulty": {
"cost": "Slower root cause analysis",
"for": "Issues harder to reproduce at lower precision",
},
}
def total_cost_analysis(
monthly_compute_cost: float,
engineering_hours: float,
eng_hourly_cost: float,
speedup_factor: float,
) -> dict:
# Savings from quantization
monthly_savings = monthly_compute_cost * (1 - 1/speedup_factor)
# Cost of engineering time
eng_cost = engineering_hours * eng_hourly_cost
# Payback period
payback_months = eng_cost / monthly_savings if monthly_savings > 0 else float('inf')
return {
"monthly_savings": monthly_savings,
"engineering_investment": eng_cost,
"payback_months": payback_months,
"year_1_net": monthly_savings * 12 - eng_cost,
}
The Decision Matrix
def should_quantize(context: dict) -> str:
# Clear yes
if context["memory_constrained"] and context["decode_heavy"]:
return "Yes - high ROI, likely necessary"
# Clear no
if context["quality_critical"] and context["not_memory_constrained"]:
return "No - risk without benefit"
# Depends
if context["moderate_volume"]:
if context["engineering_capacity"]:
return "Maybe - run ROI calculation with your numbers"
else:
return "No - engineering cost likely exceeds benefit"
if context["high_volume"]:
return "Probably - small percentage savings × large volume = significant"
return "Evaluate - run the numbers for your specific case"
When To Skip Quantization
def skip_quantization_checklist():
return [
{
"condition": "Quality is your primary differentiator",
"reason": "Any quality loss is unacceptable",
},
{
"condition": "Compute cost is small part of total cost",
"reason": "Savings not meaningful",
},
{
"condition": "Already hitting latency targets",
"reason": "No performance problem to solve",
},
{
"condition": "Small team, many priorities",
"reason": "Engineering time better spent elsewhere",
},
{
"condition": "Frequently changing models",
"reason": "Re-quantization overhead for each update",
},
]
Quantization is an optimization, not a requirement. Calculate the ROI for your specific situation before investing engineering time. Sometimes the best optimization is no optimization.