Back to Blog

The Real Cost: Fine-tuning vs Prompting

Renting and buying solve the same problem differently. Renting is high monthly cost but low commitment. Buying is high upfront cost but low ongoing cost. The math depends on how long you stay and what you can afford now.

Prompting and fine-tuning follow the same economics. Prompting has higher per-call cost (longer prompts, more context) but zero upfront investment. Fine-tuning has lower per-call cost but requires data curation, training, and maintenance. The right choice depends on your volume and timeline.

Cost Structure Comparison

def cost_structures():
    return {
        "prompting": {
            "upfront_cost": {
                "prompt_engineering": "Hours of iteration",
                "testing": "Manual validation",
                "total": "~$0 infrastructure, ~40 hours labor",
            },
            "per_call_cost": {
                "system_prompt": "~500 tokens",
                "few_shot_examples": "~1000 tokens",
                "actual_input": "~200 tokens",
                "total_input": "~1700 tokens/call",
                "cost_at_sonnet_pricing": "$0.0051/call",
            },
            "maintenance": {
                "prompt_updates": "Occasional",
                "no_infrastructure": True,
            },
        },

        "fine_tuning": {
            "upfront_cost": {
                "data_curation": "20-40 hours",
                "training": "$50-500 (compute)",
                "evaluation": "10-20 hours",
                "total": "~$200 + 50 hours labor",
            },
            "per_call_cost": {
                "system_prompt": "~100 tokens (shorter)",
                "no_few_shot": "0 tokens",
                "actual_input": "~200 tokens",
                "total_input": "~300 tokens/call",
                "cost_at_same_pricing": "$0.0009/call",
            },
            "maintenance": {
                "retraining": "Periodic",
                "data_updates": "Ongoing",
                "version_management": True,
            },
        },
    }

The Crossover Calculation

def crossover_analysis():
    """
    When does fine-tuning become cheaper?
    """
    # Cost assumptions
    prompt_per_call = 0.0051  # $5.10 per 1000 calls
    finetune_per_call = 0.0009  # $0.90 per 1000 calls
    finetune_upfront = 500  # $500 total (compute + labor value)

    # Per-call savings
    savings_per_call = prompt_per_call - finetune_per_call  # $0.0042

    # Crossover point
    crossover_calls = finetune_upfront / savings_per_call

    return {
        "crossover_calls": f"{crossover_calls:,.0f}",  # ~119,000 calls
        "interpretation": """
            At ~120,000 calls, fine-tuning becomes cheaper.

            Below 120K calls: Prompting is cheaper
            Above 120K calls: Fine-tuning is cheaper

            Monthly volume needed for fine-tuning to win:
            - If product runs 6 months: 20K calls/month
            - If product runs 12 months: 10K calls/month
        """,
    }


def sensitivity_analysis():
    """
    How crossover changes with assumptions
    """
    scenarios = {
        "cheap_fine_tune": {
            "upfront": 200,  # LoRA, small model
            "crossover": "~48K calls",
        },
        "expensive_fine_tune": {
            "upfront": 2000,  # Full fine-tune, large model
            "crossover": "~476K calls",
        },
        "minimal_prompt": {
            "prompt_cost": 0.002,  # Very short prompts
            "crossover": "Higher (prompting more competitive)",
        },
        "self_hosted_fine_tune": {
            "per_call": 0.0003,  # No API markup
            "crossover": "Lower (fine-tuning more attractive)",
        },
    }

    return scenarios

Beyond Cost: Speed of Iteration

def iteration_speed_comparison():
    return {
        "prompting": {
            "change_cycle": {
                "identify_issue": "Minutes",
                "modify_prompt": "Minutes",
                "test_change": "Minutes",
                "deploy": "Instant",
                "total": "Can iterate 10x per day",
            },
            "good_for": [
                "Early product development",
                "Rapidly changing requirements",
                "Unclear task definition",
                "Small team without ML expertise",
            ],
        },

        "fine_tuning": {
            "change_cycle": {
                "identify_issue": "Minutes",
                "curate_new_data": "Hours to days",
                "retrain_model": "Hours",
                "evaluate": "Hours",
                "deploy": "Minutes to hours",
                "total": "Maybe 1 iteration per week",
            },
            "good_for": [
                "Stable, well-defined tasks",
                "High volume production",
                "Performance-critical applications",
                "Teams with ML infrastructure",
            ],
        },

        "practical_guidance": """
            Phase 1 (Discovery): Always start with prompting
            - Find what works
            - Understand the task
            - Iterate quickly

            Phase 2 (Optimization): Consider fine-tuning when
            - Task is stable
            - Volume justifies investment
            - Prompt limitations hit
        """,
    }

Quality Considerations

def quality_comparison():
    return {
        "prompting_quality": {
            "strengths": [
                "Full model capability available",
                "Can update instantly",
                "No training data required",
                "Works with any API model",
            ],
            "limitations": [
                "Context limits constrain examples",
                "Can't learn from many examples",
                "Inconsistent following of complex instructions",
                "Prompt injection vulnerability",
            ],
        },

        "fine_tuning_quality": {
            "strengths": [
                "Learns from thousands of examples",
                "More consistent behavior",
                "Shorter prompts = more room for input",
                "Can capture subtle patterns",
            ],
            "limitations": [
                "Limited by training data quality",
                "May forget general capabilities",
                "Can't easily add new behaviors",
                "Training distribution matters",
            ],
        },

        "when_quality_differs": """
            Prompting often wins:
            - Novel/rare requests
            - Requires broad knowledge
            - Simple formatting tasks

            Fine-tuning often wins:
            - Consistent format/style needed
            - Task requires learned patterns
            - Input data is predictable
        """,
    }

Decision Framework

def decision_framework():
    return {
        "use_prompting_when": [
            "Monthly volume < 100K calls",
            "Task requirements still evolving",
            "Need to iterate quickly",
            "No ML team or infrastructure",
            "Task requires broad capabilities",
            "Short timeline to production",
        ],

        "use_fine_tuning_when": [
            "Monthly volume > 500K calls",
            "Task is well-defined and stable",
            "Have quality training data",
            "Prompting hits quality limits",
            "Need maximum consistency",
            "Have ML expertise available",
        ],

        "consider_hybrid": {
            "approach": "Fine-tuned model + prompt engineering",
            "example": """
                # Fine-tune for base behavior
                model = fine_tuned_model

                # Prompt for request-specific context
                prompt = f'''
                {short_system_prompt}

                Context: {user_context}

                Request: {user_request}
                '''
            """,
            "benefit": "Low per-call cost + flexibility",
        },
    }

Hidden Costs to Consider

def hidden_costs():
    return {
        "prompting_hidden_costs": {
            "context_length": "Long prompts fill context, less room for input",
            "latency": "More input tokens = higher TTFT",
            "inconsistency": "Time spent handling edge cases",
            "prompt_maintenance": "Prompts accumulate cruft",
        },

        "fine_tuning_hidden_costs": {
            "data_curation": "Often 80% of the work",
            "version_management": "Multiple models to track",
            "regression_testing": "Each update needs validation",
            "capability_drift": "Model may lose general skills",
            "infrastructure": "Training pipeline to maintain",
        },

        "often_overlooked": """
            1. Cost of debugging inconsistent behavior (favors fine-tuning)
            2. Cost of data labeling for fine-tuning
            3. Cost of longer latency with long prompts
            4. Cost of maintaining multiple fine-tuned models
            5. Cost of expertise required for each approach
        """,
    }

Practical Recommendation

def practical_recommendation():
    return {
        "default_path": """
            1. Start with prompting (always)
            2. Optimize prompt until diminishing returns
            3. Measure production volume for 1-2 months
            4. Calculate crossover point with your actuals
            5. Fine-tune only if:
               - Volume > crossover AND
               - Task is stable AND
               - Quality still insufficient with optimized prompt
        """,

        "volume_rule_of_thumb": """
            Volume/month  | Recommendation
            --------------|---------------
            < 10K         | Prompting
            10K - 100K    | Prompting, consider fine-tune if quality issues
            100K - 1M     | Likely fine-tune for cost
            > 1M          | Definitely fine-tune
        """,

        "quality_rule_of_thumb": """
            If prompting achieves:
            - 95%+ quality: Stay with prompting unless volume is huge
            - 85-95% quality: Fine-tune if volume justifies
            - < 85% quality: Fine-tune regardless of volume
        """,
    }

The choice isn't permanent. Start with prompting, measure, and migrate to fine-tuning when the economics and quality justify the investment. Most projects never reach the volume where fine-tuning wins on cost alone.