Back to Blog

What Actually Works with LoRA

Recipes in cookbooks assume ideal conditions: perfect ingredients, professional equipment, unlimited time. Real cooking adapts to what's in the fridge, the quirks of your oven, and dinner guests arriving in an hour.

LoRA tutorials assume ideal conditions too: clean data, obvious hyperparameters, generous compute. Real LoRA training requires adapting to your specific model, task, and constraints. Here's what actually matters once you're past the tutorials.

Learning Rate Matters More Than You Think

def learning_rate_guidance():
    return {
        "common_mistake": "Using same LR as full fine-tuning",

        "why_its_different": """
            LoRA updates a tiny fraction of parameters.
            Same gradient signal, fewer parameters to absorb it.
            Result: Updates are proportionally larger.
            Need lower LR to compensate.
        """,

        "rule_of_thumb": {
            "full_fine_tune_lr": 1e-5,
            "lora_lr": 1e-4 to 3e-4,  # 10-30x higher
        },

        "practical_approach": """
            # Start with this for most tasks
            lr = 2e-4

            # If loss oscillates: reduce to 1e-4
            # If training too slow: try 3e-4
            # If quality poor at convergence: try lower LR longer

            # Scaling with rank
            # Higher rank = more parameters = can use higher LR
            lr_r8 = 1e-4
            lr_r16 = 2e-4
            lr_r32 = 3e-4
        """,

        "scheduler_recommendation": """
            # Cosine with warmup works well
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=100,
                num_training_steps=total_steps
            )
        """,
    }

Layer Selection Strategy

def layer_selection():
    return {
        "default_recommendation": {
            "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
            "why": "Attention projections capture most task-specific patterns",
            "result": "~0.5% of parameters, ~90% of quality",
        },

        "expanded_for_harder_tasks": {
            "target_modules": [
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj"  # MLP layers
            ],
            "why": "MLP layers add capacity for knowledge/behavior changes",
            "result": "~1.5% of parameters, ~95% of quality",
        },

        "when_to_expand": [
            "Task performance plateaus with attention-only",
            "Need to learn new behaviors (not just style)",
            "Have sufficient training data",
        ],

        "layer_position_matters": {
            "early_layers": "Capture low-level patterns, often less task-specific",
            "middle_layers": "Often most valuable for task adaptation",
            "late_layers": "Output-focused, good for format/style",
        },

        "selective_layers_example": """
            # Only adapt middle layers (often effective)
            target_modules = ["q_proj", "v_proj"]
            layers_to_adapt = list(range(10, 25))  # Layers 10-24 of 32

            # Implement via custom LoRA config
            lora_config = LoraConfig(
                r=16,
                target_modules=target_modules,
                layers_to_transform=layers_to_adapt,
            )
        """,
    }

Rank Selection

def rank_selection():
    return {
        "general_guidance": {
            "r=4": "Minimal adaptation, style-only changes",
            "r=8": "Light adaptation, simple tasks",
            "r=16": "Good default for most tasks",
            "r=32": "Complex tasks, multi-behavior adaptation",
            "r=64": "Near full fine-tune capacity",
        },

        "how_to_choose": """
            Start with r=16. It works for 80% of cases.

            Reduce to r=8 if:
            - Training data is small (< 1000 examples)
            - Task is simple (style, format changes)
            - Need smallest possible adapter

            Increase to r=32/64 if:
            - r=16 quality plateaus below target
            - Task requires learning complex patterns
            - Have abundant training data (> 10K examples)
        """,

        "diminishing_returns": """
            Quality vs Rank (typical pattern):

            Rank | Quality | Parameters
            -----|---------|------------
            8    | 85%     | 0.05%
            16   | 92%     | 0.1%
            32   | 95%     | 0.2%
            64   | 97%     | 0.4%
            128  | 98%     | 0.8%

            Doubling rank doesn't double quality.
        """,

        "alpha_relationship": """
            Alpha controls scaling of LoRA output.

            Typical: alpha = 2 * rank

            Lower alpha (alpha = rank):
            - More conservative updates
            - Better stability
            - May need more training steps

            Higher alpha (alpha = 4 * rank):
            - Stronger adaptation
            - Risk of instability
            - May overfit faster
        """,
    }

Data Quality Over Quantity

def data_quality_matters():
    return {
        "observation": """
            1,000 high-quality examples often beats
            10,000 noisy examples.

            LoRA has limited capacity.
            High-quality data uses that capacity well.
            Noisy data wastes capacity on noise patterns.
        """,

        "what_makes_quality_data": {
            "consistent": "Same task, same format, same style",
            "correct": "Outputs are actually right/good",
            "diverse": "Coverage of task variations",
            "balanced": "Not dominated by one pattern",
        },

        "practical_curation": """
            # Quality filtering pipeline
            def curate_training_data(raw_data):
                filtered = []
                for example in raw_data:
                    # Check correctness
                    if not verify_output(example):
                        continue

                    # Check format consistency
                    if not matches_format(example):
                        continue

                    # Check diversity (not too similar to existing)
                    if too_similar_to(example, filtered):
                        continue

                    filtered.append(example)

                return filtered
        """,

        "minimum_viable_dataset": """
            Task Type          | Minimum | Recommended
            -------------------|---------|------------
            Style adaptation   | 100     | 500
            Format compliance  | 200     | 1000
            Domain adaptation  | 500     | 2000
            Behavior learning  | 1000    | 5000
        """,
    }

Common Failure Modes

def failure_modes():
    return {
        "underfitting": {
            "symptoms": [
                "Training loss barely decreases",
                "Model outputs similar to base model",
            ],
            "causes": [
                "Learning rate too low",
                "Rank too low for task complexity",
                "Not enough training steps",
            ],
            "fixes": [
                "Increase LR to 3e-4",
                "Increase rank",
                "Train longer",
            ],
        },

        "overfitting": {
            "symptoms": [
                "Training loss goes to near-zero",
                "Validation loss increases",
                "Model outputs are too template-y",
            ],
            "causes": [
                "Too little data",
                "Rank too high",
                "Too many steps",
            ],
            "fixes": [
                "Add more data",
                "Reduce rank",
                "Early stopping",
                "Add dropout (lora_dropout=0.1)",
            ],
        },

        "catastrophic_forgetting": {
            "symptoms": [
                "Good on fine-tune task",
                "Bad on general tasks",
            ],
            "causes": [
                "Data too narrow",
                "Training too aggressive",
            ],
            "fixes": [
                "Mix in general data (10-20%)",
                "Lower learning rate",
                "Lower rank",
            ],
        },

        "format_collapse": {
            "symptoms": [
                "Always outputs same format",
                "Ignores variations in input",
            ],
            "causes": [
                "Training data too uniform",
                "Overfitting to common pattern",
            ],
            "fixes": [
                "Diversify training data",
                "Add format variations",
                "Reduce training steps",
            ],
        },
    }

Validation Strategy

def validation_strategy():
    return {
        "during_training": {
            "eval_steps": "Every 100-500 steps",
            "metrics": ["loss", "task_accuracy"],
            "early_stopping": "Patience of 3-5 eval rounds",
        },

        "after_training": {
            "held_out_test": "Never seen during training",
            "baseline_comparison": "Same test set, base model",
            "regression_check": "General capability tests",
        },

        "practical_test_set": """
            Test set composition:
            - 70%: In-domain task examples
            - 20%: Edge cases and variations
            - 10%: Out-of-domain (regression check)

            Minimum sizes:
            - Small tasks: 100 test examples
            - Medium tasks: 500 test examples
            - Critical tasks: 1000+ test examples
        """,

        "red_flags_to_watch": [
            "Test accuracy much higher than real-world feedback",
            "Perfect scores (likely test set leakage)",
            "Large variance across test subsets",
            "Good on easy cases, bad on hard cases",
        ],
    }

Practical Config Template

def working_config():
    return """
    # Proven starting point for most tasks

    from peft import LoraConfig, get_peft_model
    from transformers import TrainingArguments, Trainer

    # LoRA config
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./output",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        load_best_model_at_end=True,
        fp16=True,  # or bf16=True for newer GPUs
    )

    # Training
    model = get_peft_model(base_model, lora_config)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
    )
    trainer.train()
    """

LoRA in production requires tuning beyond the defaults. Start with r=16, lr=2e-4, attention projections only. Adjust based on training dynamics and validation results. The tutorials get you started; iteration gets you to production quality.