From 4806daa4ca1574d27b503c4ac458e9e215dfc2b2 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Sun, 5 Apr 2026 22:51:27 +0200 Subject: [PATCH] chore: lower default warmup_steps from 500 to 100 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 500 warmup steps is 25% of a 2000-step run — too long. 100 steps lets the full lr kick in much earlier without sacrificing stability. Co-Authored-By: Claude Sonnet 4.6 --- LORA_TRAINING.md | 2 +- nodes/selva_lora_trainer.py | 4 ++-- train_lora.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/LORA_TRAINING.md b/LORA_TRAINING.md index 0d87b54..8bd5696 100644 --- a/LORA_TRAINING.md +++ b/LORA_TRAINING.md @@ -106,7 +106,7 @@ The script will: | `--target` | `attn.qkv` | Which layers to adapt. Add `linear1` for post-attention projections | | `--lr` | `1e-4` | Learning rate | | `--steps` | `2000` | Total training steps | -| `--warmup_steps` | `500` | Linear LR warmup steps | +| `--warmup_steps` | `100` | Linear LR warmup steps | | `--grad_accum` | `4` | Gradient accumulation steps (effective batch = grad_accum × 1) | | `--save_every` | `500` | Save a checkpoint every N steps | | `--resume` | `None` | Path to a step checkpoint to resume from (e.g. `lora_output/adapter_step01000.pt`) | diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py index 04bb572..4f9b596 100644 --- a/nodes/selva_lora_trainer.py +++ b/nodes/selva_lora_trainer.py @@ -242,7 +242,7 @@ class SelvaLoraTrainer: "default": "attn.qkv", "tooltip": "Space-separated layer name suffixes to wrap. Default targets all QKV projections. Add 'linear1' for post-attention projections.", }), - "warmup_steps": ("INT", {"default": 500, "min": 0, "max": 5000}), + "warmup_steps": ("INT", {"default": 100, "min": 0, "max": 5000}), "grad_accum": ("INT", {"default": 4, "min": 1, "max": 32, "tooltip": "Gradient accumulation steps."}), "save_every": ("INT", {"default": 500, "min": 50, "max": 10000}), @@ -271,7 +271,7 @@ class SelvaLoraTrainer: ) def train(self, model, data_dir, output_dir, steps, rank, lr, - alpha=0.0, target="attn.qkv", warmup_steps=500, + alpha=0.0, target="attn.qkv", warmup_steps=100, grad_accum=4, save_every=500, resume_path="", seed=42): torch.manual_seed(seed) diff --git a/train_lora.py b/train_lora.py index b15a0fb..3ecbe8d 100644 --- a/train_lora.py +++ b/train_lora.py @@ -159,7 +159,7 @@ def main(): help="Module name suffixes to wrap with LoRA. Also try 'linear1'.") parser.add_argument("--lr", type=float, default=1e-4) parser.add_argument("--steps", type=int, default=2000) - parser.add_argument("--warmup_steps",type=int, default=500) + parser.add_argument("--warmup_steps",type=int, default=100) parser.add_argument("--grad_accum", type=int, default=4, help="Gradient accumulation steps") parser.add_argument("--save_every", type=int, default=500) parser.add_argument("--resume", default=None,