chore: lower default warmup_steps from 500 to 100
500 warmup steps is 25% of a 2000-step run — too long. 100 steps lets the full lr kick in much earlier without sacrificing stability. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+1
-1
@@ -106,7 +106,7 @@ The script will:
|
|||||||
| `--target` | `attn.qkv` | Which layers to adapt. Add `linear1` for post-attention projections |
|
| `--target` | `attn.qkv` | Which layers to adapt. Add `linear1` for post-attention projections |
|
||||||
| `--lr` | `1e-4` | Learning rate |
|
| `--lr` | `1e-4` | Learning rate |
|
||||||
| `--steps` | `2000` | Total training steps |
|
| `--steps` | `2000` | Total training steps |
|
||||||
| `--warmup_steps` | `500` | Linear LR warmup steps |
|
| `--warmup_steps` | `100` | Linear LR warmup steps |
|
||||||
| `--grad_accum` | `4` | Gradient accumulation steps (effective batch = grad_accum × 1) |
|
| `--grad_accum` | `4` | Gradient accumulation steps (effective batch = grad_accum × 1) |
|
||||||
| `--save_every` | `500` | Save a checkpoint every N steps |
|
| `--save_every` | `500` | Save a checkpoint every N steps |
|
||||||
| `--resume` | `None` | Path to a step checkpoint to resume from (e.g. `lora_output/adapter_step01000.pt`) |
|
| `--resume` | `None` | Path to a step checkpoint to resume from (e.g. `lora_output/adapter_step01000.pt`) |
|
||||||
|
|||||||
@@ -242,7 +242,7 @@ class SelvaLoraTrainer:
|
|||||||
"default": "attn.qkv",
|
"default": "attn.qkv",
|
||||||
"tooltip": "Space-separated layer name suffixes to wrap. Default targets all QKV projections. Add 'linear1' for post-attention projections.",
|
"tooltip": "Space-separated layer name suffixes to wrap. Default targets all QKV projections. Add 'linear1' for post-attention projections.",
|
||||||
}),
|
}),
|
||||||
"warmup_steps": ("INT", {"default": 500, "min": 0, "max": 5000}),
|
"warmup_steps": ("INT", {"default": 100, "min": 0, "max": 5000}),
|
||||||
"grad_accum": ("INT", {"default": 4, "min": 1, "max": 32,
|
"grad_accum": ("INT", {"default": 4, "min": 1, "max": 32,
|
||||||
"tooltip": "Gradient accumulation steps."}),
|
"tooltip": "Gradient accumulation steps."}),
|
||||||
"save_every": ("INT", {"default": 500, "min": 50, "max": 10000}),
|
"save_every": ("INT", {"default": 500, "min": 50, "max": 10000}),
|
||||||
@@ -271,7 +271,7 @@ class SelvaLoraTrainer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def train(self, model, data_dir, output_dir, steps, rank, lr,
|
def train(self, model, data_dir, output_dir, steps, rank, lr,
|
||||||
alpha=0.0, target="attn.qkv", warmup_steps=500,
|
alpha=0.0, target="attn.qkv", warmup_steps=100,
|
||||||
grad_accum=4, save_every=500, resume_path="", seed=42):
|
grad_accum=4, save_every=500, resume_path="", seed=42):
|
||||||
|
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
|
|||||||
+1
-1
@@ -159,7 +159,7 @@ def main():
|
|||||||
help="Module name suffixes to wrap with LoRA. Also try 'linear1'.")
|
help="Module name suffixes to wrap with LoRA. Also try 'linear1'.")
|
||||||
parser.add_argument("--lr", type=float, default=1e-4)
|
parser.add_argument("--lr", type=float, default=1e-4)
|
||||||
parser.add_argument("--steps", type=int, default=2000)
|
parser.add_argument("--steps", type=int, default=2000)
|
||||||
parser.add_argument("--warmup_steps",type=int, default=500)
|
parser.add_argument("--warmup_steps",type=int, default=100)
|
||||||
parser.add_argument("--grad_accum", type=int, default=4, help="Gradient accumulation steps")
|
parser.add_argument("--grad_accum", type=int, default=4, help="Gradient accumulation steps")
|
||||||
parser.add_argument("--save_every", type=int, default=500)
|
parser.add_argument("--save_every", type=int, default=500)
|
||||||
parser.add_argument("--resume", default=None,
|
parser.add_argument("--resume", default=None,
|
||||||
|
|||||||
Reference in New Issue
Block a user