Files
ComfyUI-SelVA/experiments/r128_sweet_spot.json
Ethanfel 1be07a80d2 feat: add cosine LR decay schedule to trainer and scheduler
- Add lr_schedule param (constant|cosine) to SelvaLoraTrainer
- Cosine decays LR from initial value to ~0 after warmup, preventing
  the oscillation observed at steps 6000-8000 with lr=2e-4 flat
- Wire lr_schedule through scheduler _PARAM_DEFAULTS and _train_inner call
- Add g5_r128_lr_2e4_cosine and g5_r128_lr_3e4_cosine to r128_sweet_spot sweep

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 13:25:01 +02:00

104 lines
3.2 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"name": "r128_sweet_spot",
"description": "Find the noise-free sweet spot on rank 128. LoRA+ ratio=16 caused noise — testing higher base LR without LoRA+ as a cleaner alternative. Target loss range 0.250.35. Also probing rank 256 since 102GB VRAM allows it.",
"data_dir": "/media/unraid/davinci/Selva/BJ/features",
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/r128_sweet_spot",
"base": {
"steps": 10000,
"rank": 128,
"alpha": 0.0,
"lr": 1e-4,
"batch_size": 16,
"warmup_steps": 200,
"grad_accum": 1,
"save_every": 2000,
"seed": 42,
"target": "attn.qkv",
"timestep_mode": "uniform",
"logit_normal_sigma": 1.0,
"curriculum_switch": 0.6,
"lora_dropout": 0.0,
"lora_plus_ratio": 1.0
},
"experiments": [
{
"id": "g1_r128_lr_2e4",
"group": "lr",
"description": "LR=2e-4. Conservative 2× step up from baseline — noise-free descent toward sweet spot.",
"lr": 2e-4
},
{
"id": "g1_r128_lr_3e4",
"group": "lr",
"description": "LR=3e-4. 3× baseline — landed at 0.41 on r64, should reach 0.250.35 on r128.",
"lr": 3e-4
},
{
"id": "g1_r128_lr_5e4",
"group": "lr",
"description": "LR=5e-4. Aggressive but no LoRA+ B-matrix asymmetry — cleaner noise profile.",
"lr": 5e-4
},
{
"id": "g2_r128_curriculum",
"group": "curriculum",
"description": "Curriculum only at baseline LR. Clean slow descent — reference for what curriculum contributes alone.",
"timestep_mode": "curriculum"
},
{
"id": "g2_r128_lr_3e4_curriculum",
"group": "curriculum",
"description": "LR=3e-4 + curriculum. Speed of higher LR with coverage of curriculum — no LoRA+.",
"lr": 3e-4,
"timestep_mode": "curriculum"
},
{
"id": "g2_r128_lr_3e4_curriculum_dropout",
"group": "curriculum",
"description": "LR=3e-4 + curriculum + dropout=0.05. Full controlled stack without LoRA+.",
"lr": 3e-4,
"timestep_mode": "curriculum",
"lora_dropout": 0.05
},
{
"id": "g3_r128_lora_plus_4",
"group": "lora_plus",
"description": "LoRA+ ratio=4 (lr_B=4e-4). Much more conservative than ratio=16 — tests if noise came from ratio not the technique.",
"lora_plus_ratio": 4.0
},
{
"id": "g4_r256_baseline",
"group": "rank256",
"description": "Rank 256 at baseline LR. 102GB VRAM makes this viable — does more capacity keep helping?",
"rank": 256
},
{
"id": "g4_r256_lr_3e4",
"group": "rank256",
"description": "Rank 256 + LR=3e-4. Best rank + best LR candidate combined.",
"rank": 256,
"lr": 3e-4
},
{
"id": "g5_r128_lr_2e4_cosine",
"group": "cosine",
"description": "LR=2e-4 + cosine decay. Fixes the oscillation observed at step 60008000 by decaying LR to ~0 instead of staying flat.",
"lr": 2e-4,
"lr_schedule": "cosine"
},
{
"id": "g5_r128_lr_3e4_cosine",
"group": "cosine",
"description": "LR=3e-4 + cosine decay. Higher LR with decay — should reach lower loss faster then lock in.",
"lr": 3e-4,
"lr_schedule": "cosine"
}
]
}