94610b8943
9 experiments targeting loss 0.25-0.35 without LoRA+ noise. Tests higher base LR (2e-4/3e-4/5e-4), curriculum combos, conservative LoRA+ ratio=4, and rank 256 baseline + lr=3e-4. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
89 lines
2.7 KiB
JSON
89 lines
2.7 KiB
JSON
{
|
||
"name": "r128_sweet_spot",
|
||
"description": "Find the noise-free sweet spot on rank 128. LoRA+ ratio=16 caused noise — testing higher base LR without LoRA+ as a cleaner alternative. Target loss range 0.25–0.35. Also probing rank 256 since 102GB VRAM allows it.",
|
||
"data_dir": "/media/unraid/davinci/Selva/BJ/features",
|
||
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/r128_sweet_spot",
|
||
"base": {
|
||
"steps": 10000,
|
||
"rank": 128,
|
||
"alpha": 0.0,
|
||
"lr": 1e-4,
|
||
"batch_size": 16,
|
||
"warmup_steps": 200,
|
||
"grad_accum": 1,
|
||
"save_every": 2000,
|
||
"seed": 42,
|
||
"target": "attn.qkv",
|
||
"timestep_mode": "uniform",
|
||
"logit_normal_sigma": 1.0,
|
||
"curriculum_switch": 0.6,
|
||
"lora_dropout": 0.0,
|
||
"lora_plus_ratio": 1.0
|
||
},
|
||
"experiments": [
|
||
|
||
{
|
||
"id": "g1_r128_lr_2e4",
|
||
"group": "lr",
|
||
"description": "LR=2e-4. Conservative 2× step up from baseline — noise-free descent toward sweet spot.",
|
||
"lr": 2e-4
|
||
},
|
||
{
|
||
"id": "g1_r128_lr_3e4",
|
||
"group": "lr",
|
||
"description": "LR=3e-4. 3× baseline — landed at 0.41 on r64, should reach 0.25–0.35 on r128.",
|
||
"lr": 3e-4
|
||
},
|
||
{
|
||
"id": "g1_r128_lr_5e4",
|
||
"group": "lr",
|
||
"description": "LR=5e-4. Aggressive but no LoRA+ B-matrix asymmetry — cleaner noise profile.",
|
||
"lr": 5e-4
|
||
},
|
||
|
||
{
|
||
"id": "g2_r128_curriculum",
|
||
"group": "curriculum",
|
||
"description": "Curriculum only at baseline LR. Clean slow descent — reference for what curriculum contributes alone.",
|
||
"timestep_mode": "curriculum"
|
||
},
|
||
{
|
||
"id": "g2_r128_lr_3e4_curriculum",
|
||
"group": "curriculum",
|
||
"description": "LR=3e-4 + curriculum. Speed of higher LR with coverage of curriculum — no LoRA+.",
|
||
"lr": 3e-4,
|
||
"timestep_mode": "curriculum"
|
||
},
|
||
{
|
||
"id": "g2_r128_lr_3e4_curriculum_dropout",
|
||
"group": "curriculum",
|
||
"description": "LR=3e-4 + curriculum + dropout=0.05. Full controlled stack without LoRA+.",
|
||
"lr": 3e-4,
|
||
"timestep_mode": "curriculum",
|
||
"lora_dropout": 0.05
|
||
},
|
||
|
||
{
|
||
"id": "g3_r128_lora_plus_4",
|
||
"group": "lora_plus",
|
||
"description": "LoRA+ ratio=4 (lr_B=4e-4). Much more conservative than ratio=16 — tests if noise came from ratio not the technique.",
|
||
"lora_plus_ratio": 4.0
|
||
},
|
||
|
||
{
|
||
"id": "g4_r256_baseline",
|
||
"group": "rank256",
|
||
"description": "Rank 256 at baseline LR. 102GB VRAM makes this viable — does more capacity keep helping?",
|
||
"rank": 256
|
||
},
|
||
{
|
||
"id": "g4_r256_lr_3e4",
|
||
"group": "rank256",
|
||
"description": "Rank 256 + LR=3e-4. Best rank + best LR candidate combined.",
|
||
"rank": 256,
|
||
"lr": 3e-4
|
||
}
|
||
|
||
]
|
||
}
|