ComfyUI-SelVA/experiments/r128_sweet_spot.json

{
  "name": "r128_sweet_spot",
  "description": "Find the noise-free sweet spot on rank 128. LoRA+ ratio=16 caused noise — testing higher base LR without LoRA+ as a cleaner alternative. Target loss range 0.25–0.35. Also probing rank 256 since 102GB VRAM allows it.",
  "data_dir": "/media/unraid/davinci/Selva/BJ/features",
  "output_root": "/media/unraid/davinci/Selva/BJ/experiment/r128_sweet_spot",
  "base": {
    "steps": 10000,
    "rank": 128,
    "alpha": 0.0,
    "lr": 1e-4,
    "batch_size": 16,
    "warmup_steps": 200,
    "grad_accum": 1,
    "save_every": 2000,
    "seed": 42,
    "target": "attn.qkv",
    "timestep_mode": "uniform",
    "logit_normal_sigma": 1.0,
    "curriculum_switch": 0.6,
    "lora_dropout": 0.0,
    "lora_plus_ratio": 1.0
  },
  "experiments": [

    {
      "id": "g1_r128_lr_2e4",
      "group": "lr",
      "description": "LR=2e-4. Conservative 2× step up from baseline — noise-free descent toward sweet spot.",
      "lr": 2e-4
    },
    {
      "id": "g1_r128_lr_3e4",
      "group": "lr",
      "description": "LR=3e-4. 3× baseline — landed at 0.41 on r64, should reach 0.25–0.35 on r128.",
      "lr": 3e-4
    },
    {
      "id": "g1_r128_lr_5e4",
      "group": "lr",
      "description": "LR=5e-4. Aggressive but no LoRA+ B-matrix asymmetry — cleaner noise profile.",
      "lr": 5e-4
    },

    {
      "id": "g2_r128_curriculum",
      "group": "curriculum",
      "description": "Curriculum only at baseline LR. Clean slow descent — reference for what curriculum contributes alone.",
      "timestep_mode": "curriculum"
    },
    {
      "id": "g2_r128_lr_3e4_curriculum",
      "group": "curriculum",
      "description": "LR=3e-4 + curriculum. Speed of higher LR with coverage of curriculum — no LoRA+.",
      "lr": 3e-4,
      "timestep_mode": "curriculum"
    },
    {
      "id": "g2_r128_lr_3e4_curriculum_dropout",
      "group": "curriculum",
      "description": "LR=3e-4 + curriculum + dropout=0.05. Full controlled stack without LoRA+.",
      "lr": 3e-4,
      "timestep_mode": "curriculum",
      "lora_dropout": 0.05
    },

    {
      "id": "g3_r128_lora_plus_4",
      "group": "lora_plus",
      "description": "LoRA+ ratio=4 (lr_B=4e-4). Much more conservative than ratio=16 — tests if noise came from ratio not the technique.",
      "lora_plus_ratio": 4.0
    },

    {
      "id": "g4_r256_baseline",
      "group": "rank256",
      "description": "Rank 256 at baseline LR. 102GB VRAM makes this viable — does more capacity keep helping?",
      "rank": 256
    },
    {
      "id": "g4_r256_lr_3e4",
      "group": "rank256",
      "description": "Rank 256 + LR=3e-4. Best rank + best LR candidate combined.",
      "rank": 256,
      "lr": 3e-4
    },

    {
      "id": "g5_r128_lr_2e4_cosine",
      "group": "cosine",
      "description": "LR=2e-4 + cosine decay. Fixes the oscillation observed at step 6000–8000 by decaying LR to ~0 instead of staying flat.",
      "lr": 2e-4,
      "lr_schedule": "cosine"
    },
    {
      "id": "g5_r128_lr_3e4_cosine",
      "group": "cosine",
      "description": "LR=3e-4 + cosine decay. Higher LR with decay — should reach lower loss faster then lock in.",
      "lr": 3e-4,
      "lr_schedule": "cosine"
    }

  ]
}