From 94610b89433588e219ac9b43f0137e128c25d86b Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Wed, 8 Apr 2026 10:46:08 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20r128=5Fsweet=5Fspot=20sweep=20=E2=80=94?=
 =?UTF-8?q?=20noise-free=20LR=20search=20+=20rank=20256?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

9 experiments targeting loss 0.25-0.35 without LoRA+ noise.
Tests higher base LR (2e-4/3e-4/5e-4), curriculum combos, conservative
LoRA+ ratio=4, and rank 256 baseline + lr=3e-4.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 experiments/r128_sweet_spot.json | 88 ++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 experiments/r128_sweet_spot.json

diff --git a/experiments/r128_sweet_spot.json b/experiments/r128_sweet_spot.json
new file mode 100644
index 0000000..ce2c3b4
--- /dev/null
+++ b/experiments/r128_sweet_spot.json
@@ -0,0 +1,88 @@
+{
+  "name": "r128_sweet_spot",
+  "description": "Find the noise-free sweet spot on rank 128. LoRA+ ratio=16 caused noise — testing higher base LR without LoRA+ as a cleaner alternative. Target loss range 0.25–0.35. Also probing rank 256 since 102GB VRAM allows it.",
+  "data_dir": "/media/unraid/davinci/Selva/BJ/features",
+  "output_root": "/media/unraid/davinci/Selva/BJ/experiment/r128_sweet_spot",
+  "base": {
+    "steps": 10000,
+    "rank": 128,
+    "alpha": 0.0,
+    "lr": 1e-4,
+    "batch_size": 16,
+    "warmup_steps": 200,
+    "grad_accum": 1,
+    "save_every": 2000,
+    "seed": 42,
+    "target": "attn.qkv",
+    "timestep_mode": "uniform",
+    "logit_normal_sigma": 1.0,
+    "curriculum_switch": 0.6,
+    "lora_dropout": 0.0,
+    "lora_plus_ratio": 1.0
+  },
+  "experiments": [
+
+    {
+      "id": "g1_r128_lr_2e4",
+      "group": "lr",
+      "description": "LR=2e-4. Conservative 2× step up from baseline — noise-free descent toward sweet spot.",
+      "lr": 2e-4
+    },
+    {
+      "id": "g1_r128_lr_3e4",
+      "group": "lr",
+      "description": "LR=3e-4. 3× baseline — landed at 0.41 on r64, should reach 0.25–0.35 on r128.",
+      "lr": 3e-4
+    },
+    {
+      "id": "g1_r128_lr_5e4",
+      "group": "lr",
+      "description": "LR=5e-4. Aggressive but no LoRA+ B-matrix asymmetry — cleaner noise profile.",
+      "lr": 5e-4
+    },
+
+    {
+      "id": "g2_r128_curriculum",
+      "group": "curriculum",
+      "description": "Curriculum only at baseline LR. Clean slow descent — reference for what curriculum contributes alone.",
+      "timestep_mode": "curriculum"
+    },
+    {
+      "id": "g2_r128_lr_3e4_curriculum",
+      "group": "curriculum",
+      "description": "LR=3e-4 + curriculum. Speed of higher LR with coverage of curriculum — no LoRA+.",
+      "lr": 3e-4,
+      "timestep_mode": "curriculum"
+    },
+    {
+      "id": "g2_r128_lr_3e4_curriculum_dropout",
+      "group": "curriculum",
+      "description": "LR=3e-4 + curriculum + dropout=0.05. Full controlled stack without LoRA+.",
+      "lr": 3e-4,
+      "timestep_mode": "curriculum",
+      "lora_dropout": 0.05
+    },
+
+    {
+      "id": "g3_r128_lora_plus_4",
+      "group": "lora_plus",
+      "description": "LoRA+ ratio=4 (lr_B=4e-4). Much more conservative than ratio=16 — tests if noise came from ratio not the technique.",
+      "lora_plus_ratio": 4.0
+    },
+
+    {
+      "id": "g4_r256_baseline",
+      "group": "rank256",
+      "description": "Rank 256 at baseline LR. 102GB VRAM makes this viable — does more capacity keep helping?",
+      "rank": 256
+    },
+    {
+      "id": "g4_r256_lr_3e4",
+      "group": "rank256",
+      "description": "Rank 256 + LR=3e-4. Best rank + best LR candidate combined.",
+      "rank": 256,
+      "lr": 3e-4
+    }
+
+  ]
+}