feat: add PiSSA/rsLoRA support to scheduler and PiSSA sweep experiment
Thread init_mode and use_rslora through the scheduler's config parsing, experiment record, and _train_inner call. Default alpha changed to 2*rank to match trainer. Add pissa_sweep.json with 7 experiments ablating PiSSA init vs standard, rsLoRA scaling, and learning rate variations at rank 128. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,62 @@
|
|||||||
|
{
|
||||||
|
"name": "pissa_sweep",
|
||||||
|
"description": "PiSSA vs standard init ablation at rank 128. Best prior config (lr=3e-4, bs=16, 10k steps) as baseline. PiSSA starts on-manifold via SVD init — should eliminate intruder dimensions. rsLoRA stabilises scaling at high rank.",
|
||||||
|
"data_dir": "/media/unraid/davinci/Selva/BJ/features",
|
||||||
|
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/pissa_sweep",
|
||||||
|
"base": {
|
||||||
|
"steps": 10000,
|
||||||
|
"rank": 128,
|
||||||
|
"alpha": 0.0,
|
||||||
|
"lr": 3e-4,
|
||||||
|
"batch_size": 16,
|
||||||
|
"warmup_steps": 200,
|
||||||
|
"grad_accum": 1,
|
||||||
|
"save_every": 2000,
|
||||||
|
"seed": 42,
|
||||||
|
"target": "attn.qkv",
|
||||||
|
"timestep_mode": "uniform",
|
||||||
|
"lora_dropout": 0.0,
|
||||||
|
"lora_plus_ratio": 1.0,
|
||||||
|
"lr_schedule": "constant",
|
||||||
|
"init_mode": "pissa",
|
||||||
|
"use_rslora": true
|
||||||
|
},
|
||||||
|
"experiments": [
|
||||||
|
{
|
||||||
|
"id": "standard_baseline",
|
||||||
|
"description": "Standard Kaiming init + classic alpha/rank scaling. Replicates best prior config for A/B comparison.",
|
||||||
|
"init_mode": "standard",
|
||||||
|
"use_rslora": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "pissa_rslora",
|
||||||
|
"description": "PiSSA init + rsLoRA scaling. Full Tier-S config. Should start on-manifold and avoid intruder dimensions."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "pissa_classic_scale",
|
||||||
|
"description": "PiSSA init + classic alpha/rank scaling. Isolates PiSSA contribution from rsLoRA.",
|
||||||
|
"use_rslora": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "standard_rslora",
|
||||||
|
"description": "Standard init + rsLoRA only. Isolates rsLoRA contribution from PiSSA.",
|
||||||
|
"init_mode": "standard"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "pissa_rslora_lr1e-4",
|
||||||
|
"description": "PiSSA+rsLoRA at lower lr=1e-4. PiSSA starts closer to optimum — may need less aggressive lr.",
|
||||||
|
"lr": 1e-4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "pissa_rslora_lr5e-4",
|
||||||
|
"description": "PiSSA+rsLoRA at higher lr=5e-4. Test if on-manifold start tolerates faster learning.",
|
||||||
|
"lr": 5e-4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "pissa_rslora_dropout",
|
||||||
|
"description": "PiSSA+rsLoRA with dropout 0.05. Note: PiSSA forces dropout=0 (principal components should not be dropped) — this tests standard init with rsLoRA + dropout.",
|
||||||
|
"init_mode": "standard",
|
||||||
|
"lora_dropout": 0.05
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -79,6 +79,8 @@ _PARAM_DEFAULTS = {
|
|||||||
"lora_dropout": 0.0,
|
"lora_dropout": 0.0,
|
||||||
"lora_plus_ratio": 1.0,
|
"lora_plus_ratio": 1.0,
|
||||||
"lr_schedule": "constant",
|
"lr_schedule": "constant",
|
||||||
|
"init_mode": "pissa",
|
||||||
|
"use_rslora": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Palette for comparison chart: one color per experiment (cycles if > 8)
|
# Palette for comparison chart: one color per experiment (cycles if > 8)
|
||||||
@@ -388,7 +390,9 @@ class SelvaLoraScheduler:
|
|||||||
dropout = float(cfg.get("lora_dropout", 0.0))
|
dropout = float(cfg.get("lora_dropout", 0.0))
|
||||||
plus_ratio = float(cfg.get("lora_plus_ratio", 1.0))
|
plus_ratio = float(cfg.get("lora_plus_ratio", 1.0))
|
||||||
lr_schedule = str(cfg.get("lr_schedule", "constant"))
|
lr_schedule = str(cfg.get("lr_schedule", "constant"))
|
||||||
alpha_val = alpha if alpha > 0.0 else float(rank)
|
init_mode = str(cfg.get("init_mode", "pissa"))
|
||||||
|
use_rslora = bool(cfg.get("use_rslora", True))
|
||||||
|
alpha_val = alpha if alpha > 0.0 else float(2 * rank)
|
||||||
target_suffixes = tuple(target.strip().split())
|
target_suffixes = tuple(target.strip().split())
|
||||||
|
|
||||||
output_dir = output_root / exp_id
|
output_dir = output_root / exp_id
|
||||||
@@ -410,6 +414,7 @@ class SelvaLoraScheduler:
|
|||||||
"curriculum_switch": curr_switch,
|
"curriculum_switch": curr_switch,
|
||||||
"lora_dropout": dropout, "lora_plus_ratio": plus_ratio,
|
"lora_dropout": dropout, "lora_plus_ratio": plus_ratio,
|
||||||
"lr_schedule": lr_schedule,
|
"lr_schedule": lr_schedule,
|
||||||
|
"init_mode": init_mode, "use_rslora": use_rslora,
|
||||||
},
|
},
|
||||||
"results": {"status": "running"},
|
"results": {"status": "running"},
|
||||||
"adapter_path": None,
|
"adapter_path": None,
|
||||||
@@ -428,7 +433,7 @@ class SelvaLoraScheduler:
|
|||||||
alpha_val, target_suffixes, batch_size, warmup,
|
alpha_val, target_suffixes, batch_size, warmup,
|
||||||
grad_accum, save_every, resume_path, seed,
|
grad_accum, save_every, resume_path, seed,
|
||||||
ts_mode, ln_sigma, curr_switch, dropout, plus_ratio,
|
ts_mode, ln_sigma, curr_switch, dropout, plus_ratio,
|
||||||
lr_schedule,
|
lr_schedule, init_mode, use_rslora,
|
||||||
)
|
)
|
||||||
|
|
||||||
duration = time.monotonic() - t_start
|
duration = time.monotonic() - t_start
|
||||||
|
|||||||
Reference in New Issue
Block a user