From 58e1985af24f423ef097dfa936f9a7a2a0004ed8 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Wed, 8 Apr 2026 13:10:43 +0200 Subject: [PATCH] feat: SelVA Skip Experiment node + save partial scalars on skip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New node: SelVA Skip Experiment — writes skip_current.flag from UI, queue in a second workflow tab while scheduler is running - SkipExperiment now attaches partial loss/grad/spectral data to the exception so the scheduler saves all collected scalars in the summary Co-Authored-By: Claude Sonnet 4.6 --- nodes/__init__.py | 1 + nodes/selva_lora_scheduler.py | 13 +++++++-- nodes/selva_lora_trainer.py | 9 +++++- nodes/selva_skip_experiment.py | 50 ++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 nodes/selva_skip_experiment.py diff --git a/nodes/__init__.py b/nodes/__init__.py index f965e29..16f8b6f 100644 --- a/nodes/__init__.py +++ b/nodes/__init__.py @@ -9,6 +9,7 @@ _NODES = { "SelvaLoraTrainer": (".selva_lora_trainer", "SelvaLoraTrainer", "SelVA LoRA Trainer"), "SelvaLoraScheduler": (".selva_lora_scheduler", "SelvaLoraScheduler", "SelVA LoRA Scheduler"), "SelvaDatasetBrowser": (".selva_dataset_browser", "SelvaDatasetBrowser", "SelVA Dataset Browser"), + "SelvaSkipExperiment": (".selva_skip_experiment", "SelvaSkipExperiment", "SelVA Skip Experiment"), } for key, (module_path, class_name, display_name) in _NODES.items(): diff --git a/nodes/selva_lora_scheduler.py b/nodes/selva_lora_scheduler.py index 0fe2dcf..2db45c0 100644 --- a/nodes/selva_lora_scheduler.py +++ b/nodes/selva_lora_scheduler.py @@ -478,10 +478,17 @@ class SelvaLoraScheduler: except SkipExperiment as e: duration = time.monotonic() - t_start print(f"[LoRA Scheduler] Experiment '{exp_id}' skipped: {e}", flush=True) + partial = getattr(e, "partial", {}) + lh = partial.get("loss_history", []) + smoothed = _smooth_losses(lh) if lh else [] exp_record["results"] = { - "status": "skipped", - "error": str(e), - "duration_seconds": round(duration, 1), + "status": "skipped", + "stopped_at_step": partial.get("stopped_at_step"), + "final_loss": round(smoothed[-1], 6) if smoothed else None, + "loss_history": [round(v, 6) for v in lh], + "grad_norm_history": partial.get("grad_norm_history", []), + "spectral_metrics": {str(k): v for k, v in partial.get("spectral_metrics", {}).items()}, + "duration_seconds": round(duration, 1), } _write_summary() pbar_outer.update(1) diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py index 7cffe47..02b89c3 100644 --- a/nodes/selva_lora_trainer.py +++ b/nodes/selva_lora_trainer.py @@ -758,7 +758,14 @@ class SelvaLoraTrainer: skip_flag = output_dir.parent / "skip_current.flag" if skip_flag.exists(): skip_flag.unlink() - raise SkipExperiment(f"skip_current.flag detected at step {step} — skipping to next experiment") + exc = SkipExperiment(f"skip_current.flag detected at step {step} — skipping to next experiment") + exc.partial = { + "loss_history": list(loss_history), + "grad_norm_history": list(grad_norm_history), + "spectral_metrics": dict(spectral_metrics), + "stopped_at_step": step, + } + raise exc avg = running_loss / log_interval loss_history.append(avg) diff --git a/nodes/selva_skip_experiment.py b/nodes/selva_skip_experiment.py new file mode 100644 index 0000000..c646a22 --- /dev/null +++ b/nodes/selva_skip_experiment.py @@ -0,0 +1,50 @@ +from pathlib import Path + +import folder_paths + +from .utils import SELVA_CATEGORY + + +class SelvaSkipExperiment: + """Writes skip_current.flag into a sweep output_root. + + Queue this node while a SelVA LoRA Scheduler sweep is running to skip + the current experiment and move to the next one. The trainer picks up + the flag within 50 steps (~a few seconds). + """ + + OUTPUT_NODE = True + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "output_root": ("STRING", { + "default": "", + "tooltip": "output_root of the running sweep — same value as in your experiments JSON.", + }), + }, + } + + RETURN_TYPES = ("STRING",) + RETURN_NAMES = ("flag_path",) + OUTPUT_TOOLTIPS = ("Path where the flag was written.",) + FUNCTION = "skip" + CATEGORY = SELVA_CATEGORY + DESCRIPTION = ( + "Signals the running SelVA LoRA Scheduler to skip the current experiment " + "and move to the next one. Queue this node while the scheduler is running. " + "Partial scalars collected so far are saved in the summary." + ) + + def skip(self, output_root: str): + p = Path(output_root.strip()) + if not p.is_absolute(): + p = Path(folder_paths.get_output_directory()) / p + if not p.exists(): + raise FileNotFoundError(f"[SelVA Skip] output_root not found: {p}") + + flag = p / "skip_current.flag" + flag.touch() + print(f"[SelVA Skip] Flag written: {flag}", flush=True) + return (str(flag),)