From 264dc49d426a6286d95ed87d492c96bf8a06261a Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Wed, 8 Apr 2026 13:09:01 +0200 Subject: [PATCH] feat: skip_current.flag to cancel experiment and move to next Create the flag file in the sweep output_root to skip the running experiment at the next log interval (every 50 steps): touch /path/to/experiment/skip_current.flag Scheduler marks it as 'skipped' in the summary and continues. Skipped experiments are NOT resumed on restart (unlike failed ones). Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_lora_scheduler.py | 13 +++++++++++++ nodes/selva_lora_trainer.py | 9 +++++++++ 2 files changed, 22 insertions(+) diff --git a/nodes/selva_lora_scheduler.py b/nodes/selva_lora_scheduler.py index 03270e7..0fe2dcf 100644 --- a/nodes/selva_lora_scheduler.py +++ b/nodes/selva_lora_scheduler.py @@ -38,6 +38,7 @@ import folder_paths from .utils import SELVA_CATEGORY, get_device from .selva_lora_trainer import ( SelvaLoraTrainer, + SkipExperiment, _prepare_dataset, _smooth_losses, _pil_to_tensor, @@ -474,6 +475,18 @@ class SelvaLoraScheduler: "start_step": 0, }) + except SkipExperiment as e: + duration = time.monotonic() - t_start + print(f"[LoRA Scheduler] Experiment '{exp_id}' skipped: {e}", flush=True) + exp_record["results"] = { + "status": "skipped", + "error": str(e), + "duration_seconds": round(duration, 1), + } + _write_summary() + pbar_outer.update(1) + continue + except Exception as e: duration = time.monotonic() - t_start print(f"[LoRA Scheduler] Experiment '{exp_id}' failed: {e}", flush=True) diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py index 959fbe0..7cffe47 100644 --- a/nodes/selva_lora_trainer.py +++ b/nodes/selva_lora_trainer.py @@ -4,6 +4,10 @@ import random import traceback from pathlib import Path + +class SkipExperiment(Exception): + """Raised when skip_current.flag is found — signals the scheduler to move to the next experiment.""" + import numpy as np import torch import torch.nn.functional as F @@ -751,6 +755,11 @@ class SelvaLoraTrainer: optimizer.zero_grad() if step % log_interval == 0: + skip_flag = output_dir.parent / "skip_current.flag" + if skip_flag.exists(): + skip_flag.unlink() + raise SkipExperiment(f"skip_current.flag detected at step {step} — skipping to next experiment") + avg = running_loss / log_interval loss_history.append(avg) # grad_norm_count can be 0 when grad_accum > log_interval