From 264dc49d426a6286d95ed87d492c96bf8a06261a Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Wed, 8 Apr 2026 13:09:01 +0200
Subject: [PATCH] feat: skip_current.flag to cancel experiment and move to next

Create the flag file in the sweep output_root to skip the running
experiment at the next log interval (every 50 steps):
  touch /path/to/experiment/skip_current.flag

Scheduler marks it as 'skipped' in the summary and continues.
Skipped experiments are NOT resumed on restart (unlike failed ones).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 nodes/selva_lora_scheduler.py | 13 +++++++++++++
 nodes/selva_lora_trainer.py   |  9 +++++++++
 2 files changed, 22 insertions(+)

diff --git a/nodes/selva_lora_scheduler.py b/nodes/selva_lora_scheduler.py
index 03270e7..0fe2dcf 100644
--- a/nodes/selva_lora_scheduler.py
+++ b/nodes/selva_lora_scheduler.py
@@ -38,6 +38,7 @@ import folder_paths
 from .utils import SELVA_CATEGORY, get_device
 from .selva_lora_trainer import (
     SelvaLoraTrainer,
+    SkipExperiment,
     _prepare_dataset,
     _smooth_losses,
     _pil_to_tensor,
@@ -474,6 +475,18 @@ class SelvaLoraScheduler:
                     "start_step":   0,
                 })
 
+            except SkipExperiment as e:
+                duration = time.monotonic() - t_start
+                print(f"[LoRA Scheduler] Experiment '{exp_id}' skipped: {e}", flush=True)
+                exp_record["results"] = {
+                    "status":           "skipped",
+                    "error":            str(e),
+                    "duration_seconds": round(duration, 1),
+                }
+                _write_summary()
+                pbar_outer.update(1)
+                continue
+
             except Exception as e:
                 duration = time.monotonic() - t_start
                 print(f"[LoRA Scheduler] Experiment '{exp_id}' failed: {e}", flush=True)
diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py
index 959fbe0..7cffe47 100644
--- a/nodes/selva_lora_trainer.py
+++ b/nodes/selva_lora_trainer.py
@@ -4,6 +4,10 @@ import random
 import traceback
 from pathlib import Path
 
+
+class SkipExperiment(Exception):
+    """Raised when skip_current.flag is found — signals the scheduler to move to the next experiment."""
+
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -751,6 +755,11 @@ class SelvaLoraTrainer:
                     optimizer.zero_grad()
 
                 if step % log_interval == 0:
+                    skip_flag = output_dir.parent / "skip_current.flag"
+                    if skip_flag.exists():
+                        skip_flag.unlink()
+                        raise SkipExperiment(f"skip_current.flag detected at step {step} — skipping to next experiment")
+
                     avg = running_loss / log_interval
                     loss_history.append(avg)
                     # grad_norm_count can be 0 when grad_accum > log_interval