From 48493a3f0d16006d68fc59f1b82270d840f5986f Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Thu, 9 Apr 2026 15:27:48 +0200
Subject: [PATCH] feat: add SelvaDatasetSaver node with NPZ sidecar copy

Saves all clips in an AUDIO_DATASET to FLAC. When npz_source_dir is
provided, copies the matching .npz for each clip so FLAC/NPZ pairs
stay in sync after the inspector filters out bad clips.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 nodes/__init__.py               |  1 +
 nodes/selva_dataset_pipeline.py | 85 +++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/nodes/__init__.py b/nodes/__init__.py
index f8dd518..1e96234 100644
--- a/nodes/__init__.py
+++ b/nodes/__init__.py
@@ -27,6 +27,7 @@ _NODES = {
     "SelvaDatasetLUFSNormalizer":  (".selva_dataset_pipeline", "SelvaDatasetLUFSNormalizer",  "SelVA Dataset LUFS Normalizer"),
     "SelvaDatasetInspector":       (".selva_dataset_pipeline", "SelvaDatasetInspector",       "SelVA Dataset Inspector"),
     "SelvaDatasetItemExtractor":   (".selva_dataset_pipeline", "SelvaDatasetItemExtractor",   "SelVA Dataset Item Extractor"),
+    "SelvaDatasetSaver":           (".selva_dataset_pipeline", "SelvaDatasetSaver",           "SelVA Dataset Saver"),
 }
 
 for key, (module_path, class_name, display_name) in _NODES.items():
diff --git a/nodes/selva_dataset_pipeline.py b/nodes/selva_dataset_pipeline.py
index f71434d..66e3e6f 100644
--- a/nodes/selva_dataset_pipeline.py
+++ b/nodes/selva_dataset_pipeline.py
@@ -361,3 +361,88 @@ class SelvaDatasetItemExtractor:
             flush=True,
         )
         return (audio, item["name"], len(dataset))
+
+
+class SelvaDatasetSaver:
+    """Save all clips in an AUDIO_DATASET to disk as FLAC files.
+
+    Optionally copies matching .npz feature files from a source directory,
+    keeping FLAC/NPZ pairs in sync after the inspector has filtered clips.
+    """
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "dataset": (AUDIO_DATASET,),
+                "output_dir": ("STRING", {
+                    "default": "",
+                    "tooltip": "Absolute path to output folder. Created if it does not exist.",
+                }),
+            },
+            "optional": {
+                "npz_source_dir": ("STRING", {
+                    "default": "",
+                    "tooltip": "If set, copies {name}.npz from this folder alongside each saved FLAC. "
+                               "Missing NPZs are warned but do not abort the save.",
+                }),
+            },
+        }
+
+    RETURN_TYPES  = ("STRING",)
+    RETURN_NAMES  = ("report",)
+    OUTPUT_NODE   = True
+    FUNCTION      = "save"
+    CATEGORY      = SELVA_CATEGORY
+    DESCRIPTION   = (
+        "Save every clip in an AUDIO_DATASET to output_dir as FLAC. "
+        "If npz_source_dir is provided, copies the matching .npz file for each clip — "
+        "so rejected clips never get their NPZ copied."
+    )
+
+    def save(self, dataset, output_dir: str, npz_source_dir: str = ""):
+        import shutil
+        import soundfile as sf
+
+        out = Path(output_dir.strip())
+        out.mkdir(parents=True, exist_ok=True)
+
+        npz_src = Path(npz_source_dir.strip()) if npz_source_dir.strip() else None
+
+        saved = 0
+        npz_copied = 0
+        npz_missing = []
+
+        for item in dataset:
+            name   = item["name"]
+            wav    = item["waveform"][0]          # [C, L]
+            sr     = item["sample_rate"]
+
+            # soundfile wants [L] mono or [L, C] multichannel, float32
+            wav_np = wav.permute(1, 0).float().numpy()   # [L, C]
+            if wav_np.shape[1] == 1:
+                wav_np = wav_np[:, 0]                    # [L] mono
+
+            flac_path = out / f"{name}.flac"
+            sf.write(str(flac_path), wav_np, sr, subtype="PCM_24")
+            saved += 1
+
+            if npz_src is not None:
+                npz_path = npz_src / f"{name}.npz"
+                if npz_path.exists():
+                    shutil.copy2(str(npz_path), str(out / f"{name}.npz"))
+                    npz_copied += 1
+                else:
+                    npz_missing.append(name)
+
+        lines = [
+            f"[DatasetSaver] Saved {saved} clips → {out}",
+        ]
+        if npz_src is not None:
+            lines.append(f"  NPZ copied: {npz_copied}  missing: {len(npz_missing)}")
+            for n in npz_missing:
+                lines.append(f"    MISSING NPZ: {n}")
+
+        report = "\n".join(lines)
+        print(report, flush=True)
+        return (report,)