feat: comprehensive node improvements

Model Loader: - bf16 support check — auto-falls back to fp16 on unsupported GPUs - DESCRIPTION and OUTPUT_TOOLTIPS Feature Extractor: - Store variant in features dict and .npz cache - Progress bar (3 steps: CLIP encode, T5 encode, sync encode) - Expand cache hash to 32 hex chars - DESCRIPTION and OUTPUT_TOOLTIPS Sampler: - Variant mismatch validation against extracted features - Cancellation support via throw_exception_if_processing_interrupted() - OOM catch with actionable error message - normalize toggle (optional BOOLEAN, default true) for peak normalization - Remove empty optional: {} block - DESCRIPTION and OUTPUT_TOOLTIPS Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 18:16:03 +02:00
parent 429810db5b
commit bd53744e2d
3 changed files with 51 additions and 7 deletions
@@ -1,5 +1,6 @@
 import torch
 import comfy.utils
+import comfy.model_management

 from .utils import SELVA_CATEGORY, get_device, get_offload_device, soft_empty_cache

@@ -29,15 +30,22 @@ class SelvaSampler:
                                           "tooltip": "Classifier-free guidance scale. Higher values follow the prompt more strictly but can introduce artifacts. SelVA default is 4.5; useful range is roughly 3–7."}),
                "seed":     ("INT",   {"default": 0,   "min": 0,   "max": 0xFFFFFFFF}),
            },
-            "optional": {},
+            "optional": {
+                "normalize": ("BOOLEAN", {
+                    "default": True,
+                    "tooltip": "Peak-normalize output to [-1, 1]. Disable to preserve the raw decoder output level.",
+                }),
+            },
        }

    RETURN_TYPES = ("AUDIO",)
    RETURN_NAMES = ("audio",)
+    OUTPUT_TOOLTIPS = ("Generated audio waveform — connect to VHS_VideoCombine or Save Audio.",)
    FUNCTION = "generate"
    CATEGORY = SELVA_CATEGORY
+    DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance."

-    def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed):
+    def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True):
        import dataclasses
        from selva_core.model.flow_matching import FlowMatching

@@ -48,6 +56,14 @@ class SelvaSampler:
        feature_utils = model["feature_utils"]
        mode          = model["mode"]

+        # Validate that features were extracted with the same model variant
+        feat_variant = features.get("variant")
+        if feat_variant is not None and feat_variant != model["variant"]:
+            raise ValueError(
+                f"[SelVA] Variant mismatch: features were extracted with '{feat_variant}' "
+                f"but model is '{model['variant']}'. Re-run the Feature Extractor with the current model."
+            )
+
        # Resolve prompt: use override if given, otherwise fall back to features prompt
        if not prompt or not prompt.strip():
            prompt = features.get("prompt", "")
@@ -112,10 +128,17 @@ class SelvaSampler:
            pbar = comfy.utils.ProgressBar(steps)

            def ode_wrapper_tracked(t, x):
+                comfy.model_management.throw_exception_if_processing_interrupted()
                pbar.update(1)
                return net_generator.ode_wrapper(t, x, conditions, empty_conditions, cfg_strength)

-            x1 = fm.to_data(ode_wrapper_tracked, x0)
+            try:
+                x1 = fm.to_data(ode_wrapper_tracked, x0)
+            except torch.cuda.OutOfMemoryError:
+                raise RuntimeError(
+                    "[SelVA] CUDA out of memory during generation. Try switching offload_strategy "
+                    "to 'offload_to_cpu', using a smaller variant, or reducing duration."
+                )

        print(f"[SelVA] latent stats: mean={x1.float().mean():.4f} std={x1.float().std():.4f}", flush=True)

@@ -137,8 +160,9 @@ class SelvaSampler:
        elif audio.dim() == 3 and audio.shape[1] != 1:
            audio = audio.mean(dim=1, keepdim=True)  # stereo → mono

-        peak = audio.abs().max().clamp(min=1e-8)
-        audio = (audio / peak).clamp(-1, 1)
+        if normalize:
+            peak = audio.abs().max().clamp(min=1e-8)
+            audio = (audio / peak).clamp(-1, 1)
        print(f"[SelVA] audio: shape={tuple(audio.shape)} sr={sample_rate}", flush=True)

        return ({"waveform": audio.cpu(), "sample_rate": sample_rate},)