From 8bb2fb701534c3fc5c1267532be3e3df50f41183 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Sun, 5 Apr 2026 08:38:59 +0200
Subject: [PATCH] fix: extend OOM catch to decode/vocode, add (masked) to sync
 log line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- selva_sampler: wrap decode+vocode in their own OOM catch — previously
  OOM during mel decode or vocoding gave a raw CUDA traceback instead
  of the actionable hint
- selva_feature_extractor: sync frames log line now shows (masked) when
  a mask is active, matching the CLIP log line

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 nodes/selva_feature_extractor.py |  2 +-
 nodes/selva_sampler.py           | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/nodes/selva_feature_extractor.py b/nodes/selva_feature_extractor.py
index 80a002e..5d68bdc 100644
--- a/nodes/selva_feature_extractor.py
+++ b/nodes/selva_feature_extractor.py
@@ -184,7 +184,7 @@ class SelvaFeatureExtractor:
             std  = _SYNC_STD.to(sync_frames.device)
             sync_frames = (sync_frames - mean) / std
             sync_input  = sync_frames.unsqueeze(0).to(device, dtype)          # [1, N, C, 224, 224]
-            print(f"[SelVA]   Sync frames: {sync_frames.shape[0]} @ {_SYNC_FPS}fps → 224px", flush=True)
+            print(f"[SelVA]   Sync frames: {sync_frames.shape[0]} @ {_SYNC_FPS}fps → 224px {'(masked)' if mask is not None else ''}", flush=True)
 
             # Encode T5 text + prepend supplementary tokens → text-conditioned sync features
             text_f, text_mask = feature_utils.encode_text_t5([prompt])           # [1, L, D], [1, L]
diff --git a/nodes/selva_sampler.py b/nodes/selva_sampler.py
index aa5fb83..73101a0 100644
--- a/nodes/selva_sampler.py
+++ b/nodes/selva_sampler.py
@@ -143,10 +143,16 @@ class SelvaSampler:
         print(f"[SelVA] latent stats: mean={x1.float().mean():.4f} std={x1.float().std():.4f}", flush=True)
 
         # Decode: latent → mel → audio
-        with torch.no_grad():
-            x1_unnorm = net_generator.unnormalize(x1)
-            spec  = feature_utils.decode(x1_unnorm)    # latent → mel spectrogram
-            audio = feature_utils.vocode(spec)          # mel → waveform
+        try:
+            with torch.no_grad():
+                x1_unnorm = net_generator.unnormalize(x1)
+                spec  = feature_utils.decode(x1_unnorm)    # latent → mel spectrogram
+                audio = feature_utils.vocode(spec)          # mel → waveform
+        except torch.cuda.OutOfMemoryError:
+            raise RuntimeError(
+                "[SelVA] CUDA out of memory during decode/vocode. Try switching offload_strategy "
+                "to 'offload_to_cpu', using a smaller variant, or reducing duration."
+            )
 
         if strategy == "offload_to_cpu":
             net_generator.to(get_offload_device())