From 8bb2fb701534c3fc5c1267532be3e3df50f41183 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Sun, 5 Apr 2026 08:38:59 +0200 Subject: [PATCH] fix: extend OOM catch to decode/vocode, add (masked) to sync log line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - selva_sampler: wrap decode+vocode in their own OOM catch — previously OOM during mel decode or vocoding gave a raw CUDA traceback instead of the actionable hint - selva_feature_extractor: sync frames log line now shows (masked) when a mask is active, matching the CLIP log line Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_feature_extractor.py | 2 +- nodes/selva_sampler.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/nodes/selva_feature_extractor.py b/nodes/selva_feature_extractor.py index 80a002e..5d68bdc 100644 --- a/nodes/selva_feature_extractor.py +++ b/nodes/selva_feature_extractor.py @@ -184,7 +184,7 @@ class SelvaFeatureExtractor: std = _SYNC_STD.to(sync_frames.device) sync_frames = (sync_frames - mean) / std sync_input = sync_frames.unsqueeze(0).to(device, dtype) # [1, N, C, 224, 224] - print(f"[SelVA] Sync frames: {sync_frames.shape[0]} @ {_SYNC_FPS}fps → 224px", flush=True) + print(f"[SelVA] Sync frames: {sync_frames.shape[0]} @ {_SYNC_FPS}fps → 224px {'(masked)' if mask is not None else ''}", flush=True) # Encode T5 text + prepend supplementary tokens → text-conditioned sync features text_f, text_mask = feature_utils.encode_text_t5([prompt]) # [1, L, D], [1, L] diff --git a/nodes/selva_sampler.py b/nodes/selva_sampler.py index aa5fb83..73101a0 100644 --- a/nodes/selva_sampler.py +++ b/nodes/selva_sampler.py @@ -143,10 +143,16 @@ class SelvaSampler: print(f"[SelVA] latent stats: mean={x1.float().mean():.4f} std={x1.float().std():.4f}", flush=True) # Decode: latent → mel → audio - with torch.no_grad(): - x1_unnorm = net_generator.unnormalize(x1) - spec = feature_utils.decode(x1_unnorm) # latent → mel spectrogram - audio = feature_utils.vocode(spec) # mel → waveform + try: + with torch.no_grad(): + x1_unnorm = net_generator.unnormalize(x1) + spec = feature_utils.decode(x1_unnorm) # latent → mel spectrogram + audio = feature_utils.vocode(spec) # mel → waveform + except torch.cuda.OutOfMemoryError: + raise RuntimeError( + "[SelVA] CUDA out of memory during decode/vocode. Try switching offload_strategy " + "to 'offload_to_cpu', using a smaller variant, or reducing duration." + ) if strategy == "offload_to_cpu": net_generator.to(get_offload_device())