fix: extend OOM catch to decode/vocode, add (masked) to sync log line
- selva_sampler: wrap decode+vocode in their own OOM catch — previously OOM during mel decode or vocoding gave a raw CUDA traceback instead of the actionable hint - selva_feature_extractor: sync frames log line now shows (masked) when a mask is active, matching the CLIP log line Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -184,7 +184,7 @@ class SelvaFeatureExtractor:
|
|||||||
std = _SYNC_STD.to(sync_frames.device)
|
std = _SYNC_STD.to(sync_frames.device)
|
||||||
sync_frames = (sync_frames - mean) / std
|
sync_frames = (sync_frames - mean) / std
|
||||||
sync_input = sync_frames.unsqueeze(0).to(device, dtype) # [1, N, C, 224, 224]
|
sync_input = sync_frames.unsqueeze(0).to(device, dtype) # [1, N, C, 224, 224]
|
||||||
print(f"[SelVA] Sync frames: {sync_frames.shape[0]} @ {_SYNC_FPS}fps → 224px", flush=True)
|
print(f"[SelVA] Sync frames: {sync_frames.shape[0]} @ {_SYNC_FPS}fps → 224px {'(masked)' if mask is not None else ''}", flush=True)
|
||||||
|
|
||||||
# Encode T5 text + prepend supplementary tokens → text-conditioned sync features
|
# Encode T5 text + prepend supplementary tokens → text-conditioned sync features
|
||||||
text_f, text_mask = feature_utils.encode_text_t5([prompt]) # [1, L, D], [1, L]
|
text_f, text_mask = feature_utils.encode_text_t5([prompt]) # [1, L, D], [1, L]
|
||||||
|
|||||||
+10
-4
@@ -143,10 +143,16 @@ class SelvaSampler:
|
|||||||
print(f"[SelVA] latent stats: mean={x1.float().mean():.4f} std={x1.float().std():.4f}", flush=True)
|
print(f"[SelVA] latent stats: mean={x1.float().mean():.4f} std={x1.float().std():.4f}", flush=True)
|
||||||
|
|
||||||
# Decode: latent → mel → audio
|
# Decode: latent → mel → audio
|
||||||
with torch.no_grad():
|
try:
|
||||||
x1_unnorm = net_generator.unnormalize(x1)
|
with torch.no_grad():
|
||||||
spec = feature_utils.decode(x1_unnorm) # latent → mel spectrogram
|
x1_unnorm = net_generator.unnormalize(x1)
|
||||||
audio = feature_utils.vocode(spec) # mel → waveform
|
spec = feature_utils.decode(x1_unnorm) # latent → mel spectrogram
|
||||||
|
audio = feature_utils.vocode(spec) # mel → waveform
|
||||||
|
except torch.cuda.OutOfMemoryError:
|
||||||
|
raise RuntimeError(
|
||||||
|
"[SelVA] CUDA out of memory during decode/vocode. Try switching offload_strategy "
|
||||||
|
"to 'offload_to_cpu', using a smaller variant, or reducing duration."
|
||||||
|
)
|
||||||
|
|
||||||
if strategy == "offload_to_cpu":
|
if strategy == "offload_to_cpu":
|
||||||
net_generator.to(get_offload_device())
|
net_generator.to(get_offload_device())
|
||||||
|
|||||||
Reference in New Issue
Block a user