From 51ac099073c9de6727e447123f0256fe9f938b2b Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Thu, 9 Apr 2026 02:09:26 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20sanitize=20target=5Fflat=20=E2=80=94=20c?= =?UTF-8?q?lips=20are=20inference=20tensors=20from=20outer=20inference=5Fm?= =?UTF-8?q?ode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clips list is built inside ComfyUI's inference_mode context, so every element is an inference tensor. torch.stack().clone() propagates the flag. Use zeros+copy_ (same pattern as params/buffers) to get a normal tensor, so mel_converter(target_flat) inside no_grad produces a saveable input. Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_bigvgan_trainer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index 6f0061a..e27bfd6 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -268,8 +268,17 @@ class SelvaBigvganTrainer: start = random.randint(0, clip.shape[0] - segment_samples) batch.append(clip[start : start + segment_samples]) - target_flat = torch.stack(batch).to(device, dtype).clone() # [B, T] - target_wav = target_flat.unsqueeze(1) # [B, 1, T] + # clips were loaded in ComfyUI's outer inference_mode, so every + # element is an inference tensor. torch.stack().clone() is still + # an inference tensor (the flag propagates through all ops). + # Use zeros+copy_ to produce a genuine normal tensor. + _stacked = torch.stack(batch).to(device, dtype) + target_flat = torch.zeros( + _stacked.shape, device=device, dtype=dtype + ) + target_flat.copy_(_stacked) + del _stacked + target_wav = target_flat.unsqueeze(1) # [B, 1, T] # Fixed target mel — buffers are now normal tensors (sanitized # above), so torch.no_grad() correctly produces a non-inference,