From d06936802b5a0fcd9bb1ff0c631300ab1743fb68 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 10 Apr 2026 00:10:52 +0200 Subject: [PATCH] fix: cast mel_converter buffers to float32 to match STFT input dtype mel_basis and hann_window buffers inherit bfloat16 from model loading. Since all mel_converter inputs are cast to float32 for cuFFT, the internal buffers must also be float32 to avoid matmul dtype mismatch. Co-Authored-By: Claude Opus 4.6 --- nodes/selva_bigvgan_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index b73c2e9..24d6425 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -808,9 +808,11 @@ def _do_train(vocoder, mel_converter, clips, clips = [c.clone() for c in clips] # 2. mel_converter buffers (mel_basis, hann_window) — same origin. + # Also cast to float32: mel_converter receives float32 audio (cuFFT + # requirement) so all internal buffers must match. for name, buf in list(mel_converter._buffers.items()): if buf is not None: - mel_converter._buffers[name] = buf.clone() + mel_converter._buffers[name] = buf.clone().float() # 3. Vocoder parameters are handled below with clone().detach(). # ─────────────────────────────────────────────────────────────────────────