From ea7dfed27a640710e4785ffe7626c9987ec8e8d0 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Thu, 9 Apr 2026 01:41:59 +0200 Subject: [PATCH] fix(bigvgan-trainer): fallback to soundfile when torchaudio ffmpeg backend fails torchcodec/libavutil soname mismatch causes torchaudio to fail on every file load, silently emptying clips. Add _load_wav() that tries torchaudio first then falls back to soundfile (handles wav/flac without ffmpeg). Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_bigvgan_trainer.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index e5c238f..ba1146f 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -22,6 +22,23 @@ import folder_paths from .utils import SELVA_CATEGORY, get_device, soft_empty_cache +def _load_wav(path): + """Load audio file to [channels, samples] float32 tensor. + + Tries torchaudio first; falls back to soundfile for wav/flac when the + ffmpeg/torchcodec backend is unavailable (e.g. libavutil soname mismatch). + """ + try: + return torchaudio.load(str(path)) + except Exception: + pass + # soundfile fallback — handles wav, flac, ogg natively without ffmpeg + import soundfile as sf + data, sr = sf.read(str(path), dtype="float32", always_2d=True) + wav = torch.from_numpy(data.T) # [channels, samples] + return wav, sr + + # Multi-resolution STFT windows — same three resolutions as BigVGAN discriminator config. _STFT_RESOLUTIONS = [ (1024, 120, 600), @@ -143,7 +160,7 @@ class SelvaBigvganTrainer: clips = [] for af in audio_files: try: - wav, sr = torchaudio.load(str(af)) + wav, sr = _load_wav(af) if wav.shape[0] > 1: wav = wav.mean(0, keepdim=True) if sr != sample_rate: