fix(bigvgan-trainer): fallback to soundfile when torchaudio ffmpeg backend fails

torchcodec/libavutil soname mismatch causes torchaudio to fail on every
file load, silently emptying clips. Add _load_wav() that tries torchaudio
first then falls back to soundfile (handles wav/flac without ffmpeg).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-09 01:41:59 +02:00
parent 81ff0d46c9
commit ea7dfed27a
+18 -1
View File
@@ -22,6 +22,23 @@ import folder_paths
from .utils import SELVA_CATEGORY, get_device, soft_empty_cache
def _load_wav(path):
"""Load audio file to [channels, samples] float32 tensor.
Tries torchaudio first; falls back to soundfile for wav/flac when the
ffmpeg/torchcodec backend is unavailable (e.g. libavutil soname mismatch).
"""
try:
return torchaudio.load(str(path))
except Exception:
pass
# soundfile fallback — handles wav, flac, ogg natively without ffmpeg
import soundfile as sf
data, sr = sf.read(str(path), dtype="float32", always_2d=True)
wav = torch.from_numpy(data.T) # [channels, samples]
return wav, sr
# Multi-resolution STFT windows — same three resolutions as BigVGAN discriminator config.
_STFT_RESOLUTIONS = [
(1024, 120, 600),
@@ -143,7 +160,7 @@ class SelvaBigvganTrainer:
clips = []
for af in audio_files:
try:
wav, sr = torchaudio.load(str(af))
wav, sr = _load_wav(af)
if wav.shape[0] > 1:
wav = wav.mean(0, keepdim=True)
if sr != sample_rate: