fix: use soundfile for WAV/FLAC/OGG to bypass torchcodec/FFmpeg dependency
torchaudio was defaulting to the torchcodec backend which requires FFmpeg shared libraries not present in the ComfyUI venv, silently skipping every clip and producing an empty dataset. Also add experiments/vocoder_finetune.json for the BJ vocoder LoRA run (lr=3e-4, rank=128, 10k steps). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"name": "vocoder_finetune",
|
||||
"description": "Single run with fine-tuned BJ BigVGAN vocoder injected. Validates vocoder integration with LoRA training. Best known config: lr=3e-4, rank=128.",
|
||||
"data_dir": "/media/unraid/davinci/Selva/BJ/features",
|
||||
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/vocoder_finetune",
|
||||
"base": {
|
||||
"steps": 10000,
|
||||
"rank": 128,
|
||||
"alpha": 0.0,
|
||||
"lr": 3e-4,
|
||||
"batch_size": 16,
|
||||
"warmup_steps": 200,
|
||||
"grad_accum": 1,
|
||||
"save_every": 2000,
|
||||
"seed": 42,
|
||||
"target": "attn.qkv",
|
||||
"timestep_mode": "uniform",
|
||||
"logit_normal_sigma": 1.0,
|
||||
"curriculum_switch": 0.6,
|
||||
"lora_dropout": 0.0,
|
||||
"lora_plus_ratio": 1.0,
|
||||
"lr_schedule": "constant"
|
||||
},
|
||||
"experiments": [
|
||||
{
|
||||
"id": "r128_lr_3e4_bj_vocoder",
|
||||
"description": "lr=3e-4 rank=128 with fine-tuned BJ BigVGAN vocoder. Direct comparison baseline against previous best g1_r128_lr_3e4."
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -24,6 +24,19 @@ from .utils import SELVA_CATEGORY
|
||||
AUDIO_DATASET = "AUDIO_DATASET"
|
||||
|
||||
_AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".aac", ".m4a"}
|
||||
_SOUNDFILE_EXTS = {".wav", ".flac", ".ogg"} # handled natively without FFmpeg
|
||||
|
||||
|
||||
def _load_audio(path: Path):
|
||||
"""Load audio file. Uses soundfile for WAV/FLAC/OGG to avoid torchcodec/FFmpeg issues."""
|
||||
if path.suffix.lower() in _SOUNDFILE_EXTS:
|
||||
import soundfile as sf
|
||||
wav_np, sr = sf.read(str(path), dtype="float32", always_2d=True) # [L, C]
|
||||
wav = torch.from_numpy(wav_np).T.unsqueeze(0) # [1, C, L]
|
||||
else:
|
||||
wav, sr = torchaudio.load(str(path)) # [C, L]
|
||||
wav = wav.unsqueeze(0).float() # [1, C, L]
|
||||
return wav, sr
|
||||
|
||||
|
||||
class SelvaDatasetLoader:
|
||||
@@ -58,8 +71,7 @@ class SelvaDatasetLoader:
|
||||
dataset = []
|
||||
for f in sorted(files):
|
||||
try:
|
||||
wav, sr = torchaudio.load(str(f)) # [C, L]
|
||||
wav = wav.unsqueeze(0).float() # [1, C, L]
|
||||
wav, sr = _load_audio(f)
|
||||
dataset.append({"waveform": wav, "sample_rate": sr, "name": f.stem})
|
||||
except Exception as e:
|
||||
print(f"[DatasetLoader] Skipping {f.name}: {e}", flush=True)
|
||||
|
||||
Reference in New Issue
Block a user