diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index f66d2cf..97eb3ce 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -783,12 +783,17 @@ class SelvaBigvganTrainer: # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM # before starting training. BigVGAN + discriminator need the headroom. comfy.model_management.unload_all_models() + + # Move EVERYTHING to CPU first, then bring back only what we need. + # ComfyUI may have loaded the full model to GPU; unload_all_models + # doesn't always free model dicts passed between nodes. + feature_utils.to("cpu") + if "generator" in model: + model["generator"].to("cpu") soft_empty_cache() # Only move mel_converter to GPU — it's tiny and needed for training. - # The rest of feature_utils (CLIP, synchformer, T5, VAE) stays on CPU; - # _pregenerate_lora_mels handles its own device management for the parts - # it needs temporarily. + # _pregenerate_lora_mels handles its own device management for CLIP/tod. mel_converter.to(device) pbar = comfy.utils.ProgressBar(steps) @@ -828,22 +833,6 @@ class SelvaBigvganTrainer: "files with matching audio files." ) - # Offload heavy SelVA components to CPU — only vocoder + mel_converter - # are needed for training. CLIP, synchformer, T5, generator sit on - # GPU doing nothing and eat tens of GiB otherwise. - for attr in ("clip_model", "synchformer", "text_encoder_t5"): - sub = getattr(feature_utils, attr, None) - if sub is not None: - sub.to("cpu") - if "generator" in model: - model["generator"].to("cpu") - # tod contains VAE + vocoder; VAE not needed but vocoder is a - # submodule we're about to train — move just the VAE part. - tod = feature_utils.tod - if hasattr(tod, "vae"): - tod.vae.to("cpu") - soft_empty_cache() - _result[0] = _do_train( vocoder, mel_converter, clips, device, dtype, strategy, feature_utils, @@ -971,11 +960,15 @@ def _do_train(vocoder, mel_converter, clips, if buf is not None: module._buffers[bname] = buf.clone() - # ── GAFilter injection (after inference-flag stripping) ────────────────── + # ── Move vocoder to training device/dtype ──────────────────────────────── + # After cloning, vocoder may be on CPU (offloaded before training). + vocoder.to(device, dtype) + + # ── GAFilter injection ───────────────────────────────────────────────── # GAFilter params are fresh tensors — no inference flag to strip. if use_gafilter: n_gaf = inject_gafilters(vocoder, gafilter_kernel_size) - vocoder.to(device, dtype) + vocoder.to(device, dtype) # ensure new GAFilter params match print(f"[BigVGAN] GAFilter injected: {n_gaf} filters kernel={gafilter_kernel_size}", flush=True) # ── Training mode: select which parameters to train ──────────────────────