From 10a71b0c4f6cba4d537cbc3ac9410656ac5f7dce Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 10 Apr 2026 00:56:13 +0200 Subject: [PATCH] fix: offload entire model to CPU in main thread before worker starts The previous offload ran inside the worker thread, but by then ComfyUI had already loaded the full model to GPU. Now feature_utils.to('cpu') and generator.to('cpu') run in the main thread right after unload_all_models(), before the worker starts. vocoder.to(device, dtype) is called explicitly after inference flag stripping in _do_train to bring only the vocoder back to GPU. Co-Authored-By: Claude Opus 4.6 --- nodes/selva_bigvgan_trainer.py | 35 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index f66d2cf..97eb3ce 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -783,12 +783,17 @@ class SelvaBigvganTrainer: # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM # before starting training. BigVGAN + discriminator need the headroom. comfy.model_management.unload_all_models() + + # Move EVERYTHING to CPU first, then bring back only what we need. + # ComfyUI may have loaded the full model to GPU; unload_all_models + # doesn't always free model dicts passed between nodes. + feature_utils.to("cpu") + if "generator" in model: + model["generator"].to("cpu") soft_empty_cache() # Only move mel_converter to GPU — it's tiny and needed for training. - # The rest of feature_utils (CLIP, synchformer, T5, VAE) stays on CPU; - # _pregenerate_lora_mels handles its own device management for the parts - # it needs temporarily. + # _pregenerate_lora_mels handles its own device management for CLIP/tod. mel_converter.to(device) pbar = comfy.utils.ProgressBar(steps) @@ -828,22 +833,6 @@ class SelvaBigvganTrainer: "files with matching audio files." ) - # Offload heavy SelVA components to CPU — only vocoder + mel_converter - # are needed for training. CLIP, synchformer, T5, generator sit on - # GPU doing nothing and eat tens of GiB otherwise. - for attr in ("clip_model", "synchformer", "text_encoder_t5"): - sub = getattr(feature_utils, attr, None) - if sub is not None: - sub.to("cpu") - if "generator" in model: - model["generator"].to("cpu") - # tod contains VAE + vocoder; VAE not needed but vocoder is a - # submodule we're about to train — move just the VAE part. - tod = feature_utils.tod - if hasattr(tod, "vae"): - tod.vae.to("cpu") - soft_empty_cache() - _result[0] = _do_train( vocoder, mel_converter, clips, device, dtype, strategy, feature_utils, @@ -971,11 +960,15 @@ def _do_train(vocoder, mel_converter, clips, if buf is not None: module._buffers[bname] = buf.clone() - # ── GAFilter injection (after inference-flag stripping) ────────────────── + # ── Move vocoder to training device/dtype ──────────────────────────────── + # After cloning, vocoder may be on CPU (offloaded before training). + vocoder.to(device, dtype) + + # ── GAFilter injection ───────────────────────────────────────────────── # GAFilter params are fresh tensors — no inference flag to strip. if use_gafilter: n_gaf = inject_gafilters(vocoder, gafilter_kernel_size) - vocoder.to(device, dtype) + vocoder.to(device, dtype) # ensure new GAFilter params match print(f"[BigVGAN] GAFilter injected: {n_gaf} filters kernel={gafilter_kernel_size}", flush=True) # ── Training mode: select which parameters to train ──────────────────────