fix: offload entire model to CPU in main thread before worker starts

The previous offload ran inside the worker thread, but by then ComfyUI had already loaded the full model to GPU. Now feature_utils.to('cpu') and generator.to('cpu') run in the main thread right after unload_all_models(), before the worker starts. vocoder.to(device, dtype) is called explicitly after inference flag stripping in _do_train to bring only the vocoder back to GPU. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 00:56:13 +02:00
parent 37a27160aa
commit 10a71b0c4f
1 changed files with 14 additions and 21 deletions
@@ -783,12 +783,17 @@ class SelvaBigvganTrainer:
        # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM
        # before starting training. BigVGAN + discriminator need the headroom.
        comfy.model_management.unload_all_models()
+
+        # Move EVERYTHING to CPU first, then bring back only what we need.
+        # ComfyUI may have loaded the full model to GPU; unload_all_models
+        # doesn't always free model dicts passed between nodes.
+        feature_utils.to("cpu")
+        if "generator" in model:
+            model["generator"].to("cpu")
        soft_empty_cache()

        # Only move mel_converter to GPU — it's tiny and needed for training.
-        # The rest of feature_utils (CLIP, synchformer, T5, VAE) stays on CPU;
-        # _pregenerate_lora_mels handles its own device management for the parts
-        # it needs temporarily.
+        # _pregenerate_lora_mels handles its own device management for CLIP/tod.
        mel_converter.to(device)

        pbar = comfy.utils.ProgressBar(steps)
@@ -828,22 +833,6 @@ class SelvaBigvganTrainer:
                            "files with matching audio files."
                        )

-                # Offload heavy SelVA components to CPU — only vocoder + mel_converter
-                # are needed for training. CLIP, synchformer, T5, generator sit on
-                # GPU doing nothing and eat tens of GiB otherwise.
-                for attr in ("clip_model", "synchformer", "text_encoder_t5"):
-                    sub = getattr(feature_utils, attr, None)
-                    if sub is not None:
-                        sub.to("cpu")
-                if "generator" in model:
-                    model["generator"].to("cpu")
-                # tod contains VAE + vocoder; VAE not needed but vocoder is a
-                # submodule we're about to train — move just the VAE part.
-                tod = feature_utils.tod
-                if hasattr(tod, "vae"):
-                    tod.vae.to("cpu")
-                soft_empty_cache()
-
                _result[0] = _do_train(
                    vocoder, mel_converter, clips,
                    device, dtype, strategy, feature_utils,
@@ -971,11 +960,15 @@ def _do_train(vocoder, mel_converter, clips,
            if buf is not None:
                module._buffers[bname] = buf.clone()

-    # ── GAFilter injection (after inference-flag stripping) ──────────────────
+    # ── Move vocoder to training device/dtype ────────────────────────────────
+    # After cloning, vocoder may be on CPU (offloaded before training).
+    vocoder.to(device, dtype)
+
+    # ── GAFilter injection ─────────────────────────────────────────────────
    # GAFilter params are fresh tensors — no inference flag to strip.
    if use_gafilter:
        n_gaf = inject_gafilters(vocoder, gafilter_kernel_size)
-        vocoder.to(device, dtype)
+        vocoder.to(device, dtype)  # ensure new GAFilter params match
        print(f"[BigVGAN] GAFilter injected: {n_gaf} filters  kernel={gafilter_kernel_size}", flush=True)

    # ── Training mode: select which parameters to train ──────────────────────