From 10a71b0c4f6cba4d537cbc3ac9410656ac5f7dce Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Fri, 10 Apr 2026 00:56:13 +0200
Subject: [PATCH] fix: offload entire model to CPU in main thread before worker
 starts

The previous offload ran inside the worker thread, but by then ComfyUI
had already loaded the full model to GPU. Now feature_utils.to('cpu')
and generator.to('cpu') run in the main thread right after
unload_all_models(), before the worker starts. vocoder.to(device, dtype)
is called explicitly after inference flag stripping in _do_train to
bring only the vocoder back to GPU.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nodes/selva_bigvgan_trainer.py | 35 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py
index f66d2cf..97eb3ce 100644
--- a/nodes/selva_bigvgan_trainer.py
+++ b/nodes/selva_bigvgan_trainer.py
@@ -783,12 +783,17 @@ class SelvaBigvganTrainer:
         # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM
         # before starting training. BigVGAN + discriminator need the headroom.
         comfy.model_management.unload_all_models()
+
+        # Move EVERYTHING to CPU first, then bring back only what we need.
+        # ComfyUI may have loaded the full model to GPU; unload_all_models
+        # doesn't always free model dicts passed between nodes.
+        feature_utils.to("cpu")
+        if "generator" in model:
+            model["generator"].to("cpu")
         soft_empty_cache()
 
         # Only move mel_converter to GPU — it's tiny and needed for training.
-        # The rest of feature_utils (CLIP, synchformer, T5, VAE) stays on CPU;
-        # _pregenerate_lora_mels handles its own device management for the parts
-        # it needs temporarily.
+        # _pregenerate_lora_mels handles its own device management for CLIP/tod.
         mel_converter.to(device)
 
         pbar = comfy.utils.ProgressBar(steps)
@@ -828,22 +833,6 @@ class SelvaBigvganTrainer:
                             "files with matching audio files."
                         )
 
-                # Offload heavy SelVA components to CPU — only vocoder + mel_converter
-                # are needed for training. CLIP, synchformer, T5, generator sit on
-                # GPU doing nothing and eat tens of GiB otherwise.
-                for attr in ("clip_model", "synchformer", "text_encoder_t5"):
-                    sub = getattr(feature_utils, attr, None)
-                    if sub is not None:
-                        sub.to("cpu")
-                if "generator" in model:
-                    model["generator"].to("cpu")
-                # tod contains VAE + vocoder; VAE not needed but vocoder is a
-                # submodule we're about to train — move just the VAE part.
-                tod = feature_utils.tod
-                if hasattr(tod, "vae"):
-                    tod.vae.to("cpu")
-                soft_empty_cache()
-
                 _result[0] = _do_train(
                     vocoder, mel_converter, clips,
                     device, dtype, strategy, feature_utils,
@@ -971,11 +960,15 @@ def _do_train(vocoder, mel_converter, clips,
             if buf is not None:
                 module._buffers[bname] = buf.clone()
 
-    # ── GAFilter injection (after inference-flag stripping) ──────────────────
+    # ── Move vocoder to training device/dtype ────────────────────────────────
+    # After cloning, vocoder may be on CPU (offloaded before training).
+    vocoder.to(device, dtype)
+
+    # ── GAFilter injection ─────────────────────────────────────────────────
     # GAFilter params are fresh tensors — no inference flag to strip.
     if use_gafilter:
         n_gaf = inject_gafilters(vocoder, gafilter_kernel_size)
-        vocoder.to(device, dtype)
+        vocoder.to(device, dtype)  # ensure new GAFilter params match
         print(f"[BigVGAN] GAFilter injected: {n_gaf} filters  kernel={gafilter_kernel_size}", flush=True)
 
     # ── Training mode: select which parameters to train ──────────────────────