From d70c611bf7b4ff622aae770a1075718ac32476df Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Fri, 10 Apr 2026 00:33:07 +0200
Subject: [PATCH] fix: offload CLIP, synchformer, T5, generator, VAE to CPU
 before training

Only the vocoder and mel_converter are needed during BigVGAN training.
The rest of the SelVA pipeline (CLIP ViT-H, synchformer, T5, generator,
VAE) was staying on GPU and consuming ~90 GiB, leaving no room for
backward pass activations. Now offloaded individually to CPU before
the training loop starts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nodes/selva_bigvgan_trainer.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py
index 8965aad..0ec9ee7 100644
--- a/nodes/selva_bigvgan_trainer.py
+++ b/nodes/selva_bigvgan_trainer.py
@@ -820,6 +820,22 @@ class SelvaBigvganTrainer:
                             "files with matching audio files."
                         )
 
+                # Offload heavy SelVA components to CPU — only vocoder + mel_converter
+                # are needed for training. CLIP, synchformer, T5, generator sit on
+                # GPU doing nothing and eat tens of GiB otherwise.
+                for attr in ("clip_model", "synchformer", "text_encoder_t5"):
+                    sub = getattr(feature_utils, attr, None)
+                    if sub is not None:
+                        sub.to("cpu")
+                if "generator" in model:
+                    model["generator"].to("cpu")
+                # tod contains VAE + vocoder; VAE not needed but vocoder is a
+                # submodule we're about to train — move just the VAE part.
+                tod = feature_utils.tod
+                if hasattr(tod, "vae"):
+                    tod.vae.to("cpu")
+                soft_empty_cache()
+
                 _result[0] = _do_train(
                     vocoder, mel_converter, clips,
                     device, dtype, strategy, feature_utils,