From d70c611bf7b4ff622aae770a1075718ac32476df Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 10 Apr 2026 00:33:07 +0200 Subject: [PATCH] fix: offload CLIP, synchformer, T5, generator, VAE to CPU before training Only the vocoder and mel_converter are needed during BigVGAN training. The rest of the SelVA pipeline (CLIP ViT-H, synchformer, T5, generator, VAE) was staying on GPU and consuming ~90 GiB, leaving no room for backward pass activations. Now offloaded individually to CPU before the training loop starts. Co-Authored-By: Claude Opus 4.6 --- nodes/selva_bigvgan_trainer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index 8965aad..0ec9ee7 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -820,6 +820,22 @@ class SelvaBigvganTrainer: "files with matching audio files." ) + # Offload heavy SelVA components to CPU — only vocoder + mel_converter + # are needed for training. CLIP, synchformer, T5, generator sit on + # GPU doing nothing and eat tens of GiB otherwise. + for attr in ("clip_model", "synchformer", "text_encoder_t5"): + sub = getattr(feature_utils, attr, None) + if sub is not None: + sub.to("cpu") + if "generator" in model: + model["generator"].to("cpu") + # tod contains VAE + vocoder; VAE not needed but vocoder is a + # submodule we're about to train — move just the VAE part. + tod = feature_utils.tod + if hasattr(tod, "vae"): + tod.vae.to("cpu") + soft_empty_cache() + _result[0] = _do_train( vocoder, mel_converter, clips, device, dtype, strategy, feature_utils,