From bd84242fa12f0722d1df9bb21543469f0e240851 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 10 Apr 2026 01:28:31 +0200 Subject: [PATCH] debug: add VRAM logging at offload and training checkpoints Logs torch.cuda.memory_allocated/reserved at each step: before unload, after unload_all_models, after feature_utils.to(cpu), after generator to(cpu), after cache clear, after mel_converter to(device), and before training loop. This will identify what's holding VRAM. Co-Authored-By: Claude Opus 4.6 --- nodes/selva_bigvgan_trainer.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index d8e6f81..c26455e 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -772,19 +772,32 @@ class SelvaBigvganTrainer: # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM # before starting training. BigVGAN + discriminator need the headroom. + def _vram_log(label): + if device.type == "cuda": + alloc = torch.cuda.memory_allocated(device) / (1024**3) + resrv = torch.cuda.memory_reserved(device) / (1024**3) + print(f"[BigVGAN VRAM] {label}: {alloc:.2f} GiB allocated, " + f"{resrv:.2f} GiB reserved", flush=True) + + _vram_log("before unload") comfy.model_management.unload_all_models() + _vram_log("after unload_all_models") # Move EVERYTHING to CPU first, then bring back only what we need. # ComfyUI may have loaded the full model to GPU; unload_all_models # doesn't always free model dicts passed between nodes. feature_utils.to("cpu") + _vram_log("after feature_utils.to(cpu)") if "generator" in model: model["generator"].to("cpu") + _vram_log("after generator.to(cpu)") soft_empty_cache() + _vram_log("after soft_empty_cache") # Only move mel_converter to GPU — it's tiny and needed for training. # _pregenerate_lora_mels handles its own device management for CLIP/tod. mel_converter.to(device) + _vram_log("after mel_converter.to(device)") # Pre-compute text CLIP embeddings in the main thread. # CLIP weights are inference tensors from ComfyUI loading — they only @@ -1070,6 +1083,13 @@ def _do_train(vocoder, mel_converter, clips, f"falling back to mel+STFT losses", flush=True) mpd = mrd = None + # VRAM snapshot before training loop + if device.type == "cuda": + alloc = torch.cuda.memory_allocated(device) / (1024**3) + resrv = torch.cuda.memory_reserved(device) / (1024**3) + print(f"[BigVGAN VRAM] before training: {alloc:.2f} GiB allocated, " + f"{resrv:.2f} GiB reserved", flush=True) + optimizer = torch.optim.AdamW(trainable_params, lr=lr, betas=(0.8, 0.99)) vocoder.train()