diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py
index d8e6f81..c26455e 100644
--- a/nodes/selva_bigvgan_trainer.py
+++ b/nodes/selva_bigvgan_trainer.py
@@ -772,19 +772,32 @@ class SelvaBigvganTrainer:
 
         # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM
         # before starting training. BigVGAN + discriminator need the headroom.
+        def _vram_log(label):
+            if device.type == "cuda":
+                alloc = torch.cuda.memory_allocated(device) / (1024**3)
+                resrv = torch.cuda.memory_reserved(device) / (1024**3)
+                print(f"[BigVGAN VRAM] {label}: {alloc:.2f} GiB allocated, "
+                      f"{resrv:.2f} GiB reserved", flush=True)
+
+        _vram_log("before unload")
         comfy.model_management.unload_all_models()
+        _vram_log("after unload_all_models")
 
         # Move EVERYTHING to CPU first, then bring back only what we need.
         # ComfyUI may have loaded the full model to GPU; unload_all_models
         # doesn't always free model dicts passed between nodes.
         feature_utils.to("cpu")
+        _vram_log("after feature_utils.to(cpu)")
         if "generator" in model:
             model["generator"].to("cpu")
+            _vram_log("after generator.to(cpu)")
         soft_empty_cache()
+        _vram_log("after soft_empty_cache")
 
         # Only move mel_converter to GPU — it's tiny and needed for training.
         # _pregenerate_lora_mels handles its own device management for CLIP/tod.
         mel_converter.to(device)
+        _vram_log("after mel_converter.to(device)")
 
         # Pre-compute text CLIP embeddings in the main thread.
         # CLIP weights are inference tensors from ComfyUI loading — they only
@@ -1070,6 +1083,13 @@ def _do_train(vocoder, mel_converter, clips,
                   f"falling back to mel+STFT losses", flush=True)
             mpd = mrd = None
 
+    # VRAM snapshot before training loop
+    if device.type == "cuda":
+        alloc = torch.cuda.memory_allocated(device) / (1024**3)
+        resrv = torch.cuda.memory_reserved(device) / (1024**3)
+        print(f"[BigVGAN VRAM] before training: {alloc:.2f} GiB allocated, "
+              f"{resrv:.2f} GiB reserved", flush=True)
+
     optimizer = torch.optim.AdamW(trainable_params, lr=lr, betas=(0.8, 0.99))
     vocoder.train()