chore: remove debug VRAM logging

Training confirmed working — VRAM usage is normal backward-pass activation memory, not a leak. Removed all debug _vram_log and _vram calls. Kept the video_enc offload and torch.cuda.empty_cache fixes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 01:50:08 +02:00
parent 4297715a08
commit 4226297735
1 changed files with 5 additions and 37 deletions
@@ -772,37 +772,21 @@ class SelvaBigvganTrainer:
        # Unload all other ComfyUI models (SelVA generator, etc.) to free VRAM
        # before starting training. BigVGAN + discriminator need the headroom.
        def _vram_log(label):
            if device.type == "cuda":
                alloc = torch.cuda.memory_allocated(device) / (1024**3)
                resrv = torch.cuda.memory_reserved(device) / (1024**3)
                free_cuda, total_cuda = torch.cuda.mem_get_info(device)
                used_driver = (total_cuda - free_cuda) / (1024**3)
                print(f"[BigVGAN VRAM] {label}: alloc={alloc:.2f} reserved={resrv:.2f} "
                      f"driver_used={used_driver:.2f} GiB", flush=True)
        _vram_log("before unload")
        comfy.model_management.unload_all_models()
        _vram_log("after unload_all_models")
        # Move EVERYTHING to CPU first, then bring back only what we need.
        # ComfyUI may have loaded the full model to GPU; unload_all_models
        # doesn't always free model dicts passed between nodes.
        feature_utils.to("cpu")
        _vram_log("after feature_utils.to(cpu)")
        if "generator" in model:
            model["generator"].to("cpu")
            _vram_log("after generator.to(cpu)")
        if "video_enc" in model:
            model["video_enc"].to("cpu")
            _vram_log("after video_enc.to(cpu)")
        soft_empty_cache()
        _vram_log("after soft_empty_cache")
        # Only move mel_converter to GPU — it's tiny and needed for training.
        # _pregenerate_lora_mels handles its own device management for CLIP/tod.
        mel_converter.to(device)
        _vram_log("after mel_converter.to(device)")
        # Pre-compute text CLIP embeddings in the main thread.
        # CLIP weights are inference tensors from ComfyUI loading — they only
@@ -1094,17 +1078,6 @@ def _do_train(vocoder, mel_converter, clips,
                  f"falling back to mel+STFT losses", flush=True)
            mpd = mrd = None
    # VRAM snapshot before training loop
    if device.type == "cuda":
        alloc = torch.cuda.memory_allocated(device) / (1024**3)
        resrv = torch.cuda.memory_reserved(device) / (1024**3)
        free_cuda, total_cuda = torch.cuda.mem_get_info(device)
        used_driver = (total_cuda - free_cuda) / (1024**3)
        print(f"[BigVGAN VRAM] before training: "
              f"pytorch_alloc={alloc:.2f} GiB, pytorch_reserved={resrv:.2f} GiB, "
              f"driver_used={used_driver:.2f} GiB, driver_total={total_cuda/(1024**3):.2f} GiB",
              flush=True)
    optimizer = torch.optim.AdamW(trainable_params, lr=lr, betas=(0.8, 0.99))
    vocoder.train()
@@ -1126,11 +1099,6 @@ def _do_train(vocoder, mel_converter, clips,
        print(f"[BigVGAN] LoRA mel cropping: {_mel_segment} mel frames "
              f"per {segment_samples} audio samples", flush=True)
    def _vram(label):
        if device.type == "cuda" and step < 1:
            a = torch.cuda.memory_allocated(device) / (1024**3)
            print(f"  [VRAM step0] {label}: {a:.2f} GiB", flush=True)
    try:
        for step in range(steps):
            if lora_mel_pairs:
@@ -1173,7 +1141,7 @@ def _do_train(vocoder, mel_converter, clips,
            # Clean target mel for mel loss (always from clean audio)
            with torch.no_grad():
                target_mel = mel_converter(target_flat.float())       # [B, n_mels, T_mel]
-            _vram("after target_mel")
+
            # Gradient checkpointing: recompute BigVGAN activations during
            # backward instead of storing them. The 512x upsampling stack
@@ -1183,14 +1151,14 @@ def _do_train(vocoder, mel_converter, clips,
            pred_wav = torch.utils.checkpoint.checkpoint(
                vocoder, input_mel.to(dtype), use_reentrant=False
            )                                                     # [B, 1, T_wav]
-            _vram("after vocoder forward")
+
            T = min(pred_wav.shape[-1], target_wav.shape[-1])
            pred_t   = pred_wav[...,  :T]
            target_t = target_wav[..., :T]
            # ── Compute loss ─────────────────────────────────────────────────
-            _vram("before loss")
+
            if mpd is not None and mrd is not None:
                # Perceptual feature matching via frozen discriminators
                with torch.no_grad():
@@ -1236,10 +1204,10 @@ def _do_train(vocoder, mel_converter, clips,
                l2sp_loss = l2sp_loss * lambda_l2sp
            loss = primary_loss + l2sp_loss
-            _vram("after loss computation")
+
            optimizer.zero_grad()
            loss.backward()
-            _vram("after backward")
+
            torch.nn.utils.clip_grad_norm_(trainable_params, 1.0)
            optimizer.step()