From 32e5344ea242bc956787527ac7892946257ca419 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Fri, 10 Apr 2026 01:10:58 +0200
Subject: [PATCH] fix: wrap CLIP encoding in inference_mode during
 pre-generation

CLIP weights are inference tensors from ComfyUI loading. The worker
thread runs without inference_mode, so PyTorch rejects inference tensors
in multi_head_attention_forward (version counter tracking). Wrap the
encode_text_clip call in torch.inference_mode() since text encoding
doesn't need gradients.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 nodes/selva_bigvgan_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py
index 97eb3ce..fa096db 100644
--- a/nodes/selva_bigvgan_trainer.py
+++ b/nodes/selva_bigvgan_trainer.py
@@ -529,7 +529,8 @@ def _pregenerate_lora_mels(model, data_dir, lora_adapter_path, device, dtype,
                 prompt = prompt_map.get(npz_path.name, data.get("prompt", default_prompt))
                 if isinstance(prompt, np.ndarray):
                     prompt = str(prompt)
-                text_clip = feature_utils.encode_text_clip([prompt]).to(device, dtype)
+                with torch.inference_mode():
+                    text_clip = feature_utils.encode_text_clip([prompt]).to(device, dtype)
 
                 # Load clean audio
                 try: