From 32e5344ea242bc956787527ac7892946257ca419 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 10 Apr 2026 01:10:58 +0200 Subject: [PATCH] fix: wrap CLIP encoding in inference_mode during pre-generation CLIP weights are inference tensors from ComfyUI loading. The worker thread runs without inference_mode, so PyTorch rejects inference tensors in multi_head_attention_forward (version counter tracking). Wrap the encode_text_clip call in torch.inference_mode() since text encoding doesn't need gradients. Co-Authored-By: Claude Opus 4.6 --- nodes/selva_bigvgan_trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index 97eb3ce..fa096db 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -529,7 +529,8 @@ def _pregenerate_lora_mels(model, data_dir, lora_adapter_path, device, dtype, prompt = prompt_map.get(npz_path.name, data.get("prompt", default_prompt)) if isinstance(prompt, np.ndarray): prompt = str(prompt) - text_clip = feature_utils.encode_text_clip([prompt]).to(device, dtype) + with torch.inference_mode(): + text_clip = feature_utils.encode_text_clip([prompt]).to(device, dtype) # Load clean audio try: