feat: add SelVA Textual Inversion Trainer and Loader nodes

Learns K CLIP token embeddings ([K, 1024]) with all model weights frozen,
keeping generated latents on the decoder's natural manifold — avoids the
quality degradation that affects LoRA on BJ's audio dataset.

- selva_textual_inversion_trainer.py: trains learned_tokens via AdamW,
  injects into last K positions of 77-token CLIP embedding, checkpoints
  with eval audio + spectral metrics
- selva_textual_inversion_loader.py: loads .pt bundle, returns
  TEXTUAL_INVERSION dict for sampler
- selva_sampler.py: optional textual_inversion input; injects into both
  text_clip and neg_text_clip before preprocess_conditions
- __init__.py: registers both new nodes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-08 23:01:44 +02:00
parent eed7eefeac
commit e56ece9c1c
4 changed files with 452 additions and 1 deletions
+17 -1
View File
@@ -39,6 +39,10 @@ class SelvaSampler:
"default": -27.0, "min": -40.0, "max": -6.0, "step": 1.0,
"tooltip": "Target RMS level in dBFS when normalize=True. -27 matches the measured RMS of LUFS-normalized training clips. Increase toward -20 for louder output.",
}),
"textual_inversion": ("TEXTUAL_INVERSION", {
"tooltip": "Learned token embeddings from SelVA Textual Inversion Loader. "
"Injects style tokens into CLIP conditioning without modifying model weights.",
}),
},
}
@@ -49,7 +53,7 @@ class SelvaSampler:
CATEGORY = SELVA_CATEGORY
DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance."
def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-27.0):
def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-27.0, textual_inversion=None):
import dataclasses
from selva_core.model.flow_matching import FlowMatching
@@ -114,6 +118,18 @@ class SelvaSampler:
neg_text_clip = feature_utils.encode_text_clip([negative_prompt]) \
if negative_prompt.strip() else None
# Inject textual inversion tokens into last K positions of CLIP embedding
if textual_inversion is not None:
emb = textual_inversion["embeddings"].to(device, dtype) # [K, 1024]
K = emb.shape[0]
text_clip = text_clip.clone()
text_clip[:, -K:, :] = emb.unsqueeze(0)
if neg_text_clip is not None:
neg_text_clip = neg_text_clip.clone()
neg_text_clip[:, -K:, :] = emb.unsqueeze(0)
print(f"[SelVA] Textual inversion: injected {K} tokens into CLIP conditioning",
flush=True)
conditions = net_generator.preprocess_conditions(clip_f, sync_f, text_clip)
empty_conditions = net_generator.get_empty_conditions(
bs=1, negative_text_features=neg_text_clip