From 28ee3db337075a1ba3e6f92270ed0f5180fecae9 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Thu, 9 Apr 2026 00:07:57 +0200
Subject: [PATCH] feat(sampler): add ti_strength blend for TI injection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TI via text conditioning produces buzz because SelVA's text path is
mean-pooled into a global DiT bias — not rich per-token cross-attention
like SD. The optimizer learns a constant spectral artifact rather than
semantic style shift.

ti_strength=1.0 (default) = full injection as before.
ti_strength<1.0 = lerp between original and injected text_clip,
allowing the effect to be dialled back without retraining.
Applies to both text_clip and neg_text_clip symmetrically.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 nodes/selva_sampler.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/nodes/selva_sampler.py b/nodes/selva_sampler.py
index 134af12..14eab29 100644
--- a/nodes/selva_sampler.py
+++ b/nodes/selva_sampler.py
@@ -44,6 +44,11 @@ class SelvaSampler:
                     "tooltip": "Learned token embeddings from SelVA Textual Inversion Loader. "
                                "Injects style tokens into CLIP conditioning without modifying model weights.",
                 }),
+                "ti_strength": ("FLOAT", {
+                    "default": 1.0, "min": 0.0, "max": 1.0, "step": 0.05,
+                    "tooltip": "Blends between original CLIP conditioning (0.0) and full TI injection (1.0). "
+                               "Reduce toward 0.3–0.5 if TI produces buzz artifacts.",
+                }),
             },
         }
 
@@ -54,7 +59,7 @@ class SelvaSampler:
     CATEGORY = SELVA_CATEGORY
     DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance."
 
-    def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-27.0, textual_inversion=None):
+    def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-27.0, textual_inversion=None, ti_strength=1.0):
         import dataclasses
         from selva_core.model.flow_matching import FlowMatching
 
@@ -124,10 +129,12 @@ class SelvaSampler:
                     emb         = textual_inversion["embeddings"].to(device, dtype)  # [K, 1024]
                     K           = emb.shape[0]
                     inject_mode = textual_inversion.get("inject_mode", "suffix")
-                    text_clip   = _inject_tokens(text_clip, emb, K, inject_mode)
+                    ti_text     = _inject_tokens(text_clip, emb, K, inject_mode)
+                    text_clip   = torch.lerp(text_clip, ti_text, ti_strength)
                     if neg_text_clip is not None:
-                        neg_text_clip = _inject_tokens(neg_text_clip, emb, K, inject_mode)
-                    print(f"[SelVA] Textual inversion: {K} tokens  mode={inject_mode}",
+                        ti_neg    = _inject_tokens(neg_text_clip, emb, K, inject_mode)
+                        neg_text_clip = torch.lerp(neg_text_clip, ti_neg, ti_strength)
+                    print(f"[SelVA] Textual inversion: {K} tokens  mode={inject_mode}  strength={ti_strength}",
                           flush=True)
 
                 conditions = net_generator.preprocess_conditions(clip_f, sync_f, text_clip)