chore: sanitize tooltips/comments + add experiment configs

- Replace all BJ references with generic "target style/audio" in
  activation steering, DITTO optimizer, and BigVGAN trainer
- Add latent_mixup_alpha/latent_noise_sigma to LoRA scheduler defaults
- Add bigvgan_disc_fm_retest.json and lora_optimized_dataset.json

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 13:44:37 +02:00
parent 082a2da438
commit f745e241c4
6 changed files with 120 additions and 23 deletions
+13 -13
View File
@@ -1,15 +1,15 @@
"""SelVA Activation Steering Extractor.
Computes per-block steering vectors by running the frozen generator on the
training dataset and recording how BJ's conditioning shifts the DiT hidden
training dataset and recording how target style's conditioning shifts the DiT hidden
states vs. empty/unconditional conditioning.
For each block i:
steering[i] = mean(latent_hidden | BJ conditions)
steering[i] = mean(latent_hidden | target style conditions)
- mean(latent_hidden | empty conditions)
The resulting vectors are injected at inference time (via SelVA Sampler's
steering_strength input) to nudge the denoising trajectory toward BJ's
steering_strength input) to nudge the denoising trajectory toward target style's
activation patterns without modifying any model weights.
"""
@@ -58,7 +58,7 @@ class SelvaActivationSteeringExtractor:
"""Computes activation steering vectors from a training dataset.
Runs the frozen generator on N clips at random timesteps with both
BJ-conditioned and empty-conditioned inputs, then saves the mean
target style-conditioned and empty-conditioned inputs, then saves the mean
difference per DiT block to a .pt file.
"""
@@ -69,7 +69,7 @@ class SelvaActivationSteeringExtractor:
RETURN_NAMES = ("steering_path",)
OUTPUT_TOOLTIPS = ("Path to saved steering_vectors.pt — load with SelVA Activation Steering Loader.",)
DESCRIPTION = (
"Computes per-block activation steering vectors: mean(BJ activations) "
"Computes per-block activation steering vectors: mean(target style activations) "
"mean(empty activations) at each DiT block. Load the result with "
"SelVA Activation Steering Loader and connect to the Sampler."
)
@@ -124,7 +124,7 @@ class SelvaActivationSteeringExtractor:
indices = random.choices(range(len(dataset)), k=n_samples)
n_blocks = len(generator.joint_blocks) + len(generator.fused_blocks)
bj_sums = [None] * n_blocks
style_sums = [None] * n_blocks
empty_sums = [None] * n_blocks
counts = [0] * n_blocks
@@ -157,15 +157,15 @@ class SelvaActivationSteeringExtractor:
device=device, dtype=dtype,
)
bj_acts = _collect_activations(generator, conditions, latent, t_tensor)
style_acts = _collect_activations(generator, conditions, latent, t_tensor)
empty_acts = _collect_activations(generator, empty_conditions, latent, t_tensor)
for i, (bj, em) in enumerate(zip(bj_acts, empty_acts)):
if bj_sums[i] is None:
bj_sums[i] = bj.clone()
for i, (st, em) in enumerate(zip(style_acts, empty_acts)):
if style_sums[i] is None:
style_sums[i] = st.clone()
empty_sums[i] = em.clone()
else:
bj_sums[i] += bj
style_sums[i] += st
empty_sums[i] += em
counts[i] += 1
@@ -173,10 +173,10 @@ class SelvaActivationSteeringExtractor:
if (sample_i + 1) % 4 == 0 or sample_i == n_samples - 1:
print(f"[Steering] Processed {sample_i + 1}/{n_samples} clips", flush=True)
# Steering vector per block: mean(BJ) - mean(empty)
# Steering vector per block: mean(target style) - mean(empty)
steering_vectors = []
for i in range(n_blocks):
vec = (bj_sums[i] - empty_sums[i]) / counts[i] # [hidden]
vec = (style_sums[i] - empty_sums[i]) / counts[i] # [hidden]
steering_vectors.append(vec)
norm = vec.norm().item()
+2 -2
View File
@@ -593,7 +593,7 @@ class SelvaBigvganTrainer:
RETURN_NAMES = ("checkpoint_path",)
OUTPUT_TOOLTIPS = ("Path to saved vocoder checkpoint — load with SelVA BigVGAN Loader.",)
DESCRIPTION = (
"Fine-tunes the BigVGAN vocoder (mel→waveform) on BJ audio clips. "
"Fine-tunes the BigVGAN vocoder (mel→waveform) on target audio clips. "
"Default mode (snake_alpha_only) tunes only the ~5K Snake activation α "
"parameters — cannot cause harmonic smearing. Add a discriminator path "
"for perceptual feature matching loss. DiT and VAE stay frozen."
@@ -606,7 +606,7 @@ class SelvaBigvganTrainer:
"model": ("SELVA_MODEL",),
"data_dir": ("STRING", {
"default": "",
"tooltip": "Directory with BJ audio files (.wav/.flac/.mp3). Searched recursively.",
"tooltip": "Directory with target audio files (.wav/.flac/.mp3). Searched recursively.",
}),
"output_path": ("STRING", {
"default": "bigvgan_bj.pt",
+8 -8
View File
@@ -1,14 +1,14 @@
"""SelVA DITTO Optimizer.
Inference-time noise optimization: optimizes the initial noise latent x_0
using a style loss against BJ reference clips, backpropagating through the
using a style loss against target style reference clips, backpropagating through the
ODE solver. All model weights remain frozen — only x_0 changes.
Based on DITTO: Diffusion Inference-Time T-Optimization (arXiv:2401.12179,
ICML 2024 Oral). Adapted for SelVA's flow-matching Euler ODE.
Style loss: mel-spectrogram statistics matching (mean spectrum + Gram matrix)
against BJ reference clips. Runs entirely before the vocoder — optimization
against target style reference clips. Runs entirely before the vocoder — optimization
only requires the DiT + VAE decoder, not BigVGAN.
Memory strategy: gradient checkpointing at each ODE step — stores O(1 DiT
@@ -97,7 +97,7 @@ class SelvaDittoOptimizer:
"""DITTO inference-time noise optimization.
Freezes all model weights and optimizes only the initial noise latent x_0
to make the generated audio sound like the BJ reference clips.
to make the generated audio sound like the target style reference clips.
No training data or gradient updates to the model — per-video per-run.
"""
@@ -116,7 +116,7 @@ class SelvaDittoOptimizer:
}),
"reference_dir": ("STRING", {
"default": "",
"tooltip": "Directory with BJ reference audio files (.wav/.flac/.mp3). "
"tooltip": "Directory with target style reference audio files (.wav/.flac/.mp3). "
"Reference mel statistics are precomputed from these once.",
}),
"n_opt_steps": ("INT", {
@@ -143,8 +143,8 @@ class SelvaDittoOptimizer:
}),
"style_weight": ("FLOAT", {
"default": 0.1, "min": 0.0, "max": 10.0, "step": 0.05,
"tooltip": "Weight of the BJ style loss. High values push harder toward "
"BJ style but add noise. Start at 0.1 and increase slowly.",
"tooltip": "Weight of the target style style loss. High values push harder toward "
"target style style but add noise. Start at 0.1 and increase slowly.",
}),
"gram_weight": ("FLOAT", {
"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01,
@@ -176,12 +176,12 @@ class SelvaDittoOptimizer:
RETURN_TYPES = ("AUDIO",)
RETURN_NAMES = ("audio",)
OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward BJ style.",)
OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward target style style.",)
FUNCTION = "optimize"
CATEGORY = SELVA_CATEGORY
DESCRIPTION = (
"DITTO inference-time noise optimization (arXiv:2401.12179). "
"Optimizes the initial noise latent x_0 to match BJ reference clips "
"Optimizes the initial noise latent x_0 to match target style reference clips "
"via mel statistics style loss, backpropagating through the ODE. "
"All model weights frozen — zero quality degradation risk."
)
+2
View File
@@ -81,6 +81,8 @@ _PARAM_DEFAULTS = {
"lr_schedule": "constant",
"init_mode": "pissa",
"use_rslora": True,
"latent_mixup_alpha": 0.0,
"latent_noise_sigma": 0.0,
}
# Palette for comparison chart: one color per experiment (cycles if > 8)