chore: sanitize tooltips/comments + add experiment configs

- Replace all BJ references with generic "target style/audio" in
  activation steering, DITTO optimizer, and BigVGAN trainer
- Add latent_mixup_alpha/latent_noise_sigma to LoRA scheduler defaults
- Add bigvgan_disc_fm_retest.json and lora_optimized_dataset.json

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 13:44:37 +02:00
parent 082a2da438
commit f745e241c4
6 changed files with 120 additions and 23 deletions
+31
View File
@@ -0,0 +1,31 @@
{
"name": "bigvgan_disc_fm_retest",
"description": "Retest discriminator feature matching after bfloat16 dtype fix. Uses optimal config from overnight sweep (snake_alpha, GAFilter, lr=1e-4, phase=1.0, L2-SP=1e-3, 5000 steps).",
"data_dir": "/media/unraid/davinci/Selva/BJ/features",
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/bigvgan_disc_fm_retest",
"base": {
"train_mode": "snake_alpha_only",
"steps": 5000,
"lr": 1e-4,
"batch_size": 8,
"segment_seconds": 0.5,
"lambda_l2sp": 1e-3,
"use_gafilter": true,
"gafilter_kernel_size": 9,
"lambda_phase": 1.0,
"save_every": 1000,
"seed": 42,
"lora_adapter": "/media/unraid/davinci/Selva/BJ/experiment/pissa_sweep/standard_baseline/adapter_final.pt"
},
"experiments": [
{
"id": "snake_5k_control",
"description": "Control: best config from overnight sweep without discriminator. Baseline for A/B comparison."
},
{
"id": "disc_fm_5k",
"description": "Discriminator feature matching at 5k steps. Tests if perceptual FM loss improves over mel+phase alone.",
"discriminator_path": "/media/unraid/davinci/Selva/BJ/experiment/bigvgan_discriminator_optimizer.pt"
}
]
}
+64
View File
@@ -0,0 +1,64 @@
{
"name": "lora_optimized_dataset",
"description": "LoRA training on optimized dataset (134 clips: resampled 44.1kHz, LUFS-normalized, spectral matched, HF smoothed, gain-augmented). Tests latent augmentation and schedule variants on top of known-best config (PiSSA, rank=128, lr=3e-4).",
"data_dir": "/media/unraid/davinci/Selva/BJ/features_v2_improved/",
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/lora_optimized_dataset",
"base": {
"rank": 128,
"lr": 3e-4,
"steps": 5000,
"batch_size": 4,
"warmup_steps": 100,
"save_every": 1000,
"seed": 42,
"init_mode": "pissa",
"use_rslora": true,
"target": "attn.qkv",
"timestep_mode": "uniform",
"lr_schedule": "constant"
},
"experiments": [
{
"id": "baseline",
"description": "Control: known-best config (PiSSA r128 lr=3e-4) on the optimized dataset. No latent augmentation."
},
{
"id": "latent_mixup",
"description": "Latent mixup alpha=0.4 (MusicLDM). Tests if mixing training latents reduces memorization on 134 clips.",
"latent_mixup_alpha": 0.4
},
{
"id": "latent_noise",
"description": "Latent noise sigma=0.02. Mild Gaussian noise on training latents for regularization.",
"latent_noise_sigma": 0.02
},
{
"id": "mixup_and_noise",
"description": "Both latent mixup (0.4) and noise (0.02). Combined regularization.",
"latent_mixup_alpha": 0.4,
"latent_noise_sigma": 0.02
},
{
"id": "cosine_schedule",
"description": "Cosine LR decay. lr=3e-4 was stable with constant, but cosine may extract more from 5k steps.",
"lr_schedule": "cosine"
},
{
"id": "cosine_mixup",
"description": "Cosine LR + latent mixup. Best regularization combo candidate.",
"lr_schedule": "cosine",
"latent_mixup_alpha": 0.4
},
{
"id": "logit_normal",
"description": "Logit-normal timestep sampling (sigma=1.0). Concentrates training near t=0.5 where flow matching is hardest.",
"timestep_mode": "logit_normal"
},
{
"id": "curriculum_mixup",
"description": "Curriculum timesteps (logit_normal first 60%, then uniform) + latent mixup. Full regularization stack.",
"timestep_mode": "curriculum",
"latent_mixup_alpha": 0.4
}
]
}
+13 -13
View File
@@ -1,15 +1,15 @@
"""SelVA Activation Steering Extractor. """SelVA Activation Steering Extractor.
Computes per-block steering vectors by running the frozen generator on the Computes per-block steering vectors by running the frozen generator on the
training dataset and recording how BJ's conditioning shifts the DiT hidden training dataset and recording how target style's conditioning shifts the DiT hidden
states vs. empty/unconditional conditioning. states vs. empty/unconditional conditioning.
For each block i: For each block i:
steering[i] = mean(latent_hidden | BJ conditions) steering[i] = mean(latent_hidden | target style conditions)
- mean(latent_hidden | empty conditions) - mean(latent_hidden | empty conditions)
The resulting vectors are injected at inference time (via SelVA Sampler's The resulting vectors are injected at inference time (via SelVA Sampler's
steering_strength input) to nudge the denoising trajectory toward BJ's steering_strength input) to nudge the denoising trajectory toward target style's
activation patterns without modifying any model weights. activation patterns without modifying any model weights.
""" """
@@ -58,7 +58,7 @@ class SelvaActivationSteeringExtractor:
"""Computes activation steering vectors from a training dataset. """Computes activation steering vectors from a training dataset.
Runs the frozen generator on N clips at random timesteps with both Runs the frozen generator on N clips at random timesteps with both
BJ-conditioned and empty-conditioned inputs, then saves the mean target style-conditioned and empty-conditioned inputs, then saves the mean
difference per DiT block to a .pt file. difference per DiT block to a .pt file.
""" """
@@ -69,7 +69,7 @@ class SelvaActivationSteeringExtractor:
RETURN_NAMES = ("steering_path",) RETURN_NAMES = ("steering_path",)
OUTPUT_TOOLTIPS = ("Path to saved steering_vectors.pt — load with SelVA Activation Steering Loader.",) OUTPUT_TOOLTIPS = ("Path to saved steering_vectors.pt — load with SelVA Activation Steering Loader.",)
DESCRIPTION = ( DESCRIPTION = (
"Computes per-block activation steering vectors: mean(BJ activations) " "Computes per-block activation steering vectors: mean(target style activations) "
"mean(empty activations) at each DiT block. Load the result with " "mean(empty activations) at each DiT block. Load the result with "
"SelVA Activation Steering Loader and connect to the Sampler." "SelVA Activation Steering Loader and connect to the Sampler."
) )
@@ -124,7 +124,7 @@ class SelvaActivationSteeringExtractor:
indices = random.choices(range(len(dataset)), k=n_samples) indices = random.choices(range(len(dataset)), k=n_samples)
n_blocks = len(generator.joint_blocks) + len(generator.fused_blocks) n_blocks = len(generator.joint_blocks) + len(generator.fused_blocks)
bj_sums = [None] * n_blocks style_sums = [None] * n_blocks
empty_sums = [None] * n_blocks empty_sums = [None] * n_blocks
counts = [0] * n_blocks counts = [0] * n_blocks
@@ -157,15 +157,15 @@ class SelvaActivationSteeringExtractor:
device=device, dtype=dtype, device=device, dtype=dtype,
) )
bj_acts = _collect_activations(generator, conditions, latent, t_tensor) style_acts = _collect_activations(generator, conditions, latent, t_tensor)
empty_acts = _collect_activations(generator, empty_conditions, latent, t_tensor) empty_acts = _collect_activations(generator, empty_conditions, latent, t_tensor)
for i, (bj, em) in enumerate(zip(bj_acts, empty_acts)): for i, (st, em) in enumerate(zip(style_acts, empty_acts)):
if bj_sums[i] is None: if style_sums[i] is None:
bj_sums[i] = bj.clone() style_sums[i] = st.clone()
empty_sums[i] = em.clone() empty_sums[i] = em.clone()
else: else:
bj_sums[i] += bj style_sums[i] += st
empty_sums[i] += em empty_sums[i] += em
counts[i] += 1 counts[i] += 1
@@ -173,10 +173,10 @@ class SelvaActivationSteeringExtractor:
if (sample_i + 1) % 4 == 0 or sample_i == n_samples - 1: if (sample_i + 1) % 4 == 0 or sample_i == n_samples - 1:
print(f"[Steering] Processed {sample_i + 1}/{n_samples} clips", flush=True) print(f"[Steering] Processed {sample_i + 1}/{n_samples} clips", flush=True)
# Steering vector per block: mean(BJ) - mean(empty) # Steering vector per block: mean(target style) - mean(empty)
steering_vectors = [] steering_vectors = []
for i in range(n_blocks): for i in range(n_blocks):
vec = (bj_sums[i] - empty_sums[i]) / counts[i] # [hidden] vec = (style_sums[i] - empty_sums[i]) / counts[i] # [hidden]
steering_vectors.append(vec) steering_vectors.append(vec)
norm = vec.norm().item() norm = vec.norm().item()
+2 -2
View File
@@ -593,7 +593,7 @@ class SelvaBigvganTrainer:
RETURN_NAMES = ("checkpoint_path",) RETURN_NAMES = ("checkpoint_path",)
OUTPUT_TOOLTIPS = ("Path to saved vocoder checkpoint — load with SelVA BigVGAN Loader.",) OUTPUT_TOOLTIPS = ("Path to saved vocoder checkpoint — load with SelVA BigVGAN Loader.",)
DESCRIPTION = ( DESCRIPTION = (
"Fine-tunes the BigVGAN vocoder (mel→waveform) on BJ audio clips. " "Fine-tunes the BigVGAN vocoder (mel→waveform) on target audio clips. "
"Default mode (snake_alpha_only) tunes only the ~5K Snake activation α " "Default mode (snake_alpha_only) tunes only the ~5K Snake activation α "
"parameters — cannot cause harmonic smearing. Add a discriminator path " "parameters — cannot cause harmonic smearing. Add a discriminator path "
"for perceptual feature matching loss. DiT and VAE stay frozen." "for perceptual feature matching loss. DiT and VAE stay frozen."
@@ -606,7 +606,7 @@ class SelvaBigvganTrainer:
"model": ("SELVA_MODEL",), "model": ("SELVA_MODEL",),
"data_dir": ("STRING", { "data_dir": ("STRING", {
"default": "", "default": "",
"tooltip": "Directory with BJ audio files (.wav/.flac/.mp3). Searched recursively.", "tooltip": "Directory with target audio files (.wav/.flac/.mp3). Searched recursively.",
}), }),
"output_path": ("STRING", { "output_path": ("STRING", {
"default": "bigvgan_bj.pt", "default": "bigvgan_bj.pt",
+8 -8
View File
@@ -1,14 +1,14 @@
"""SelVA DITTO Optimizer. """SelVA DITTO Optimizer.
Inference-time noise optimization: optimizes the initial noise latent x_0 Inference-time noise optimization: optimizes the initial noise latent x_0
using a style loss against BJ reference clips, backpropagating through the using a style loss against target style reference clips, backpropagating through the
ODE solver. All model weights remain frozen — only x_0 changes. ODE solver. All model weights remain frozen — only x_0 changes.
Based on DITTO: Diffusion Inference-Time T-Optimization (arXiv:2401.12179, Based on DITTO: Diffusion Inference-Time T-Optimization (arXiv:2401.12179,
ICML 2024 Oral). Adapted for SelVA's flow-matching Euler ODE. ICML 2024 Oral). Adapted for SelVA's flow-matching Euler ODE.
Style loss: mel-spectrogram statistics matching (mean spectrum + Gram matrix) Style loss: mel-spectrogram statistics matching (mean spectrum + Gram matrix)
against BJ reference clips. Runs entirely before the vocoder — optimization against target style reference clips. Runs entirely before the vocoder — optimization
only requires the DiT + VAE decoder, not BigVGAN. only requires the DiT + VAE decoder, not BigVGAN.
Memory strategy: gradient checkpointing at each ODE step — stores O(1 DiT Memory strategy: gradient checkpointing at each ODE step — stores O(1 DiT
@@ -97,7 +97,7 @@ class SelvaDittoOptimizer:
"""DITTO inference-time noise optimization. """DITTO inference-time noise optimization.
Freezes all model weights and optimizes only the initial noise latent x_0 Freezes all model weights and optimizes only the initial noise latent x_0
to make the generated audio sound like the BJ reference clips. to make the generated audio sound like the target style reference clips.
No training data or gradient updates to the model — per-video per-run. No training data or gradient updates to the model — per-video per-run.
""" """
@@ -116,7 +116,7 @@ class SelvaDittoOptimizer:
}), }),
"reference_dir": ("STRING", { "reference_dir": ("STRING", {
"default": "", "default": "",
"tooltip": "Directory with BJ reference audio files (.wav/.flac/.mp3). " "tooltip": "Directory with target style reference audio files (.wav/.flac/.mp3). "
"Reference mel statistics are precomputed from these once.", "Reference mel statistics are precomputed from these once.",
}), }),
"n_opt_steps": ("INT", { "n_opt_steps": ("INT", {
@@ -143,8 +143,8 @@ class SelvaDittoOptimizer:
}), }),
"style_weight": ("FLOAT", { "style_weight": ("FLOAT", {
"default": 0.1, "min": 0.0, "max": 10.0, "step": 0.05, "default": 0.1, "min": 0.0, "max": 10.0, "step": 0.05,
"tooltip": "Weight of the BJ style loss. High values push harder toward " "tooltip": "Weight of the target style style loss. High values push harder toward "
"BJ style but add noise. Start at 0.1 and increase slowly.", "target style style but add noise. Start at 0.1 and increase slowly.",
}), }),
"gram_weight": ("FLOAT", { "gram_weight": ("FLOAT", {
"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01, "default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01,
@@ -176,12 +176,12 @@ class SelvaDittoOptimizer:
RETURN_TYPES = ("AUDIO",) RETURN_TYPES = ("AUDIO",)
RETURN_NAMES = ("audio",) RETURN_NAMES = ("audio",)
OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward BJ style.",) OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward target style style.",)
FUNCTION = "optimize" FUNCTION = "optimize"
CATEGORY = SELVA_CATEGORY CATEGORY = SELVA_CATEGORY
DESCRIPTION = ( DESCRIPTION = (
"DITTO inference-time noise optimization (arXiv:2401.12179). " "DITTO inference-time noise optimization (arXiv:2401.12179). "
"Optimizes the initial noise latent x_0 to match BJ reference clips " "Optimizes the initial noise latent x_0 to match target style reference clips "
"via mel statistics style loss, backpropagating through the ODE. " "via mel statistics style loss, backpropagating through the ODE. "
"All model weights frozen — zero quality degradation risk." "All model weights frozen — zero quality degradation risk."
) )
+2
View File
@@ -81,6 +81,8 @@ _PARAM_DEFAULTS = {
"lr_schedule": "constant", "lr_schedule": "constant",
"init_mode": "pissa", "init_mode": "pissa",
"use_rslora": True, "use_rslora": True,
"latent_mixup_alpha": 0.0,
"latent_noise_sigma": 0.0,
} }
# Palette for comparison chart: one color per experiment (cycles if > 8) # Palette for comparison chart: one color per experiment (cycles if > 8)