chore: sanitize tooltips/comments + add experiment configs
- Replace all BJ references with generic "target style/audio" in activation steering, DITTO optimizer, and BigVGAN trainer - Add latent_mixup_alpha/latent_noise_sigma to LoRA scheduler defaults - Add bigvgan_disc_fm_retest.json and lora_optimized_dataset.json Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"name": "bigvgan_disc_fm_retest",
|
||||||
|
"description": "Retest discriminator feature matching after bfloat16 dtype fix. Uses optimal config from overnight sweep (snake_alpha, GAFilter, lr=1e-4, phase=1.0, L2-SP=1e-3, 5000 steps).",
|
||||||
|
"data_dir": "/media/unraid/davinci/Selva/BJ/features",
|
||||||
|
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/bigvgan_disc_fm_retest",
|
||||||
|
"base": {
|
||||||
|
"train_mode": "snake_alpha_only",
|
||||||
|
"steps": 5000,
|
||||||
|
"lr": 1e-4,
|
||||||
|
"batch_size": 8,
|
||||||
|
"segment_seconds": 0.5,
|
||||||
|
"lambda_l2sp": 1e-3,
|
||||||
|
"use_gafilter": true,
|
||||||
|
"gafilter_kernel_size": 9,
|
||||||
|
"lambda_phase": 1.0,
|
||||||
|
"save_every": 1000,
|
||||||
|
"seed": 42,
|
||||||
|
"lora_adapter": "/media/unraid/davinci/Selva/BJ/experiment/pissa_sweep/standard_baseline/adapter_final.pt"
|
||||||
|
},
|
||||||
|
"experiments": [
|
||||||
|
{
|
||||||
|
"id": "snake_5k_control",
|
||||||
|
"description": "Control: best config from overnight sweep without discriminator. Baseline for A/B comparison."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "disc_fm_5k",
|
||||||
|
"description": "Discriminator feature matching at 5k steps. Tests if perceptual FM loss improves over mel+phase alone.",
|
||||||
|
"discriminator_path": "/media/unraid/davinci/Selva/BJ/experiment/bigvgan_discriminator_optimizer.pt"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
{
|
||||||
|
"name": "lora_optimized_dataset",
|
||||||
|
"description": "LoRA training on optimized dataset (134 clips: resampled 44.1kHz, LUFS-normalized, spectral matched, HF smoothed, gain-augmented). Tests latent augmentation and schedule variants on top of known-best config (PiSSA, rank=128, lr=3e-4).",
|
||||||
|
"data_dir": "/media/unraid/davinci/Selva/BJ/features_v2_improved/",
|
||||||
|
"output_root": "/media/unraid/davinci/Selva/BJ/experiment/lora_optimized_dataset",
|
||||||
|
"base": {
|
||||||
|
"rank": 128,
|
||||||
|
"lr": 3e-4,
|
||||||
|
"steps": 5000,
|
||||||
|
"batch_size": 4,
|
||||||
|
"warmup_steps": 100,
|
||||||
|
"save_every": 1000,
|
||||||
|
"seed": 42,
|
||||||
|
"init_mode": "pissa",
|
||||||
|
"use_rslora": true,
|
||||||
|
"target": "attn.qkv",
|
||||||
|
"timestep_mode": "uniform",
|
||||||
|
"lr_schedule": "constant"
|
||||||
|
},
|
||||||
|
"experiments": [
|
||||||
|
{
|
||||||
|
"id": "baseline",
|
||||||
|
"description": "Control: known-best config (PiSSA r128 lr=3e-4) on the optimized dataset. No latent augmentation."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "latent_mixup",
|
||||||
|
"description": "Latent mixup alpha=0.4 (MusicLDM). Tests if mixing training latents reduces memorization on 134 clips.",
|
||||||
|
"latent_mixup_alpha": 0.4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "latent_noise",
|
||||||
|
"description": "Latent noise sigma=0.02. Mild Gaussian noise on training latents for regularization.",
|
||||||
|
"latent_noise_sigma": 0.02
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "mixup_and_noise",
|
||||||
|
"description": "Both latent mixup (0.4) and noise (0.02). Combined regularization.",
|
||||||
|
"latent_mixup_alpha": 0.4,
|
||||||
|
"latent_noise_sigma": 0.02
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "cosine_schedule",
|
||||||
|
"description": "Cosine LR decay. lr=3e-4 was stable with constant, but cosine may extract more from 5k steps.",
|
||||||
|
"lr_schedule": "cosine"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "cosine_mixup",
|
||||||
|
"description": "Cosine LR + latent mixup. Best regularization combo candidate.",
|
||||||
|
"lr_schedule": "cosine",
|
||||||
|
"latent_mixup_alpha": 0.4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "logit_normal",
|
||||||
|
"description": "Logit-normal timestep sampling (sigma=1.0). Concentrates training near t=0.5 where flow matching is hardest.",
|
||||||
|
"timestep_mode": "logit_normal"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "curriculum_mixup",
|
||||||
|
"description": "Curriculum timesteps (logit_normal first 60%, then uniform) + latent mixup. Full regularization stack.",
|
||||||
|
"timestep_mode": "curriculum",
|
||||||
|
"latent_mixup_alpha": 0.4
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -1,15 +1,15 @@
|
|||||||
"""SelVA Activation Steering Extractor.
|
"""SelVA Activation Steering Extractor.
|
||||||
|
|
||||||
Computes per-block steering vectors by running the frozen generator on the
|
Computes per-block steering vectors by running the frozen generator on the
|
||||||
training dataset and recording how BJ's conditioning shifts the DiT hidden
|
training dataset and recording how target style's conditioning shifts the DiT hidden
|
||||||
states vs. empty/unconditional conditioning.
|
states vs. empty/unconditional conditioning.
|
||||||
|
|
||||||
For each block i:
|
For each block i:
|
||||||
steering[i] = mean(latent_hidden | BJ conditions)
|
steering[i] = mean(latent_hidden | target style conditions)
|
||||||
- mean(latent_hidden | empty conditions)
|
- mean(latent_hidden | empty conditions)
|
||||||
|
|
||||||
The resulting vectors are injected at inference time (via SelVA Sampler's
|
The resulting vectors are injected at inference time (via SelVA Sampler's
|
||||||
steering_strength input) to nudge the denoising trajectory toward BJ's
|
steering_strength input) to nudge the denoising trajectory toward target style's
|
||||||
activation patterns without modifying any model weights.
|
activation patterns without modifying any model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ class SelvaActivationSteeringExtractor:
|
|||||||
"""Computes activation steering vectors from a training dataset.
|
"""Computes activation steering vectors from a training dataset.
|
||||||
|
|
||||||
Runs the frozen generator on N clips at random timesteps with both
|
Runs the frozen generator on N clips at random timesteps with both
|
||||||
BJ-conditioned and empty-conditioned inputs, then saves the mean
|
target style-conditioned and empty-conditioned inputs, then saves the mean
|
||||||
difference per DiT block to a .pt file.
|
difference per DiT block to a .pt file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -69,7 +69,7 @@ class SelvaActivationSteeringExtractor:
|
|||||||
RETURN_NAMES = ("steering_path",)
|
RETURN_NAMES = ("steering_path",)
|
||||||
OUTPUT_TOOLTIPS = ("Path to saved steering_vectors.pt — load with SelVA Activation Steering Loader.",)
|
OUTPUT_TOOLTIPS = ("Path to saved steering_vectors.pt — load with SelVA Activation Steering Loader.",)
|
||||||
DESCRIPTION = (
|
DESCRIPTION = (
|
||||||
"Computes per-block activation steering vectors: mean(BJ activations) − "
|
"Computes per-block activation steering vectors: mean(target style activations) − "
|
||||||
"mean(empty activations) at each DiT block. Load the result with "
|
"mean(empty activations) at each DiT block. Load the result with "
|
||||||
"SelVA Activation Steering Loader and connect to the Sampler."
|
"SelVA Activation Steering Loader and connect to the Sampler."
|
||||||
)
|
)
|
||||||
@@ -124,7 +124,7 @@ class SelvaActivationSteeringExtractor:
|
|||||||
indices = random.choices(range(len(dataset)), k=n_samples)
|
indices = random.choices(range(len(dataset)), k=n_samples)
|
||||||
|
|
||||||
n_blocks = len(generator.joint_blocks) + len(generator.fused_blocks)
|
n_blocks = len(generator.joint_blocks) + len(generator.fused_blocks)
|
||||||
bj_sums = [None] * n_blocks
|
style_sums = [None] * n_blocks
|
||||||
empty_sums = [None] * n_blocks
|
empty_sums = [None] * n_blocks
|
||||||
counts = [0] * n_blocks
|
counts = [0] * n_blocks
|
||||||
|
|
||||||
@@ -157,15 +157,15 @@ class SelvaActivationSteeringExtractor:
|
|||||||
device=device, dtype=dtype,
|
device=device, dtype=dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
bj_acts = _collect_activations(generator, conditions, latent, t_tensor)
|
style_acts = _collect_activations(generator, conditions, latent, t_tensor)
|
||||||
empty_acts = _collect_activations(generator, empty_conditions, latent, t_tensor)
|
empty_acts = _collect_activations(generator, empty_conditions, latent, t_tensor)
|
||||||
|
|
||||||
for i, (bj, em) in enumerate(zip(bj_acts, empty_acts)):
|
for i, (st, em) in enumerate(zip(style_acts, empty_acts)):
|
||||||
if bj_sums[i] is None:
|
if style_sums[i] is None:
|
||||||
bj_sums[i] = bj.clone()
|
style_sums[i] = st.clone()
|
||||||
empty_sums[i] = em.clone()
|
empty_sums[i] = em.clone()
|
||||||
else:
|
else:
|
||||||
bj_sums[i] += bj
|
style_sums[i] += st
|
||||||
empty_sums[i] += em
|
empty_sums[i] += em
|
||||||
counts[i] += 1
|
counts[i] += 1
|
||||||
|
|
||||||
@@ -173,10 +173,10 @@ class SelvaActivationSteeringExtractor:
|
|||||||
if (sample_i + 1) % 4 == 0 or sample_i == n_samples - 1:
|
if (sample_i + 1) % 4 == 0 or sample_i == n_samples - 1:
|
||||||
print(f"[Steering] Processed {sample_i + 1}/{n_samples} clips", flush=True)
|
print(f"[Steering] Processed {sample_i + 1}/{n_samples} clips", flush=True)
|
||||||
|
|
||||||
# Steering vector per block: mean(BJ) - mean(empty)
|
# Steering vector per block: mean(target style) - mean(empty)
|
||||||
steering_vectors = []
|
steering_vectors = []
|
||||||
for i in range(n_blocks):
|
for i in range(n_blocks):
|
||||||
vec = (bj_sums[i] - empty_sums[i]) / counts[i] # [hidden]
|
vec = (style_sums[i] - empty_sums[i]) / counts[i] # [hidden]
|
||||||
steering_vectors.append(vec)
|
steering_vectors.append(vec)
|
||||||
|
|
||||||
norm = vec.norm().item()
|
norm = vec.norm().item()
|
||||||
|
|||||||
@@ -593,7 +593,7 @@ class SelvaBigvganTrainer:
|
|||||||
RETURN_NAMES = ("checkpoint_path",)
|
RETURN_NAMES = ("checkpoint_path",)
|
||||||
OUTPUT_TOOLTIPS = ("Path to saved vocoder checkpoint — load with SelVA BigVGAN Loader.",)
|
OUTPUT_TOOLTIPS = ("Path to saved vocoder checkpoint — load with SelVA BigVGAN Loader.",)
|
||||||
DESCRIPTION = (
|
DESCRIPTION = (
|
||||||
"Fine-tunes the BigVGAN vocoder (mel→waveform) on BJ audio clips. "
|
"Fine-tunes the BigVGAN vocoder (mel→waveform) on target audio clips. "
|
||||||
"Default mode (snake_alpha_only) tunes only the ~5K Snake activation α "
|
"Default mode (snake_alpha_only) tunes only the ~5K Snake activation α "
|
||||||
"parameters — cannot cause harmonic smearing. Add a discriminator path "
|
"parameters — cannot cause harmonic smearing. Add a discriminator path "
|
||||||
"for perceptual feature matching loss. DiT and VAE stay frozen."
|
"for perceptual feature matching loss. DiT and VAE stay frozen."
|
||||||
@@ -606,7 +606,7 @@ class SelvaBigvganTrainer:
|
|||||||
"model": ("SELVA_MODEL",),
|
"model": ("SELVA_MODEL",),
|
||||||
"data_dir": ("STRING", {
|
"data_dir": ("STRING", {
|
||||||
"default": "",
|
"default": "",
|
||||||
"tooltip": "Directory with BJ audio files (.wav/.flac/.mp3). Searched recursively.",
|
"tooltip": "Directory with target audio files (.wav/.flac/.mp3). Searched recursively.",
|
||||||
}),
|
}),
|
||||||
"output_path": ("STRING", {
|
"output_path": ("STRING", {
|
||||||
"default": "bigvgan_bj.pt",
|
"default": "bigvgan_bj.pt",
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
"""SelVA DITTO Optimizer.
|
"""SelVA DITTO Optimizer.
|
||||||
|
|
||||||
Inference-time noise optimization: optimizes the initial noise latent x_0
|
Inference-time noise optimization: optimizes the initial noise latent x_0
|
||||||
using a style loss against BJ reference clips, backpropagating through the
|
using a style loss against target style reference clips, backpropagating through the
|
||||||
ODE solver. All model weights remain frozen — only x_0 changes.
|
ODE solver. All model weights remain frozen — only x_0 changes.
|
||||||
|
|
||||||
Based on DITTO: Diffusion Inference-Time T-Optimization (arXiv:2401.12179,
|
Based on DITTO: Diffusion Inference-Time T-Optimization (arXiv:2401.12179,
|
||||||
ICML 2024 Oral). Adapted for SelVA's flow-matching Euler ODE.
|
ICML 2024 Oral). Adapted for SelVA's flow-matching Euler ODE.
|
||||||
|
|
||||||
Style loss: mel-spectrogram statistics matching (mean spectrum + Gram matrix)
|
Style loss: mel-spectrogram statistics matching (mean spectrum + Gram matrix)
|
||||||
against BJ reference clips. Runs entirely before the vocoder — optimization
|
against target style reference clips. Runs entirely before the vocoder — optimization
|
||||||
only requires the DiT + VAE decoder, not BigVGAN.
|
only requires the DiT + VAE decoder, not BigVGAN.
|
||||||
|
|
||||||
Memory strategy: gradient checkpointing at each ODE step — stores O(1 DiT
|
Memory strategy: gradient checkpointing at each ODE step — stores O(1 DiT
|
||||||
@@ -97,7 +97,7 @@ class SelvaDittoOptimizer:
|
|||||||
"""DITTO inference-time noise optimization.
|
"""DITTO inference-time noise optimization.
|
||||||
|
|
||||||
Freezes all model weights and optimizes only the initial noise latent x_0
|
Freezes all model weights and optimizes only the initial noise latent x_0
|
||||||
to make the generated audio sound like the BJ reference clips.
|
to make the generated audio sound like the target style reference clips.
|
||||||
No training data or gradient updates to the model — per-video per-run.
|
No training data or gradient updates to the model — per-video per-run.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -116,7 +116,7 @@ class SelvaDittoOptimizer:
|
|||||||
}),
|
}),
|
||||||
"reference_dir": ("STRING", {
|
"reference_dir": ("STRING", {
|
||||||
"default": "",
|
"default": "",
|
||||||
"tooltip": "Directory with BJ reference audio files (.wav/.flac/.mp3). "
|
"tooltip": "Directory with target style reference audio files (.wav/.flac/.mp3). "
|
||||||
"Reference mel statistics are precomputed from these once.",
|
"Reference mel statistics are precomputed from these once.",
|
||||||
}),
|
}),
|
||||||
"n_opt_steps": ("INT", {
|
"n_opt_steps": ("INT", {
|
||||||
@@ -143,8 +143,8 @@ class SelvaDittoOptimizer:
|
|||||||
}),
|
}),
|
||||||
"style_weight": ("FLOAT", {
|
"style_weight": ("FLOAT", {
|
||||||
"default": 0.1, "min": 0.0, "max": 10.0, "step": 0.05,
|
"default": 0.1, "min": 0.0, "max": 10.0, "step": 0.05,
|
||||||
"tooltip": "Weight of the BJ style loss. High values push harder toward "
|
"tooltip": "Weight of the target style style loss. High values push harder toward "
|
||||||
"BJ style but add noise. Start at 0.1 and increase slowly.",
|
"target style style but add noise. Start at 0.1 and increase slowly.",
|
||||||
}),
|
}),
|
||||||
"gram_weight": ("FLOAT", {
|
"gram_weight": ("FLOAT", {
|
||||||
"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01,
|
"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01,
|
||||||
@@ -176,12 +176,12 @@ class SelvaDittoOptimizer:
|
|||||||
|
|
||||||
RETURN_TYPES = ("AUDIO",)
|
RETURN_TYPES = ("AUDIO",)
|
||||||
RETURN_NAMES = ("audio",)
|
RETURN_NAMES = ("audio",)
|
||||||
OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward BJ style.",)
|
OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward target style style.",)
|
||||||
FUNCTION = "optimize"
|
FUNCTION = "optimize"
|
||||||
CATEGORY = SELVA_CATEGORY
|
CATEGORY = SELVA_CATEGORY
|
||||||
DESCRIPTION = (
|
DESCRIPTION = (
|
||||||
"DITTO inference-time noise optimization (arXiv:2401.12179). "
|
"DITTO inference-time noise optimization (arXiv:2401.12179). "
|
||||||
"Optimizes the initial noise latent x_0 to match BJ reference clips "
|
"Optimizes the initial noise latent x_0 to match target style reference clips "
|
||||||
"via mel statistics style loss, backpropagating through the ODE. "
|
"via mel statistics style loss, backpropagating through the ODE. "
|
||||||
"All model weights frozen — zero quality degradation risk."
|
"All model weights frozen — zero quality degradation risk."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -81,6 +81,8 @@ _PARAM_DEFAULTS = {
|
|||||||
"lr_schedule": "constant",
|
"lr_schedule": "constant",
|
||||||
"init_mode": "pissa",
|
"init_mode": "pissa",
|
||||||
"use_rslora": True,
|
"use_rslora": True,
|
||||||
|
"latent_mixup_alpha": 0.0,
|
||||||
|
"latent_noise_sigma": 0.0,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Palette for comparison chart: one color per experiment (cycles if > 8)
|
# Palette for comparison chart: one color per experiment (cycles if > 8)
|
||||||
|
|||||||
Reference in New Issue
Block a user