diff --git a/experiments/bigvgan_disc_fm_retest.json b/experiments/bigvgan_disc_fm_retest.json new file mode 100644 index 0000000..fa70c3c --- /dev/null +++ b/experiments/bigvgan_disc_fm_retest.json @@ -0,0 +1,31 @@ +{ + "name": "bigvgan_disc_fm_retest", + "description": "Retest discriminator feature matching after bfloat16 dtype fix. Uses optimal config from overnight sweep (snake_alpha, GAFilter, lr=1e-4, phase=1.0, L2-SP=1e-3, 5000 steps).", + "data_dir": "/media/unraid/davinci/Selva/BJ/features", + "output_root": "/media/unraid/davinci/Selva/BJ/experiment/bigvgan_disc_fm_retest", + "base": { + "train_mode": "snake_alpha_only", + "steps": 5000, + "lr": 1e-4, + "batch_size": 8, + "segment_seconds": 0.5, + "lambda_l2sp": 1e-3, + "use_gafilter": true, + "gafilter_kernel_size": 9, + "lambda_phase": 1.0, + "save_every": 1000, + "seed": 42, + "lora_adapter": "/media/unraid/davinci/Selva/BJ/experiment/pissa_sweep/standard_baseline/adapter_final.pt" + }, + "experiments": [ + { + "id": "snake_5k_control", + "description": "Control: best config from overnight sweep without discriminator. Baseline for A/B comparison." + }, + { + "id": "disc_fm_5k", + "description": "Discriminator feature matching at 5k steps. Tests if perceptual FM loss improves over mel+phase alone.", + "discriminator_path": "/media/unraid/davinci/Selva/BJ/experiment/bigvgan_discriminator_optimizer.pt" + } + ] +} diff --git a/experiments/lora_optimized_dataset.json b/experiments/lora_optimized_dataset.json new file mode 100644 index 0000000..3194961 --- /dev/null +++ b/experiments/lora_optimized_dataset.json @@ -0,0 +1,64 @@ +{ + "name": "lora_optimized_dataset", + "description": "LoRA training on optimized dataset (134 clips: resampled 44.1kHz, LUFS-normalized, spectral matched, HF smoothed, gain-augmented). Tests latent augmentation and schedule variants on top of known-best config (PiSSA, rank=128, lr=3e-4).", + "data_dir": "/media/unraid/davinci/Selva/BJ/features_v2_improved/", + "output_root": "/media/unraid/davinci/Selva/BJ/experiment/lora_optimized_dataset", + "base": { + "rank": 128, + "lr": 3e-4, + "steps": 5000, + "batch_size": 4, + "warmup_steps": 100, + "save_every": 1000, + "seed": 42, + "init_mode": "pissa", + "use_rslora": true, + "target": "attn.qkv", + "timestep_mode": "uniform", + "lr_schedule": "constant" + }, + "experiments": [ + { + "id": "baseline", + "description": "Control: known-best config (PiSSA r128 lr=3e-4) on the optimized dataset. No latent augmentation." + }, + { + "id": "latent_mixup", + "description": "Latent mixup alpha=0.4 (MusicLDM). Tests if mixing training latents reduces memorization on 134 clips.", + "latent_mixup_alpha": 0.4 + }, + { + "id": "latent_noise", + "description": "Latent noise sigma=0.02. Mild Gaussian noise on training latents for regularization.", + "latent_noise_sigma": 0.02 + }, + { + "id": "mixup_and_noise", + "description": "Both latent mixup (0.4) and noise (0.02). Combined regularization.", + "latent_mixup_alpha": 0.4, + "latent_noise_sigma": 0.02 + }, + { + "id": "cosine_schedule", + "description": "Cosine LR decay. lr=3e-4 was stable with constant, but cosine may extract more from 5k steps.", + "lr_schedule": "cosine" + }, + { + "id": "cosine_mixup", + "description": "Cosine LR + latent mixup. Best regularization combo candidate.", + "lr_schedule": "cosine", + "latent_mixup_alpha": 0.4 + }, + { + "id": "logit_normal", + "description": "Logit-normal timestep sampling (sigma=1.0). Concentrates training near t=0.5 where flow matching is hardest.", + "timestep_mode": "logit_normal" + }, + { + "id": "curriculum_mixup", + "description": "Curriculum timesteps (logit_normal first 60%, then uniform) + latent mixup. Full regularization stack.", + "timestep_mode": "curriculum", + "latent_mixup_alpha": 0.4 + } + ] +} diff --git a/nodes/selva_activation_steering_extractor.py b/nodes/selva_activation_steering_extractor.py index cefb482..a4d1322 100644 --- a/nodes/selva_activation_steering_extractor.py +++ b/nodes/selva_activation_steering_extractor.py @@ -1,15 +1,15 @@ """SelVA Activation Steering Extractor. Computes per-block steering vectors by running the frozen generator on the -training dataset and recording how BJ's conditioning shifts the DiT hidden +training dataset and recording how target style's conditioning shifts the DiT hidden states vs. empty/unconditional conditioning. For each block i: - steering[i] = mean(latent_hidden | BJ conditions) + steering[i] = mean(latent_hidden | target style conditions) - mean(latent_hidden | empty conditions) The resulting vectors are injected at inference time (via SelVA Sampler's -steering_strength input) to nudge the denoising trajectory toward BJ's +steering_strength input) to nudge the denoising trajectory toward target style's activation patterns without modifying any model weights. """ @@ -58,7 +58,7 @@ class SelvaActivationSteeringExtractor: """Computes activation steering vectors from a training dataset. Runs the frozen generator on N clips at random timesteps with both - BJ-conditioned and empty-conditioned inputs, then saves the mean + target style-conditioned and empty-conditioned inputs, then saves the mean difference per DiT block to a .pt file. """ @@ -69,7 +69,7 @@ class SelvaActivationSteeringExtractor: RETURN_NAMES = ("steering_path",) OUTPUT_TOOLTIPS = ("Path to saved steering_vectors.pt — load with SelVA Activation Steering Loader.",) DESCRIPTION = ( - "Computes per-block activation steering vectors: mean(BJ activations) − " + "Computes per-block activation steering vectors: mean(target style activations) − " "mean(empty activations) at each DiT block. Load the result with " "SelVA Activation Steering Loader and connect to the Sampler." ) @@ -124,7 +124,7 @@ class SelvaActivationSteeringExtractor: indices = random.choices(range(len(dataset)), k=n_samples) n_blocks = len(generator.joint_blocks) + len(generator.fused_blocks) - bj_sums = [None] * n_blocks + style_sums = [None] * n_blocks empty_sums = [None] * n_blocks counts = [0] * n_blocks @@ -157,15 +157,15 @@ class SelvaActivationSteeringExtractor: device=device, dtype=dtype, ) - bj_acts = _collect_activations(generator, conditions, latent, t_tensor) + style_acts = _collect_activations(generator, conditions, latent, t_tensor) empty_acts = _collect_activations(generator, empty_conditions, latent, t_tensor) - for i, (bj, em) in enumerate(zip(bj_acts, empty_acts)): - if bj_sums[i] is None: - bj_sums[i] = bj.clone() + for i, (st, em) in enumerate(zip(style_acts, empty_acts)): + if style_sums[i] is None: + style_sums[i] = st.clone() empty_sums[i] = em.clone() else: - bj_sums[i] += bj + style_sums[i] += st empty_sums[i] += em counts[i] += 1 @@ -173,10 +173,10 @@ class SelvaActivationSteeringExtractor: if (sample_i + 1) % 4 == 0 or sample_i == n_samples - 1: print(f"[Steering] Processed {sample_i + 1}/{n_samples} clips", flush=True) - # Steering vector per block: mean(BJ) - mean(empty) + # Steering vector per block: mean(target style) - mean(empty) steering_vectors = [] for i in range(n_blocks): - vec = (bj_sums[i] - empty_sums[i]) / counts[i] # [hidden] + vec = (style_sums[i] - empty_sums[i]) / counts[i] # [hidden] steering_vectors.append(vec) norm = vec.norm().item() diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index 804b012..f2fa441 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -593,7 +593,7 @@ class SelvaBigvganTrainer: RETURN_NAMES = ("checkpoint_path",) OUTPUT_TOOLTIPS = ("Path to saved vocoder checkpoint — load with SelVA BigVGAN Loader.",) DESCRIPTION = ( - "Fine-tunes the BigVGAN vocoder (mel→waveform) on BJ audio clips. " + "Fine-tunes the BigVGAN vocoder (mel→waveform) on target audio clips. " "Default mode (snake_alpha_only) tunes only the ~5K Snake activation α " "parameters — cannot cause harmonic smearing. Add a discriminator path " "for perceptual feature matching loss. DiT and VAE stay frozen." @@ -606,7 +606,7 @@ class SelvaBigvganTrainer: "model": ("SELVA_MODEL",), "data_dir": ("STRING", { "default": "", - "tooltip": "Directory with BJ audio files (.wav/.flac/.mp3). Searched recursively.", + "tooltip": "Directory with target audio files (.wav/.flac/.mp3). Searched recursively.", }), "output_path": ("STRING", { "default": "bigvgan_bj.pt", diff --git a/nodes/selva_ditto_optimizer.py b/nodes/selva_ditto_optimizer.py index e7795de..43741ad 100644 --- a/nodes/selva_ditto_optimizer.py +++ b/nodes/selva_ditto_optimizer.py @@ -1,14 +1,14 @@ """SelVA DITTO Optimizer. Inference-time noise optimization: optimizes the initial noise latent x_0 -using a style loss against BJ reference clips, backpropagating through the +using a style loss against target style reference clips, backpropagating through the ODE solver. All model weights remain frozen — only x_0 changes. Based on DITTO: Diffusion Inference-Time T-Optimization (arXiv:2401.12179, ICML 2024 Oral). Adapted for SelVA's flow-matching Euler ODE. Style loss: mel-spectrogram statistics matching (mean spectrum + Gram matrix) -against BJ reference clips. Runs entirely before the vocoder — optimization +against target style reference clips. Runs entirely before the vocoder — optimization only requires the DiT + VAE decoder, not BigVGAN. Memory strategy: gradient checkpointing at each ODE step — stores O(1 DiT @@ -97,7 +97,7 @@ class SelvaDittoOptimizer: """DITTO inference-time noise optimization. Freezes all model weights and optimizes only the initial noise latent x_0 - to make the generated audio sound like the BJ reference clips. + to make the generated audio sound like the target style reference clips. No training data or gradient updates to the model — per-video per-run. """ @@ -116,7 +116,7 @@ class SelvaDittoOptimizer: }), "reference_dir": ("STRING", { "default": "", - "tooltip": "Directory with BJ reference audio files (.wav/.flac/.mp3). " + "tooltip": "Directory with target style reference audio files (.wav/.flac/.mp3). " "Reference mel statistics are precomputed from these once.", }), "n_opt_steps": ("INT", { @@ -143,8 +143,8 @@ class SelvaDittoOptimizer: }), "style_weight": ("FLOAT", { "default": 0.1, "min": 0.0, "max": 10.0, "step": 0.05, - "tooltip": "Weight of the BJ style loss. High values push harder toward " - "BJ style but add noise. Start at 0.1 and increase slowly.", + "tooltip": "Weight of the target style style loss. High values push harder toward " + "target style style but add noise. Start at 0.1 and increase slowly.", }), "gram_weight": ("FLOAT", { "default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01, @@ -176,12 +176,12 @@ class SelvaDittoOptimizer: RETURN_TYPES = ("AUDIO",) RETURN_NAMES = ("audio",) - OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward BJ style.",) + OUTPUT_TOOLTIPS = ("DITTO-optimized audio — x_0 steered toward target style style.",) FUNCTION = "optimize" CATEGORY = SELVA_CATEGORY DESCRIPTION = ( "DITTO inference-time noise optimization (arXiv:2401.12179). " - "Optimizes the initial noise latent x_0 to match BJ reference clips " + "Optimizes the initial noise latent x_0 to match target style reference clips " "via mel statistics style loss, backpropagating through the ODE. " "All model weights frozen — zero quality degradation risk." ) diff --git a/nodes/selva_lora_scheduler.py b/nodes/selva_lora_scheduler.py index d8609a9..a5e50f1 100644 --- a/nodes/selva_lora_scheduler.py +++ b/nodes/selva_lora_scheduler.py @@ -81,6 +81,8 @@ _PARAM_DEFAULTS = { "lr_schedule": "constant", "init_mode": "pissa", "use_rslora": True, + "latent_mixup_alpha": 0.0, + "latent_noise_sigma": 0.0, } # Palette for comparison chart: one color per experiment (cycles if > 8)