From 78e9838a83c2274b2ceb08d0de928dee33e89790 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Wed, 8 Apr 2026 12:06:48 +0200 Subject: [PATCH] fix: replace peak normalization with RMS normalization at -20 dBFS Peak norm was slamming output to full scale regardless of content level, making generated audio several times louder than training clips. RMS norm to -20 dBFS matches typical processed audio level. Sampler exposes target_lufs (-40 to -6, default -20) for user control. Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_lora_trainer.py | 5 +++-- nodes/selva_sampler.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py index 607b51c..7e477f7 100644 --- a/nodes/selva_lora_trainer.py +++ b/nodes/selva_lora_trainer.py @@ -134,8 +134,9 @@ def _eval_sample(generator, feature_utils_orig, dataset, seq_cfg, device, dtype, elif audio.dim() == 3 and audio.shape[1] != 1: audio = audio.mean(dim=1, keepdim=True) - peak = audio.abs().max().clamp(min=1e-8) - audio = (audio / peak).clamp(-1, 1) + target_rms = 10 ** (-20.0 / 20.0) # -20 dBFS + rms = audio.pow(2).mean().sqrt().clamp(min=1e-8) + audio = (audio * (target_rms / rms)).clamp(-1, 1) return audio.squeeze(0), seq_cfg.sampling_rate # [1, L] except Exception as e: diff --git a/nodes/selva_sampler.py b/nodes/selva_sampler.py index 249ccc4..45ae038 100644 --- a/nodes/selva_sampler.py +++ b/nodes/selva_sampler.py @@ -33,7 +33,11 @@ class SelvaSampler: "optional": { "normalize": ("BOOLEAN", { "default": True, - "tooltip": "Peak-normalize output to [-1, 1]. Disable to preserve the raw decoder output level.", + "tooltip": "Normalize output level. Uses RMS normalization to target_lufs rather than peak normalization, so level matches typical audio content.", + }), + "target_lufs": ("FLOAT", { + "default": -20.0, "min": -40.0, "max": -6.0, "step": 1.0, + "tooltip": "Target RMS level in dBFS when normalize=True. -20 matches typical processed audio. Increase toward -14 for louder output, decrease toward -30 for quieter.", }), }, } @@ -45,7 +49,7 @@ class SelvaSampler: CATEGORY = SELVA_CATEGORY DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance." - def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True): + def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-20.0): import dataclasses from selva_core.model.flow_matching import FlowMatching @@ -168,8 +172,9 @@ class SelvaSampler: audio = audio.mean(dim=1, keepdim=True) # stereo → mono if normalize: - peak = audio.abs().max().clamp(min=1e-8) - audio = (audio / peak).clamp(-1, 1) + target_rms = 10 ** (target_lufs / 20.0) + rms = audio.pow(2).mean().sqrt().clamp(min=1e-8) + audio = (audio * (target_rms / rms)).clamp(-1, 1) print(f"[SelVA] audio: shape={tuple(audio.shape)} sr={sample_rate}", flush=True) return ({"waveform": audio.cpu(), "sample_rate": sample_rate},)