From 78e9838a83c2274b2ceb08d0de928dee33e89790 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Wed, 8 Apr 2026 12:06:48 +0200
Subject: [PATCH] fix: replace peak normalization with RMS normalization at -20
 dBFS

Peak norm was slamming output to full scale regardless of content level,
making generated audio several times louder than training clips.
RMS norm to -20 dBFS matches typical processed audio level.
Sampler exposes target_lufs (-40 to -6, default -20) for user control.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 nodes/selva_lora_trainer.py |  5 +++--
 nodes/selva_sampler.py      | 13 +++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py
index 607b51c..7e477f7 100644
--- a/nodes/selva_lora_trainer.py
+++ b/nodes/selva_lora_trainer.py
@@ -134,8 +134,9 @@ def _eval_sample(generator, feature_utils_orig, dataset, seq_cfg, device, dtype,
         elif audio.dim() == 3 and audio.shape[1] != 1:
             audio = audio.mean(dim=1, keepdim=True)
 
-        peak = audio.abs().max().clamp(min=1e-8)
-        audio = (audio / peak).clamp(-1, 1)
+        target_rms = 10 ** (-20.0 / 20.0)   # -20 dBFS
+        rms = audio.pow(2).mean().sqrt().clamp(min=1e-8)
+        audio = (audio * (target_rms / rms)).clamp(-1, 1)
         return audio.squeeze(0), seq_cfg.sampling_rate   # [1, L]
 
     except Exception as e:
diff --git a/nodes/selva_sampler.py b/nodes/selva_sampler.py
index 249ccc4..45ae038 100644
--- a/nodes/selva_sampler.py
+++ b/nodes/selva_sampler.py
@@ -33,7 +33,11 @@ class SelvaSampler:
             "optional": {
                 "normalize": ("BOOLEAN", {
                     "default": True,
-                    "tooltip": "Peak-normalize output to [-1, 1]. Disable to preserve the raw decoder output level.",
+                    "tooltip": "Normalize output level. Uses RMS normalization to target_lufs rather than peak normalization, so level matches typical audio content.",
+                }),
+                "target_lufs": ("FLOAT", {
+                    "default": -20.0, "min": -40.0, "max": -6.0, "step": 1.0,
+                    "tooltip": "Target RMS level in dBFS when normalize=True. -20 matches typical processed audio. Increase toward -14 for louder output, decrease toward -30 for quieter.",
                 }),
             },
         }
@@ -45,7 +49,7 @@ class SelvaSampler:
     CATEGORY = SELVA_CATEGORY
     DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance."
 
-    def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True):
+    def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-20.0):
         import dataclasses
         from selva_core.model.flow_matching import FlowMatching
 
@@ -168,8 +172,9 @@ class SelvaSampler:
             audio = audio.mean(dim=1, keepdim=True)  # stereo → mono
 
         if normalize:
-            peak = audio.abs().max().clamp(min=1e-8)
-            audio = (audio / peak).clamp(-1, 1)
+            target_rms = 10 ** (target_lufs / 20.0)
+            rms = audio.pow(2).mean().sqrt().clamp(min=1e-8)
+            audio = (audio * (target_rms / rms)).clamp(-1, 1)
         print(f"[SelVA] audio: shape={tuple(audio.shape)} sr={sample_rate}", flush=True)
 
         return ({"waveform": audio.cpu(), "sample_rate": sample_rate},)