fix: prevent saturation from RMS normalization clipping peaks

RMS normalize to target then scale back if peaks exceed 1.0, preserving dynamics instead of hard-clipping transients. Eval sample target updated to -23 dBFS to match training data. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 12:29:29 +02:00
parent 78e9838a83
commit 8717af2728
2 changed files with 11 additions and 3 deletions
@@ -134,9 +134,12 @@ def _eval_sample(generator, feature_utils_orig, dataset, seq_cfg, device, dtype,
        elif audio.dim() == 3 and audio.shape[1] != 1:
            audio = audio.mean(dim=1, keepdim=True)

-        target_rms = 10 ** (-20.0 / 20.0)   # -20 dBFS
+        target_rms = 10 ** (-23.0 / 20.0)   # -23 dBFS matches training data
        rms = audio.pow(2).mean().sqrt().clamp(min=1e-8)
-        audio = (audio * (target_rms / rms)).clamp(-1, 1)
+        audio = audio * (target_rms / rms)
+        peak = audio.abs().max().clamp(min=1e-8)
+        if peak > 1.0:
+            audio = audio / peak
        return audio.squeeze(0), seq_cfg.sampling_rate   # [1, L]

    except Exception as e:
@@ -174,7 +174,12 @@ class SelvaSampler:
        if normalize:
            target_rms = 10 ** (target_lufs / 20.0)
            rms = audio.pow(2).mean().sqrt().clamp(min=1e-8)
-            audio = (audio * (target_rms / rms)).clamp(-1, 1)
+            audio = audio * (target_rms / rms)
+            # If RMS normalization pushes peaks into clipping, scale back to
+            # preserve dynamics rather than hard-clipping (no saturation)
+            peak = audio.abs().max().clamp(min=1e-8)
+            if peak > 1.0:
+                audio = audio / peak
        print(f"[SelVA] audio: shape={tuple(audio.shape)} sr={sample_rate}", flush=True)

        return ({"waveform": audio.cpu(), "sample_rate": sample_rate},)