fix: replace peak normalization with RMS normalization at -20 dBFS
Peak norm was slamming output to full scale regardless of content level, making generated audio several times louder than training clips. RMS norm to -20 dBFS matches typical processed audio level. Sampler exposes target_lufs (-40 to -6, default -20) for user control. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -134,8 +134,9 @@ def _eval_sample(generator, feature_utils_orig, dataset, seq_cfg, device, dtype,
|
|||||||
elif audio.dim() == 3 and audio.shape[1] != 1:
|
elif audio.dim() == 3 and audio.shape[1] != 1:
|
||||||
audio = audio.mean(dim=1, keepdim=True)
|
audio = audio.mean(dim=1, keepdim=True)
|
||||||
|
|
||||||
peak = audio.abs().max().clamp(min=1e-8)
|
target_rms = 10 ** (-20.0 / 20.0) # -20 dBFS
|
||||||
audio = (audio / peak).clamp(-1, 1)
|
rms = audio.pow(2).mean().sqrt().clamp(min=1e-8)
|
||||||
|
audio = (audio * (target_rms / rms)).clamp(-1, 1)
|
||||||
return audio.squeeze(0), seq_cfg.sampling_rate # [1, L]
|
return audio.squeeze(0), seq_cfg.sampling_rate # [1, L]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -33,7 +33,11 @@ class SelvaSampler:
|
|||||||
"optional": {
|
"optional": {
|
||||||
"normalize": ("BOOLEAN", {
|
"normalize": ("BOOLEAN", {
|
||||||
"default": True,
|
"default": True,
|
||||||
"tooltip": "Peak-normalize output to [-1, 1]. Disable to preserve the raw decoder output level.",
|
"tooltip": "Normalize output level. Uses RMS normalization to target_lufs rather than peak normalization, so level matches typical audio content.",
|
||||||
|
}),
|
||||||
|
"target_lufs": ("FLOAT", {
|
||||||
|
"default": -20.0, "min": -40.0, "max": -6.0, "step": 1.0,
|
||||||
|
"tooltip": "Target RMS level in dBFS when normalize=True. -20 matches typical processed audio. Increase toward -14 for louder output, decrease toward -30 for quieter.",
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -45,7 +49,7 @@ class SelvaSampler:
|
|||||||
CATEGORY = SELVA_CATEGORY
|
CATEGORY = SELVA_CATEGORY
|
||||||
DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance."
|
DESCRIPTION = "Generates audio from video features using SelVA's flow matching ODE. Supports text prompts and negative prompts via classifier-free guidance."
|
||||||
|
|
||||||
def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True):
|
def generate(self, model, features, prompt, negative_prompt, duration, steps, cfg_strength, seed, normalize=True, target_lufs=-20.0):
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from selva_core.model.flow_matching import FlowMatching
|
from selva_core.model.flow_matching import FlowMatching
|
||||||
|
|
||||||
@@ -168,8 +172,9 @@ class SelvaSampler:
|
|||||||
audio = audio.mean(dim=1, keepdim=True) # stereo → mono
|
audio = audio.mean(dim=1, keepdim=True) # stereo → mono
|
||||||
|
|
||||||
if normalize:
|
if normalize:
|
||||||
peak = audio.abs().max().clamp(min=1e-8)
|
target_rms = 10 ** (target_lufs / 20.0)
|
||||||
audio = (audio / peak).clamp(-1, 1)
|
rms = audio.pow(2).mean().sqrt().clamp(min=1e-8)
|
||||||
|
audio = (audio * (target_rms / rms)).clamp(-1, 1)
|
||||||
print(f"[SelVA] audio: shape={tuple(audio.shape)} sr={sample_rate}", flush=True)
|
print(f"[SelVA] audio: shape={tuple(audio.shape)} sr={sample_rate}", flush=True)
|
||||||
|
|
||||||
return ({"waveform": audio.cpu(), "sample_rate": sample_rate},)
|
return ({"waveform": audio.cpu(), "sample_rate": sample_rate},)
|
||||||
|
|||||||
Reference in New Issue
Block a user