ce62bccc1f
Three new nodes for post-generation quality improvement: - SelvaHarmonicExciter: multi-band exciter (HPF → tanh saturation → mix) restores harmonic richness lost in BigVGAN HF reconstruction - SelvaFlashSR: audio super-resolution via FlashSR basic model (haoheliu/versatile_audio_super_resolution, requires pip install audiosr) predicts missing HF content above vocoder reconstruction ceiling - SelvaOutputNormalizer: BS.1770-4 LUFS normalization + true peak limiting for consistent loudness on generated outputs (pyloudnorm) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
248 lines
8.9 KiB
Python
248 lines
8.9 KiB
Python
"""SelVA Audio Post-Processing nodes.
|
|
|
|
Post-generation enhancement applied to standard AUDIO outputs:
|
|
SelvaHarmonicExciter — multi-band harmonic exciter (HPF → tanh → mix)
|
|
SelvaFlashSR — audio super-resolution via FlashSR/AudioSR
|
|
SelvaOutputNormalizer — LUFS normalization + true peak limiting
|
|
"""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
from .utils import SELVA_CATEGORY
|
|
|
|
|
|
class SelvaHarmonicExciter:
|
|
"""Multi-band harmonic exciter for post-generation enhancement.
|
|
|
|
Isolates high-frequency content above a cutoff, applies tanh saturation
|
|
to generate 2nd/3rd harmonics, then mixes back with the dry signal.
|
|
Restores harmonic richness lost during BigVGAN vocoder reconstruction.
|
|
"""
|
|
|
|
@classmethod
|
|
def INPUT_TYPES(cls):
|
|
return {
|
|
"required": {
|
|
"audio": ("AUDIO",),
|
|
"cutoff_hz": ("FLOAT", {
|
|
"default": 3000.0, "min": 500.0, "max": 16000.0, "step": 100.0,
|
|
"tooltip": "Highpass cutoff frequency in Hz. Only content above this is excited. "
|
|
"3000 Hz targets the upper harmonics BigVGAN tends to smear.",
|
|
}),
|
|
"drive": ("FLOAT", {
|
|
"default": 2.0, "min": 1.0, "max": 10.0, "step": 0.5,
|
|
"tooltip": "Saturation drive. Higher = more harmonics generated. "
|
|
"2-3 is subtle, 5+ is aggressive.",
|
|
}),
|
|
"mix": ("FLOAT", {
|
|
"default": 0.15, "min": 0.0, "max": 1.0, "step": 0.05,
|
|
"tooltip": "Wet/dry blend. 0.1-0.2 is subtle enhancement, "
|
|
"0.5+ is aggressive harmonic addition.",
|
|
}),
|
|
}
|
|
}
|
|
|
|
RETURN_TYPES = ("AUDIO",)
|
|
RETURN_NAMES = ("audio",)
|
|
FUNCTION = "excite"
|
|
CATEGORY = SELVA_CATEGORY
|
|
DESCRIPTION = (
|
|
"Multi-band harmonic exciter. Applies tanh saturation to the high-frequency band "
|
|
"to restore harmonics lost during BigVGAN vocoder reconstruction. "
|
|
"Uses pedalboard.HighpassFilter for band isolation."
|
|
)
|
|
|
|
def excite(self, audio, cutoff_hz: float, drive: float, mix: float):
|
|
from pedalboard import Pedalboard, HighpassFilter
|
|
|
|
wav = audio["waveform"][0] # [C, T]
|
|
sr = audio["sample_rate"]
|
|
|
|
wav_np = wav.float().numpy() # [C, T]
|
|
|
|
# Isolate HF band
|
|
board = Pedalboard([HighpassFilter(cutoff_frequency_hz=cutoff_hz)])
|
|
hf = board(wav_np, sr) # [C, T]
|
|
|
|
# Tanh saturation — normalize by drive so output stays in [-1, 1]
|
|
excited = np.tanh(hf * drive) / max(drive, 1.0)
|
|
|
|
# Mix back with dry
|
|
mixed = wav_np + mix * excited
|
|
|
|
# Soft clip to prevent going over
|
|
mixed = np.tanh(mixed)
|
|
|
|
wav_out = torch.from_numpy(mixed).unsqueeze(0) # [1, C, T]
|
|
print(
|
|
f"[HarmonicExciter] cutoff={cutoff_hz}Hz drive={drive} mix={mix:.0%}",
|
|
flush=True,
|
|
)
|
|
return ({"waveform": wav_out, "sample_rate": sr},)
|
|
|
|
|
|
class SelvaFlashSR:
|
|
"""Audio super-resolution via FlashSR (haoheliu/versatile_audio_super_resolution).
|
|
|
|
Upsamples bandwidth-limited audio to full 44.1 kHz by predicting missing
|
|
high-frequency content. Requires: pip install audiosr
|
|
|
|
FlashSR uses the 'basic' model — 22x faster than full AudioSR with
|
|
comparable quality for vocoder output enhancement.
|
|
"""
|
|
|
|
@classmethod
|
|
def INPUT_TYPES(cls):
|
|
return {
|
|
"required": {
|
|
"audio": ("AUDIO",),
|
|
"guidance_scale": ("FLOAT", {
|
|
"default": 3.5, "min": 1.0, "max": 10.0, "step": 0.5,
|
|
"tooltip": "Classifier-free guidance scale. Higher = stronger HF prediction, "
|
|
"lower = closer to input. 3.5 is a good default.",
|
|
}),
|
|
"ddim_steps": ("INT", {
|
|
"default": 50, "min": 10, "max": 200,
|
|
"tooltip": "Diffusion steps. 50 is standard quality, 25 for faster preview.",
|
|
}),
|
|
}
|
|
}
|
|
|
|
RETURN_TYPES = ("AUDIO",)
|
|
RETURN_NAMES = ("audio",)
|
|
FUNCTION = "upsample"
|
|
CATEGORY = SELVA_CATEGORY
|
|
DESCRIPTION = (
|
|
"Audio super-resolution using FlashSR (basic model). "
|
|
"Predicts missing high-frequency content above the vocoder's reconstruction ceiling. "
|
|
"Requires: pip install audiosr"
|
|
)
|
|
|
|
def upsample(self, audio, guidance_scale: float, ddim_steps: int):
|
|
try:
|
|
import audiosr
|
|
except ImportError:
|
|
raise RuntimeError(
|
|
"[FlashSR] audiosr not installed. Run: pip install audiosr"
|
|
)
|
|
|
|
import soundfile as sf
|
|
import comfy.model_management
|
|
|
|
wav = audio["waveform"][0] # [C, T]
|
|
sr = audio["sample_rate"]
|
|
|
|
# AudioSR works on files — write to temp, process, read back
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
tmp_in = Path(f.name)
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
tmp_out = Path(f.name)
|
|
|
|
try:
|
|
wav_np = wav.float().numpy() # [C, T]
|
|
if wav_np.shape[0] == 1:
|
|
wav_np = wav_np[0] # [T] mono for soundfile
|
|
else:
|
|
wav_np = wav_np.T # [T, C]
|
|
sf.write(str(tmp_in), wav_np, sr)
|
|
|
|
device = str(comfy.model_management.get_torch_device())
|
|
model = audiosr.build_model(model_name="basic", device=device)
|
|
result = audiosr.super_resolution(
|
|
model,
|
|
str(tmp_in),
|
|
guidance_scale=guidance_scale,
|
|
ddim_steps=ddim_steps,
|
|
latent_t_per_second=12.8,
|
|
)
|
|
|
|
# result is numpy [1, T] at 44100 Hz
|
|
out_np = np.array(result).squeeze() # [T]
|
|
out_sr = 44100
|
|
wav_out = torch.from_numpy(out_np).float()
|
|
if wav_out.dim() == 1:
|
|
wav_out = wav_out.unsqueeze(0) # [1, T]
|
|
wav_out = wav_out.unsqueeze(0) # [1, 1, T]
|
|
|
|
finally:
|
|
tmp_in.unlink(missing_ok=True)
|
|
tmp_out.unlink(missing_ok=True)
|
|
|
|
print(f"[FlashSR] Done guidance={guidance_scale} steps={ddim_steps}", flush=True)
|
|
return ({"waveform": wav_out, "sample_rate": out_sr},)
|
|
|
|
|
|
class SelvaOutputNormalizer:
|
|
"""Normalize generated audio to a target LUFS level with true peak limiting.
|
|
|
|
Apply as the final node before saving — brings generated audio to a
|
|
consistent loudness target regardless of input video loudness variation.
|
|
Uses pyloudnorm (BS.1770-4).
|
|
"""
|
|
|
|
@classmethod
|
|
def INPUT_TYPES(cls):
|
|
return {
|
|
"required": {
|
|
"audio": ("AUDIO",),
|
|
"target_lufs": ("FLOAT", {
|
|
"default": -14.0, "min": -40.0, "max": -6.0, "step": 0.5,
|
|
"tooltip": "Target integrated loudness in LUFS. "
|
|
"-14 LUFS for streaming (Spotify/YouTube), "
|
|
"-9 to -7 for production masters.",
|
|
}),
|
|
"true_peak_dbtp": ("FLOAT", {
|
|
"default": -1.0, "min": -6.0, "max": 0.0, "step": 0.5,
|
|
"tooltip": "True peak ceiling in dBTP applied after LUFS gain.",
|
|
}),
|
|
}
|
|
}
|
|
|
|
RETURN_TYPES = ("AUDIO",)
|
|
RETURN_NAMES = ("audio",)
|
|
FUNCTION = "normalize"
|
|
CATEGORY = SELVA_CATEGORY
|
|
DESCRIPTION = (
|
|
"Normalize output audio to a target LUFS level (BS.1770-4) with true peak limiting. "
|
|
"Apply as the last node before saving. Uses pyloudnorm."
|
|
)
|
|
|
|
def normalize(self, audio, target_lufs: float, true_peak_dbtp: float):
|
|
import pyloudnorm as pyln
|
|
|
|
wav = audio["waveform"][0] # [C, T]
|
|
sr = audio["sample_rate"]
|
|
|
|
tp_linear = 10.0 ** (true_peak_dbtp / 20.0)
|
|
|
|
wav_np = wav.permute(1, 0).double().numpy() # [T, C]
|
|
if wav_np.shape[1] == 1:
|
|
wav_np = wav_np[:, 0] # [T] mono
|
|
|
|
meter = pyln.Meter(sr)
|
|
loudness = meter.integrated_loudness(wav_np)
|
|
|
|
if not np.isfinite(loudness):
|
|
print("[OutputNormalizer] Could not measure loudness — clip too short or silent. Passing through.", flush=True)
|
|
return (audio,)
|
|
|
|
gain_db = target_lufs - loudness
|
|
gain_linear = 10.0 ** (gain_db / 20.0)
|
|
|
|
wav_out = wav * gain_linear
|
|
|
|
peak = wav_out.abs().max().item()
|
|
if peak > tp_linear:
|
|
wav_out = wav_out * (tp_linear / peak)
|
|
|
|
print(
|
|
f"[OutputNormalizer] {loudness:.1f} LUFS → {target_lufs} LUFS "
|
|
f"gain={gain_db:+.1f}dB TP={true_peak_dbtp}dBTP",
|
|
flush=True,
|
|
)
|
|
return ({"waveform": wav_out.unsqueeze(0), "sample_rate": sr},)
|