feat: add post-generation audio enhancement nodes

Three new nodes for post-generation quality improvement: - SelvaHarmonicExciter: multi-band exciter (HPF → tanh saturation → mix) restores harmonic richness lost in BigVGAN HF reconstruction - SelvaFlashSR: audio super-resolution via FlashSR basic model (haoheliu/versatile_audio_super_resolution, requires pip install audiosr) predicts missing HF content above vocoder reconstruction ceiling - SelvaOutputNormalizer: BS.1770-4 LUFS normalization + true peak limiting for consistent loudness on generated outputs (pyloudnorm) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 16:27:39 +02:00
parent 45fced55bc
commit ce62bccc1f
2 changed files with 250 additions and 0 deletions
@@ -0,0 +1,247 @@
+"""SelVA Audio Post-Processing nodes.
+
+Post-generation enhancement applied to standard AUDIO outputs:
+  SelvaHarmonicExciter    — multi-band harmonic exciter (HPF → tanh → mix)
+  SelvaFlashSR            — audio super-resolution via FlashSR/AudioSR
+  SelvaOutputNormalizer   — LUFS normalization + true peak limiting
+"""
+
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from .utils import SELVA_CATEGORY
+
+
+class SelvaHarmonicExciter:
+    """Multi-band harmonic exciter for post-generation enhancement.
+
+    Isolates high-frequency content above a cutoff, applies tanh saturation
+    to generate 2nd/3rd harmonics, then mixes back with the dry signal.
+    Restores harmonic richness lost during BigVGAN vocoder reconstruction.
+    """
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+                "cutoff_hz": ("FLOAT", {
+                    "default": 3000.0, "min": 500.0, "max": 16000.0, "step": 100.0,
+                    "tooltip": "Highpass cutoff frequency in Hz. Only content above this is excited. "
+                               "3000 Hz targets the upper harmonics BigVGAN tends to smear.",
+                }),
+                "drive": ("FLOAT", {
+                    "default": 2.0, "min": 1.0, "max": 10.0, "step": 0.5,
+                    "tooltip": "Saturation drive. Higher = more harmonics generated. "
+                               "2-3 is subtle, 5+ is aggressive.",
+                }),
+                "mix": ("FLOAT", {
+                    "default": 0.15, "min": 0.0, "max": 1.0, "step": 0.05,
+                    "tooltip": "Wet/dry blend. 0.1-0.2 is subtle enhancement, "
+                               "0.5+ is aggressive harmonic addition.",
+                }),
+            }
+        }
+
+    RETURN_TYPES  = ("AUDIO",)
+    RETURN_NAMES  = ("audio",)
+    FUNCTION      = "excite"
+    CATEGORY      = SELVA_CATEGORY
+    DESCRIPTION   = (
+        "Multi-band harmonic exciter. Applies tanh saturation to the high-frequency band "
+        "to restore harmonics lost during BigVGAN vocoder reconstruction. "
+        "Uses pedalboard.HighpassFilter for band isolation."
+    )
+
+    def excite(self, audio, cutoff_hz: float, drive: float, mix: float):
+        from pedalboard import Pedalboard, HighpassFilter
+
+        wav = audio["waveform"][0]   # [C, T]
+        sr  = audio["sample_rate"]
+
+        wav_np = wav.float().numpy()   # [C, T]
+
+        # Isolate HF band
+        board = Pedalboard([HighpassFilter(cutoff_frequency_hz=cutoff_hz)])
+        hf = board(wav_np, sr)         # [C, T]
+
+        # Tanh saturation — normalize by drive so output stays in [-1, 1]
+        excited = np.tanh(hf * drive) / max(drive, 1.0)
+
+        # Mix back with dry
+        mixed = wav_np + mix * excited
+
+        # Soft clip to prevent going over
+        mixed = np.tanh(mixed)
+
+        wav_out = torch.from_numpy(mixed).unsqueeze(0)  # [1, C, T]
+        print(
+            f"[HarmonicExciter] cutoff={cutoff_hz}Hz  drive={drive}  mix={mix:.0%}",
+            flush=True,
+        )
+        return ({"waveform": wav_out, "sample_rate": sr},)
+
+
+class SelvaFlashSR:
+    """Audio super-resolution via FlashSR (haoheliu/versatile_audio_super_resolution).
+
+    Upsamples bandwidth-limited audio to full 44.1 kHz by predicting missing
+    high-frequency content. Requires: pip install audiosr
+
+    FlashSR uses the 'basic' model — 22x faster than full AudioSR with
+    comparable quality for vocoder output enhancement.
+    """
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+                "guidance_scale": ("FLOAT", {
+                    "default": 3.5, "min": 1.0, "max": 10.0, "step": 0.5,
+                    "tooltip": "Classifier-free guidance scale. Higher = stronger HF prediction, "
+                               "lower = closer to input. 3.5 is a good default.",
+                }),
+                "ddim_steps": ("INT", {
+                    "default": 50, "min": 10, "max": 200,
+                    "tooltip": "Diffusion steps. 50 is standard quality, 25 for faster preview.",
+                }),
+            }
+        }
+
+    RETURN_TYPES  = ("AUDIO",)
+    RETURN_NAMES  = ("audio",)
+    FUNCTION      = "upsample"
+    CATEGORY      = SELVA_CATEGORY
+    DESCRIPTION   = (
+        "Audio super-resolution using FlashSR (basic model). "
+        "Predicts missing high-frequency content above the vocoder's reconstruction ceiling. "
+        "Requires: pip install audiosr"
+    )
+
+    def upsample(self, audio, guidance_scale: float, ddim_steps: int):
+        try:
+            import audiosr
+        except ImportError:
+            raise RuntimeError(
+                "[FlashSR] audiosr not installed. Run: pip install audiosr"
+            )
+
+        import soundfile as sf
+        import comfy.model_management
+
+        wav = audio["waveform"][0]   # [C, T]
+        sr  = audio["sample_rate"]
+
+        # AudioSR works on files — write to temp, process, read back
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            tmp_in  = Path(f.name)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            tmp_out = Path(f.name)
+
+        try:
+            wav_np = wav.float().numpy()   # [C, T]
+            if wav_np.shape[0] == 1:
+                wav_np = wav_np[0]         # [T] mono for soundfile
+            else:
+                wav_np = wav_np.T          # [T, C]
+            sf.write(str(tmp_in), wav_np, sr)
+
+            device = str(comfy.model_management.get_torch_device())
+            model  = audiosr.build_model(model_name="basic", device=device)
+            result = audiosr.super_resolution(
+                model,
+                str(tmp_in),
+                guidance_scale=guidance_scale,
+                ddim_steps=ddim_steps,
+                latent_t_per_second=12.8,
+            )
+
+            # result is numpy [1, T] at 44100 Hz
+            out_np  = np.array(result).squeeze()             # [T]
+            out_sr  = 44100
+            wav_out = torch.from_numpy(out_np).float()
+            if wav_out.dim() == 1:
+                wav_out = wav_out.unsqueeze(0)               # [1, T]
+            wav_out = wav_out.unsqueeze(0)                   # [1, 1, T]
+
+        finally:
+            tmp_in.unlink(missing_ok=True)
+            tmp_out.unlink(missing_ok=True)
+
+        print(f"[FlashSR] Done  guidance={guidance_scale}  steps={ddim_steps}", flush=True)
+        return ({"waveform": wav_out, "sample_rate": out_sr},)
+
+
+class SelvaOutputNormalizer:
+    """Normalize generated audio to a target LUFS level with true peak limiting.
+
+    Apply as the final node before saving — brings generated audio to a
+    consistent loudness target regardless of input video loudness variation.
+    Uses pyloudnorm (BS.1770-4).
+    """
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+                "target_lufs": ("FLOAT", {
+                    "default": -14.0, "min": -40.0, "max": -6.0, "step": 0.5,
+                    "tooltip": "Target integrated loudness in LUFS. "
+                               "-14 LUFS for streaming (Spotify/YouTube), "
+                               "-9 to -7 for production masters.",
+                }),
+                "true_peak_dbtp": ("FLOAT", {
+                    "default": -1.0, "min": -6.0, "max": 0.0, "step": 0.5,
+                    "tooltip": "True peak ceiling in dBTP applied after LUFS gain.",
+                }),
+            }
+        }
+
+    RETURN_TYPES  = ("AUDIO",)
+    RETURN_NAMES  = ("audio",)
+    FUNCTION      = "normalize"
+    CATEGORY      = SELVA_CATEGORY
+    DESCRIPTION   = (
+        "Normalize output audio to a target LUFS level (BS.1770-4) with true peak limiting. "
+        "Apply as the last node before saving. Uses pyloudnorm."
+    )
+
+    def normalize(self, audio, target_lufs: float, true_peak_dbtp: float):
+        import pyloudnorm as pyln
+
+        wav = audio["waveform"][0]   # [C, T]
+        sr  = audio["sample_rate"]
+
+        tp_linear = 10.0 ** (true_peak_dbtp / 20.0)
+
+        wav_np = wav.permute(1, 0).double().numpy()   # [T, C]
+        if wav_np.shape[1] == 1:
+            wav_np = wav_np[:, 0]                     # [T] mono
+
+        meter    = pyln.Meter(sr)
+        loudness = meter.integrated_loudness(wav_np)
+
+        if not np.isfinite(loudness):
+            print("[OutputNormalizer] Could not measure loudness — clip too short or silent. Passing through.", flush=True)
+            return (audio,)
+
+        gain_db     = target_lufs - loudness
+        gain_linear = 10.0 ** (gain_db / 20.0)
+
+        wav_out = wav * gain_linear
+
+        peak = wav_out.abs().max().item()
+        if peak > tp_linear:
+            wav_out = wav_out * (tp_linear / peak)
+
+        print(
+            f"[OutputNormalizer] {loudness:.1f} LUFS → {target_lufs} LUFS  "
+            f"gain={gain_db:+.1f}dB  TP={true_peak_dbtp}dBTP",
+            flush=True,
+        )
+        return ({"waveform": wav_out.unsqueeze(0), "sample_rate": sr},)