Files
ComfyUI-SelVA/nodes/selva_audio_postprocess.py
T
Ethanfel 8ccc2438e4 fix: remove FlashSR (audiosr incompatible with Python 3.12), add training loss CSV
- Drop SelvaFlashSR node — audiosr pins numpy<=1.23.5 which cannot build
  on Python 3.12 (pkgutil.ImpImporter removed); use Saganaki22/ComfyUI-AudioSR instead
- BigVGAN trainer now writes <output_stem>_training_log.csv alongside the
  checkpoint: step, total, fm, mel, stft, phase, l2sp columns, line-buffered
  so loss can be tailed live during training

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 17:18:34 +02:00

154 lines
5.5 KiB
Python

"""SelVA Audio Post-Processing nodes.
Post-generation enhancement applied to standard AUDIO outputs:
SelvaHarmonicExciter — multi-band harmonic exciter (HPF → tanh → mix)
SelvaOutputNormalizer — LUFS normalization + true peak limiting
"""
import numpy as np
import torch
from .utils import SELVA_CATEGORY
class SelvaHarmonicExciter:
"""Multi-band harmonic exciter for post-generation enhancement.
Isolates high-frequency content above a cutoff, applies tanh saturation
to generate 2nd/3rd harmonics, then mixes back with the dry signal.
Restores harmonic richness lost during BigVGAN vocoder reconstruction.
"""
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"audio": ("AUDIO",),
"cutoff_hz": ("FLOAT", {
"default": 3000.0, "min": 500.0, "max": 16000.0, "step": 100.0,
"tooltip": "Highpass cutoff frequency in Hz. Only content above this is excited. "
"3000 Hz targets the upper harmonics BigVGAN tends to smear.",
}),
"drive": ("FLOAT", {
"default": 2.0, "min": 1.0, "max": 10.0, "step": 0.5,
"tooltip": "Saturation drive. Higher = more harmonics generated. "
"2-3 is subtle, 5+ is aggressive.",
}),
"mix": ("FLOAT", {
"default": 0.15, "min": 0.0, "max": 1.0, "step": 0.05,
"tooltip": "Wet/dry blend. 0.1-0.2 is subtle enhancement, "
"0.5+ is aggressive harmonic addition.",
}),
}
}
RETURN_TYPES = ("AUDIO",)
RETURN_NAMES = ("audio",)
FUNCTION = "excite"
CATEGORY = SELVA_CATEGORY
DESCRIPTION = (
"Multi-band harmonic exciter. Applies tanh saturation to the high-frequency band "
"to restore harmonics lost during BigVGAN vocoder reconstruction. "
"Uses pedalboard.HighpassFilter for band isolation."
)
def excite(self, audio, cutoff_hz: float, drive: float, mix: float):
from pedalboard import Pedalboard, HighpassFilter
wav = audio["waveform"][0] # [C, T]
sr = audio["sample_rate"]
wav_np = wav.float().numpy() # [C, T]
# Isolate HF band
board = Pedalboard([HighpassFilter(cutoff_frequency_hz=cutoff_hz)])
hf = board(wav_np, sr) # [C, T]
# Tanh saturation — normalize by drive so output stays in [-1, 1]
excited = np.tanh(hf * drive) / max(drive, 1.0)
# Mix back with dry
mixed = wav_np + mix * excited
# Soft clip to prevent going over
mixed = np.tanh(mixed)
wav_out = torch.from_numpy(mixed).unsqueeze(0) # [1, C, T]
print(
f"[HarmonicExciter] cutoff={cutoff_hz}Hz drive={drive} mix={mix:.0%}",
flush=True,
)
return ({"waveform": wav_out, "sample_rate": sr},)
class SelvaOutputNormalizer:
"""Normalize generated audio to a target LUFS level with true peak limiting.
Apply as the final node before saving — brings generated audio to a
consistent loudness target regardless of input video loudness variation.
Uses pyloudnorm (BS.1770-4).
"""
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"audio": ("AUDIO",),
"target_lufs": ("FLOAT", {
"default": -14.0, "min": -40.0, "max": -6.0, "step": 0.5,
"tooltip": "Target integrated loudness in LUFS. "
"-14 LUFS for streaming (Spotify/YouTube), "
"-9 to -7 for production masters.",
}),
"true_peak_dbtp": ("FLOAT", {
"default": -1.0, "min": -6.0, "max": 0.0, "step": 0.5,
"tooltip": "True peak ceiling in dBTP applied after LUFS gain.",
}),
}
}
RETURN_TYPES = ("AUDIO",)
RETURN_NAMES = ("audio",)
FUNCTION = "normalize"
CATEGORY = SELVA_CATEGORY
DESCRIPTION = (
"Normalize output audio to a target LUFS level (BS.1770-4) with true peak limiting. "
"Apply as the last node before saving. Uses pyloudnorm."
)
def normalize(self, audio, target_lufs: float, true_peak_dbtp: float):
import pyloudnorm as pyln
wav = audio["waveform"][0] # [C, T]
sr = audio["sample_rate"]
tp_linear = 10.0 ** (true_peak_dbtp / 20.0)
wav_np = wav.permute(1, 0).double().numpy() # [T, C]
if wav_np.shape[1] == 1:
wav_np = wav_np[:, 0] # [T] mono
meter = pyln.Meter(sr)
loudness = meter.integrated_loudness(wav_np)
if not np.isfinite(loudness):
print("[OutputNormalizer] Could not measure loudness — clip too short or silent. Passing through.", flush=True)
return (audio,)
gain_db = target_lufs - loudness
gain_linear = 10.0 ** (gain_db / 20.0)
wav_out = wav * gain_linear
peak = wav_out.abs().max().item()
if peak > tp_linear:
wav_out = wav_out * (tp_linear / peak)
print(
f"[OutputNormalizer] {loudness:.1f} LUFS → {target_lufs} LUFS "
f"gain={gain_db:+.1f}dB TP={true_peak_dbtp}dBTP",
flush=True,
)
return ({"waveform": wav_out.unsqueeze(0), "sample_rate": sr},)