fix: remove FlashSR (audiosr incompatible with Python 3.12), add training loss CSV

- Drop SelvaFlashSR node — audiosr pins numpy<=1.23.5 which cannot build
  on Python 3.12 (pkgutil.ImpImporter removed); use Saganaki22/ComfyUI-AudioSR instead
- BigVGAN trainer now writes <output_stem>_training_log.csv alongside the
  checkpoint: step, total, fm, mel, stft, phase, l2sp columns, line-buffered
  so loss can be tailed live during training

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-09 17:18:34 +02:00
parent 8371466e44
commit 8ccc2438e4
3 changed files with 12 additions and 91 deletions
-90
View File
@@ -2,13 +2,9 @@
Post-generation enhancement applied to standard AUDIO outputs:
SelvaHarmonicExciter — multi-band harmonic exciter (HPF → tanh → mix)
SelvaFlashSR — audio super-resolution via FlashSR/AudioSR
SelvaOutputNormalizer — LUFS normalization + true peak limiting
"""
import tempfile
from pathlib import Path
import numpy as np
import torch
@@ -85,92 +81,6 @@ class SelvaHarmonicExciter:
return ({"waveform": wav_out, "sample_rate": sr},)
class SelvaFlashSR:
"""Audio super-resolution via FlashSR (haoheliu/versatile_audio_super_resolution).
Upsamples bandwidth-limited audio to full 44.1 kHz by predicting missing
high-frequency content. Requires: pip install audiosr
FlashSR uses the 'basic' model — 22x faster than full AudioSR with
comparable quality for vocoder output enhancement.
"""
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"audio": ("AUDIO",),
"guidance_scale": ("FLOAT", {
"default": 3.5, "min": 1.0, "max": 10.0, "step": 0.5,
"tooltip": "Classifier-free guidance scale. Higher = stronger HF prediction, "
"lower = closer to input. 3.5 is a good default.",
}),
"ddim_steps": ("INT", {
"default": 50, "min": 10, "max": 200,
"tooltip": "Diffusion steps. 50 is standard quality, 25 for faster preview.",
}),
}
}
RETURN_TYPES = ("AUDIO",)
RETURN_NAMES = ("audio",)
FUNCTION = "upsample"
CATEGORY = SELVA_CATEGORY
DESCRIPTION = (
"Audio super-resolution using FlashSR (basic model). "
"Predicts missing high-frequency content above the vocoder's reconstruction ceiling. "
"Requires: pip install audiosr"
)
def upsample(self, audio, guidance_scale: float, ddim_steps: int):
try:
import audiosr
except ImportError:
raise RuntimeError(
"[FlashSR] audiosr not installed. Run: pip install audiosr"
)
import soundfile as sf
import comfy.model_management
wav = audio["waveform"][0] # [C, T]
sr = audio["sample_rate"]
# AudioSR works on files — write to temp, process, read back
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
tmp_in = Path(f.name)
try:
wav_np = wav.float().numpy() # [C, T]
if wav_np.shape[0] == 1:
wav_np = wav_np[0] # [T] mono for soundfile
else:
wav_np = wav_np.T # [T, C]
sf.write(str(tmp_in), wav_np, sr)
model = audiosr.build_model(model_name="basic", device="auto")
result = audiosr.super_resolution(
model,
str(tmp_in),
guidance_scale=guidance_scale,
ddim_steps=ddim_steps,
latent_t_per_second=12.8,
)
# result is numpy [1, T] at 44100 Hz
out_np = np.array(result).squeeze() # [T]
out_sr = 44100
wav_out = torch.from_numpy(out_np).float()
if wav_out.dim() == 1:
wav_out = wav_out.unsqueeze(0) # [1, T]
wav_out = wav_out.unsqueeze(0) # [1, 1, T]
finally:
tmp_in.unlink(missing_ok=True)
print(f"[FlashSR] Done guidance={guidance_scale} steps={ddim_steps}", flush=True)
return ({"waveform": wav_out, "sample_rate": out_sr},)
class SelvaOutputNormalizer:
"""Normalize generated audio to a target LUFS level with true peak limiting.