fix: remove FlashSR (audiosr incompatible with Python 3.12), add training loss CSV

- Drop SelvaFlashSR node — audiosr pins numpy<=1.23.5 which cannot build on Python 3.12 (pkgutil.ImpImporter removed); use Saganaki22/ComfyUI-AudioSR instead - BigVGAN trainer now writes <output_stem>_training_log.csv alongside the checkpoint: step, total, fm, mel, stft, phase, l2sp columns, line-buffered so loss can be tailed live during training Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 17:18:34 +02:00
parent 8371466e44
commit 8ccc2438e4
3 changed files with 12 additions and 91 deletions
@@ -2,13 +2,9 @@

 Post-generation enhancement applied to standard AUDIO outputs:
  SelvaHarmonicExciter    — multi-band harmonic exciter (HPF → tanh → mix)
-  SelvaFlashSR            — audio super-resolution via FlashSR/AudioSR
  SelvaOutputNormalizer   — LUFS normalization + true peak limiting
 """

-import tempfile
-from pathlib import Path
-
 import numpy as np
 import torch

@@ -85,92 +81,6 @@ class SelvaHarmonicExciter:
        return ({"waveform": wav_out, "sample_rate": sr},)


-class SelvaFlashSR:
-    """Audio super-resolution via FlashSR (haoheliu/versatile_audio_super_resolution).
-
-    Upsamples bandwidth-limited audio to full 44.1 kHz by predicting missing
-    high-frequency content. Requires: pip install audiosr
-
-    FlashSR uses the 'basic' model — 22x faster than full AudioSR with
-    comparable quality for vocoder output enhancement.
-    """
-
-    @classmethod
-    def INPUT_TYPES(cls):
-        return {
-            "required": {
-                "audio": ("AUDIO",),
-                "guidance_scale": ("FLOAT", {
-                    "default": 3.5, "min": 1.0, "max": 10.0, "step": 0.5,
-                    "tooltip": "Classifier-free guidance scale. Higher = stronger HF prediction, "
-                               "lower = closer to input. 3.5 is a good default.",
-                }),
-                "ddim_steps": ("INT", {
-                    "default": 50, "min": 10, "max": 200,
-                    "tooltip": "Diffusion steps. 50 is standard quality, 25 for faster preview.",
-                }),
-            }
-        }
-
-    RETURN_TYPES  = ("AUDIO",)
-    RETURN_NAMES  = ("audio",)
-    FUNCTION      = "upsample"
-    CATEGORY      = SELVA_CATEGORY
-    DESCRIPTION   = (
-        "Audio super-resolution using FlashSR (basic model). "
-        "Predicts missing high-frequency content above the vocoder's reconstruction ceiling. "
-        "Requires: pip install audiosr"
-    )
-
-    def upsample(self, audio, guidance_scale: float, ddim_steps: int):
-        try:
-            import audiosr
-        except ImportError:
-            raise RuntimeError(
-                "[FlashSR] audiosr not installed. Run: pip install audiosr"
-            )
-
-        import soundfile as sf
-        import comfy.model_management
-
-        wav = audio["waveform"][0]   # [C, T]
-        sr  = audio["sample_rate"]
-
-        # AudioSR works on files — write to temp, process, read back
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            tmp_in = Path(f.name)
-
-        try:
-            wav_np = wav.float().numpy()   # [C, T]
-            if wav_np.shape[0] == 1:
-                wav_np = wav_np[0]         # [T] mono for soundfile
-            else:
-                wav_np = wav_np.T          # [T, C]
-            sf.write(str(tmp_in), wav_np, sr)
-
-            model  = audiosr.build_model(model_name="basic", device="auto")
-            result = audiosr.super_resolution(
-                model,
-                str(tmp_in),
-                guidance_scale=guidance_scale,
-                ddim_steps=ddim_steps,
-                latent_t_per_second=12.8,
-            )
-
-            # result is numpy [1, T] at 44100 Hz
-            out_np  = np.array(result).squeeze()             # [T]
-            out_sr  = 44100
-            wav_out = torch.from_numpy(out_np).float()
-            if wav_out.dim() == 1:
-                wav_out = wav_out.unsqueeze(0)               # [1, T]
-            wav_out = wav_out.unsqueeze(0)                   # [1, 1, T]
-
-        finally:
-            tmp_in.unlink(missing_ok=True)
-
-        print(f"[FlashSR] Done  guidance={guidance_scale}  steps={ddim_steps}", flush=True)
-        return ({"waveform": wav_out, "sample_rate": out_sr},)
-

 class SelvaOutputNormalizer:
    """Normalize generated audio to a target LUFS level with true peak limiting.