diff --git a/nodes/__init__.py b/nodes/__init__.py index 62514c2..d945d92 100644 --- a/nodes/__init__.py +++ b/nodes/__init__.py @@ -30,7 +30,6 @@ _NODES = { "SelvaDatasetItemExtractor": (".selva_dataset_pipeline", "SelvaDatasetItemExtractor", "SelVA Dataset Item Extractor"), "SelvaDatasetSaver": (".selva_dataset_pipeline", "SelvaDatasetSaver", "SelVA Dataset Saver"), "SelvaHarmonicExciter": (".selva_audio_postprocess", "SelvaHarmonicExciter", "SelVA Harmonic Exciter"), - "SelvaFlashSR": (".selva_audio_postprocess", "SelvaFlashSR", "SelVA FlashSR"), "SelvaOutputNormalizer": (".selva_audio_postprocess", "SelvaOutputNormalizer", "SelVA Output Normalizer"), } diff --git a/nodes/selva_audio_postprocess.py b/nodes/selva_audio_postprocess.py index bddfccd..7a67355 100644 --- a/nodes/selva_audio_postprocess.py +++ b/nodes/selva_audio_postprocess.py @@ -2,13 +2,9 @@ Post-generation enhancement applied to standard AUDIO outputs: SelvaHarmonicExciter — multi-band harmonic exciter (HPF → tanh → mix) - SelvaFlashSR — audio super-resolution via FlashSR/AudioSR SelvaOutputNormalizer — LUFS normalization + true peak limiting """ -import tempfile -from pathlib import Path - import numpy as np import torch @@ -85,92 +81,6 @@ class SelvaHarmonicExciter: return ({"waveform": wav_out, "sample_rate": sr},) -class SelvaFlashSR: - """Audio super-resolution via FlashSR (haoheliu/versatile_audio_super_resolution). - - Upsamples bandwidth-limited audio to full 44.1 kHz by predicting missing - high-frequency content. Requires: pip install audiosr - - FlashSR uses the 'basic' model — 22x faster than full AudioSR with - comparable quality for vocoder output enhancement. - """ - - @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "audio": ("AUDIO",), - "guidance_scale": ("FLOAT", { - "default": 3.5, "min": 1.0, "max": 10.0, "step": 0.5, - "tooltip": "Classifier-free guidance scale. Higher = stronger HF prediction, " - "lower = closer to input. 3.5 is a good default.", - }), - "ddim_steps": ("INT", { - "default": 50, "min": 10, "max": 200, - "tooltip": "Diffusion steps. 50 is standard quality, 25 for faster preview.", - }), - } - } - - RETURN_TYPES = ("AUDIO",) - RETURN_NAMES = ("audio",) - FUNCTION = "upsample" - CATEGORY = SELVA_CATEGORY - DESCRIPTION = ( - "Audio super-resolution using FlashSR (basic model). " - "Predicts missing high-frequency content above the vocoder's reconstruction ceiling. " - "Requires: pip install audiosr" - ) - - def upsample(self, audio, guidance_scale: float, ddim_steps: int): - try: - import audiosr - except ImportError: - raise RuntimeError( - "[FlashSR] audiosr not installed. Run: pip install audiosr" - ) - - import soundfile as sf - import comfy.model_management - - wav = audio["waveform"][0] # [C, T] - sr = audio["sample_rate"] - - # AudioSR works on files — write to temp, process, read back - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: - tmp_in = Path(f.name) - - try: - wav_np = wav.float().numpy() # [C, T] - if wav_np.shape[0] == 1: - wav_np = wav_np[0] # [T] mono for soundfile - else: - wav_np = wav_np.T # [T, C] - sf.write(str(tmp_in), wav_np, sr) - - model = audiosr.build_model(model_name="basic", device="auto") - result = audiosr.super_resolution( - model, - str(tmp_in), - guidance_scale=guidance_scale, - ddim_steps=ddim_steps, - latent_t_per_second=12.8, - ) - - # result is numpy [1, T] at 44100 Hz - out_np = np.array(result).squeeze() # [T] - out_sr = 44100 - wav_out = torch.from_numpy(out_np).float() - if wav_out.dim() == 1: - wav_out = wav_out.unsqueeze(0) # [1, T] - wav_out = wav_out.unsqueeze(0) # [1, 1, T] - - finally: - tmp_in.unlink(missing_ok=True) - - print(f"[FlashSR] Done guidance={guidance_scale} steps={ddim_steps}", flush=True) - return ({"waveform": wav_out, "sample_rate": out_sr},) - class SelvaOutputNormalizer: """Normalize generated audio to a target LUFS level with true peak limiting. diff --git a/nodes/selva_bigvgan_trainer.py b/nodes/selva_bigvgan_trainer.py index e7a06dd..a424c98 100644 --- a/nodes/selva_bigvgan_trainer.py +++ b/nodes/selva_bigvgan_trainer.py @@ -757,6 +757,10 @@ def _do_train(vocoder, mel_converter, clips, optimizer = torch.optim.AdamW(trainable_params, lr=lr, betas=(0.8, 0.99)) vocoder.train() + log_path = out_path.parent / f"{out_path.stem}_training_log.csv" + log_file = open(log_path, "w", buffering=1) # line-buffered + log_file.write("step,total_loss,fm_loss,mel_loss,stft_loss,phase_loss,l2sp_loss\n") + try: for step in range(steps): # Sample random batch — clips are CPU floats, move to device @@ -842,6 +846,13 @@ def _do_train(vocoder, mel_converter, clips, l2sp_str = f" l2sp={l2sp_loss.item():.4e}" if lambda_l2sp > 0 else "" print(f"[BigVGAN] {step+1}/{steps} {loss_desc}" f" total={loss.item():.4f}{l2sp_str}", flush=True) + # CSV row + _fm = fm_loss.item() if mpd is not None else "" + _mel = mel_loss.item() + _stft = stft_loss.item() if mpd is None else "" + _phase = phase_loss.item() if lambda_phase > 0.0 else "" + _l2sp = l2sp_loss.item() + log_file.write(f"{step+1},{loss.item():.6f},{_fm},{_mel},{_stft},{_phase},{_l2sp}\n") if (step + 1) % save_every == 0 and (step + 1) < steps: step_path = out_path.parent / f"{out_path.stem}_step{step+1}{out_path.suffix}" @@ -856,6 +867,7 @@ def _do_train(vocoder, mel_converter, clips, vocoder.train() finally: + log_file.close() vocoder.requires_grad_(False) vocoder.eval() if strategy == "offload_to_cpu":