fix: remove FlashSR (audiosr incompatible with Python 3.12), add training loss CSV
- Drop SelvaFlashSR node — audiosr pins numpy<=1.23.5 which cannot build on Python 3.12 (pkgutil.ImpImporter removed); use Saganaki22/ComfyUI-AudioSR instead - BigVGAN trainer now writes <output_stem>_training_log.csv alongside the checkpoint: step, total, fm, mel, stft, phase, l2sp columns, line-buffered so loss can be tailed live during training Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -30,7 +30,6 @@ _NODES = {
|
|||||||
"SelvaDatasetItemExtractor": (".selva_dataset_pipeline", "SelvaDatasetItemExtractor", "SelVA Dataset Item Extractor"),
|
"SelvaDatasetItemExtractor": (".selva_dataset_pipeline", "SelvaDatasetItemExtractor", "SelVA Dataset Item Extractor"),
|
||||||
"SelvaDatasetSaver": (".selva_dataset_pipeline", "SelvaDatasetSaver", "SelVA Dataset Saver"),
|
"SelvaDatasetSaver": (".selva_dataset_pipeline", "SelvaDatasetSaver", "SelVA Dataset Saver"),
|
||||||
"SelvaHarmonicExciter": (".selva_audio_postprocess", "SelvaHarmonicExciter", "SelVA Harmonic Exciter"),
|
"SelvaHarmonicExciter": (".selva_audio_postprocess", "SelvaHarmonicExciter", "SelVA Harmonic Exciter"),
|
||||||
"SelvaFlashSR": (".selva_audio_postprocess", "SelvaFlashSR", "SelVA FlashSR"),
|
|
||||||
"SelvaOutputNormalizer": (".selva_audio_postprocess", "SelvaOutputNormalizer", "SelVA Output Normalizer"),
|
"SelvaOutputNormalizer": (".selva_audio_postprocess", "SelvaOutputNormalizer", "SelVA Output Normalizer"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,13 +2,9 @@
|
|||||||
|
|
||||||
Post-generation enhancement applied to standard AUDIO outputs:
|
Post-generation enhancement applied to standard AUDIO outputs:
|
||||||
SelvaHarmonicExciter — multi-band harmonic exciter (HPF → tanh → mix)
|
SelvaHarmonicExciter — multi-band harmonic exciter (HPF → tanh → mix)
|
||||||
SelvaFlashSR — audio super-resolution via FlashSR/AudioSR
|
|
||||||
SelvaOutputNormalizer — LUFS normalization + true peak limiting
|
SelvaOutputNormalizer — LUFS normalization + true peak limiting
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -85,92 +81,6 @@ class SelvaHarmonicExciter:
|
|||||||
return ({"waveform": wav_out, "sample_rate": sr},)
|
return ({"waveform": wav_out, "sample_rate": sr},)
|
||||||
|
|
||||||
|
|
||||||
class SelvaFlashSR:
|
|
||||||
"""Audio super-resolution via FlashSR (haoheliu/versatile_audio_super_resolution).
|
|
||||||
|
|
||||||
Upsamples bandwidth-limited audio to full 44.1 kHz by predicting missing
|
|
||||||
high-frequency content. Requires: pip install audiosr
|
|
||||||
|
|
||||||
FlashSR uses the 'basic' model — 22x faster than full AudioSR with
|
|
||||||
comparable quality for vocoder output enhancement.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def INPUT_TYPES(cls):
|
|
||||||
return {
|
|
||||||
"required": {
|
|
||||||
"audio": ("AUDIO",),
|
|
||||||
"guidance_scale": ("FLOAT", {
|
|
||||||
"default": 3.5, "min": 1.0, "max": 10.0, "step": 0.5,
|
|
||||||
"tooltip": "Classifier-free guidance scale. Higher = stronger HF prediction, "
|
|
||||||
"lower = closer to input. 3.5 is a good default.",
|
|
||||||
}),
|
|
||||||
"ddim_steps": ("INT", {
|
|
||||||
"default": 50, "min": 10, "max": 200,
|
|
||||||
"tooltip": "Diffusion steps. 50 is standard quality, 25 for faster preview.",
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RETURN_TYPES = ("AUDIO",)
|
|
||||||
RETURN_NAMES = ("audio",)
|
|
||||||
FUNCTION = "upsample"
|
|
||||||
CATEGORY = SELVA_CATEGORY
|
|
||||||
DESCRIPTION = (
|
|
||||||
"Audio super-resolution using FlashSR (basic model). "
|
|
||||||
"Predicts missing high-frequency content above the vocoder's reconstruction ceiling. "
|
|
||||||
"Requires: pip install audiosr"
|
|
||||||
)
|
|
||||||
|
|
||||||
def upsample(self, audio, guidance_scale: float, ddim_steps: int):
|
|
||||||
try:
|
|
||||||
import audiosr
|
|
||||||
except ImportError:
|
|
||||||
raise RuntimeError(
|
|
||||||
"[FlashSR] audiosr not installed. Run: pip install audiosr"
|
|
||||||
)
|
|
||||||
|
|
||||||
import soundfile as sf
|
|
||||||
import comfy.model_management
|
|
||||||
|
|
||||||
wav = audio["waveform"][0] # [C, T]
|
|
||||||
sr = audio["sample_rate"]
|
|
||||||
|
|
||||||
# AudioSR works on files — write to temp, process, read back
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
||||||
tmp_in = Path(f.name)
|
|
||||||
|
|
||||||
try:
|
|
||||||
wav_np = wav.float().numpy() # [C, T]
|
|
||||||
if wav_np.shape[0] == 1:
|
|
||||||
wav_np = wav_np[0] # [T] mono for soundfile
|
|
||||||
else:
|
|
||||||
wav_np = wav_np.T # [T, C]
|
|
||||||
sf.write(str(tmp_in), wav_np, sr)
|
|
||||||
|
|
||||||
model = audiosr.build_model(model_name="basic", device="auto")
|
|
||||||
result = audiosr.super_resolution(
|
|
||||||
model,
|
|
||||||
str(tmp_in),
|
|
||||||
guidance_scale=guidance_scale,
|
|
||||||
ddim_steps=ddim_steps,
|
|
||||||
latent_t_per_second=12.8,
|
|
||||||
)
|
|
||||||
|
|
||||||
# result is numpy [1, T] at 44100 Hz
|
|
||||||
out_np = np.array(result).squeeze() # [T]
|
|
||||||
out_sr = 44100
|
|
||||||
wav_out = torch.from_numpy(out_np).float()
|
|
||||||
if wav_out.dim() == 1:
|
|
||||||
wav_out = wav_out.unsqueeze(0) # [1, T]
|
|
||||||
wav_out = wav_out.unsqueeze(0) # [1, 1, T]
|
|
||||||
|
|
||||||
finally:
|
|
||||||
tmp_in.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
print(f"[FlashSR] Done guidance={guidance_scale} steps={ddim_steps}", flush=True)
|
|
||||||
return ({"waveform": wav_out, "sample_rate": out_sr},)
|
|
||||||
|
|
||||||
|
|
||||||
class SelvaOutputNormalizer:
|
class SelvaOutputNormalizer:
|
||||||
"""Normalize generated audio to a target LUFS level with true peak limiting.
|
"""Normalize generated audio to a target LUFS level with true peak limiting.
|
||||||
|
|||||||
@@ -757,6 +757,10 @@ def _do_train(vocoder, mel_converter, clips,
|
|||||||
optimizer = torch.optim.AdamW(trainable_params, lr=lr, betas=(0.8, 0.99))
|
optimizer = torch.optim.AdamW(trainable_params, lr=lr, betas=(0.8, 0.99))
|
||||||
vocoder.train()
|
vocoder.train()
|
||||||
|
|
||||||
|
log_path = out_path.parent / f"{out_path.stem}_training_log.csv"
|
||||||
|
log_file = open(log_path, "w", buffering=1) # line-buffered
|
||||||
|
log_file.write("step,total_loss,fm_loss,mel_loss,stft_loss,phase_loss,l2sp_loss\n")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for step in range(steps):
|
for step in range(steps):
|
||||||
# Sample random batch — clips are CPU floats, move to device
|
# Sample random batch — clips are CPU floats, move to device
|
||||||
@@ -842,6 +846,13 @@ def _do_train(vocoder, mel_converter, clips,
|
|||||||
l2sp_str = f" l2sp={l2sp_loss.item():.4e}" if lambda_l2sp > 0 else ""
|
l2sp_str = f" l2sp={l2sp_loss.item():.4e}" if lambda_l2sp > 0 else ""
|
||||||
print(f"[BigVGAN] {step+1}/{steps} {loss_desc}"
|
print(f"[BigVGAN] {step+1}/{steps} {loss_desc}"
|
||||||
f" total={loss.item():.4f}{l2sp_str}", flush=True)
|
f" total={loss.item():.4f}{l2sp_str}", flush=True)
|
||||||
|
# CSV row
|
||||||
|
_fm = fm_loss.item() if mpd is not None else ""
|
||||||
|
_mel = mel_loss.item()
|
||||||
|
_stft = stft_loss.item() if mpd is None else ""
|
||||||
|
_phase = phase_loss.item() if lambda_phase > 0.0 else ""
|
||||||
|
_l2sp = l2sp_loss.item()
|
||||||
|
log_file.write(f"{step+1},{loss.item():.6f},{_fm},{_mel},{_stft},{_phase},{_l2sp}\n")
|
||||||
|
|
||||||
if (step + 1) % save_every == 0 and (step + 1) < steps:
|
if (step + 1) % save_every == 0 and (step + 1) < steps:
|
||||||
step_path = out_path.parent / f"{out_path.stem}_step{step+1}{out_path.suffix}"
|
step_path = out_path.parent / f"{out_path.stem}_step{step+1}{out_path.suffix}"
|
||||||
@@ -856,6 +867,7 @@ def _do_train(vocoder, mel_converter, clips,
|
|||||||
vocoder.train()
|
vocoder.train()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
log_file.close()
|
||||||
vocoder.requires_grad_(False)
|
vocoder.requires_grad_(False)
|
||||||
vocoder.eval()
|
vocoder.eval()
|
||||||
if strategy == "offload_to_cpu":
|
if strategy == "offload_to_cpu":
|
||||||
|
|||||||
Reference in New Issue
Block a user