e5110b88e1
New "auto" option (now the default) on the Sampler's input_sr. detect_input_sr finds the spectral cutoff cliff (steepest drop) and its dB confidence: effective cutoff = that cliff if confident, else sr/2 — one rule that covers band-limited (→ matched input_sr), full-band (→ 24000), and genuine low-rate files (→ their rate). Rounds DOWN to the nearest supported Nyquist to avoid feeding the model an empty band. Logs its decision. Falls back to 24000 when unsure. Tests cover sharp 4/6/8/12 kHz cutoffs, full-band, genuine-8kHz, silence, stereo. Verified end-to-end on the real model (8 kHz clip -> auto picks 16000). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
224 lines
9.7 KiB
Python
224 lines
9.7 KiB
Python
"""ComfyUI-UniverSR nodes.
|
|
|
|
Two-node design (mirrors the ComfyUI-Flash-AudioSR pattern):
|
|
UniverSRModelLoader -> UNIVERSR_MODEL (loads + caches weights, auto-downloads)
|
|
UniverSRSampler -> AUDIO, IMAGE (runs the super-resolution)
|
|
"""
|
|
|
|
import torch
|
|
|
|
from . import universr_wrapper as usr
|
|
|
|
try:
|
|
import comfy.model_management as mm
|
|
HAS_COMFY = True
|
|
except Exception: # pragma: no cover
|
|
HAS_COMFY = False
|
|
|
|
|
|
def _default_device() -> str:
|
|
if HAS_COMFY:
|
|
try:
|
|
return "cuda" if mm.get_torch_device().type == "cuda" else "cpu"
|
|
except Exception:
|
|
pass
|
|
return "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Model loader
|
|
# --------------------------------------------------------------------------- #
|
|
class UniverSRModelLoader:
|
|
"""Load a UniverSR checkpoint. Auto-downloads the presets on first use.
|
|
|
|
Output: UNIVERSR_MODEL -> connect to UniverSR Super-Resolution.
|
|
"""
|
|
|
|
DESCRIPTION = ("Load UniverSR (vocoder-free audio super-resolution, ICASSP 2026). "
|
|
"Presets auto-download to models/universr on first use.")
|
|
CATEGORY = "audio/UniverSR"
|
|
|
|
@classmethod
|
|
def INPUT_TYPES(cls):
|
|
choices = list(usr.HF_REPOS.keys()) + usr.list_local_models()
|
|
return {
|
|
"required": {
|
|
"model": (choices, {
|
|
"default": choices[0],
|
|
"tooltip": "universr-audio = general (music/SFX/mixed, recommended); "
|
|
"universr-speech = voice only. Both download (~230 MB) on first use. "
|
|
"Local checkpoint folders in models/universr also appear here.",
|
|
}),
|
|
"device": (["auto", "cuda", "cpu"], {
|
|
"default": "auto",
|
|
"tooltip": "Device to load the model onto.",
|
|
}),
|
|
},
|
|
"optional": {
|
|
"tf32": ("BOOLEAN", {
|
|
"default": False,
|
|
"tooltip": "TF32 matmul + conv on Ampere+ GPUs (~1.15x). Tonally neutral in testing "
|
|
"but not bit-exact; off by default = reference fp32. A/B with a FIXED seed "
|
|
"(seed!=0) — comparing two seed=0 runs changes the noise, not just TF32.",
|
|
}),
|
|
"compile": ("BOOLEAN", {
|
|
"default": False,
|
|
"tooltip": "torch.compile the network (~2x). First run compiles (~10-35s), then fast "
|
|
"and cached. Needs CUDA. Chunks are auto-padded to a fixed size, so set the "
|
|
"sampler's chunk_seconds near your typical clip length to avoid wasted compute.",
|
|
}),
|
|
"local_path": ("STRING", {
|
|
"default": "",
|
|
"tooltip": "Override: a folder with config.yaml + pytorch_model.bin, "
|
|
"or a raw .pth/.ckpt file (uses config_path or the bundled config).",
|
|
}),
|
|
"config_path": ("STRING", {
|
|
"default": "",
|
|
"tooltip": "config.yaml for a raw checkpoint given in local_path. "
|
|
"Leave empty to use the bundled default config.",
|
|
}),
|
|
},
|
|
}
|
|
|
|
RETURN_TYPES = ("UNIVERSR_MODEL",)
|
|
RETURN_NAMES = ("model",)
|
|
FUNCTION = "load"
|
|
|
|
def load(self, model, device, tf32=False, compile=False, local_path="", config_path=""):
|
|
dev = _default_device() if device == "auto" else device
|
|
if dev == "cuda" and not torch.cuda.is_available():
|
|
print("[UniverSR] CUDA unavailable, falling back to CPU")
|
|
dev = "cpu"
|
|
model_obj, cache_key = usr.load_model(
|
|
model, dev, local_path=local_path, config_path=config_path,
|
|
tf32=tf32, compile_model=compile,
|
|
)
|
|
return ({"model": model_obj, "device": dev, "cache_key": cache_key},)
|
|
|
|
@classmethod
|
|
def IS_CHANGED(cls, model, device, tf32=True, compile=False, local_path="", config_path=""):
|
|
return f"{model}:{device}:tf32={tf32}:compile={compile}:{local_path}:{config_path}"
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Sampler
|
|
# --------------------------------------------------------------------------- #
|
|
class UniverSRSampler:
|
|
"""Super-resolve audio to 48 kHz with UniverSR. Long clips are processed in
|
|
overlapping chunks (click-free overlap-add) to stay within VRAM."""
|
|
|
|
DESCRIPTION = ("Upscale low-bandwidth audio to 48 kHz with UniverSR. Pick input_sr to "
|
|
"match the effective bandwidth of your content (the model regenerates "
|
|
"everything above input_sr/2).")
|
|
CATEGORY = "audio/UniverSR"
|
|
|
|
@classmethod
|
|
def INPUT_TYPES(cls):
|
|
return {
|
|
"required": {
|
|
"audio": ("AUDIO", {}),
|
|
"model": ("UNIVERSR_MODEL", {}),
|
|
"input_sr": (["auto", "8000", "12000", "16000", "24000"], {
|
|
"default": "auto",
|
|
"tooltip": "Effective input bandwidth (Hz). Content is treated as valid up to "
|
|
"input_sr/2 and regenerated above it. 'auto' detects the audio's cutoff "
|
|
"and picks for you (falls back to 24000 if no clear cutoff). "
|
|
"8000 = genuine low-rate audio (strongest). 16000 = brighten muffled audio.",
|
|
}),
|
|
},
|
|
"optional": {
|
|
"ode_method": (["midpoint", "euler", "rk4"], {
|
|
"default": "midpoint",
|
|
"tooltip": "ODE solver. euler (fastest) -> midpoint (balanced) -> rk4 (best).",
|
|
}),
|
|
"ode_steps": ("INT", {
|
|
"default": 4, "min": 1, "max": 64, "step": 1,
|
|
"tooltip": "Flow-matching integration steps. 4 is fast and validated; 4-10 is a good range.",
|
|
}),
|
|
"guidance_scale": ("FLOAT", {
|
|
"default": 1.5, "min": 0.0, "max": 6.0, "step": 0.25,
|
|
"tooltip": "Classifier-free guidance. Speech 1.0-1.5, music 1.5-2.0, SFX ~1.5. "
|
|
"Higher = denser highs but less faithful. 0 disables CFG.",
|
|
}),
|
|
"seed": ("INT", {
|
|
"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF,
|
|
"tooltip": "Noise seed for the flow-matching source. 0 = random each run.",
|
|
}),
|
|
"chunk_seconds": ("FLOAT", {
|
|
"default": 10.0, "min": 0.0, "max": 120.0, "step": 0.5,
|
|
"tooltip": "Process long audio in chunks of this length (seconds) to avoid OOM. "
|
|
"0 = process the whole clip at once.",
|
|
}),
|
|
"overlap_seconds": ("FLOAT", {
|
|
"default": 0.5, "min": 0.0, "max": 5.0, "step": 0.1,
|
|
"tooltip": "Crossfade overlap between chunks (seconds). Prevents seam clicks.",
|
|
}),
|
|
"blend": ("FLOAT", {
|
|
"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.05,
|
|
"tooltip": "Wet/dry mix. 1.0 = full super-resolution. Lower to keep more of the "
|
|
"original (useful when brightening already-48 kHz audio).",
|
|
}),
|
|
"unload_model": ("BOOLEAN", {
|
|
"default": False,
|
|
"tooltip": "Free the model from VRAM after this run.",
|
|
}),
|
|
"show_spectrogram": ("BOOLEAN", {
|
|
"default": True,
|
|
"tooltip": "Also output a before/after spectrogram comparison image.",
|
|
}),
|
|
},
|
|
}
|
|
|
|
RETURN_TYPES = ("AUDIO", "IMAGE")
|
|
RETURN_NAMES = ("audio", "spectrogram")
|
|
FUNCTION = "run"
|
|
|
|
def run(self, audio, model, input_sr, ode_method="midpoint", ode_steps=4,
|
|
guidance_scale=1.5, seed=0, chunk_seconds=10.0, overlap_seconds=0.5,
|
|
blend=1.0, unload_model=False, show_spectrogram=True):
|
|
|
|
model_obj = model["model"]
|
|
waveform, sr = usr.comfy_audio_to_tensor(audio)
|
|
dur = waveform.shape[-1] / max(sr, 1)
|
|
|
|
# Resolve auto bandwidth detection to a concrete input_sr.
|
|
if str(input_sr) == "auto":
|
|
isr, info = usr.detect_input_sr(waveform, sr)
|
|
print(f"[UniverSR] auto: {info['reason']} -> input_sr={isr}")
|
|
else:
|
|
isr = int(input_sr)
|
|
|
|
print(f"[UniverSR] {tuple(waveform.shape)} @ {sr} Hz ({dur:.2f}s) -> 48 kHz | "
|
|
f"input_sr={isr}, {ode_method}/{ode_steps}, cfg={guidance_scale}, blend={blend}")
|
|
|
|
out, dry48 = usr.super_resolve(
|
|
model_obj, waveform, sr, isr,
|
|
ode_method=ode_method, ode_steps=int(ode_steps), guidance_scale=guidance_scale,
|
|
seed=int(seed), chunk_seconds=float(chunk_seconds),
|
|
overlap_seconds=float(overlap_seconds), blend=float(blend),
|
|
)
|
|
|
|
audio_out = usr.tensor_to_comfy_audio(out, usr.TARGET_SR)
|
|
|
|
spec = torch.zeros(1, 64, 64, 3)
|
|
if show_spectrogram:
|
|
in_mono = dry48[0].mean(0).numpy()
|
|
out_mono = out[0].mean(0).numpy()
|
|
spec = usr.make_spectrogram_image(in_mono, out_mono, isr)
|
|
|
|
if unload_model:
|
|
usr.evict_model(model["cache_key"])
|
|
|
|
print(f"[UniverSR] Done -> {out.shape[-1] / usr.TARGET_SR:.2f}s at 48 kHz")
|
|
return (audio_out, spec)
|
|
|
|
|
|
NODE_CLASS_MAPPINGS = {
|
|
"UniverSRModelLoader": UniverSRModelLoader,
|
|
"UniverSRSampler": UniverSRSampler,
|
|
}
|
|
NODE_DISPLAY_NAME_MAPPINGS = {
|
|
"UniverSRModelLoader": "UniverSR Model Loader",
|
|
"UniverSRSampler": "UniverSR Super-Resolution",
|
|
}
|