Files
Ethanfel 26295e4db7 feat: auto-discover user presets from the presets folder
Drop any audio file (wav/flac/mp3/ogg/m4a) into the presets cache dir and
it will appear as "<name> (local)" in the Voice Preset dropdown on next
ComfyUI restart. Add a same-stem .txt file for the transcript.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 09:37:17 +02:00

136 lines
7.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import urllib.request
import numpy as np
import torch
import soundfile as sf
try:
import folder_paths
_CACHE_DIR = os.path.join(folder_paths.models_dir, "omnivoice", "presets")
except ImportError:
_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "omnivoice", "presets")
# Each entry: (url, transcript)
PRESETS = {
# ── Female ──────────────────────────────────────────────────────────────
"Shadowheart female, expressive (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
"That place in the distance, it's huge and dedicated to Lady Shar. It can only mean one thing. I have a hidden place close to the cloister where night orchids bloom.",
),
"American actress female, theatrical (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_american.flac",
"In this piece, we're actually playing husband and wife.",
),
"Podcast host female, casual (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_random_podcast.wav",
"Really is, like, here is the context in which Sinead made those comments. And then also it, like, contextualizes in, like, present day Ireland. And you're like, oh, maybe Sinead was on to something. And we punished her for telling the truth. But also musicians, you know, like I was like, oh, yeah, musicians used to like any time musicians do politics, it's very dicey for them. I was like, the chicks, we were like, don't talk bad about the Iraq war. And then it turns out they were 100 percent on to something.",
),
# ── Male ────────────────────────────────────────────────────────────────
"Nature male, warm (F5-TTS)": (
"https://raw.githubusercontent.com/SWivid/F5-TTS/main/src/f5_tts/infer/examples/basic/basic_ref_en.wav",
"Some call me nature, others call me mother nature.",
),
"Old Hollywood male, classic (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_old_movie.flac",
"Yes, and you're right. Of course he will be. You'll see me in my hotel office before you leave this evening, Sid. Yes, sir. Now, Miss Gardner, let's get you the best room we have in this hotel.",
),
"Rick Sanchez male, casual (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_rickmorty.mp3",
"You know, Grandpa goes around and he does his business in public because Grandpa isn't shady. Any of your, uh, scientists working on anything new? Why was Knight Rider called Knight Rider? I don't want to overstep my bounds or anything. It's your house. It's your world. You're a real Julius Caesar.",
),
"Stewie Griffin male, precise (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_stewie.mp3",
"Hope I'm not bothering you. Just doing some stretching. Maybe a few poses. You'll tell me if I'm bothering you, right? I've even been singled out a few times. Probably because it's mostly pregnant women in the group. Still, Brody must see something. Although I certainly don't. But then again, I'm not the instructor, am I? Oh, yuck!",
),
"Harvey Keitel male, intense (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_harvey_keitel.mp3",
"And if self-preservation is an instinct you possess, you better fucking do it and do it quick. So if a cop stops us and starts sticking his big snout in the car, the subterfuge won't last. But at a glance, the car will appear to be normal. We need to camouflage the interior of the car. We're gonna line the front seat and the back seat and the floorboards with quilts and blankets. So pretty please, with sugar on top. Clean the fucking car.",
),
"Conan O'Brien male, comedy (Chatterbox)": (
"https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_conan.mp3",
"This review, the hosts quibbled over whether it's Live Free and Die Hard or Live Free or Die Hard, never even mentioning the film itself. Tangled mess. It went on for 10 minutes and then this reviewer's review was read. Good space work. And this is all really for the fake podcaster from the fake slate review while miming swiping up on his screen.",
),
}
_AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}
_BUILTIN_FILES = frozenset(os.path.basename(url.split("?")[0]) for url, _ in PRESETS.values())
def _scan_user_presets():
"""Return a dict of user presets found in _CACHE_DIR.
For each audio file that is not a cached built-in, look for a same-stem
.txt file for the transcript. Key format: "<stem> (local)".
"""
user = {}
if not os.path.isdir(_CACHE_DIR):
return user
for fname in sorted(os.listdir(_CACHE_DIR)):
stem, ext = os.path.splitext(fname)
if ext.lower() not in _AUDIO_EXTS or fname in _BUILTIN_FILES:
continue
audio_path = os.path.join(_CACHE_DIR, fname)
txt_path = os.path.join(_CACHE_DIR, stem + ".txt")
transcript = ""
if os.path.exists(txt_path):
with open(txt_path, "r", encoding="utf-8") as f:
transcript = f.read().strip()
user[f"{stem} (local)"] = (audio_path, transcript)
return user
def _load_audio(source):
"""Load audio from a URL (downloading once) or a local file path."""
os.makedirs(_CACHE_DIR, exist_ok=True)
if source.startswith("http://") or source.startswith("https://"):
filename = os.path.basename(source.split("?")[0])
cache_path = os.path.join(_CACHE_DIR, filename)
if not os.path.exists(cache_path):
urllib.request.urlretrieve(source, cache_path)
path = cache_path
else:
path = source
audio_np, sr = sf.read(path, dtype="float32")
if audio_np.ndim == 1:
audio_np = audio_np[np.newaxis, :] # (1, samples)
else:
audio_np = audio_np.T # (channels, samples)
waveform = torch.from_numpy(audio_np).unsqueeze(0) # (1, channels, samples)
return waveform, sr
class OmniVoiceVoicePreset:
@classmethod
def INPUT_TYPES(cls):
all_presets = {**PRESETS, **_scan_user_presets()}
return {
"required": {
"preset": (
list(all_presets.keys()),
{
"tooltip": (
"Pre-fetched reference voice for OmniVoice Generate.\n"
"Connect ref_audio → ref_audio and ref_text → ref_text.\n"
"\n"
"To add your own presets, drop audio files into:\n"
f" {_CACHE_DIR}\n"
"Add a same-name .txt file alongside for the transcript.\n"
"Restart ComfyUI to pick up new files."
),
},
),
},
}
RETURN_TYPES = ("AUDIO", "STRING")
RETURN_NAMES = ("ref_audio", "ref_text")
FUNCTION = "load_preset"
CATEGORY = "OmniVoice"
def load_preset(self, preset):
all_presets = {**PRESETS, **_scan_user_presets()}
source, transcript = all_presets[preset]
waveform, sr = _load_audio(source)
return ({"waveform": waveform, "sample_rate": sr}, transcript)