From 8de201a4c964017cbba7005732b80b3491a2ee60 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Sun, 5 Apr 2026 18:19:29 +0200 Subject: [PATCH] Add OmniVoice Voice Preset node with two female voice samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two built-in presets, auto-downloaded and cached to ComfyUI/models/omnivoice/presets/: - "Nature – female, warm" (F5-TTS basic_ref_en.wav, transcript included) - "Shadowheart – female, expressive" (Chatterbox demo, connect Whisper for transcript) Outputs ref_audio (AUDIO) and ref_text (STRING) — wire directly into OmniVoice Generate. Updated default workflow to use this node. Co-Authored-By: Claude Sonnet 4.6 --- __init__.py | 4 +- nodes/__init__.py | 3 +- nodes/voice_presets.py | 69 ++++++++++++++++++++++++++++++++ workflows/voice_cloning.json | 77 +++++++++++------------------------- 4 files changed, 96 insertions(+), 57 deletions(-) create mode 100644 nodes/voice_presets.py diff --git a/__init__.py b/__init__.py index 1474cdd..85962c1 100644 --- a/__init__.py +++ b/__init__.py @@ -1,15 +1,17 @@ -from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader +from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader, OmniVoiceVoicePreset NODE_CLASS_MAPPINGS = { "OmniVoiceModelLoader": OmniVoiceModelLoader, "OmniVoiceGenerate": OmniVoiceGenerate, "OmniVoiceEpubLoader": OmniVoiceEpubLoader, + "OmniVoiceVoicePreset": OmniVoiceVoicePreset, } NODE_DISPLAY_NAME_MAPPINGS = { "OmniVoiceModelLoader": "OmniVoice Model Loader", "OmniVoiceGenerate": "OmniVoice Generate", "OmniVoiceEpubLoader": "OmniVoice EPUB Loader", + "OmniVoiceVoicePreset": "OmniVoice Voice Preset", } __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] diff --git a/nodes/__init__.py b/nodes/__init__.py index fcc139b..0fb698f 100644 --- a/nodes/__init__.py +++ b/nodes/__init__.py @@ -1,5 +1,6 @@ from .loader import OmniVoiceModelLoader from .generator import OmniVoiceGenerate from .epub_loader import OmniVoiceEpubLoader +from .voice_presets import OmniVoiceVoicePreset -__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader"] +__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader", "OmniVoiceVoicePreset"] diff --git a/nodes/voice_presets.py b/nodes/voice_presets.py new file mode 100644 index 0000000..d97cfb2 --- /dev/null +++ b/nodes/voice_presets.py @@ -0,0 +1,69 @@ +import os +import urllib.request +import numpy as np +import torch +import soundfile as sf + +try: + import folder_paths + _CACHE_DIR = os.path.join(folder_paths.models_dir, "omnivoice", "presets") +except ImportError: + _CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "omnivoice", "presets") + +# Each entry: (display_name, url, transcript) +# transcript="" means unknown — connect a Whisper node to ref_text to fill it. +PRESETS = { + "Nature – female, warm (F5-TTS ref)": ( + "https://raw.githubusercontent.com/SWivid/F5-TTS/main/src/f5_tts/infer/examples/basic/basic_ref_en.wav", + "Some call me nature, others call me mother nature.", + ), + "Shadowheart – female, expressive (Chatterbox ref)": ( + "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac", + "", # transcript unknown — connect Whisper node to ref_text + ), +} + + +def _load_audio(url): + """Download (once) and return (waveform_tensor, sample_rate).""" + os.makedirs(_CACHE_DIR, exist_ok=True) + filename = os.path.basename(url.split("?")[0]) + cache_path = os.path.join(_CACHE_DIR, filename) + if not os.path.exists(cache_path): + urllib.request.urlretrieve(url, cache_path) + audio_np, sr = sf.read(cache_path, dtype="float32") + if audio_np.ndim == 1: + audio_np = audio_np[np.newaxis, :] # (1, samples) + else: + audio_np = audio_np.T # (channels, samples) + waveform = torch.from_numpy(audio_np).unsqueeze(0) # (1, channels, samples) + return waveform, sr + + +class OmniVoiceVoicePreset: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "preset": ( + list(PRESETS.keys()), + { + "tooltip": ( + "Pre-fetched reference voice for OmniVoice Generate.\n" + "Connect ref_audio → ref_audio and ref_text → ref_text.\n" + "If ref_text is blank, connect a Whisper node to supply the transcript." + ), + }, + ), + }, + } + + RETURN_TYPES = ("AUDIO", "STRING") + RETURN_NAMES = ("ref_audio", "ref_text") + FUNCTION = "load_preset" + CATEGORY = "OmniVoice" + + def load_preset(self, preset): + url, transcript = PRESETS[preset] + waveform, sr = _load_audio(url) + return ({"waveform": waveform, "sample_rate": sr}, transcript) diff --git a/workflows/voice_cloning.json b/workflows/voice_cloning.json index 2b7b6cb..2e8cc7c 100644 --- a/workflows/voice_cloning.json +++ b/workflows/voice_cloning.json @@ -1,84 +1,57 @@ { "last_node_id": 4, - "last_link_id": 3, + "last_link_id": 4, "nodes": [ { "id": 1, "type": "OmniVoiceModelLoader", "pos": [40, 80], - "size": {"0": 320, "1": 130}, + "size": {"0": 300, "1": 100}, "flags": {}, "order": 0, "mode": 0, "outputs": [ - { - "name": "model", - "type": "OMNIVOICE_MODEL", - "links": [1], - "shape": 3, - "slot_index": 0 - } + {"name": "model", "type": "OMNIVOICE_MODEL", "links": [1], "shape": 3, "slot_index": 0} ], "properties": {"Node name for S&R": "OmniVoiceModelLoader"}, "widgets_values": ["cuda:0", "float16"] }, { "id": 2, - "type": "LoadAudio", - "pos": [40, 280], - "size": {"0": 320, "1": 76}, + "type": "OmniVoiceVoicePreset", + "pos": [40, 240], + "size": {"0": 300, "1": 80}, "flags": {}, "order": 1, "mode": 0, "outputs": [ - { - "name": "AUDIO", - "type": "AUDIO", - "links": [2], - "shape": 3, - "slot_index": 0 - } + {"name": "ref_audio", "type": "AUDIO", "links": [2], "shape": 3, "slot_index": 0}, + {"name": "ref_text", "type": "STRING", "links": [3], "shape": 3, "slot_index": 1} ], - "properties": {"Node name for S&R": "LoadAudio"}, - "widgets_values": ["reference_voice.wav"] + "properties": {"Node name for S&R": "OmniVoiceVoicePreset"}, + "widgets_values": ["Nature – female, warm (F5-TTS ref)"] }, { "id": 3, "type": "OmniVoiceGenerate", - "pos": [440, 80], + "pos": [420, 80], "size": {"0": 380, "1": 310}, "flags": {}, "order": 2, "mode": 0, "inputs": [ - { - "name": "model", - "type": "OMNIVOICE_MODEL", - "link": 1, - "slot_index": 0 - }, - { - "name": "ref_audio", - "type": "AUDIO", - "link": 2, - "slot_index": 1 - } + {"name": "model", "type": "OMNIVOICE_MODEL", "link": 1, "slot_index": 0}, + {"name": "ref_audio", "type": "AUDIO", "link": 2, "slot_index": 1}, + {"name": "ref_text", "type": "STRING", "link": 3, "slot_index": 2} ], "outputs": [ - { - "name": "audio", - "type": "AUDIO", - "links": [3], - "shape": 3, - "slot_index": 0 - } + {"name": "audio", "type": "AUDIO", "links": [4], "shape": 3, "slot_index": 0} ], "properties": {"Node name for S&R": "OmniVoiceGenerate"}, "widgets_values": [ - "Hello! Connect a Whisper node to ref_text for best results.", + "Hello! This is a test of the OmniVoice text-to-speech system.", "voice_cloning", "", - "", 1.0, 32 ] @@ -86,18 +59,13 @@ { "id": 4, "type": "SaveAudio", - "pos": [900, 80], - "size": {"0": 320, "1": 100}, + "pos": [880, 80], + "size": {"0": 300, "1": 100}, "flags": {}, "order": 3, "mode": 0, "inputs": [ - { - "name": "audio", - "type": "AUDIO", - "link": 3, - "slot_index": 0 - } + {"name": "audio", "type": "AUDIO", "link": 4, "slot_index": 0} ], "properties": {"Node name for S&R": "SaveAudio"}, "widgets_values": ["omnivoice"] @@ -106,12 +74,11 @@ "links": [ [1, 1, 0, 3, 0, "OMNIVOICE_MODEL"], [2, 2, 0, 3, 1, "AUDIO"], - [3, 3, 0, 4, 0, "AUDIO"] + [3, 2, 1, 3, 2, "STRING"], + [4, 3, 0, 4, 0, "AUDIO"] ], "groups": [], "config": {}, - "extra": { - "ds": {"scale": 0.9, "offset": [0, 0]} - }, + "extra": {"ds": {"scale": 0.9, "offset": [0, 0]}}, "version": 0.4 }