From 8de201a4c964017cbba7005732b80b3491a2ee60 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Sun, 5 Apr 2026 18:19:29 +0200
Subject: [PATCH] Add OmniVoice Voice Preset node with two female voice samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two built-in presets, auto-downloaded and cached to ComfyUI/models/omnivoice/presets/:
- "Nature – female, warm" (F5-TTS basic_ref_en.wav, transcript included)
- "Shadowheart – female, expressive" (Chatterbox demo, connect Whisper for transcript)

Outputs ref_audio (AUDIO) and ref_text (STRING) — wire directly into
OmniVoice Generate. Updated default workflow to use this node.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 __init__.py                  |  4 +-
 nodes/__init__.py            |  3 +-
 nodes/voice_presets.py       | 69 ++++++++++++++++++++++++++++++++
 workflows/voice_cloning.json | 77 +++++++++++-------------------------
 4 files changed, 96 insertions(+), 57 deletions(-)
 create mode 100644 nodes/voice_presets.py

diff --git a/__init__.py b/__init__.py
index 1474cdd..85962c1 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,15 +1,17 @@
-from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader
+from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader, OmniVoiceVoicePreset
 
 NODE_CLASS_MAPPINGS = {
     "OmniVoiceModelLoader": OmniVoiceModelLoader,
     "OmniVoiceGenerate": OmniVoiceGenerate,
     "OmniVoiceEpubLoader": OmniVoiceEpubLoader,
+    "OmniVoiceVoicePreset": OmniVoiceVoicePreset,
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
     "OmniVoiceModelLoader": "OmniVoice Model Loader",
     "OmniVoiceGenerate": "OmniVoice Generate",
     "OmniVoiceEpubLoader": "OmniVoice EPUB Loader",
+    "OmniVoiceVoicePreset": "OmniVoice Voice Preset",
 }
 
 __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
diff --git a/nodes/__init__.py b/nodes/__init__.py
index fcc139b..0fb698f 100644
--- a/nodes/__init__.py
+++ b/nodes/__init__.py
@@ -1,5 +1,6 @@
 from .loader import OmniVoiceModelLoader
 from .generator import OmniVoiceGenerate
 from .epub_loader import OmniVoiceEpubLoader
+from .voice_presets import OmniVoiceVoicePreset
 
-__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader"]
+__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader", "OmniVoiceVoicePreset"]
diff --git a/nodes/voice_presets.py b/nodes/voice_presets.py
new file mode 100644
index 0000000..d97cfb2
--- /dev/null
+++ b/nodes/voice_presets.py
@@ -0,0 +1,69 @@
+import os
+import urllib.request
+import numpy as np
+import torch
+import soundfile as sf
+
+try:
+    import folder_paths
+    _CACHE_DIR = os.path.join(folder_paths.models_dir, "omnivoice", "presets")
+except ImportError:
+    _CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "omnivoice", "presets")
+
+# Each entry: (display_name, url, transcript)
+# transcript="" means unknown — connect a Whisper node to ref_text to fill it.
+PRESETS = {
+    "Nature – female, warm (F5-TTS ref)": (
+        "https://raw.githubusercontent.com/SWivid/F5-TTS/main/src/f5_tts/infer/examples/basic/basic_ref_en.wav",
+        "Some call me nature, others call me mother nature.",
+    ),
+    "Shadowheart – female, expressive (Chatterbox ref)": (
+        "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
+        "",  # transcript unknown — connect Whisper node to ref_text
+    ),
+}
+
+
+def _load_audio(url):
+    """Download (once) and return (waveform_tensor, sample_rate)."""
+    os.makedirs(_CACHE_DIR, exist_ok=True)
+    filename = os.path.basename(url.split("?")[0])
+    cache_path = os.path.join(_CACHE_DIR, filename)
+    if not os.path.exists(cache_path):
+        urllib.request.urlretrieve(url, cache_path)
+    audio_np, sr = sf.read(cache_path, dtype="float32")
+    if audio_np.ndim == 1:
+        audio_np = audio_np[np.newaxis, :]        # (1, samples)
+    else:
+        audio_np = audio_np.T                     # (channels, samples)
+    waveform = torch.from_numpy(audio_np).unsqueeze(0)  # (1, channels, samples)
+    return waveform, sr
+
+
+class OmniVoiceVoicePreset:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "preset": (
+                    list(PRESETS.keys()),
+                    {
+                        "tooltip": (
+                            "Pre-fetched reference voice for OmniVoice Generate.\n"
+                            "Connect ref_audio → ref_audio and ref_text → ref_text.\n"
+                            "If ref_text is blank, connect a Whisper node to supply the transcript."
+                        ),
+                    },
+                ),
+            },
+        }
+
+    RETURN_TYPES = ("AUDIO", "STRING")
+    RETURN_NAMES = ("ref_audio", "ref_text")
+    FUNCTION = "load_preset"
+    CATEGORY = "OmniVoice"
+
+    def load_preset(self, preset):
+        url, transcript = PRESETS[preset]
+        waveform, sr = _load_audio(url)
+        return ({"waveform": waveform, "sample_rate": sr}, transcript)
diff --git a/workflows/voice_cloning.json b/workflows/voice_cloning.json
index 2b7b6cb..2e8cc7c 100644
--- a/workflows/voice_cloning.json
+++ b/workflows/voice_cloning.json
@@ -1,84 +1,57 @@
 {
   "last_node_id": 4,
-  "last_link_id": 3,
+  "last_link_id": 4,
   "nodes": [
     {
       "id": 1,
       "type": "OmniVoiceModelLoader",
       "pos": [40, 80],
-      "size": {"0": 320, "1": 130},
+      "size": {"0": 300, "1": 100},
       "flags": {},
       "order": 0,
       "mode": 0,
       "outputs": [
-        {
-          "name": "model",
-          "type": "OMNIVOICE_MODEL",
-          "links": [1],
-          "shape": 3,
-          "slot_index": 0
-        }
+        {"name": "model", "type": "OMNIVOICE_MODEL", "links": [1], "shape": 3, "slot_index": 0}
       ],
       "properties": {"Node name for S&R": "OmniVoiceModelLoader"},
       "widgets_values": ["cuda:0", "float16"]
     },
     {
       "id": 2,
-      "type": "LoadAudio",
-      "pos": [40, 280],
-      "size": {"0": 320, "1": 76},
+      "type": "OmniVoiceVoicePreset",
+      "pos": [40, 240],
+      "size": {"0": 300, "1": 80},
       "flags": {},
       "order": 1,
       "mode": 0,
       "outputs": [
-        {
-          "name": "AUDIO",
-          "type": "AUDIO",
-          "links": [2],
-          "shape": 3,
-          "slot_index": 0
-        }
+        {"name": "ref_audio", "type": "AUDIO",  "links": [2], "shape": 3, "slot_index": 0},
+        {"name": "ref_text",  "type": "STRING", "links": [3], "shape": 3, "slot_index": 1}
       ],
-      "properties": {"Node name for S&R": "LoadAudio"},
-      "widgets_values": ["reference_voice.wav"]
+      "properties": {"Node name for S&R": "OmniVoiceVoicePreset"},
+      "widgets_values": ["Nature – female, warm (F5-TTS ref)"]
     },
     {
       "id": 3,
       "type": "OmniVoiceGenerate",
-      "pos": [440, 80],
+      "pos": [420, 80],
       "size": {"0": 380, "1": 310},
       "flags": {},
       "order": 2,
       "mode": 0,
       "inputs": [
-        {
-          "name": "model",
-          "type": "OMNIVOICE_MODEL",
-          "link": 1,
-          "slot_index": 0
-        },
-        {
-          "name": "ref_audio",
-          "type": "AUDIO",
-          "link": 2,
-          "slot_index": 1
-        }
+        {"name": "model",     "type": "OMNIVOICE_MODEL", "link": 1, "slot_index": 0},
+        {"name": "ref_audio", "type": "AUDIO",           "link": 2, "slot_index": 1},
+        {"name": "ref_text",  "type": "STRING",          "link": 3, "slot_index": 2}
       ],
       "outputs": [
-        {
-          "name": "audio",
-          "type": "AUDIO",
-          "links": [3],
-          "shape": 3,
-          "slot_index": 0
-        }
+        {"name": "audio", "type": "AUDIO", "links": [4], "shape": 3, "slot_index": 0}
       ],
       "properties": {"Node name for S&R": "OmniVoiceGenerate"},
       "widgets_values": [
-        "Hello! Connect a Whisper node to ref_text for best results.",
+        "Hello! This is a test of the OmniVoice text-to-speech system.",
         "voice_cloning",
         "",
-        "",
         1.0,
         32
       ]
@@ -86,18 +59,13 @@
     {
       "id": 4,
       "type": "SaveAudio",
-      "pos": [900, 80],
-      "size": {"0": 320, "1": 100},
+      "pos": [880, 80],
+      "size": {"0": 300, "1": 100},
       "flags": {},
       "order": 3,
       "mode": 0,
       "inputs": [
-        {
-          "name": "audio",
-          "type": "AUDIO",
-          "link": 3,
-          "slot_index": 0
-        }
+        {"name": "audio", "type": "AUDIO", "link": 4, "slot_index": 0}
       ],
       "properties": {"Node name for S&R": "SaveAudio"},
       "widgets_values": ["omnivoice"]
@@ -106,12 +74,11 @@
   "links": [
     [1, 1, 0, 3, 0, "OMNIVOICE_MODEL"],
     [2, 2, 0, 3, 1, "AUDIO"],
-    [3, 3, 0, 4, 0, "AUDIO"]
+    [3, 2, 1, 3, 2, "STRING"],
+    [4, 3, 0, 4, 0, "AUDIO"]
   ],
   "groups": [],
   "config": {},
-  "extra": {
-    "ds": {"scale": 0.9, "offset": [0, 0]}
-  },
+  "extra": {"ds": {"scale": 0.9, "offset": [0, 0]}},
   "version": 0.4
 }