feat: auto-discover user presets from the presets folder

Drop any audio file (wav/flac/mp3/ogg/m4a) into the presets cache dir and it will appear as "<name> (local)" in the Voice Preset dropdown on next ComfyUI restart. Add a same-stem .txt file for the transcript. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 09:37:17 +02:00
parent d5f2632c48
commit 26295e4db7
1 changed files with 48 additions and 11 deletions
@@ -53,14 +53,45 @@ PRESETS = {
 }


-def _load_audio(url):
-    """Download (once) and return (waveform_tensor, sample_rate)."""
+_AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}
+_BUILTIN_FILES = frozenset(os.path.basename(url.split("?")[0]) for url, _ in PRESETS.values())
+
+
+def _scan_user_presets():
+    """Return a dict of user presets found in _CACHE_DIR.
+
+    For each audio file that is not a cached built-in, look for a same-stem
+    .txt file for the transcript.  Key format: "<stem> (local)".
+    """
+    user = {}
+    if not os.path.isdir(_CACHE_DIR):
+        return user
+    for fname in sorted(os.listdir(_CACHE_DIR)):
+        stem, ext = os.path.splitext(fname)
+        if ext.lower() not in _AUDIO_EXTS or fname in _BUILTIN_FILES:
+            continue
+        audio_path = os.path.join(_CACHE_DIR, fname)
+        txt_path = os.path.join(_CACHE_DIR, stem + ".txt")
+        transcript = ""
+        if os.path.exists(txt_path):
+            with open(txt_path, "r", encoding="utf-8") as f:
+                transcript = f.read().strip()
+        user[f"{stem} (local)"] = (audio_path, transcript)
+    return user
+
+
+def _load_audio(source):
+    """Load audio from a URL (downloading once) or a local file path."""
    os.makedirs(_CACHE_DIR, exist_ok=True)
-    filename = os.path.basename(url.split("?")[0])
+    if source.startswith("http://") or source.startswith("https://"):
+        filename = os.path.basename(source.split("?")[0])
        cache_path = os.path.join(_CACHE_DIR, filename)
        if not os.path.exists(cache_path):
-        urllib.request.urlretrieve(url, cache_path)
-    audio_np, sr = sf.read(cache_path, dtype="float32")
+            urllib.request.urlretrieve(source, cache_path)
+        path = cache_path
+    else:
+        path = source
+    audio_np, sr = sf.read(path, dtype="float32")
    if audio_np.ndim == 1:
        audio_np = audio_np[np.newaxis, :]        # (1, samples)
    else:
@@ -72,15 +103,20 @@ def _load_audio(url):
 class OmniVoiceVoicePreset:
    @classmethod
    def INPUT_TYPES(cls):
+        all_presets = {**PRESETS, **_scan_user_presets()}
        return {
            "required": {
                "preset": (
-                    list(PRESETS.keys()),
+                    list(all_presets.keys()),
                    {
                        "tooltip": (
                            "Pre-fetched reference voice for OmniVoice Generate.\n"
                            "Connect ref_audio → ref_audio and ref_text → ref_text.\n"
-                            "If ref_text is blank, connect a Whisper node to supply the transcript."
+                            "\n"
+                            "To add your own presets, drop audio files into:\n"
+                            f"  {_CACHE_DIR}\n"
+                            "Add a same-name .txt file alongside for the transcript.\n"
+                            "Restart ComfyUI to pick up new files."
                        ),
                    },
                ),
@@ -93,6 +129,7 @@ class OmniVoiceVoicePreset:
    CATEGORY = "OmniVoice"

    def load_preset(self, preset):
-        url, transcript = PRESETS[preset]
-        waveform, sr = _load_audio(url)
+        all_presets = {**PRESETS, **_scan_user_presets()}
+        source, transcript = all_presets[preset]
+        waveform, sr = _load_audio(source)
        return ({"waveform": waveform, "sample_rate": sr}, transcript)