Compare commits
36 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b4c1cb2955 | |||
| 2e3c357e5a | |||
| aa986fd534 | |||
| 3fee610050 | |||
| d4638aa785 | |||
| e9c947b613 | |||
| 197bcc554e | |||
| 9f2683cd54 | |||
| f8657aca80 | |||
| 0f2bcc4c0e | |||
| 4eabac4c7e | |||
| dd6d5061e4 | |||
| 147030e2af | |||
| aedbe2e7d9 | |||
| 26295e4db7 | |||
| d5f2632c48 | |||
| 33b3d62d02 | |||
| 95cf706b19 | |||
| 3cbc04d12d | |||
| 340c0aa402 | |||
| 2b13e55dc5 | |||
| 86ec8cf3fb | |||
| ae2255d9e4 | |||
| d5a0ebeb9a | |||
| 0d43e5374f | |||
| 2b4b221e88 | |||
| 772f6654d4 | |||
| e26bac3684 | |||
| 194e0b0e09 | |||
| d4bf7c825e | |||
| d2cb5c4249 | |||
| c1558efad9 | |||
| 97ed0f209f | |||
| f7d624799c | |||
| d5000dee11 | |||
| bb1d83578c |
@@ -1,9 +1,12 @@
|
|||||||
name: Publish to ComfyUI Registry
|
name: Publish to ComfyUI Registry
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
push:
|
push:
|
||||||
tags:
|
branches:
|
||||||
- "v*"
|
- master
|
||||||
|
paths:
|
||||||
|
- "pyproject.toml"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
publish:
|
publish:
|
||||||
|
|||||||
+10
-2
@@ -1,4 +1,4 @@
|
|||||||
from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader, OmniVoiceVoicePreset, OmniVoiceMixVoices
|
from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader, OmniVoiceVoicePreset, OmniVoiceMixVoices, OmniVoiceVoiceDesign, OmniVoiceSpeaker, OmniVoiceSpeakers
|
||||||
|
|
||||||
NODE_CLASS_MAPPINGS = {
|
NODE_CLASS_MAPPINGS = {
|
||||||
"OmniVoiceModelLoader": OmniVoiceModelLoader,
|
"OmniVoiceModelLoader": OmniVoiceModelLoader,
|
||||||
@@ -6,6 +6,9 @@ NODE_CLASS_MAPPINGS = {
|
|||||||
"OmniVoiceEpubLoader": OmniVoiceEpubLoader,
|
"OmniVoiceEpubLoader": OmniVoiceEpubLoader,
|
||||||
"OmniVoiceVoicePreset": OmniVoiceVoicePreset,
|
"OmniVoiceVoicePreset": OmniVoiceVoicePreset,
|
||||||
"OmniVoiceMixVoices": OmniVoiceMixVoices,
|
"OmniVoiceMixVoices": OmniVoiceMixVoices,
|
||||||
|
"OmniVoiceVoiceDesign": OmniVoiceVoiceDesign,
|
||||||
|
"OmniVoiceSpeaker": OmniVoiceSpeaker,
|
||||||
|
"OmniVoiceSpeakers": OmniVoiceSpeakers,
|
||||||
}
|
}
|
||||||
|
|
||||||
NODE_DISPLAY_NAME_MAPPINGS = {
|
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||||
@@ -14,6 +17,11 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
|||||||
"OmniVoiceEpubLoader": "OmniVoice EPUB Loader",
|
"OmniVoiceEpubLoader": "OmniVoice EPUB Loader",
|
||||||
"OmniVoiceVoicePreset": "OmniVoice Voice Preset",
|
"OmniVoiceVoicePreset": "OmniVoice Voice Preset",
|
||||||
"OmniVoiceMixVoices": "OmniVoice Mix Voices",
|
"OmniVoiceMixVoices": "OmniVoice Mix Voices",
|
||||||
|
"OmniVoiceVoiceDesign": "OmniVoice Voice Design",
|
||||||
|
"OmniVoiceSpeaker": "OmniVoice Speaker",
|
||||||
|
"OmniVoiceSpeakers": "OmniVoice Speakers",
|
||||||
}
|
}
|
||||||
|
|
||||||
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
|
WEB_DIRECTORY = "./web"
|
||||||
|
|
||||||
|
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS", "WEB_DIRECTORY"]
|
||||||
|
|||||||
+3
-1
@@ -3,5 +3,7 @@ from .generator import OmniVoiceGenerate
|
|||||||
from .epub_loader import OmniVoiceEpubLoader
|
from .epub_loader import OmniVoiceEpubLoader
|
||||||
from .voice_presets import OmniVoiceVoicePreset
|
from .voice_presets import OmniVoiceVoicePreset
|
||||||
from .mix_voices import OmniVoiceMixVoices
|
from .mix_voices import OmniVoiceMixVoices
|
||||||
|
from .voice_design import OmniVoiceVoiceDesign
|
||||||
|
from .multi_speaker import OmniVoiceSpeaker, OmniVoiceSpeakers
|
||||||
|
|
||||||
__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader", "OmniVoiceVoicePreset", "OmniVoiceMixVoices"]
|
__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader", "OmniVoiceVoicePreset", "OmniVoiceMixVoices", "OmniVoiceVoiceDesign", "OmniVoiceSpeaker", "OmniVoiceSpeakers"]
|
||||||
|
|||||||
+14
-4
@@ -58,6 +58,12 @@ def _extract_chapters(epub_path):
|
|||||||
if tag:
|
if tag:
|
||||||
title = tag.get_text(strip=True)
|
title = tag.get_text(strip=True)
|
||||||
break
|
break
|
||||||
|
# Remove title/heading elements so they don't appear in the body text
|
||||||
|
if soup.title:
|
||||||
|
soup.title.decompose()
|
||||||
|
for hn in ['h1', 'h2', 'h3']:
|
||||||
|
for tag in soup.find_all(hn):
|
||||||
|
tag.decompose()
|
||||||
for tag in soup.find_all(_BLOCK_TAGS):
|
for tag in soup.find_all(_BLOCK_TAGS):
|
||||||
tag.append(soup.new_string('\n\n'))
|
tag.append(soup.new_string('\n\n'))
|
||||||
text = soup.get_text(separator='')
|
text = soup.get_text(separator='')
|
||||||
@@ -90,8 +96,8 @@ class OmniVoiceEpubLoader:
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
RETURN_TYPES = ("STRING", "STRING")
|
RETURN_TYPES = ("STRING", "STRING", "STRING")
|
||||||
RETURN_NAMES = ("text", "chapter_list")
|
RETURN_NAMES = ("text", "chapter_title", "chapter_list")
|
||||||
FUNCTION = "load_epub"
|
FUNCTION = "load_epub"
|
||||||
CATEGORY = "OmniVoice"
|
CATEGORY = "OmniVoice"
|
||||||
|
|
||||||
@@ -100,7 +106,7 @@ class OmniVoiceEpubLoader:
|
|||||||
n = len(chapters)
|
n = len(chapters)
|
||||||
|
|
||||||
if n == 0:
|
if n == 0:
|
||||||
return ("", "")
|
return ("", "", "")
|
||||||
|
|
||||||
start = max(1, min(chapter_start, n))
|
start = max(1, min(chapter_start, n))
|
||||||
end = max(start, min(chapter_end, n))
|
end = max(start, min(chapter_end, n))
|
||||||
@@ -111,8 +117,12 @@ class OmniVoiceEpubLoader:
|
|||||||
for i, ch in enumerate(chapters, 1)
|
for i, ch in enumerate(chapters, 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# chapter_title: title of the first selected chapter (useful for file naming)
|
||||||
|
first = chapters[start - 1]
|
||||||
|
chapter_title = first["title"] if first["title"] else f"Chapter {start}"
|
||||||
|
|
||||||
# text: selected range joined by delimiter
|
# text: selected range joined by delimiter
|
||||||
selected = chapters[start - 1 : end]
|
selected = chapters[start - 1 : end]
|
||||||
text = "\n\n---\n\n".join(ch["text"] for ch in selected)
|
text = "\n\n---\n\n".join(ch["text"] for ch in selected)
|
||||||
|
|
||||||
return (text, chapter_list)
|
return (text, chapter_title, chapter_list)
|
||||||
|
|||||||
+137
-22
@@ -1,8 +1,26 @@
|
|||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
|
_TAG_RE = re.compile(r'^\[([^\]]+)\]:?\s*(.*)', re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_tmp_wav(ref_audio):
|
||||||
|
"""Write a ComfyUI AUDIO dict to a temp WAV file. Returns the path (caller must delete)."""
|
||||||
|
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
tmp.close()
|
||||||
|
waveform = ref_audio["waveform"].squeeze(0).cpu() # (channels, samples)
|
||||||
|
audio_np = waveform.numpy()
|
||||||
|
sf.write(
|
||||||
|
tmp_path,
|
||||||
|
audio_np[0] if audio_np.shape[0] == 1 else audio_np.T,
|
||||||
|
int(ref_audio["sample_rate"]),
|
||||||
|
)
|
||||||
|
return tmp_path
|
||||||
|
|
||||||
|
|
||||||
class OmniVoiceGenerate:
|
class OmniVoiceGenerate:
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -49,12 +67,21 @@ class OmniVoiceGenerate:
|
|||||||
"tooltip": (
|
"tooltip": (
|
||||||
"voice_cloning – clone the voice from ref_audio (requires ref_audio)\n"
|
"voice_cloning – clone the voice from ref_audio (requires ref_audio)\n"
|
||||||
"voice_design – describe a voice with the instruct field (requires instruct)\n"
|
"voice_design – describe a voice with the instruct field (requires instruct)\n"
|
||||||
"auto_voice – model picks a voice automatically"
|
"auto_voice – model picks a voice automatically\n"
|
||||||
|
"\n"
|
||||||
|
"Ignored when a Speakers roster is connected."
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
"optional": {
|
"optional": {
|
||||||
|
"speakers": ("OMNIVOICE_SPEAKERS", {
|
||||||
|
"tooltip": (
|
||||||
|
"Connect an OmniVoice Speakers node to enable multi-speaker generation.\n"
|
||||||
|
"When connected, ref_audio / instruct / mode are ignored and each paragraph\n"
|
||||||
|
"is routed to its assigned speaker automatically."
|
||||||
|
),
|
||||||
|
}),
|
||||||
"ref_audio": ("AUDIO", {
|
"ref_audio": ("AUDIO", {
|
||||||
"tooltip": "Reference audio clip to clone the voice from. Used in voice_cloning mode.",
|
"tooltip": "Reference audio clip to clone the voice from. Used in voice_cloning mode.",
|
||||||
}),
|
}),
|
||||||
@@ -65,23 +92,32 @@ class OmniVoiceGenerate:
|
|||||||
"instruct": ("STRING", {
|
"instruct": ("STRING", {
|
||||||
"default": "",
|
"default": "",
|
||||||
"tooltip": (
|
"tooltip": (
|
||||||
"Voice description for voice_design mode. Combine attributes freely.\n"
|
"Voice style description. Required for voice_design mode; optional in voice_cloning\n"
|
||||||
|
"mode to attempt accent/style transfer on top of the cloned voice.\n"
|
||||||
|
"Connect the OmniVoice Voice Design node for structured input.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"GENDER: male, female\n"
|
"GENDER: male, female\n"
|
||||||
"AGE: child, teenager, young adult, middle-aged, elderly\n"
|
"AGE: child, teenager, young adult, middle-aged, elderly\n"
|
||||||
"PITCH: very low, low, moderate, high, very high\n"
|
"PITCH: very low pitch, low pitch, moderate pitch, high pitch, very high pitch, whisper\n"
|
||||||
"STYLE: whisper\n"
|
|
||||||
"\n"
|
"\n"
|
||||||
"ENGLISH ACCENTS (text must be English):\n"
|
"ACCENTS (only these are supported by the model):\n"
|
||||||
" american, british, australian, canadian,\n"
|
" american accent, australian accent, british accent, canadian accent,\n"
|
||||||
" indian, chinese, korean, japanese, portuguese, russian\n"
|
" chinese accent, indian accent, japanese accent, korean accent,\n"
|
||||||
|
" portuguese accent, russian accent\n"
|
||||||
"\n"
|
"\n"
|
||||||
"EXAMPLE: female, high pitch, british accent"
|
"EXAMPLE: female, high pitch, british accent"
|
||||||
),
|
),
|
||||||
}),
|
}),
|
||||||
|
"guidance_scale": ("FLOAT", {
|
||||||
|
"default": 2.0, "min": 0.0, "max": 20.0, "step": 0.1,
|
||||||
|
"tooltip": (
|
||||||
|
"Classifier-free guidance scale. Higher = more faithful to the reference/instruct, "
|
||||||
|
"but can over-saturate. 2.0 is a good default."
|
||||||
|
),
|
||||||
|
}),
|
||||||
"speed": ("FLOAT", {
|
"speed": ("FLOAT", {
|
||||||
"default": 1.0, "min": 0.1, "max": 3.0, "step": 0.1,
|
"default": 1.0, "min": 0.3, "max": 3.0, "step": 0.1,
|
||||||
"tooltip": "Playback speed multiplier. 1.0 = normal, >1.0 = faster, <1.0 = slower.",
|
"tooltip": "Playback speed multiplier. 1.0 = normal, >1.0 = faster, <1.0 = slower. Below 0.3 produces noise and extreme VRAM usage.",
|
||||||
}),
|
}),
|
||||||
"num_step": ("INT", {
|
"num_step": ("INT", {
|
||||||
"default": 32, "min": 1, "max": 100,
|
"default": 32, "min": 1, "max": 100,
|
||||||
@@ -104,10 +140,17 @@ class OmniVoiceGenerate:
|
|||||||
FUNCTION = "generate"
|
FUNCTION = "generate"
|
||||||
CATEGORY = "OmniVoice"
|
CATEGORY = "OmniVoice"
|
||||||
|
|
||||||
def generate(self, model, text, mode, ref_audio=None, ref_text="", instruct="", speed=1.0, num_step=32, seed=0):
|
def generate(self, model, text, mode, speakers=None, ref_audio=None, ref_text="",
|
||||||
|
instruct="", guidance_scale=2.0, speed=1.0, num_step=32, seed=0):
|
||||||
if seed != 0:
|
if seed != 0:
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
kwargs = {"text": text, "speed": speed, "num_step": num_step}
|
|
||||||
|
if speakers is not None:
|
||||||
|
return self._generate_multi_speaker(
|
||||||
|
model, text, speakers, guidance_scale, speed, num_step
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs = {"text": text, "speed": speed, "num_step": num_step, "guidance_scale": guidance_scale}
|
||||||
|
|
||||||
if mode == "voice_cloning" and ref_audio is None:
|
if mode == "voice_cloning" and ref_audio is None:
|
||||||
raise ValueError("voice_cloning mode requires ref_audio to be connected")
|
raise ValueError("voice_cloning mode requires ref_audio to be connected")
|
||||||
@@ -115,17 +158,13 @@ class OmniVoiceGenerate:
|
|||||||
raise ValueError("voice_design mode requires an instruct string (e.g. 'female, low pitch')")
|
raise ValueError("voice_design mode requires an instruct string (e.g. 'female, low pitch')")
|
||||||
|
|
||||||
if mode == "voice_cloning":
|
if mode == "voice_cloning":
|
||||||
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
tmp_path = _write_tmp_wav(ref_audio)
|
||||||
tmp_path = tmp.name
|
|
||||||
tmp.close()
|
|
||||||
try:
|
try:
|
||||||
ref_waveform = ref_audio["waveform"].squeeze(0).cpu() # (channels, samples)
|
|
||||||
audio_np = ref_waveform.numpy()
|
|
||||||
# soundfile expects (samples,) for mono or (samples, channels) for multi-channel
|
|
||||||
sf.write(tmp_path, audio_np[0] if audio_np.shape[0] == 1 else audio_np.T, int(ref_audio["sample_rate"]))
|
|
||||||
kwargs["ref_audio"] = tmp_path
|
kwargs["ref_audio"] = tmp_path
|
||||||
if ref_text:
|
if ref_text:
|
||||||
kwargs["ref_text"] = ref_text
|
kwargs["ref_text"] = ref_text
|
||||||
|
if instruct:
|
||||||
|
kwargs["instruct"] = instruct
|
||||||
audio_tensors = model.generate(**kwargs)
|
audio_tensors = model.generate(**kwargs)
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
@@ -140,9 +179,85 @@ class OmniVoiceGenerate:
|
|||||||
else: # auto_voice or fallback
|
else: # auto_voice or fallback
|
||||||
audio_tensors = model.generate(**kwargs)
|
audio_tensors = model.generate(**kwargs)
|
||||||
|
|
||||||
# Concatenate chunks: each tensor is (1, T) → concat along T → (1, T_total)
|
return self._tensors_to_audio(audio_tensors)
|
||||||
combined = torch.cat(audio_tensors, dim=1).cpu() # (1, T_total) on CPU
|
|
||||||
# ComfyUI AUDIO format: (batch, channels, samples)
|
|
||||||
waveform = combined.unsqueeze(0) # (1, 1, T_total)
|
|
||||||
|
|
||||||
|
def _generate_multi_speaker(self, model, text, speakers_data, guidance_scale, speed, num_step):
|
||||||
|
speaker_list = speakers_data["speakers"]
|
||||||
|
spk_mode = speakers_data["mode"]
|
||||||
|
label_map = {s["label"].lower(): i for i, s in enumerate(speaker_list)}
|
||||||
|
|
||||||
|
if spk_mode == "alternate_paragraphs":
|
||||||
|
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
||||||
|
if not paragraphs:
|
||||||
|
raise ValueError("OmniVoice Multi-Speaker: no paragraphs found in text.")
|
||||||
|
segments = [
|
||||||
|
(para, speaker_list[i % len(speaker_list)])
|
||||||
|
for i, para in enumerate(paragraphs)
|
||||||
|
]
|
||||||
|
else: # tagged_speakers
|
||||||
|
# In tagged mode each line that starts with [Tag] begins a new segment.
|
||||||
|
# Continuation lines (no tag) are appended to the previous segment so
|
||||||
|
# multi-line speeches stay together. Both \n and \n\n separators work.
|
||||||
|
raw_segments: list[list[str]] = []
|
||||||
|
current: list[str] = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if _TAG_RE.match(line):
|
||||||
|
if current:
|
||||||
|
raw_segments.append(current)
|
||||||
|
current = [line]
|
||||||
|
else:
|
||||||
|
current.append(line)
|
||||||
|
if current:
|
||||||
|
raw_segments.append(current)
|
||||||
|
|
||||||
|
if not raw_segments:
|
||||||
|
raise ValueError("OmniVoice Multi-Speaker: no tagged segments found in text.")
|
||||||
|
|
||||||
|
segments = []
|
||||||
|
for lines in raw_segments:
|
||||||
|
joined = " ".join(lines)
|
||||||
|
m = _TAG_RE.match(joined)
|
||||||
|
if m:
|
||||||
|
tag = m.group(1).strip().lower()
|
||||||
|
body = m.group(2).strip()
|
||||||
|
spk = speaker_list[label_map.get(tag, 0)]
|
||||||
|
else:
|
||||||
|
body = joined
|
||||||
|
spk = speaker_list[0]
|
||||||
|
if body:
|
||||||
|
segments.append((body, spk))
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
raise ValueError("OmniVoice Multi-Speaker: no text segments to generate.")
|
||||||
|
|
||||||
|
all_chunks = []
|
||||||
|
for para_text, spk in segments:
|
||||||
|
tmp_path = _write_tmp_wav(spk["ref_audio"])
|
||||||
|
try:
|
||||||
|
kwargs = {
|
||||||
|
"text": para_text,
|
||||||
|
"ref_audio": tmp_path,
|
||||||
|
"speed": speed,
|
||||||
|
"num_step": num_step,
|
||||||
|
"guidance_scale": guidance_scale,
|
||||||
|
}
|
||||||
|
if spk["ref_text"]:
|
||||||
|
kwargs["ref_text"] = spk["ref_text"]
|
||||||
|
chunks = model.generate(**kwargs)
|
||||||
|
all_chunks.extend(chunks)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return self._tensors_to_audio(all_chunks)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _tensors_to_audio(tensors):
|
||||||
|
combined = torch.cat(tensors, dim=1).cpu() # (1, T_total)
|
||||||
|
waveform = combined.unsqueeze(0) # (1, 1, T_total)
|
||||||
return ({"waveform": waveform, "sample_rate": 24000},)
|
return ({"waveform": waveform, "sample_rate": 24000},)
|
||||||
|
|||||||
+11
-4
@@ -1,10 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
_omnivoice_import_error = None
|
||||||
try:
|
try:
|
||||||
from omnivoice import OmniVoice
|
from omnivoice import OmniVoice
|
||||||
except ImportError:
|
except Exception as e:
|
||||||
OmniVoice = None # deferred; will raise at runtime if package is missing
|
OmniVoice = None
|
||||||
|
_omnivoice_import_error = e
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import folder_paths
|
import folder_paths
|
||||||
@@ -55,9 +57,14 @@ class OmniVoiceModelLoader:
|
|||||||
|
|
||||||
def load_model(self, device, dtype, compile=False):
|
def load_model(self, device, dtype, compile=False):
|
||||||
if OmniVoice is None:
|
if OmniVoice is None:
|
||||||
raise ImportError(
|
msg = (
|
||||||
"omnivoice is not installed. Run: pip install omnivoice --no-deps"
|
"omnivoice failed to import. "
|
||||||
|
"Install it with: pip install omnivoice --no-deps\n"
|
||||||
|
"(On Windows embedded Python: .\\python_embeded\\python.exe -m pip install omnivoice --no-deps)\n"
|
||||||
)
|
)
|
||||||
|
if _omnivoice_import_error is not None:
|
||||||
|
msg += f"\nOriginal error: {_omnivoice_import_error}"
|
||||||
|
raise ImportError(msg)
|
||||||
|
|
||||||
model = OmniVoice.from_pretrained(
|
model = OmniVoice.from_pretrained(
|
||||||
"k2-fsa/OmniVoice",
|
"k2-fsa/OmniVoice",
|
||||||
|
|||||||
@@ -0,0 +1,103 @@
|
|||||||
|
class OmniVoiceSpeaker:
|
||||||
|
"""Bundle a label, reference audio, and optional transcript into a speaker slot."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(cls):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"label": ("STRING", {
|
||||||
|
"default": "Narrator",
|
||||||
|
"tooltip": (
|
||||||
|
"Name used to identify this speaker.\n"
|
||||||
|
"In tagged_speakers mode, prefix paragraphs with [Label]:\n"
|
||||||
|
" [Narrator] Once upon a time...\n"
|
||||||
|
"In alternate_paragraphs mode the label is informational only."
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
"ref_audio": ("AUDIO", {
|
||||||
|
"tooltip": "Reference audio clip for this speaker's voice.",
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
"optional": {
|
||||||
|
"ref_text": ("STRING", {
|
||||||
|
"default": "",
|
||||||
|
"tooltip": "Transcript of ref_audio. Improves cloning quality.",
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("OMNIVOICE_SPEAKER",)
|
||||||
|
RETURN_NAMES = ("speaker",)
|
||||||
|
FUNCTION = "build"
|
||||||
|
CATEGORY = "OmniVoice"
|
||||||
|
|
||||||
|
def build(self, label, ref_audio, ref_text=""):
|
||||||
|
return ({"label": label, "ref_audio": ref_audio, "ref_text": ref_text},)
|
||||||
|
|
||||||
|
|
||||||
|
class OmniVoiceSpeakers:
|
||||||
|
"""Collect multiple speakers into a roster for multi-speaker generation.
|
||||||
|
|
||||||
|
The number of speaker input slots expands dynamically when num_speakers changes
|
||||||
|
(requires the OmniVoice web extension to be loaded by ComfyUI).
|
||||||
|
Connect one OmniVoice Speaker node per slot.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(cls):
|
||||||
|
# speaker_1…speaker_8 are declared here so ComfyUI validation accepts them.
|
||||||
|
# Visibility is controlled by the JS extension (web/multi_speaker.js):
|
||||||
|
# only the first num_speakers slots are shown as live inputs.
|
||||||
|
optional_speakers = {
|
||||||
|
f"speaker_{i}": ("OMNIVOICE_SPEAKER", {})
|
||||||
|
for i in range(1, 9)
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"num_speakers": ("INT", {
|
||||||
|
"default": 2, "min": 2, "max": 8, "step": 1,
|
||||||
|
"tooltip": (
|
||||||
|
"Number of active speaker slots.\n"
|
||||||
|
"Changing this value adds or removes speaker_N inputs on the node."
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
"mode": (
|
||||||
|
["alternate_paragraphs", "tagged_speakers"],
|
||||||
|
{
|
||||||
|
"default": "alternate_paragraphs",
|
||||||
|
"tooltip": (
|
||||||
|
"alternate_paragraphs – paragraphs (separated by blank lines) rotate\n"
|
||||||
|
" through speakers in order: 1 → 2 → 3 → 1 → …\n"
|
||||||
|
"\n"
|
||||||
|
"tagged_speakers – prefix each paragraph with [Label] to assign\n"
|
||||||
|
" a specific speaker. Labels must match those on the Speaker nodes.\n"
|
||||||
|
" Unrecognised tags fall back to speaker 1.\n"
|
||||||
|
"\n"
|
||||||
|
" Example:\n"
|
||||||
|
" [Narrator] The door creaked open.\n"
|
||||||
|
"\n"
|
||||||
|
" [Alice] Who is there?"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"optional": optional_speakers,
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("OMNIVOICE_SPEAKERS",)
|
||||||
|
RETURN_NAMES = ("speakers",)
|
||||||
|
FUNCTION = "build"
|
||||||
|
CATEGORY = "OmniVoice"
|
||||||
|
|
||||||
|
def build(self, num_speakers, mode, **kwargs):
|
||||||
|
speakers = []
|
||||||
|
for i in range(1, num_speakers + 1):
|
||||||
|
spk = kwargs.get(f"speaker_{i}")
|
||||||
|
if spk is not None:
|
||||||
|
speakers.append(spk)
|
||||||
|
if len(speakers) < 2:
|
||||||
|
raise ValueError(
|
||||||
|
f"OmniVoice Speakers: at least 2 speakers must be connected "
|
||||||
|
f"(got {len(speakers)})."
|
||||||
|
)
|
||||||
|
return ({"speakers": speakers, "mode": mode},)
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
class OmniVoiceVoiceDesign:
|
||||||
|
"""Compose a voice design instruct string from structured dropdowns."""
|
||||||
|
|
||||||
|
GENDERS = ["none", "male", "female"]
|
||||||
|
|
||||||
|
AGES = ["none", "child", "teenager", "young adult", "middle-aged", "elderly"]
|
||||||
|
|
||||||
|
PITCHES = [
|
||||||
|
"none",
|
||||||
|
"very low pitch", "low pitch", "moderate pitch",
|
||||||
|
"high pitch", "very high pitch", "whisper",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Exactly the accents validated by the model's _resolve_instruct() for English
|
||||||
|
ACCENTS = [
|
||||||
|
"none",
|
||||||
|
"american accent", "australian accent", "british accent",
|
||||||
|
"canadian accent", "chinese accent", "indian accent",
|
||||||
|
"japanese accent", "korean accent", "portuguese accent",
|
||||||
|
"russian accent",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Chinese dialect items validated by the model's _resolve_instruct()
|
||||||
|
ZH_GENDERS = ["none", "男", "女"]
|
||||||
|
ZH_AGES = ["none", "儿童", "少年", "青年", "中年", "老年"]
|
||||||
|
ZH_PITCHES = ["none", "极低音调", "低音调", "中音调", "高音调", "极高音调", "耳语"]
|
||||||
|
ZH_DIALECTS = [
|
||||||
|
"none",
|
||||||
|
"东北话", "云南话", "四川话", "宁夏话", "桂林话",
|
||||||
|
"河南话", "济南话", "甘肃话", "石家庄话", "贵州话",
|
||||||
|
"陕西话", "青岛话",
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(cls):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"language": (
|
||||||
|
["English", "Chinese"],
|
||||||
|
{
|
||||||
|
"default": "English",
|
||||||
|
"tooltip": "Selects the instruct vocabulary. The language output wires directly into Generate — no need to set it there too.",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
"gender": (cls.GENDERS, {"default": "female",
|
||||||
|
"tooltip": "Voice gender (English). Ignored when language is Chinese — use zh_gender."}),
|
||||||
|
"age": (cls.AGES, {"default": "none",
|
||||||
|
"tooltip": "Age of the speaker (English). Ignored when language is Chinese — use zh_age."}),
|
||||||
|
"pitch": (cls.PITCHES, {"default": "none",
|
||||||
|
"tooltip": "Pitch (English). Ignored when language is Chinese — use zh_pitch."}),
|
||||||
|
"accent": (cls.ACCENTS, {"default": "none",
|
||||||
|
"tooltip": "Accent (English only, 10 supported values)."}),
|
||||||
|
},
|
||||||
|
"optional": {
|
||||||
|
"zh_gender": (cls.ZH_GENDERS, {"default": "none", "tooltip": "声线性别 (Chinese mode)"}),
|
||||||
|
"zh_age": (cls.ZH_AGES, {"default": "none", "tooltip": "年龄段 (Chinese mode)"}),
|
||||||
|
"zh_pitch": (cls.ZH_PITCHES, {"default": "none", "tooltip": "音调 (Chinese mode)"}),
|
||||||
|
"zh_dialect": (cls.ZH_DIALECTS, {"default": "none", "tooltip": "方言/口音 (Chinese mode)"}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("STRING",)
|
||||||
|
RETURN_NAMES = ("instruct",)
|
||||||
|
FUNCTION = "compose"
|
||||||
|
CATEGORY = "OmniVoice"
|
||||||
|
|
||||||
|
def compose(self, language, gender, age, pitch, accent,
|
||||||
|
zh_gender="none", zh_age="none", zh_pitch="none", zh_dialect="none"):
|
||||||
|
if language == "Chinese":
|
||||||
|
parts = [v for v in [zh_gender, zh_age, zh_pitch, zh_dialect] if v != "none"]
|
||||||
|
return (",".join(parts),)
|
||||||
|
else:
|
||||||
|
parts = [v for v in [gender, age, pitch, accent] if v != "none"]
|
||||||
|
return (", ".join(parts),)
|
||||||
+46
-9
@@ -53,14 +53,45 @@ PRESETS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _load_audio(url):
|
_AUDIO_EXTS = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}
|
||||||
"""Download (once) and return (waveform_tensor, sample_rate)."""
|
_BUILTIN_FILES = frozenset(os.path.basename(url.split("?")[0]) for url, _ in PRESETS.values())
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_user_presets():
|
||||||
|
"""Return a dict of user presets found in _CACHE_DIR.
|
||||||
|
|
||||||
|
For each audio file that is not a cached built-in, look for a same-stem
|
||||||
|
.txt file for the transcript. Key format: "<stem> (local)".
|
||||||
|
"""
|
||||||
|
user = {}
|
||||||
|
if not os.path.isdir(_CACHE_DIR):
|
||||||
|
return user
|
||||||
|
for fname in sorted(os.listdir(_CACHE_DIR)):
|
||||||
|
stem, ext = os.path.splitext(fname)
|
||||||
|
if ext.lower() not in _AUDIO_EXTS or fname in _BUILTIN_FILES:
|
||||||
|
continue
|
||||||
|
audio_path = os.path.join(_CACHE_DIR, fname)
|
||||||
|
txt_path = os.path.join(_CACHE_DIR, stem + ".txt")
|
||||||
|
transcript = ""
|
||||||
|
if os.path.exists(txt_path):
|
||||||
|
with open(txt_path, "r", encoding="utf-8") as f:
|
||||||
|
transcript = f.read().strip()
|
||||||
|
user[f"{stem} (local)"] = (audio_path, transcript)
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
def _load_audio(source):
|
||||||
|
"""Load audio from a URL (downloading once) or a local file path."""
|
||||||
os.makedirs(_CACHE_DIR, exist_ok=True)
|
os.makedirs(_CACHE_DIR, exist_ok=True)
|
||||||
filename = os.path.basename(url.split("?")[0])
|
if source.startswith("http://") or source.startswith("https://"):
|
||||||
|
filename = os.path.basename(source.split("?")[0])
|
||||||
cache_path = os.path.join(_CACHE_DIR, filename)
|
cache_path = os.path.join(_CACHE_DIR, filename)
|
||||||
if not os.path.exists(cache_path):
|
if not os.path.exists(cache_path):
|
||||||
urllib.request.urlretrieve(url, cache_path)
|
urllib.request.urlretrieve(source, cache_path)
|
||||||
audio_np, sr = sf.read(cache_path, dtype="float32")
|
path = cache_path
|
||||||
|
else:
|
||||||
|
path = source
|
||||||
|
audio_np, sr = sf.read(path, dtype="float32")
|
||||||
if audio_np.ndim == 1:
|
if audio_np.ndim == 1:
|
||||||
audio_np = audio_np[np.newaxis, :] # (1, samples)
|
audio_np = audio_np[np.newaxis, :] # (1, samples)
|
||||||
else:
|
else:
|
||||||
@@ -72,15 +103,20 @@ def _load_audio(url):
|
|||||||
class OmniVoiceVoicePreset:
|
class OmniVoiceVoicePreset:
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(cls):
|
def INPUT_TYPES(cls):
|
||||||
|
all_presets = {**PRESETS, **_scan_user_presets()}
|
||||||
return {
|
return {
|
||||||
"required": {
|
"required": {
|
||||||
"preset": (
|
"preset": (
|
||||||
list(PRESETS.keys()),
|
list(all_presets.keys()),
|
||||||
{
|
{
|
||||||
"tooltip": (
|
"tooltip": (
|
||||||
"Pre-fetched reference voice for OmniVoice Generate.\n"
|
"Pre-fetched reference voice for OmniVoice Generate.\n"
|
||||||
"Connect ref_audio → ref_audio and ref_text → ref_text.\n"
|
"Connect ref_audio → ref_audio and ref_text → ref_text.\n"
|
||||||
"If ref_text is blank, connect a Whisper node to supply the transcript."
|
"\n"
|
||||||
|
"To add your own presets, drop audio files into:\n"
|
||||||
|
f" {_CACHE_DIR}\n"
|
||||||
|
"Add a same-name .txt file alongside for the transcript.\n"
|
||||||
|
"Restart ComfyUI to pick up new files."
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
@@ -93,6 +129,7 @@ class OmniVoiceVoicePreset:
|
|||||||
CATEGORY = "OmniVoice"
|
CATEGORY = "OmniVoice"
|
||||||
|
|
||||||
def load_preset(self, preset):
|
def load_preset(self, preset):
|
||||||
url, transcript = PRESETS[preset]
|
all_presets = {**PRESETS, **_scan_user_presets()}
|
||||||
waveform, sr = _load_audio(url)
|
source, transcript = all_presets[preset]
|
||||||
|
waveform, sr = _load_audio(source)
|
||||||
return ({"waveform": waveform, "sample_rate": sr}, transcript)
|
return ({"waveform": waveform, "sample_rate": sr}, transcript)
|
||||||
|
|||||||
+2
-2
@@ -1,7 +1,7 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "comfyui-omnivoice"
|
name = "comfyui-omnivoice-fel"
|
||||||
description = "ComfyUI nodes for OmniVoice — multilingual zero-shot TTS with voice cloning, voice design, EPUB loading, and voice mixing."
|
description = "ComfyUI nodes for OmniVoice — multilingual zero-shot TTS with voice cloning, voice design, EPUB loading, and voice mixing."
|
||||||
version = "1.0.0"
|
version = "1.0.6"
|
||||||
license = { text = "GPL-3.0-only" }
|
license = { text = "GPL-3.0-only" }
|
||||||
dependencies = []
|
dependencies = []
|
||||||
|
|
||||||
|
|||||||
+5
-2
@@ -1,6 +1,9 @@
|
|||||||
transformers>=4.40.0
|
transformers>=5.3.0
|
||||||
accelerate
|
accelerate
|
||||||
pydub
|
|
||||||
soundfile
|
soundfile
|
||||||
numpy
|
numpy
|
||||||
|
pydub
|
||||||
|
tensorboardx
|
||||||
|
webdataset
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
|
torchcodec
|
||||||
|
|||||||
@@ -58,14 +58,14 @@ def test_input_types_structure():
|
|||||||
|
|
||||||
|
|
||||||
def test_return_types():
|
def test_return_types():
|
||||||
assert OmniVoiceEpubLoader.RETURN_TYPES == ("STRING", "STRING")
|
assert OmniVoiceEpubLoader.RETURN_TYPES == ("STRING", "STRING", "STRING")
|
||||||
assert OmniVoiceEpubLoader.RETURN_NAMES == ("text", "chapter_list")
|
assert OmniVoiceEpubLoader.RETURN_NAMES == ("text", "chapter_title", "chapter_list")
|
||||||
|
|
||||||
|
|
||||||
def test_chapter_extraction_basic():
|
def test_chapter_extraction_basic():
|
||||||
epub = make_fake_epub([("Intro", "<p>Hello world</p>"), ("Chapter One", "<p>Body here</p>")])
|
epub = make_fake_epub([("Intro", "<p>Hello world</p>"), ("Chapter One", "<p>Body here</p>")])
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
||||||
text, chapter_list = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 2)
|
text, chapter_title, chapter_list = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 2)
|
||||||
assert "Hello world" in text
|
assert "Hello world" in text
|
||||||
assert "Body here" in text
|
assert "Body here" in text
|
||||||
assert "---" in text
|
assert "---" in text
|
||||||
@@ -75,7 +75,7 @@ def test_chapter_extraction_basic():
|
|||||||
def test_chapter_range_single():
|
def test_chapter_range_single():
|
||||||
epub = make_fake_epub([("One", "<p>First</p>"), ("Two", "<p>Second</p>"), ("Three", "<p>Third</p>")])
|
epub = make_fake_epub([("One", "<p>First</p>"), ("Two", "<p>Second</p>"), ("Three", "<p>Third</p>")])
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
||||||
text, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 2, 2)
|
text, _, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 2, 2)
|
||||||
assert "Second" in text
|
assert "Second" in text
|
||||||
assert "First" not in text
|
assert "First" not in text
|
||||||
assert "Third" not in text
|
assert "Third" not in text
|
||||||
@@ -84,7 +84,7 @@ def test_chapter_range_single():
|
|||||||
def test_chapter_list_contains_all():
|
def test_chapter_list_contains_all():
|
||||||
epub = make_fake_epub([("A", ""), ("B", ""), ("C", "")])
|
epub = make_fake_epub([("A", ""), ("B", ""), ("C", "")])
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
||||||
_, chapter_list = OmniVoiceEpubLoader().load_epub('/fake.epub', 2, 2)
|
_, _, chapter_list = OmniVoiceEpubLoader().load_epub('/fake.epub', 2, 2)
|
||||||
lines = chapter_list.strip().splitlines()
|
lines = chapter_list.strip().splitlines()
|
||||||
assert len(lines) == 3
|
assert len(lines) == 3
|
||||||
assert lines[0].startswith("1.")
|
assert lines[0].startswith("1.")
|
||||||
@@ -94,14 +94,14 @@ def test_chapter_list_contains_all():
|
|||||||
def test_range_clamping_high():
|
def test_range_clamping_high():
|
||||||
epub = make_fake_epub([("A", "<p>aaa</p>"), ("B", "<p>bbb</p>")])
|
epub = make_fake_epub([("A", "<p>aaa</p>"), ("B", "<p>bbb</p>")])
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
||||||
text, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 99)
|
text, _, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 99)
|
||||||
assert "aaa" in text and "bbb" in text
|
assert "aaa" in text and "bbb" in text
|
||||||
|
|
||||||
|
|
||||||
def test_range_clamping_end_below_start():
|
def test_range_clamping_end_below_start():
|
||||||
epub = make_fake_epub([("A", "<p>aaa</p>"), ("B", "<p>bbb</p>")])
|
epub = make_fake_epub([("A", "<p>aaa</p>"), ("B", "<p>bbb</p>")])
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
||||||
text, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 2, 1)
|
text, _, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 2, 1)
|
||||||
assert "bbb" in text
|
assert "bbb" in text
|
||||||
assert "aaa" not in text
|
assert "aaa" not in text
|
||||||
|
|
||||||
@@ -119,14 +119,14 @@ def test_missing_title_fallback():
|
|||||||
z.writestr('OEBPS/ch0.xhtml', '<html><body><p>No title here</p></body></html>')
|
z.writestr('OEBPS/ch0.xhtml', '<html><body><p>No title here</p></body></html>')
|
||||||
buf.seek(0)
|
buf.seek(0)
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(buf.read())):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(buf.read())):
|
||||||
_, chapter_list = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 1)
|
_, _, chapter_list = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 1)
|
||||||
assert "1. Chapter 1" in chapter_list
|
assert "1. Chapter 1" in chapter_list
|
||||||
|
|
||||||
|
|
||||||
def test_script_style_stripped():
|
def test_script_style_stripped():
|
||||||
epub = make_fake_epub([("Test", '<script>alert("xss")</script><style>color:red</style><p>clean</p>')])
|
epub = make_fake_epub([("Test", '<script>alert("xss")</script><style>color:red</style><p>clean</p>')])
|
||||||
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
with patch('nodes.epub_loader.zipfile.ZipFile', side_effect=epub_opener(epub)):
|
||||||
text, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 1)
|
text, _, _ = OmniVoiceEpubLoader().load_epub('/fake.epub', 1, 1)
|
||||||
assert "alert" not in text
|
assert "alert" not in text
|
||||||
assert "color" not in text
|
assert "color" not in text
|
||||||
assert "clean" in text
|
assert "clean" in text
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ def test_generate_auto_voice():
|
|||||||
assert "sample_rate" in audio
|
assert "sample_rate" in audio
|
||||||
assert audio["sample_rate"] == 24000
|
assert audio["sample_rate"] == 24000
|
||||||
mock_model.generate.assert_called_once_with(
|
mock_model.generate.assert_called_once_with(
|
||||||
text="Hello world", speed=1.0, num_step=32
|
text="Hello world", speed=1.0, num_step=32, guidance_scale=2.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -64,7 +64,7 @@ def test_generate_voice_design():
|
|||||||
audio = result[0]
|
audio = result[0]
|
||||||
assert audio["sample_rate"] == 24000
|
assert audio["sample_rate"] == 24000
|
||||||
mock_model.generate.assert_called_once_with(
|
mock_model.generate.assert_called_once_with(
|
||||||
text="Hello world", instruct="female, low pitch", speed=1.0, num_step=32
|
text="Hello world", instruct="female, low pitch", speed=1.0, num_step=32, guidance_scale=2.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,70 @@
|
|||||||
|
import { app } from "../../scripts/app.js";
|
||||||
|
|
||||||
|
const MAX_SPEAKERS = 8;
|
||||||
|
|
||||||
|
app.registerExtension({
|
||||||
|
name: "OmniVoice.MultiSpeaker",
|
||||||
|
|
||||||
|
beforeRegisterNodeDef(nodeType, nodeData) {
|
||||||
|
if (nodeData.name !== "OmniVoiceSpeakers") return;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure the node has exactly `count` speaker_N inputs.
|
||||||
|
* Safe to call multiple times with the same count (idempotent).
|
||||||
|
*/
|
||||||
|
function syncSpeakerInputs(node, count) {
|
||||||
|
count = Math.max(2, Math.min(MAX_SPEAKERS, Math.floor(count)));
|
||||||
|
|
||||||
|
// Add any missing slots
|
||||||
|
for (let i = 1; i <= count; i++) {
|
||||||
|
const name = `speaker_${i}`;
|
||||||
|
if (!node.inputs?.find(inp => inp.name === name)) {
|
||||||
|
node.addInput(name, "OMNIVOICE_SPEAKER");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove excess slots (high → low so indices stay valid)
|
||||||
|
for (let i = MAX_SPEAKERS; i > count; i--) {
|
||||||
|
const name = `speaker_${i}`;
|
||||||
|
const idx = node.inputs?.findIndex(inp => inp.name === name) ?? -1;
|
||||||
|
if (idx === -1) continue;
|
||||||
|
// Sever any connected link before removing the slot
|
||||||
|
const linkId = node.inputs[idx].link;
|
||||||
|
if (linkId != null) node.graph?.removeLink(linkId);
|
||||||
|
node.removeInput(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
node.setDirtyCanvas(true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attach the num_speakers widget callback once per node instance.
|
||||||
|
* Guarded by a flag so configure() can call it safely on reload.
|
||||||
|
*/
|
||||||
|
function attachCallback(node) {
|
||||||
|
if (node._omnivoiceCbAttached) return;
|
||||||
|
const w = node.widgets?.find(w => w.name === "num_speakers");
|
||||||
|
if (!w) return;
|
||||||
|
node._omnivoiceCbAttached = true;
|
||||||
|
w.callback = (value) => syncSpeakerInputs(node, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Fresh node creation ---
|
||||||
|
const onNodeCreated = nodeType.prototype.onNodeCreated;
|
||||||
|
nodeType.prototype.onNodeCreated = function () {
|
||||||
|
onNodeCreated?.apply(this, arguments);
|
||||||
|
attachCallback(this);
|
||||||
|
const w = this.widgets?.find(w => w.name === "num_speakers");
|
||||||
|
if (w) syncSpeakerInputs(this, w.value);
|
||||||
|
};
|
||||||
|
|
||||||
|
// --- Workflow load: called by LiteGraph after widget values are restored ---
|
||||||
|
const onConfigure = nodeType.prototype.onConfigure;
|
||||||
|
nodeType.prototype.onConfigure = function (data) {
|
||||||
|
onConfigure?.apply(this, arguments);
|
||||||
|
attachCallback(this);
|
||||||
|
const w = this.widgets?.find(w => w.name === "num_speakers");
|
||||||
|
if (w) syncSpeakerInputs(this, w.value);
|
||||||
|
};
|
||||||
|
},
|
||||||
|
});
|
||||||
@@ -1,55 +1,115 @@
|
|||||||
{
|
{
|
||||||
"id": "85925298-bd01-4df7-b8e5-ac92f9d6582e",
|
"id": "85925298-bd01-4df7-b8e5-ac92f9d6582e",
|
||||||
"revision": 0,
|
"revision": 0,
|
||||||
"last_node_id": 6,
|
"last_node_id": 14,
|
||||||
"last_link_id": 9,
|
"last_link_id": 30,
|
||||||
"nodes": [
|
"nodes": [
|
||||||
{
|
{
|
||||||
"id": 1,
|
"id": 6,
|
||||||
"type": "OmniVoiceModelLoader",
|
"type": "PreviewAudio",
|
||||||
"pos": [
|
"pos": [
|
||||||
32,
|
928,
|
||||||
96
|
96
|
||||||
],
|
],
|
||||||
"size": [
|
"size": [
|
||||||
320,
|
448,
|
||||||
160
|
96
|
||||||
],
|
],
|
||||||
"flags": {},
|
"flags": {},
|
||||||
"order": 0,
|
"order": 4,
|
||||||
"mode": 0,
|
"mode": 0,
|
||||||
"inputs": [],
|
"inputs": [
|
||||||
"outputs": [
|
{
|
||||||
|
"name": "audio",
|
||||||
|
"type": "AUDIO",
|
||||||
|
"link": 27
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"properties": {
|
||||||
|
"cnr_id": "comfy-core",
|
||||||
|
"ver": "0.18.1",
|
||||||
|
"Node name for S&R": "PreviewAudio",
|
||||||
|
"ue_properties": {
|
||||||
|
"widget_ue_connectable": {},
|
||||||
|
"input_ue_unconnectable": {},
|
||||||
|
"version": "7.8"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets_values": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"type": "OmniVoiceGenerate",
|
||||||
|
"pos": [
|
||||||
|
512,
|
||||||
|
96
|
||||||
|
],
|
||||||
|
"size": [
|
||||||
|
384,
|
||||||
|
448
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 3,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
{
|
{
|
||||||
"name": "model",
|
"name": "model",
|
||||||
"type": "OMNIVOICE_MODEL",
|
"type": "OMNIVOICE_MODEL",
|
||||||
"slot_index": 0,
|
"link": 23
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ref_audio",
|
||||||
|
"shape": 7,
|
||||||
|
"type": "AUDIO",
|
||||||
|
"link": 29
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ref_text",
|
||||||
|
"shape": 7,
|
||||||
|
"type": "STRING",
|
||||||
|
"widget": {
|
||||||
|
"name": "ref_text"
|
||||||
|
},
|
||||||
|
"link": 30
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "audio",
|
||||||
|
"type": "AUDIO",
|
||||||
"links": [
|
"links": [
|
||||||
5
|
27
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"aux_id": "ethanfel/ComfyUI-Omnivoice",
|
"cnr_id": "comfyui-omnivoice-fel",
|
||||||
"ver": "9b62c9bda8469171025709e2fae3ce877ee0f059",
|
"ver": "340c0aa402fdf8b56fad5eb1559ff901c7fc7cfc",
|
||||||
"Node name for S&R": "OmniVoiceModelLoader",
|
"Node name for S&R": "OmniVoiceGenerate",
|
||||||
"ue_properties": {
|
"ue_properties": {
|
||||||
"widget_ue_connectable": {},
|
"widget_ue_connectable": {},
|
||||||
"version": "7.8",
|
"input_ue_unconnectable": {},
|
||||||
"input_ue_unconnectable": {}
|
"version": "7.8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"widgets_values": [
|
"widgets_values": [
|
||||||
"cuda:0",
|
"Hello! This is a test of the OmniVoice text-to-speech system.",
|
||||||
"float16",
|
"voice_cloning",
|
||||||
false
|
"",
|
||||||
|
"",
|
||||||
|
2,
|
||||||
|
1,
|
||||||
|
32,
|
||||||
|
42,
|
||||||
|
"fixed"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
"type": "OmniVoiceVoicePreset",
|
"type": "OmniVoiceVoicePreset",
|
||||||
"pos": [
|
"pos": [
|
||||||
32,
|
64,
|
||||||
320
|
320
|
||||||
],
|
],
|
||||||
"size": [
|
"size": [
|
||||||
@@ -57,7 +117,7 @@
|
|||||||
128
|
128
|
||||||
],
|
],
|
||||||
"flags": {},
|
"flags": {},
|
||||||
"order": 1,
|
"order": 0,
|
||||||
"mode": 0,
|
"mode": 0,
|
||||||
"inputs": [],
|
"inputs": [],
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -66,7 +126,7 @@
|
|||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"slot_index": 0,
|
"slot_index": 0,
|
||||||
"links": [
|
"links": [
|
||||||
6
|
29
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -74,14 +134,15 @@
|
|||||||
"type": "STRING",
|
"type": "STRING",
|
||||||
"slot_index": 1,
|
"slot_index": 1,
|
||||||
"links": [
|
"links": [
|
||||||
7
|
30
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
"aux_id": "ethanfel/ComfyUI-Omnivoice",
|
"cnr_id": "comfyui-omnivoice-fel",
|
||||||
"ver": "9b62c9bda8469171025709e2fae3ce877ee0f059",
|
"ver": "9b62c9bda8469171025709e2fae3ce877ee0f059",
|
||||||
"Node name for S&R": "OmniVoiceVoicePreset",
|
"Node name for S&R": "OmniVoiceVoicePreset",
|
||||||
|
"aux_id": "ethanfel/ComfyUI-Omnivoice",
|
||||||
"ue_properties": {
|
"ue_properties": {
|
||||||
"widget_ue_connectable": {},
|
"widget_ue_connectable": {},
|
||||||
"version": "7.8",
|
"version": "7.8",
|
||||||
@@ -93,136 +154,124 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 1,
|
||||||
"type": "OmniVoiceGenerate",
|
"type": "OmniVoiceModelLoader",
|
||||||
"pos": [
|
"pos": [
|
||||||
416,
|
64,
|
||||||
96
|
96
|
||||||
],
|
],
|
||||||
"size": [
|
"size": [
|
||||||
384,
|
320,
|
||||||
448
|
160
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 1,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "model",
|
||||||
|
"type": "OMNIVOICE_MODEL",
|
||||||
|
"slot_index": 0,
|
||||||
|
"links": [
|
||||||
|
23
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"cnr_id": "comfyui-omnivoice-fel",
|
||||||
|
"ver": "9b62c9bda8469171025709e2fae3ce877ee0f059",
|
||||||
|
"Node name for S&R": "OmniVoiceModelLoader",
|
||||||
|
"aux_id": "ethanfel/ComfyUI-Omnivoice",
|
||||||
|
"ue_properties": {
|
||||||
|
"widget_ue_connectable": {},
|
||||||
|
"version": "7.8",
|
||||||
|
"input_ue_unconnectable": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"cuda:0",
|
||||||
|
"float32",
|
||||||
|
false
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "OmniVoiceVoiceDesign",
|
||||||
|
"pos": [
|
||||||
|
64,
|
||||||
|
512
|
||||||
|
],
|
||||||
|
"size": [
|
||||||
|
288,
|
||||||
|
288
|
||||||
],
|
],
|
||||||
"flags": {},
|
"flags": {},
|
||||||
"order": 2,
|
"order": 2,
|
||||||
"mode": 0,
|
"mode": 0,
|
||||||
"inputs": [
|
"inputs": [],
|
||||||
{
|
|
||||||
"name": "model",
|
|
||||||
"type": "OMNIVOICE_MODEL",
|
|
||||||
"link": 5
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "ref_audio",
|
|
||||||
"shape": 7,
|
|
||||||
"type": "AUDIO",
|
|
||||||
"link": 6
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "ref_text",
|
|
||||||
"shape": 7,
|
|
||||||
"type": "STRING",
|
|
||||||
"widget": {
|
|
||||||
"name": "ref_text"
|
|
||||||
},
|
|
||||||
"link": 7
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "audio",
|
"name": "instruct",
|
||||||
"type": "AUDIO",
|
"type": "STRING",
|
||||||
"links": [
|
"links": []
|
||||||
9
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"cnr_id": "comfyui-omnivoice-fel",
|
||||||
|
"ver": "340c0aa402fdf8b56fad5eb1559ff901c7fc7cfc",
|
||||||
|
"Node name for S&R": "OmniVoiceVoiceDesign",
|
||||||
"ue_properties": {
|
"ue_properties": {
|
||||||
"widget_ue_connectable": {},
|
"widget_ue_connectable": {},
|
||||||
"input_ue_unconnectable": {}
|
"input_ue_unconnectable": {},
|
||||||
},
|
"version": "7.8"
|
||||||
"aux_id": "ethanfel/ComfyUI-Omnivoice",
|
}
|
||||||
"ver": "9b62c9bda8469171025709e2fae3ce877ee0f059",
|
|
||||||
"Node name for S&R": "OmniVoiceGenerate"
|
|
||||||
},
|
},
|
||||||
"widgets_values": [
|
"widgets_values": [
|
||||||
"Hello! This is a test of the OmniVoice text-to-speech system.",
|
"English",
|
||||||
"voice_cloning",
|
"none",
|
||||||
"",
|
"none",
|
||||||
"",
|
"none",
|
||||||
1,
|
"american accent",
|
||||||
32,
|
"none",
|
||||||
42,
|
"none",
|
||||||
"fixed"
|
"none",
|
||||||
|
"none"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 6,
|
|
||||||
"type": "PreviewAudio",
|
|
||||||
"pos": [
|
|
||||||
832,
|
|
||||||
96
|
|
||||||
],
|
|
||||||
"size": [
|
|
||||||
270,
|
|
||||||
88
|
|
||||||
],
|
|
||||||
"flags": {},
|
|
||||||
"order": 3,
|
|
||||||
"mode": 0,
|
|
||||||
"inputs": [
|
|
||||||
{
|
|
||||||
"name": "audio",
|
|
||||||
"type": "AUDIO",
|
|
||||||
"link": 9
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"outputs": [],
|
|
||||||
"properties": {
|
|
||||||
"ue_properties": {
|
|
||||||
"widget_ue_connectable": {},
|
|
||||||
"input_ue_unconnectable": {}
|
|
||||||
},
|
|
||||||
"cnr_id": "comfy-core",
|
|
||||||
"ver": "0.18.1",
|
|
||||||
"Node name for S&R": "PreviewAudio"
|
|
||||||
},
|
|
||||||
"widgets_values": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"links": [
|
"links": [
|
||||||
[
|
[
|
||||||
5,
|
23,
|
||||||
1,
|
1,
|
||||||
0,
|
0,
|
||||||
5,
|
14,
|
||||||
0,
|
0,
|
||||||
"OMNIVOICE_MODEL"
|
"OMNIVOICE_MODEL"
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
|
27,
|
||||||
|
14,
|
||||||
|
0,
|
||||||
6,
|
6,
|
||||||
|
0,
|
||||||
|
"AUDIO"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
29,
|
||||||
2,
|
2,
|
||||||
0,
|
0,
|
||||||
5,
|
14,
|
||||||
1,
|
1,
|
||||||
"AUDIO"
|
"AUDIO"
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
7,
|
30,
|
||||||
2,
|
2,
|
||||||
1,
|
1,
|
||||||
5,
|
14,
|
||||||
2,
|
2,
|
||||||
"STRING"
|
"STRING"
|
||||||
],
|
|
||||||
[
|
|
||||||
9,
|
|
||||||
5,
|
|
||||||
0,
|
|
||||||
6,
|
|
||||||
0,
|
|
||||||
"AUDIO"
|
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"groups": [],
|
"groups": [],
|
||||||
@@ -231,8 +280,8 @@
|
|||||||
"ds": {
|
"ds": {
|
||||||
"scale": 1.201632379715383,
|
"scale": 1.201632379715383,
|
||||||
"offset": [
|
"offset": [
|
||||||
1340.797529638862,
|
1415.539606763376,
|
||||||
246.44960712937743
|
123.61214765199416
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"ue_links": [],
|
"ue_links": [],
|
||||||
|
|||||||
Reference in New Issue
Block a user