feat: add Voice Design node + language and guidance_scale to Generate

OmniVoiceVoiceDesign: structured dropdowns for gender/age/pitch/accent that compose into an instruct string — wire to Generate's instruct input. OmniVoiceGenerate: new optional language dropdown (auto + 11 languages) and guidance_scale (CFG, default 2.0) parameters. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:02:06 +02:00
parent 97ed0f209f
commit c1558efad9
4 changed files with 57 additions and 4 deletions
@@ -1,4 +1,4 @@
-from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader, OmniVoiceVoicePreset, OmniVoiceMixVoices
+from .nodes import OmniVoiceModelLoader, OmniVoiceGenerate, OmniVoiceEpubLoader, OmniVoiceVoicePreset, OmniVoiceMixVoices, OmniVoiceVoiceDesign
 NODE_CLASS_MAPPINGS = {
    "OmniVoiceModelLoader": OmniVoiceModelLoader,
@@ -6,6 +6,7 @@ NODE_CLASS_MAPPINGS = {
    "OmniVoiceEpubLoader": OmniVoiceEpubLoader,
    "OmniVoiceVoicePreset": OmniVoiceVoicePreset,
    "OmniVoiceMixVoices": OmniVoiceMixVoices,
    "OmniVoiceVoiceDesign": OmniVoiceVoiceDesign,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
@@ -14,6 +15,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "OmniVoiceEpubLoader": "OmniVoice EPUB Loader",
    "OmniVoiceVoicePreset": "OmniVoice Voice Preset",
    "OmniVoiceMixVoices": "OmniVoice Mix Voices",
    "OmniVoiceVoiceDesign": "OmniVoice Voice Design",
 }
 __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
@@ -3,5 +3,6 @@ from .generator import OmniVoiceGenerate
 from .epub_loader import OmniVoiceEpubLoader
 from .voice_presets import OmniVoiceVoicePreset
 from .mix_voices import OmniVoiceMixVoices
 from .voice_design import OmniVoiceVoiceDesign
-__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader", "OmniVoiceVoicePreset", "OmniVoiceMixVoices"]
+__all__ = ["OmniVoiceModelLoader", "OmniVoiceGenerate", "OmniVoiceEpubLoader", "OmniVoiceVoicePreset", "OmniVoiceMixVoices", "OmniVoiceVoiceDesign"]
@@ -62,6 +62,14 @@ class OmniVoiceGenerate:
                    "default": "",
                    "tooltip": "Transcription of ref_audio. Connect a Whisper (or other STT) node for best results.",
                }),
                "language": (
                    ["auto", "English", "Chinese", "Japanese", "Korean", "French",
                     "Spanish", "German", "Portuguese", "Russian", "Arabic", "Hindi"],
                    {
                        "default": "auto",
                        "tooltip": "Language of the text to synthesize. 'auto' lets the model detect it.",
                    },
                ),
                "instruct": ("STRING", {
                    "default": "",
                    "tooltip": (
@@ -79,6 +87,13 @@ class OmniVoiceGenerate:
                        "EXAMPLE:  female, high pitch, british accent"
                    ),
                }),
                "guidance_scale": ("FLOAT", {
                    "default": 2.0, "min": 0.0, "max": 20.0, "step": 0.1,
                    "tooltip": (
                        "Classifier-free guidance scale. Higher = more faithful to the reference/instruct, "
                        "but can over-saturate. 2.0 is a good default."
                    ),
                }),
                "speed": ("FLOAT", {
                    "default": 1.0, "min": 0.1, "max": 3.0, "step": 0.1,
                    "tooltip": "Playback speed multiplier. 1.0 = normal, >1.0 = faster, <1.0 = slower.",
@@ -104,10 +119,13 @@ class OmniVoiceGenerate:
    FUNCTION = "generate"
    CATEGORY = "OmniVoice"
-    def generate(self, model, text, mode, ref_audio=None, ref_text="", instruct="", speed=1.0, num_step=32, seed=0):
+    def generate(self, model, text, mode, ref_audio=None, ref_text="", language="auto",
                 instruct="", guidance_scale=2.0, speed=1.0, num_step=32, seed=0):
        if seed != 0:
            torch.manual_seed(seed)
-        kwargs = {"text": text, "speed": speed, "num_step": num_step}
+        kwargs = {"text": text, "speed": speed, "num_step": num_step, "guidance_scale": guidance_scale}
        if language != "auto":
            kwargs["language"] = language
        if mode == "voice_cloning" and ref_audio is None:
            raise ValueError("voice_cloning mode requires ref_audio to be connected")
@@ -0,0 +1,32 @@
 class OmniVoiceVoiceDesign:
    """Compose a voice design instruct string from structured dropdowns."""
    GENDERS  = ["none", "male", "female"]
    AGES     = ["none", "child", "teenager", "young adult", "middle-aged", "elderly"]
    PITCHES  = ["none", "very low pitch", "low pitch", "moderate pitch", "high pitch", "very high pitch", "whisper"]
    ACCENTS  = [
        "none",
        "american accent", "british accent", "australian accent", "canadian accent",
        "indian accent", "chinese accent", "japanese accent", "korean accent",
        "portuguese accent", "russian accent",
    ]
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "gender": (cls.GENDERS, {"default": "female"}),
                "age":    (cls.AGES,    {"default": "none"}),
                "pitch":  (cls.PITCHES, {"default": "none"}),
                "accent": (cls.ACCENTS, {"default": "none"}),
            },
        }
    RETURN_TYPES  = ("STRING",)
    RETURN_NAMES  = ("instruct",)
    FUNCTION      = "compose"
    CATEGORY      = "OmniVoice"
    def compose(self, gender, age, pitch, accent):
        parts = [v for v in [gender, age, pitch, accent] if v != "none"]
        return (", ".join(parts),)