feat: add language selector for voice_design + Chinese instruct support

- Generate: language dropdown (auto/English/Chinese), passed only in voice_design and auto_voice modes where it selects the instruct vocab - VoiceDesign: Chinese mode with dialect/age/pitch/gender dropdowns using the model's validated Chinese instruct vocabulary (全角逗号) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:22:25 +02:00
parent e26bac3684
commit 772f6654d4
2 changed files with 52 additions and 9 deletions
@@ -62,6 +62,18 @@ class OmniVoiceGenerate:
                    "default": "",
                    "tooltip": "Transcription of ref_audio. Connect a Whisper (or other STT) node for best results.",
                }),
                "language": (
                    ["auto", "English", "Chinese"],
                    {
                        "default": "auto",
                        "tooltip": (
                            "Used in voice_design mode to select the instruct vocabulary.\n"
                            "'English' uses English instruct items (male, female, british accent …)\n"
                            "'Chinese' uses Chinese dialect items (男, 女, 四川话, 东北话 …)\n"
                            "Has no effect in voice_cloning mode (language is inferred from text)."
                        ),
                    },
                ),
                "instruct": ("STRING", {
                    "default": "",
                    "tooltip": (
@@ -113,11 +125,13 @@ class OmniVoiceGenerate:
    FUNCTION = "generate"
    CATEGORY = "OmniVoice"
-    def generate(self, model, text, mode, ref_audio=None, ref_text="",
+    def generate(self, model, text, mode, ref_audio=None, ref_text="", language="auto",
                 instruct="", guidance_scale=2.0, speed=1.0, num_step=32, seed=0):
        if seed != 0:
            torch.manual_seed(seed)
        kwargs = {"text": text, "speed": speed, "num_step": num_step, "guidance_scale": guidance_scale}
        if mode != "voice_cloning" and language and language != "auto":
            kwargs["language"] = language
        if mode == "voice_cloning" and ref_audio is None:
            raise ValueError("voice_cloning mode requires ref_audio to be connected")
@@ -11,7 +11,7 @@ class OmniVoiceVoiceDesign:
        "high pitch", "very high pitch", "whisper",
    ]
-    # Exactly the accents validated by the model's _resolve_instruct()
+    # Exactly the accents validated by the model's _resolve_instruct() for English
    ACCENTS = [
        "none",
        "american accent", "australian accent", "british accent",
@@ -20,18 +20,42 @@ class OmniVoiceVoiceDesign:
        "russian accent",
    ]
    # Chinese dialect items validated by the model's _resolve_instruct()
    ZH_GENDERS  = ["none", "男", "女"]
    ZH_AGES     = ["none", "儿童", "少年", "青年", "中年", "老年"]
    ZH_PITCHES  = ["none", "极低音调", "低音调", "中音调", "高音调", "极高音调", "耳语"]
    ZH_DIALECTS = [
        "none",
        "东北话", "云南话", "四川话", "宁夏话", "桂林话",
        "河南话", "济南话", "甘肃话", "石家庄话", "贵州话",
        "陕西话", "青岛话",
    ]
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "language": (
                    ["English", "Chinese"],
                    {
                        "default": "English",
                        "tooltip": "Selects the instruct vocabulary. Must match the language set in OmniVoice Generate.",
                    },
                ),
                "gender": (cls.GENDERS, {"default": "female",
-                    "tooltip": "Voice gender."}),
+                    "tooltip": "Voice gender (English). Ignored when language is Chinese — use zh_gender."}),
                "age":    (cls.AGES,    {"default": "none",
-                    "tooltip": "Approximate age of the speaker."}),
+                    "tooltip": "Age of the speaker (English). Ignored when language is Chinese — use zh_age."}),
                "pitch":  (cls.PITCHES, {"default": "none",
-                    "tooltip": "Pitch / register of the voice."}),
+                    "tooltip": "Pitch (English). Ignored when language is Chinese — use zh_pitch."}),
                "accent": (cls.ACCENTS, {"default": "none",
-                    "tooltip": "Accent validated by the model. Only these 10 are supported."}),
+                    "tooltip": "Accent (English only, 10 supported values)."}),
            },
            "optional": {
                "zh_gender":  (cls.ZH_GENDERS,  {"default": "none", "tooltip": "声线性别 (Chinese mode)"}),
                "zh_age":     (cls.ZH_AGES,      {"default": "none", "tooltip": "年龄段 (Chinese mode)"}),
                "zh_pitch":   (cls.ZH_PITCHES,   {"default": "none", "tooltip": "音调 (Chinese mode)"}),
                "zh_dialect": (cls.ZH_DIALECTS,  {"default": "none", "tooltip": "方言/口音 (Chinese mode)"}),
            },
        }
@@ -40,6 +64,11 @@ class OmniVoiceVoiceDesign:
    FUNCTION      = "compose"
    CATEGORY      = "OmniVoice"
-    def compose(self, gender, age, pitch, accent):
+    def compose(self, language, gender, age, pitch, accent,
                zh_gender="none", zh_age="none", zh_pitch="none", zh_dialect="none"):
        if language == "Chinese":
            parts = [v for v in [zh_gender, zh_age, zh_pitch, zh_dialect] if v != "none"]
            return ("，".join(parts),)
        else:
            parts = [v for v in [gender, age, pitch, accent] if v != "none"]
            return (", ".join(parts),)