feat: add language selector for voice_design + Chinese instruct support

- Generate: language dropdown (auto/English/Chinese), passed only in voice_design and auto_voice modes where it selects the instruct vocab - VoiceDesign: Chinese mode with dialect/age/pitch/gender dropdowns using the model's validated Chinese instruct vocabulary (全角逗号) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:22:25 +02:00
parent e26bac3684
commit 772f6654d4
2 changed files with 52 additions and 9 deletions
@@ -62,6 +62,18 @@ class OmniVoiceGenerate:
                    "default": "",
                    "tooltip": "Transcription of ref_audio. Connect a Whisper (or other STT) node for best results.",
                }),
+                "language": (
+                    ["auto", "English", "Chinese"],
+                    {
+                        "default": "auto",
+                        "tooltip": (
+                            "Used in voice_design mode to select the instruct vocabulary.\n"
+                            "'English' uses English instruct items (male, female, british accent …)\n"
+                            "'Chinese' uses Chinese dialect items (男, 女, 四川话, 东北话 …)\n"
+                            "Has no effect in voice_cloning mode (language is inferred from text)."
+                        ),
+                    },
+                ),
                "instruct": ("STRING", {
                    "default": "",
                    "tooltip": (
@@ -113,11 +125,13 @@ class OmniVoiceGenerate:
    FUNCTION = "generate"
    CATEGORY = "OmniVoice"

-    def generate(self, model, text, mode, ref_audio=None, ref_text="",
+    def generate(self, model, text, mode, ref_audio=None, ref_text="", language="auto",
                 instruct="", guidance_scale=2.0, speed=1.0, num_step=32, seed=0):
        if seed != 0:
            torch.manual_seed(seed)
        kwargs = {"text": text, "speed": speed, "num_step": num_step, "guidance_scale": guidance_scale}
+        if mode != "voice_cloning" and language and language != "auto":
+            kwargs["language"] = language

        if mode == "voice_cloning" and ref_audio is None:
            raise ValueError("voice_cloning mode requires ref_audio to be connected")
@@ -11,7 +11,7 @@ class OmniVoiceVoiceDesign:
        "high pitch", "very high pitch", "whisper",
    ]

-    # Exactly the accents validated by the model's _resolve_instruct()
+    # Exactly the accents validated by the model's _resolve_instruct() for English
    ACCENTS = [
        "none",
        "american accent", "australian accent", "british accent",
@@ -20,18 +20,42 @@ class OmniVoiceVoiceDesign:
        "russian accent",
    ]

+    # Chinese dialect items validated by the model's _resolve_instruct()
+    ZH_GENDERS  = ["none", "男", "女"]
+    ZH_AGES     = ["none", "儿童", "少年", "青年", "中年", "老年"]
+    ZH_PITCHES  = ["none", "极低音调", "低音调", "中音调", "高音调", "极高音调", "耳语"]
+    ZH_DIALECTS = [
+        "none",
+        "东北话", "云南话", "四川话", "宁夏话", "桂林话",
+        "河南话", "济南话", "甘肃话", "石家庄话", "贵州话",
+        "陕西话", "青岛话",
+    ]
+
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
+                "language": (
+                    ["English", "Chinese"],
+                    {
+                        "default": "English",
+                        "tooltip": "Selects the instruct vocabulary. Must match the language set in OmniVoice Generate.",
+                    },
+                ),
                "gender": (cls.GENDERS, {"default": "female",
-                    "tooltip": "Voice gender."}),
+                    "tooltip": "Voice gender (English). Ignored when language is Chinese — use zh_gender."}),
                "age":    (cls.AGES,    {"default": "none",
-                    "tooltip": "Approximate age of the speaker."}),
+                    "tooltip": "Age of the speaker (English). Ignored when language is Chinese — use zh_age."}),
                "pitch":  (cls.PITCHES, {"default": "none",
-                    "tooltip": "Pitch / register of the voice."}),
+                    "tooltip": "Pitch (English). Ignored when language is Chinese — use zh_pitch."}),
                "accent": (cls.ACCENTS, {"default": "none",
-                    "tooltip": "Accent validated by the model. Only these 10 are supported."}),
+                    "tooltip": "Accent (English only, 10 supported values)."}),
+            },
+            "optional": {
+                "zh_gender":  (cls.ZH_GENDERS,  {"default": "none", "tooltip": "声线性别 (Chinese mode)"}),
+                "zh_age":     (cls.ZH_AGES,      {"default": "none", "tooltip": "年龄段 (Chinese mode)"}),
+                "zh_pitch":   (cls.ZH_PITCHES,   {"default": "none", "tooltip": "音调 (Chinese mode)"}),
+                "zh_dialect": (cls.ZH_DIALECTS,  {"default": "none", "tooltip": "方言/口音 (Chinese mode)"}),
            },
        }

@@ -40,6 +64,11 @@ class OmniVoiceVoiceDesign:
    FUNCTION      = "compose"
    CATEGORY      = "OmniVoice"

-    def compose(self, gender, age, pitch, accent):
-        parts = [v for v in [gender, age, pitch, accent] if v != "none"]
-        return (", ".join(parts),)
+    def compose(self, language, gender, age, pitch, accent,
+                zh_gender="none", zh_age="none", zh_pitch="none", zh_dialect="none"):
+        if language == "Chinese":
+            parts = [v for v in [zh_gender, zh_age, zh_pitch, zh_dialect] if v != "none"]
+            return ("，".join(parts),)
+        else:
+            parts = [v for v in [gender, age, pitch, accent] if v != "none"]
+            return (", ".join(parts),)