Initial release: ComfyUI-MisoTTS (modernized CSM 8B)

Modernized MisoTTS integration for ComfyUI with no torchtune/moshi: - vendored plain-torch Llama backbone (csm_llama), parity-verified Δ=0 vs torchtune - transformers.MimiModel codec (bit-identical codes to moshi), drops moshi/bnb/sphn - low-memory loader: streams 32GB fp32 checkpoint to GPU in bf16 (~18GB VRAM) - nodes: Model Loader, Generate (audiobook chunking + voice anchoring), EPUB Loader - pin-free requirements; runs on modern torch / Blackwell GPUs Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-06 23:37:54 +02:00
commit f7a6f7790d
13 changed files with 1110 additions and 0 deletions
@@ -0,0 +1,5 @@
+from .loader import MisoTTSModelLoader
+from .generator import MisoTTSGenerate
+from .epub_loader import MisoTTSEpubLoader
+
+__all__ = ["MisoTTSModelLoader", "MisoTTSGenerate", "MisoTTSEpubLoader"]
@@ -0,0 +1,96 @@
+import re
+import zipfile
+import xml.etree.ElementTree as ET
+
+from bs4 import BeautifulSoup
+
+_BLOCK_TAGS = {"p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div", "br", "tr"}
+
+
+def _local(tag):
+    return tag.split("}")[-1]
+
+
+def _extract_chapters(epub_path):
+    chapters = []
+    with zipfile.ZipFile(epub_path, "r") as zf:
+        container = ET.fromstring(zf.read("META-INF/container.xml"))
+        rootfile = next(el for el in container.iter() if _local(el.tag) == "rootfile")
+        opf_path = rootfile.attrib["full-path"]
+        opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else ""
+
+        opf = ET.fromstring(zf.read(opf_path))
+        manifest = {
+            el.attrib["id"]: el.attrib["href"]
+            for el in opf.iter()
+            if _local(el.tag) == "item" and "xhtml" in el.attrib.get("media-type", "")
+        }
+        spine = [el.attrib["idref"] for el in opf.iter() if _local(el.tag) == "itemref"]
+
+        for idref in spine:
+            href = manifest.get(idref)
+            if href is None:
+                continue
+            xhtml = zf.read(opf_dir + href).decode("utf-8", errors="replace")
+            soup = BeautifulSoup(xhtml, "html.parser")
+            for tag in soup(["script", "style"]):
+                tag.decompose()
+            title = None
+            if soup.title and soup.title.string:
+                title = soup.title.string.strip()
+            if not title:
+                for hn in ["h1", "h2", "h3"]:
+                    tag = soup.find(hn)
+                    if tag:
+                        title = tag.get_text(strip=True)
+                        break
+            if soup.title:
+                soup.title.decompose()
+            for hn in ["h1", "h2", "h3"]:
+                for tag in soup.find_all(hn):
+                    tag.decompose()
+            for tag in soup.find_all(_BLOCK_TAGS):
+                tag.append(soup.new_string("\n\n"))
+            text = soup.get_text(separator="")
+            text = re.sub(r"[^\S\n]+", " ", text)
+            text = re.sub(r" *\n *", "\n", text)
+            text = re.sub(r"\n{3,}", "\n\n", text)
+            chapters.append({"title": title, "text": text.strip()})
+    return chapters
+
+
+class MisoTTSEpubLoader:
+    """Load an EPUB and emit a chapter range as text, ready for the MisoTTS Generate node."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "epub_path": ("STRING", {"default": "", "tooltip": "Absolute path to the .epub file."}),
+                "chapter_start": ("INT", {"default": 1, "min": 1, "max": 9999, "step": 1,
+                                          "tooltip": "First chapter (1-indexed). Clamped to valid range."}),
+                "chapter_end": ("INT", {"default": 1, "min": 1, "max": 9999, "step": 1,
+                                        "tooltip": "Last chapter (1-indexed, inclusive). Clamped automatically."}),
+            },
+        }
+
+    RETURN_TYPES = ("STRING", "STRING", "STRING")
+    RETURN_NAMES = ("text", "chapter_title", "chapter_list")
+    FUNCTION = "load_epub"
+    CATEGORY = "MisoTTS"
+
+    def load_epub(self, epub_path, chapter_start, chapter_end):
+        chapters = _extract_chapters(epub_path)
+        n = len(chapters)
+        if n == 0:
+            return ("", "", "")
+        start = max(1, min(chapter_start, n))
+        end = max(start, min(chapter_end, n))
+        chapter_list = "\n".join(
+            f"{i}. {ch['title'] if ch['title'] else f'Chapter {i}'}"
+            for i, ch in enumerate(chapters, 1)
+        )
+        first = chapters[start - 1]
+        chapter_title = first["title"] if first["title"] else f"Chapter {start}"
+        text = "\n\n---\n\n".join(ch["text"] for ch in chapters[start - 1: end])
+        return (text, chapter_title, chapter_list)
@@ -0,0 +1,175 @@
+import re
+
+import torch
+import torchaudio
+
+from ..misotts import Segment
+
+
+# --------------------------------------------------------------------------- audio helpers
+def _audio_to_mono24k(audio_dict, sr_target=24000):
+    """ComfyUI AUDIO dict -> 1-D mono tensor at 24 kHz (Mimi's rate)."""
+    wav = audio_dict["waveform"]
+    sr = int(audio_dict["sample_rate"])
+    if wav.dim() == 3:
+        wav = wav[0]                      # (C, T)
+    if wav.shape[0] > 1:
+        wav = wav.mean(0, keepdim=True)   # mix to mono
+    if sr != sr_target:
+        wav = torchaudio.functional.resample(wav, sr, sr_target)
+    return wav.squeeze(0).contiguous().float()
+
+
+# --------------------------------------------------------------------------- text chunking
+def _split_sentences(text):
+    parts = re.split(r"(?<=[.!?…])\s+", text.strip())
+    return [p.strip() for p in parts if p.strip()]
+
+
+def _hard_split(s, max_chars):
+    """Break an over-long sentence on commas, then on words, so no chunk exceeds max_chars."""
+    out, cur = [], ""
+    for tok in re.split(r"(?<=,)\s+", s):
+        if cur and len(cur) + 1 + len(tok) > max_chars:
+            out.append(cur)
+            cur = tok
+        else:
+            cur = f"{cur} {tok}".strip()
+    if cur:
+        out.append(cur)
+    final = []
+    for c in out:
+        if len(c) <= max_chars:
+            final.append(c)
+            continue
+        cc = ""
+        for w in c.split():
+            if cc and len(cc) + 1 + len(w) > max_chars:
+                final.append(cc)
+                cc = w
+            else:
+                cc = f"{cc} {w}".strip()
+        if cc:
+            final.append(cc)
+    return final
+
+
+def _chunk_text(text, max_chars):
+    """Sentence-aware chunking. Respects paragraph breaks and EPUB '---' chapter markers,
+    packs whole sentences up to max_chars, and hard-splits any sentence longer than that."""
+    chunks = []
+    paragraphs = re.split(r"\n\s*\n|\n?-{3,}\n?", text)
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+        cur = ""
+        for s in _split_sentences(para):
+            if len(s) > max_chars:
+                if cur:
+                    chunks.append(cur)
+                    cur = ""
+                chunks.extend(_hard_split(s, max_chars))
+                continue
+            if cur and len(cur) + 1 + len(s) > max_chars:
+                chunks.append(cur)
+                cur = s
+            else:
+                cur = f"{cur} {s}".strip()
+        if cur:
+            chunks.append(cur)
+    return chunks
+
+
+# --------------------------------------------------------------------------- node
+class MisoTTSGenerate:
+    """Generate speech from text. Handles arbitrarily long text (audiobooks/EPUB chapters)
+    by sentence-aware chunking, and keeps a consistent voice across chunks by feeding prior
+    audio (and an optional reference clip) back as context — CSM models otherwise drift."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "model": ("MISOTTS_MODEL", {"tooltip": "Loaded by the MisoTTS Model Loader node."}),
+                "text": ("STRING", {"multiline": True, "default": "",
+                                     "tooltip": "Text to synthesize. Long text is chunked automatically."}),
+            },
+            "optional": {
+                "ref_audio": ("AUDIO", {
+                    "tooltip": "Optional reference clip to clone the voice from. Anchored across every chunk.",
+                }),
+                "ref_text": ("STRING", {"default": "",
+                                        "tooltip": "Transcript of ref_audio. Improves cloning quality."}),
+                "speaker": ("INT", {"default": 0, "min": 0, "max": 31,
+                                    "tooltip": "Speaker id. Keep fixed for a single narrator."}),
+                "temperature": ("FLOAT", {"default": 0.9, "min": 0.1, "max": 2.0, "step": 0.05,
+                                          "tooltip": "Sampling temperature. Lower = steadier, higher = more varied."}),
+                "topk": ("INT", {"default": 50, "min": 1, "max": 500,
+                                 "tooltip": "Top-k sampling cutoff."}),
+                "max_chunk_seconds": ("FLOAT", {"default": 30.0, "min": 5.0, "max": 90.0, "step": 1.0,
+                                                "tooltip": "Max audio length generated per text chunk."}),
+                "chunk_chars": ("INT", {"default": 300, "min": 50, "max": 2000, "step": 10,
+                                        "tooltip": "Target characters per chunk. Larger = fewer joins, more VRAM/time."}),
+                "context_window": ("INT", {"default": 1, "min": 0, "max": 4,
+                                           "tooltip": (
+                                               "How many previous chunks to feed back as context to keep the voice "
+                                               "consistent. 1 is a good default; 0 makes each chunk independent "
+                                               "(voice may drift). Higher = steadier but slower / more VRAM.")}),
+                "silence_ms": ("INT", {"default": 250, "min": 0, "max": 2000, "step": 10,
+                                       "tooltip": "Silence inserted between chunks."}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 2**32 - 1,
+                                 "tooltip": "0 = random each run. Set a fixed value for reproducible narration."}),
+            },
+        }
+
+    RETURN_TYPES = ("AUDIO",)
+    RETURN_NAMES = ("audio",)
+    FUNCTION = "generate"
+    CATEGORY = "MisoTTS"
+
+    def generate(self, model, text, ref_audio=None, ref_text="", speaker=0, temperature=0.9,
+                 topk=50, max_chunk_seconds=30.0, chunk_chars=300, context_window=1,
+                 silence_ms=250, seed=0):
+        if seed != 0:
+            torch.manual_seed(seed)
+        text = (text or "").strip()
+        if not text:
+            raise ValueError("MisoTTS Generate: text is empty.")
+
+        chunks = _chunk_text(text, int(chunk_chars))
+        if not chunks:
+            raise ValueError("MisoTTS Generate: no text chunks produced.")
+
+        sr = int(model.sample_rate)
+        ms = float(max_chunk_seconds) * 1000.0
+
+        ref_seg = None
+        if ref_audio is not None:
+            ref_seg = Segment(speaker=int(speaker), text=(ref_text or "").strip(),
+                              audio=_audio_to_mono24k(ref_audio, sr))
+
+        gap = torch.zeros(int(sr * silence_ms / 1000.0)) if silence_ms > 0 else None
+        keep = max(int(context_window), 1)
+
+        history, pieces = [], []
+        for i, chunk in enumerate(chunks):
+            ctx = []
+            if ref_seg is not None:
+                ctx.append(ref_seg)
+            if context_window > 0:
+                ctx.extend(history[-context_window:])
+
+            audio = model.generate(text=chunk, speaker=int(speaker), context=ctx,
+                                   max_audio_length_ms=ms, temperature=float(temperature), topk=int(topk))
+            audio = audio.detach().to("cpu", torch.float32)
+
+            if i > 0 and gap is not None:
+                pieces.append(gap)
+            pieces.append(audio)
+
+            history.append(Segment(speaker=int(speaker), text=chunk, audio=audio))
+            history = history[-keep:]
+
+        waveform = torch.cat(pieces).unsqueeze(0).unsqueeze(0)   # (1, 1, T)
+        return ({"waveform": waveform, "sample_rate": sr},)
@@ -0,0 +1,70 @@
+import os
+
+import torch
+
+_import_error = None
+try:
+    from ..misotts import load_miso_8b
+except Exception as e:  # pragma: no cover - surfaced to the user at node runtime
+    load_miso_8b = None
+    _import_error = e
+
+try:
+    import folder_paths
+    CACHE_DIR = os.path.join(folder_paths.models_dir, "misotts")
+except ImportError:
+    CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "misotts")
+
+DTYPE_MAP = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}
+
+
+class MisoTTSModelLoader:
+    """Load the MisoTTS 8B model (modernized: no torchtune/moshi).
+
+    The 32 GB fp32 checkpoint is streamed straight to the GPU in the chosen dtype, so
+    loading needs ~18 GB VRAM (bf16) and almost no system RAM.
+    """
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "device": (["cuda:0", "cuda:1", "cpu"], {"default": "cuda:0"}),
+                "dtype": (["bfloat16", "float16", "float32"], {"default": "bfloat16"}),
+            },
+            "optional": {
+                "model_repo_or_path": ("STRING", {
+                    "default": "MisoLabs/MisoTTS",
+                    "tooltip": "HF repo id or a local path to a model.safetensors / model dir.",
+                }),
+                "tokenizer": ("STRING", {
+                    "default": "unsloth/Llama-3.2-1B",
+                    "tooltip": (
+                        "Llama-3.2 tokenizer source. Default is an ungated mirror byte-identical "
+                        "to meta-llama/Llama-3.2-1B. Change only if you know what you're doing."
+                    ),
+                }),
+            },
+        }
+
+    RETURN_TYPES = ("MISOTTS_MODEL",)
+    RETURN_NAMES = ("model",)
+    FUNCTION = "load_model"
+    CATEGORY = "MisoTTS"
+
+    def load_model(self, device, dtype, model_repo_or_path="MisoLabs/MisoTTS", tokenizer="unsloth/Llama-3.2-1B"):
+        if load_miso_8b is None:
+            raise ImportError(
+                "MisoTTS engine failed to import. Ensure transformers, safetensors, tokenizers "
+                f"and torchaudio are installed.\nOriginal error: {_import_error}"
+            )
+        os.makedirs(CACHE_DIR, exist_ok=True)
+        os.environ.setdefault("HF_HOME", CACHE_DIR)
+        source = model_repo_or_path.strip() or "MisoLabs/MisoTTS"
+        gen = load_miso_8b(
+            device=device,
+            model_path_or_repo_id=source,
+            dtype=DTYPE_MAP[dtype],
+            tokenizer_name=tokenizer.strip() or "unsloth/Llama-3.2-1B",
+        )
+        return (gen,)