ComfyUI-MisoTTS/nodes/epub_loader.py

import re
import zipfile
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup

_BLOCK_TAGS = {"p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div", "br", "tr"}


def _local(tag):
    return tag.split("}")[-1]


def _extract_chapters(epub_path):
    chapters = []
    with zipfile.ZipFile(epub_path, "r") as zf:
        container = ET.fromstring(zf.read("META-INF/container.xml"))
        rootfile = next(el for el in container.iter() if _local(el.tag) == "rootfile")
        opf_path = rootfile.attrib["full-path"]
        opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else ""

        opf = ET.fromstring(zf.read(opf_path))
        manifest = {
            el.attrib["id"]: el.attrib["href"]
            for el in opf.iter()
            if _local(el.tag) == "item" and "xhtml" in el.attrib.get("media-type", "")
        }
        spine = [el.attrib["idref"] for el in opf.iter() if _local(el.tag) == "itemref"]

        for idref in spine:
            href = manifest.get(idref)
            if href is None:
                continue
            xhtml = zf.read(opf_dir + href).decode("utf-8", errors="replace")
            soup = BeautifulSoup(xhtml, "html.parser")
            for tag in soup(["script", "style"]):
                tag.decompose()
            title = None
            if soup.title and soup.title.string:
                title = soup.title.string.strip()
            if not title:
                for hn in ["h1", "h2", "h3"]:
                    tag = soup.find(hn)
                    if tag:
                        title = tag.get_text(strip=True)
                        break
            if soup.title:
                soup.title.decompose()
            for hn in ["h1", "h2", "h3"]:
                for tag in soup.find_all(hn):
                    tag.decompose()
            for tag in soup.find_all(_BLOCK_TAGS):
                tag.append(soup.new_string("\n\n"))
            text = soup.get_text(separator="")
            text = re.sub(r"[^\S\n]+", " ", text)
            text = re.sub(r" *\n *", "\n", text)
            text = re.sub(r"\n{3,}", "\n\n", text)
            chapters.append({"title": title, "text": text.strip()})
    return chapters


class MisoTTSEpubLoader:
    """Load an EPUB and emit a chapter range as text, ready for the MisoTTS Generate node."""

    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "epub_path": ("STRING", {"default": "", "tooltip": "Absolute path to the .epub file."}),
                "chapter_start": ("INT", {"default": 1, "min": 1, "max": 9999, "step": 1,
                                          "tooltip": "First chapter (1-indexed). Clamped to valid range."}),
                "chapter_end": ("INT", {"default": 1, "min": 1, "max": 9999, "step": 1,
                                        "tooltip": "Last chapter (1-indexed, inclusive). Clamped automatically."}),
            },
        }

    RETURN_TYPES = ("STRING", "STRING", "STRING")
    RETURN_NAMES = ("text", "chapter_title", "chapter_list")
    FUNCTION = "load_epub"
    CATEGORY = "MisoTTS"

    def load_epub(self, epub_path, chapter_start, chapter_end):
        chapters = _extract_chapters(epub_path)
        n = len(chapters)
        if n == 0:
            return ("", "", "")
        start = max(1, min(chapter_start, n))
        end = max(start, min(chapter_end, n))
        chapter_list = "\n".join(
            f"{i}. {ch['title'] if ch['title'] else f'Chapter {i}'}"
            for i, ch in enumerate(chapters, 1)
        )
        first = chapters[start - 1]
        chapter_title = first["title"] if first["title"] else f"Chapter {start}"
        text = "\n\n---\n\n".join(ch["text"] for ch in chapters[start - 1: end])
        return (text, chapter_title, chapter_list)