ComfyUI-Omnivoice/nodes/epub_loader.py

import re
import zipfile
import io
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

_BLOCK_TAGS = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'br', 'tr'}


def _local(tag):
    """Strip XML namespace prefix, return local tag name."""
    return tag.split('}')[-1]


def _extract_chapters(epub_path):
    """Parse EPUB and return list of {"title": str|None, "text": str}."""
    chapters = []
    with zipfile.ZipFile(epub_path, 'r') as zf:
        # 1. Find OPF path from container.xml
        container = ET.fromstring(zf.read('META-INF/container.xml'))
        rootfile = next(
            el for el in container.iter()
            if _local(el.tag) == 'rootfile'
        )
        opf_path = rootfile.attrib['full-path']
        opf_dir = opf_path.rsplit('/', 1)[0] + '/' if '/' in opf_path else ''

        # 2. Parse OPF: build manifest and spine
        opf = ET.fromstring(zf.read(opf_path))
        manifest = {
            el.attrib['id']: el.attrib['href']
            for el in opf.iter()
            if _local(el.tag) == 'item'
            and 'xhtml' in el.attrib.get('media-type', '')
        }
        spine = [
            el.attrib['idref']
            for el in opf.iter()
            if _local(el.tag) == 'itemref'
        ]

        # 3. Extract text from each chapter XHTML
        for idref in spine:
            href = manifest.get(idref)
            if href is None:
                continue
            xhtml = zf.read(opf_dir + href).decode('utf-8', errors='replace')
            soup = BeautifulSoup(xhtml, 'html.parser')
            for tag in soup(['script', 'style']):
                tag.decompose()
            # Title: <title> → <h1/h2/h3> → None
            title = None
            if soup.title and soup.title.string:
                title = soup.title.string.strip()
            if not title:
                for hn in ['h1', 'h2', 'h3']:
                    tag = soup.find(hn)
                    if tag:
                        title = tag.get_text(strip=True)
                        break
            # Remove title/heading elements so they don't appear in the body text
            if soup.title:
                soup.title.decompose()
            for hn in ['h1', 'h2', 'h3']:
                for tag in soup.find_all(hn):
                    tag.decompose()
            for tag in soup.find_all(_BLOCK_TAGS):
                tag.append(soup.new_string('\n\n'))
            text = soup.get_text(separator='')
            text = re.sub(r'[^\S\n]+', ' ', text)   # collapse inline whitespace
            text = re.sub(r' *\n *', '\n', text)     # trim spaces around newlines
            text = re.sub(r'\n{3,}', '\n\n', text)  # max one blank line
            text = text.strip()
            chapters.append({"title": title, "text": text})

    return chapters


class OmniVoiceEpubLoader:
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "epub_path": ("STRING", {
                    "default": "",
                    "tooltip": "Absolute path to the .epub file to load.",
                }),
                "chapter_start": ("INT", {
                    "default": 1, "min": 1, "max": 9999, "step": 1,
                    "tooltip": "First chapter to include (1-indexed). Clamped to valid range automatically.",
                }),
                "chapter_end": ("INT", {
                    "default": 1, "min": 1, "max": 9999, "step": 1,
                    "tooltip": "Last chapter to include (1-indexed, inclusive). Clamped automatically. If less than chapter_start, set to chapter_start.",
                }),
            },
        }

    RETURN_TYPES = ("STRING", "STRING", "STRING")
    RETURN_NAMES = ("text", "chapter_title", "chapter_list")
    FUNCTION = "load_epub"
    CATEGORY = "OmniVoice"

    def load_epub(self, epub_path, chapter_start, chapter_end):
        chapters = _extract_chapters(epub_path)
        n = len(chapters)

        if n == 0:
            return ("", "", "")

        start = max(1, min(chapter_start, n))
        end   = max(start, min(chapter_end, n))

        # chapter_list: all chapters regardless of selection
        chapter_list = "\n".join(
            f"{i}. {ch['title'] if ch['title'] else f'Chapter {i}'}"
            for i, ch in enumerate(chapters, 1)
        )

        # chapter_title: title of the first selected chapter (useful for file naming)
        first = chapters[start - 1]
        chapter_title = first["title"] if first["title"] else f"Chapter {start}"

        # text: selected range joined by delimiter
        selected = chapters[start - 1 : end]
        text = "\n\n---\n\n".join(ch["text"] for ch in selected)

        return (text, chapter_title, chapter_list)