e9c947b613
Publish to ComfyUI Registry / publish (push) Has been cancelled
The chapter title was appearing multiple times in the text (from <title>, <h1>, and body). Now <title> and <h1>/<h2>/<h3> are removed from the body text since the title is already available via the chapter_title output. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
129 lines
4.7 KiB
Python
129 lines
4.7 KiB
Python
import re
|
|
import zipfile
|
|
import io
|
|
import xml.etree.ElementTree as ET
|
|
from bs4 import BeautifulSoup
|
|
|
|
_BLOCK_TAGS = {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'br', 'tr'}
|
|
|
|
|
|
def _local(tag):
|
|
"""Strip XML namespace prefix, return local tag name."""
|
|
return tag.split('}')[-1]
|
|
|
|
|
|
def _extract_chapters(epub_path):
|
|
"""Parse EPUB and return list of {"title": str|None, "text": str}."""
|
|
chapters = []
|
|
with zipfile.ZipFile(epub_path, 'r') as zf:
|
|
# 1. Find OPF path from container.xml
|
|
container = ET.fromstring(zf.read('META-INF/container.xml'))
|
|
rootfile = next(
|
|
el for el in container.iter()
|
|
if _local(el.tag) == 'rootfile'
|
|
)
|
|
opf_path = rootfile.attrib['full-path']
|
|
opf_dir = opf_path.rsplit('/', 1)[0] + '/' if '/' in opf_path else ''
|
|
|
|
# 2. Parse OPF: build manifest and spine
|
|
opf = ET.fromstring(zf.read(opf_path))
|
|
manifest = {
|
|
el.attrib['id']: el.attrib['href']
|
|
for el in opf.iter()
|
|
if _local(el.tag) == 'item'
|
|
and 'xhtml' in el.attrib.get('media-type', '')
|
|
}
|
|
spine = [
|
|
el.attrib['idref']
|
|
for el in opf.iter()
|
|
if _local(el.tag) == 'itemref'
|
|
]
|
|
|
|
# 3. Extract text from each chapter XHTML
|
|
for idref in spine:
|
|
href = manifest.get(idref)
|
|
if href is None:
|
|
continue
|
|
xhtml = zf.read(opf_dir + href).decode('utf-8', errors='replace')
|
|
soup = BeautifulSoup(xhtml, 'html.parser')
|
|
for tag in soup(['script', 'style']):
|
|
tag.decompose()
|
|
# Title: <title> → <h1/h2/h3> → None
|
|
title = None
|
|
if soup.title and soup.title.string:
|
|
title = soup.title.string.strip()
|
|
if not title:
|
|
for hn in ['h1', 'h2', 'h3']:
|
|
tag = soup.find(hn)
|
|
if tag:
|
|
title = tag.get_text(strip=True)
|
|
break
|
|
# Remove title/heading elements so they don't appear in the body text
|
|
if soup.title:
|
|
soup.title.decompose()
|
|
for hn in ['h1', 'h2', 'h3']:
|
|
for tag in soup.find_all(hn):
|
|
tag.decompose()
|
|
for tag in soup.find_all(_BLOCK_TAGS):
|
|
tag.append(soup.new_string('\n\n'))
|
|
text = soup.get_text(separator='')
|
|
text = re.sub(r'[^\S\n]+', ' ', text) # collapse inline whitespace
|
|
text = re.sub(r' *\n *', '\n', text) # trim spaces around newlines
|
|
text = re.sub(r'\n{3,}', '\n\n', text) # max one blank line
|
|
text = text.strip()
|
|
chapters.append({"title": title, "text": text})
|
|
|
|
return chapters
|
|
|
|
|
|
class OmniVoiceEpubLoader:
|
|
@classmethod
|
|
def INPUT_TYPES(cls):
|
|
return {
|
|
"required": {
|
|
"epub_path": ("STRING", {
|
|
"default": "",
|
|
"tooltip": "Absolute path to the .epub file to load.",
|
|
}),
|
|
"chapter_start": ("INT", {
|
|
"default": 1, "min": 1, "max": 9999, "step": 1,
|
|
"tooltip": "First chapter to include (1-indexed). Clamped to valid range automatically.",
|
|
}),
|
|
"chapter_end": ("INT", {
|
|
"default": 1, "min": 1, "max": 9999, "step": 1,
|
|
"tooltip": "Last chapter to include (1-indexed, inclusive). Clamped automatically. If less than chapter_start, set to chapter_start.",
|
|
}),
|
|
},
|
|
}
|
|
|
|
RETURN_TYPES = ("STRING", "STRING", "STRING")
|
|
RETURN_NAMES = ("text", "chapter_title", "chapter_list")
|
|
FUNCTION = "load_epub"
|
|
CATEGORY = "OmniVoice"
|
|
|
|
def load_epub(self, epub_path, chapter_start, chapter_end):
|
|
chapters = _extract_chapters(epub_path)
|
|
n = len(chapters)
|
|
|
|
if n == 0:
|
|
return ("", "", "")
|
|
|
|
start = max(1, min(chapter_start, n))
|
|
end = max(start, min(chapter_end, n))
|
|
|
|
# chapter_list: all chapters regardless of selection
|
|
chapter_list = "\n".join(
|
|
f"{i}. {ch['title'] if ch['title'] else f'Chapter {i}'}"
|
|
for i, ch in enumerate(chapters, 1)
|
|
)
|
|
|
|
# chapter_title: title of the first selected chapter (useful for file naming)
|
|
first = chapters[start - 1]
|
|
chapter_title = first["title"] if first["title"] else f"Chapter {start}"
|
|
|
|
# text: selected range joined by delimiter
|
|
selected = chapters[start - 1 : end]
|
|
text = "\n\n---\n\n".join(ch["text"] for ch in selected)
|
|
|
|
return (text, chapter_title, chapter_list)
|