fix: strip title and heading tags from EPUB text output
Publish to ComfyUI Registry / publish (push) Has been cancelled

The chapter title was appearing multiple times in the text (from <title>,
<h1>, and body). Now <title> and <h1>/<h2>/<h3> are removed from the body
text since the title is already available via the chapter_title output.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 00:07:00 +02:00
parent 197bcc554e
commit e9c947b613
+6
View File
@@ -58,6 +58,12 @@ def _extract_chapters(epub_path):
if tag: if tag:
title = tag.get_text(strip=True) title = tag.get_text(strip=True)
break break
# Remove title/heading elements so they don't appear in the body text
if soup.title:
soup.title.decompose()
for hn in ['h1', 'h2', 'h3']:
for tag in soup.find_all(hn):
tag.decompose()
for tag in soup.find_all(_BLOCK_TAGS): for tag in soup.find_all(_BLOCK_TAGS):
tag.append(soup.new_string('\n\n')) tag.append(soup.new_string('\n\n'))
text = soup.get_text(separator='') text = soup.get_text(separator='')