From e9c947b6131523a3b291c1cd42998e1ac0263aeb Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 10 Apr 2026 00:07:00 +0200 Subject: [PATCH] fix: strip title and heading tags from EPUB text output The chapter title was appearing multiple times in the text (from , <h1>, and body). Now <title> and <h1>/<h2>/<h3> are removed from the body text since the title is already available via the chapter_title output. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- nodes/epub_loader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nodes/epub_loader.py b/nodes/epub_loader.py index 29d4571..986e159 100644 --- a/nodes/epub_loader.py +++ b/nodes/epub_loader.py @@ -58,6 +58,12 @@ def _extract_chapters(epub_path): if tag: title = tag.get_text(strip=True) break + # Remove title/heading elements so they don't appear in the body text + if soup.title: + soup.title.decompose() + for hn in ['h1', 'h2', 'h3']: + for tag in soup.find_all(hn): + tag.decompose() for tag in soup.find_all(_BLOCK_TAGS): tag.append(soup.new_string('\n\n')) text = soup.get_text(separator='')