Source code for vivre.parser

"""
EPUB Parser module for the vivre library.

This module provides functionality to load, validate, and parse EPUB files,
extracting chapter content while filtering out non-story elements like
acknowledgements, covers, table of contents, etc.

The VivreParser class implements a robust EPUB parsing system that follows
EPUB standards to extract story content while intelligently filtering out
front matter, back matter, and other non-story elements.

Example:
    >>> from vivre.parser import VivreParser
    >>> parser = VivreParser()
    >>> chapters = parser.parse_epub("book.epub")
    >>> for title, content in chapters:
    ...     print(f"Chapter: {title}")
    ...     print(f"Content: {content[:100]}...")
"""

import os
import re
import zipfile
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

from bs4 import BeautifulSoup, Tag
from defusedxml import ElementTree as ET

# XML namespaces for EPUB parsing
NAMESPACES = {
    "dc": "http://purl.org/dc/elements/1.1/",
    "opf": "http://www.idpf.org/2007/opf",
    "container": "urn:oasis:names:tc:opendocument:xmlns:container",
}


@dataclass
class Chapter:
    """Represents a chapter extracted from an EPUB file."""

    title: str
    content: str
    href: str
    order: int
    char_count: int
    word_count: int


[docs] class VivreParser: """ A robust parser for EPUB files that extracts story content while filtering non-story elements. This parser follows EPUB standards to extract chapter titles and content from EPUB files, intelligently filtering out front matter, back matter, and other non-story content. The parser implements a multi-stage approach: 1. EPUB validation and structure analysis 2. Table of contents parsing for chapter titles 3. Content extraction with intelligent filtering 4. Text cleaning and normalization The parser can handle various EPUB formats and structures, including different table of contents formats (NCX and HTML) and various content organization patterns. IMPORTANT: This parser is stateless and can be safely reused for multiple EPUB files without state pollution. Each parse_epub() call is independent. Attributes: file_path: Path to the currently loaded EPUB file, if any. _is_loaded: Boolean indicating whether an EPUB file is currently loaded. Example: >>> parser = VivreParser() >>> chapters1 = parser.parse_epub("book1.epub") # Safe to reuse >>> chapters2 = parser.parse_epub("book2.epub") # No state pollution >>> print(f"Found {len(chapters1)} chapters in book1") >>> print(f"Found {len(chapters2)} chapters in book2") """ # Multilingual non-story content keywords NON_STORY_KEYWORDS = { "en": [ "cover", "title", "titlepage", "front cover", "back cover", "acknowledgement", "acknowledgments", "acknowledgements", "table of contents", "contents", "toc", "copyright", "legal", "disclaimer", "about the author", "author bio", "biography", "translator", "translation", "translator's note", "preface", "foreword", "introduction", "afterword", "appendix", "index", "bibliography", "references", "citations", "notes", "glossary", "credits", "dedication", "colophon", ], "es": [ "cubierta", "título", "página de título", "cubierta frontal", "cubierta trasera", "agradecimientos", "reconocimientos", "tabla de contenidos", "contenidos", "índice", "derechos de autor", "copyright", "legal", "descargo de responsabilidad", "sobre el autor", "biografía del autor", "biografía", "traductor", "traducción", "nota del traductor", "prefacio", "introducción", "apéndice", "bibliografía", "referencias", "citas", "notas", "glosario", "créditos", "dedicatoria", "colofón", ], "fr": [ "couverture", "titre", "page de titre", "couverture avant", "couverture arrière", "remerciements", "table des matières", "sommaire", "index", "copyright", "droits d'auteur", "légal", "avertissement", "à propos de l'auteur", "biographie de l'auteur", "biographie", "traducteur", "traduction", "note du traducteur", "préface", "avant-propos", "introduction", "appendice", "bibliographie", "références", "citations", "notes", "glossaire", "crédits", "dédicace", "colophon", ], "de": [ "umschlag", "titel", "titelseite", "vorderer umschlag", "hinterer umschlag", "danksagung", "danksagungen", "inhaltsverzeichnis", "inhalt", "index", "urheberrecht", "copyright", "rechtlich", "haftungsausschluss", "über den autor", "autorenbiografie", "biografie", "übersetzer", "übersetzung", "übersetzernotiz", "vorwort", "einleitung", "anhang", "bibliografie", "referenzen", "zitate", "notizen", "glossar", "credits", "widmung", "kolophon", ], "it": [ "copertina", "titolo", "frontespizio", "copertina anteriore", "copertina posteriore", "ringraziamenti", "indice", "contenuti", "copyright", "diritti d'autore", "legale", "disclaimer", "sull'autore", "biografia dell'autore", "biografia", "traduttore", "traduzione", "nota del traduttore", "prefazione", "introduzione", "appendice", "bibliografia", "riferimenti", "citazioni", "note", "glossario", "crediti", "dedica", "colophon", ], } def __init__(self) -> None: """ Initialize the VivreParser. Creates a new parser instance ready to parse EPUB files. The parser is stateless and can be reused for multiple files. """ self.file_path: Optional[Path] = None self._is_loaded: bool = False
[docs] def load_epub(self, file_path: Union[str, Path]) -> bool: """ Load and validate an EPUB file from the given path. This method performs comprehensive validation including: - Input path validation (None, empty, invalid characters) - File existence and accessibility checks - EPUB format validation (ZIP structure, required files) - Corrupted file detection The validation process ensures that the file is a valid EPUB by checking: 1. File exists and is readable 2. File is not empty and has minimum size 3. File has ZIP magic number (PK\x03\x04) 4. ZIP structure is valid and contains required EPUB files 5. META-INF/container.xml exists (required for EPUB) Args: file_path: Path to the EPUB file to load. Can be a string or Path object. Returns: True if the file was successfully loaded and validated. Raises: FileNotFoundError: If the EPUB file doesn't exist. ValueError: If the file path is invalid, file is not readable, or file is not a valid EPUB (empty, corrupted, wrong format). Example: >>> parser = VivreParser() >>> success = parser.load_epub("book.epub") >>> if success: ... print("EPUB loaded successfully") """ # Validate input path if file_path is None: raise ValueError("File path cannot be None") # Convert to string for validation if isinstance(file_path, (str, Path)): path_str = str(file_path).strip() else: raise ValueError( f"File path must be a string or Path object, " f"not {type(file_path).__name__}" ) # Check for empty or whitespace-only paths if not path_str: raise ValueError("File path cannot be empty") # Check for invalid characters in path invalid_chars = ["\x00", "\n", "\r", "\t"] for char in invalid_chars: if char in path_str: raise ValueError("File path contains invalid characters") # Convert to Path object file_path = Path(file_path) # Check if file exists if not file_path.exists(): raise FileNotFoundError(f"EPUB file not found: {file_path}") # Check if it's a file if not file_path.is_file(): raise ValueError(f"Path is not a file: {file_path}") # Check if file is readable if not os.access(file_path, os.R_OK): raise ValueError(f"EPUB file is not readable: {file_path}") # Basic EPUB validation - check if it's a ZIP file (EPUBs are ZIP archives) try: with open(file_path, "rb") as f: # Check if file is empty f.seek(0, 2) # Seek to end file_size = f.tell() if file_size == 0: raise ValueError( f"File is not a valid EPUB (empty file): {file_path}" ) # Check if file is too small to be a valid ZIP if file_size < 4: raise ValueError( f"File is not a valid EPUB (file too small): {file_path}" ) # Check ZIP magic number f.seek(0) # Seek to beginning magic = f.read(4) if magic != b"PK\x03\x04": raise ValueError( f"File is not a valid EPUB (not a ZIP archive): {file_path}" ) # Try to open as ZIP to validate structure try: with zipfile.ZipFile(file_path, "r") as test_zip: # Check if it has the minimum required files for an EPUB file_list = test_zip.namelist() if "META-INF/container.xml" not in file_list: raise ValueError( f"File is not a valid EPUB " f"(missing container.xml): {file_path}" ) except zipfile.BadZipFile: raise ValueError( f"File is not a valid EPUB " f"(corrupted ZIP structure): {file_path}" ) except Exception as e: if "File is not a valid EPUB" in str(e): raise # Re-raise our specific validation errors raise ValueError(f"Error reading EPUB file: {e}") # If we get here, the file is valid self.file_path = file_path self._is_loaded = True return True
[docs] def is_loaded(self) -> bool: """ Check if an EPUB file is currently loaded. Returns: True if an EPUB file is loaded, False otherwise. """ return self._is_loaded
[docs] def parse_epub(self, file_path: Union[str, Path]) -> List[Chapter]: """ Parse an EPUB file and extract chapter titles and text content. This method performs comprehensive EPUB parsing following EPUB standards: 1. Reads container.xml to locate content.opf 2. Parses content.opf to get manifest and spine 3. Extracts chapter titles from table of contents 4. Processes spine items in reading order 5. Filters out non-story content 6. Extracts chapter text content Args: file_path: Path to the EPUB file to parse. Can be a string or Path object. Returns: List of Chapter objects containing chapter information. Only story chapters are included, with non-story content filtered out. Raises: FileNotFoundError: If the EPUB file doesn't exist. ValueError: If the file path is invalid, file is not a valid EPUB, or the EPUB structure cannot be parsed. """ # Use load_epub for validation (DRY principle) if not self.load_epub(file_path): raise ValueError(f"Failed to load EPUB file: {file_path}") chapters: List[Chapter] = [] try: with zipfile.ZipFile(file_path, "r") as epub_zip: # Step 1: Find the container.xml to locate the content.opf container_xml = epub_zip.read("META-INF/container.xml") container_root = ET.fromstring(container_xml) # Extract the path to the content.opf file selector = ( './/container:rootfile[@media-type="application/oebps-package+xml"]' ) rootfile_elem = container_root.find(selector, NAMESPACES) if rootfile_elem is None: raise ValueError("Could not find content.opf in container.xml") content_opf_path = rootfile_elem.get("full-path") if not content_opf_path: raise ValueError("No full-path attribute found in rootfile") # Step 2: Parse the content.opf to get the spine (reading order) content_opf = epub_zip.read(content_opf_path) content_root = ET.fromstring(content_opf) # Extract metadata (book title and language) - now stateless book_metadata = self._extract_metadata(content_opf) book_language = book_metadata.get("language", "en") # Get the base directory for the content files content_dir = Path(content_opf_path).parent # Step 3: Extract chapter titles from table of contents using EPUB # standards chapter_titles = self._extract_chapter_titles( epub_zip, content_dir, content_opf ) # Find the spine to get the reading order spine_elem = content_root.find(".//opf:spine", NAMESPACES) if spine_elem is None: raise ValueError("Could not find spine in content.opf") # Get all itemref elements in the spine itemrefs = spine_elem.findall(".//opf:itemref", NAMESPACES) if not itemrefs: raise ValueError("No itemref elements found in spine") # Step 4: Extract chapter content for each item in the spine for i, itemref in enumerate(itemrefs): idref = itemref.get("idref") if not idref: continue # Find the manifest item with this id manifest_elem = content_root.find(".//opf:manifest", NAMESPACES) if manifest_elem is None: continue item_elem = manifest_elem.find( f'.//opf:item[@id="{idref}"]', NAMESPACES ) if item_elem is None: continue href = item_elem.get("href") if not href: continue # Skip non-story content based on href pattern if self._is_non_story_content("", href, book_language): continue # Construct the full path to the chapter file chapter_path = content_dir / href # Read and parse the chapter file try: chapter_content = epub_zip.read(str(chapter_path)) chapter_title, chapter_text = self._extract_chapter_content( chapter_content ) # Use title from table of contents if available if href in chapter_titles: chapter_title = chapter_titles[href] # Skip if still a generic title (check without soup for basic # validation) if ( len(chapter_title.strip()) < 3 or len(chapter_title.split()) > 15 ): continue # Skip if text is too short (likely just a title page) if len(chapter_text.strip()) < 100: continue # Skip back matter (files with 'bm' in the name) if "bm" in href.lower(): continue # Calculate character and word counts char_count = len(chapter_text) word_count = len(chapter_text.split()) chapters.append( Chapter( title=chapter_title, content=chapter_text, href=href, order=i, char_count=char_count, word_count=word_count, ) ) except Exception as e: # Skip chapters that can't be parsed warning_msg = f"Warning: Could not parse chapter {href}: {e}" print(warning_msg) continue except zipfile.BadZipFile: raise ValueError(f"File is not a valid ZIP archive: {file_path}") except ET.ParseError as e: raise ValueError(f"Error parsing EPUB XML: {e}") except Exception as e: raise ValueError(f"Error reading EPUB file: {e}") return chapters
def _extract_chapter_content(self, chapter_content: bytes) -> Tuple[str, str]: """ Extract chapter title and text from HTML/XML content using BeautifulSoup. This method uses BeautifulSoup to robustly parse HTML/XML content, handling malformed HTML that would cause XML parsers to fail. Args: chapter_content: Raw bytes of the chapter file. Returns: Tuple of (chapter_title, chapter_text). """ # Decode content and parse with BeautifulSoup content_str = chapter_content.decode("utf-8", errors="ignore") # Use XML parser for EPUB content to avoid warnings soup = BeautifulSoup(content_str, "lxml-xml") # Extract title using BeautifulSoup selectors title = self._extract_title(soup) # Extract text using BeautifulSoup's get_text() text = self._extract_text(soup) # If we got "Untitled Chapter" but text starts with what looks like a # title, # try to extract the title from the beginning of the text if title == "Untitled Chapter" and text.strip(): # Look for patterns like "1. Title" or "2. Title" at the beginning # Stop at the first sentence boundary or when we hit the actual content pattern = r"^(\d+\.?\s+[^.!?]+?)(?=\s+[A-Z]|$)" title_match = re.match(pattern, text.strip()) if title_match: title = title_match.group(1).strip() # Explicitly remove the title from the text if it appears at the beginning if title and title != "Untitled Chapter": text = self._remove_title_from_text(text, title) return title, text def _extract_title(self, soup: BeautifulSoup) -> str: """ Extract title from BeautifulSoup object using multiple strategies. This method tries various selectors to find the chapter title, prioritizing more specific selectors over generic ones. Args: soup: The BeautifulSoup object to search for titles. Returns: The extracted title, or "Untitled Chapter" if none found. """ # Try different possible title locations in order of preference title_selectors = [ "h1.chapter", # Specific chapter headings "h1[id*='chapter']", # Chapter headings with chapter in ID "h1", # Any h1 "h2.chapter", # Chapter h2 headings "h2", # Any h2 "h3.chapter", # Chapter h3 headings "h3", # Any h3 "title", # Title tag "head title", # Head title ] for selector in title_selectors: title_elem = soup.select_one(selector) if title_elem and title_elem.get_text().strip(): title_text = title_elem.get_text().strip() # Skip generic titles that are likely not chapter titles if title_text and not self._is_generic_title(title_text, soup): return title_text # If no title found, try to get the first meaningful heading text for tag in ["h1", "h2", "h3"]: for elem in soup.find_all(tag): if elem.get_text().strip(): title_text = elem.get_text().strip() if not self._is_generic_title(title_text, soup): return title_text return "Untitled Chapter" def _is_generic_title(self, title: str, soup: BeautifulSoup) -> bool: """ Check if a title is generic and likely not a chapter title. This method uses content-agnostic rules to identify titles that are probably book titles or other generic content rather than specific chapter titles. Args: title: The title to check. soup: BeautifulSoup object of the chapter content. Returns: True if the title is generic, False otherwise. """ title_lower = title.lower() # Check if title is too short (likely not a chapter title) if len(title.strip()) < 3: return True # Check if title is excessively long (likely subtitle or publisher info) if len(title.split()) > 15: return True # Check if title matches the book title from metadata # This instance variable is no longer available, so we'll skip this check # if self._book_title and title_lower == self._book_title.lower(): # return True # Check if title matches the HTML document title (but allow if it's the # only title found) if soup: head_title = soup.find("title") if head_title and head_title.get_text().strip(): doc_title = head_title.get_text().strip().lower() if title_lower == doc_title: # Only consider it generic if we found other potential titles other_titles = soup.find_all(["h1", "h2", "h3"]) # More than just the title tag if len(other_titles) > 1: return True # Check if title is just repeated words (likely not a chapter title) words = title_lower.split() if len(words) > 1 and words.count(words[0]) > 1: return True return False def _extract_chapter_titles( self, epub_zip: zipfile.ZipFile, content_dir: Path, content_opf: bytes ) -> Dict[str, str]: """ Extract chapter titles from the table of contents using EPUB standards. This method follows the EPUB specification to find and parse the navigation document: - EPUB3: HTML navigation document with properties="nav" - EPUB2: NCX file referenced in spine toc attribute Args: epub_zip: The EPUB zip file. content_dir: Base directory for content files. content_opf: Raw bytes of the content.opf file. Returns: Dictionary mapping href to chapter title. """ chapter_titles: Dict[str, str] = {} # Find navigation document using EPUB standards nav_path = self._find_navigation_document(content_opf, epub_zip, content_dir) if nav_path: try: nav_content = epub_zip.read(nav_path) # Determine if it's HTML (EPUB3) or NCX (EPUB2) if nav_path.lower().endswith(".ncx"): # Parse NCX file for chapter titles nav_root = ET.fromstring(nav_content) nav_points = nav_root.findall(".//{*}navPoint") for nav_point in nav_points: # Get the title title_elem = nav_point.find(".//{*}text") if title_elem is not None and title_elem.text: title = title_elem.text.strip() # Get the href content_elem = nav_point.find(".//{*}content") if content_elem is not None: src = content_elem.get("src") if src: # Extract the filename from src (remove anchor) href = src.split("#")[0] chapter_titles[href] = title else: # Parse HTML navigation document for chapter links soup = BeautifulSoup(nav_content, "lxml-xml") links = soup.find_all("a") for link in links: if isinstance(link, Tag): href_attr = link.get("href") if ( href_attr and isinstance(href_attr, str) and link.get_text().strip() ): title = link.get_text().strip() # Clean up href (remove anchor if present) href = href_attr.split("#")[0] chapter_titles[href] = title except Exception as e: print( f"Warning: Could not extract chapter titles from navigation " f"document: {e}" ) # Fallback to old method if standards-compliant method fails if not chapter_titles: try: # Look for common table of contents files toc_files = ["toc.ncx", "OEBPS/toc.ncx", "OEBPS/html/toc.ncx"] toc_content = None for toc_file in toc_files: try: toc_content = epub_zip.read(toc_file) break except KeyError: continue if toc_content: # Parse NCX file for chapter titles toc_root = ET.fromstring(toc_content) nav_points = toc_root.findall(".//{*}navPoint") for nav_point in nav_points: # Get the title title_elem = nav_point.find(".//{*}text") if title_elem is not None and title_elem.text: title = title_elem.text.strip() # Get the href content_elem = nav_point.find(".//{*}content") if content_elem is not None: src = content_elem.get("src") if src: # Extract the filename from src (remove anchor) href = src.split("#")[0] chapter_titles[href] = title except Exception as e: print( f"Warning: Could not extract chapter titles from fallback TOC: {e}" ) return chapter_titles def _extract_text(self, soup: Union[BeautifulSoup, Tag]) -> str: """ Extract all text content from BeautifulSoup object with paragraph structure. This method extracts text on a block-level element basis to preserve paragraph breaks, which improves sentence segmentation accuracy. Args: soup: The BeautifulSoup object to extract text from. Returns: Cleaned text content with preserved paragraph structure. """ # Focus on body content if available body = soup.find("body") if body is not None and isinstance(body, Tag): soup = body # Extract text from block-level elements to preserve paragraph structure block_elements = soup.find_all( ["p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "section", "article"] ) if block_elements: # Extract text from each block element text_blocks = [] for element in block_elements: text = element.get_text(separator=" ", strip=True) if text: text_blocks.append(text) # Join blocks with double newlines to preserve paragraph breaks text = "\n\n".join(text_blocks) else: # Fallback to simple text extraction if no block elements found text = soup.get_text(separator=" ", strip=True) # Clean up the text while preserving paragraph breaks text = re.sub( r"\n\s*\n\s*\n+", "\n\n", text ) # Normalize multiple paragraph breaks text = re.sub(r"[ \t]+", " ", text) # Normalize whitespace within lines text = text.strip() return text def _is_non_story_content(self, title: str, href: str, book_language: str) -> bool: """ Check if content should be filtered out as non-story content. This method identifies various types of non-story content that should be excluded from the final chapter list. It prioritizes href patterns over title patterns for more robust filtering. Args: title: The chapter title. href: The chapter file path. book_language: The book's language code. Returns: True if the content should be filtered out, False otherwise. """ href_lower = href.lower() # PRIORITY 1: Check href for common non-story file patterns # These patterns are the most reliable indicators non_story_patterns = [ "cover", "title", "titlepage", "front", "back", "toc", "contents", "copyright", "legal", "acknowledgement", "preface", "foreword", "afterword", "appendix", "index", "bibliography", "references", "glossary", "fm", "ded", "cop", "adc", "author", ] if any(pattern in href_lower for pattern in non_story_patterns): return True # Check for specific file patterns that indicate non-story content # Front matter files (fm1, fm2, etc.) if re.search(r"fm\d+", href_lower): return True # Dedication files if re.search(r"ded", href_lower): return True # Copyright files if re.search(r"cop", href_lower): return True # Acknowledgements files if re.search(r"adc", href_lower): return True # Front split files if re.search(r"front_split", href_lower): return True # PRIORITY 2: Only check title if href passed the initial filter # This prevents important content from being discarded based on generic titles if title: # Get the appropriate keyword list for the book's language keywords = self.NON_STORY_KEYWORDS.get( book_language, self.NON_STORY_KEYWORDS["en"] ) # Check title for common non-story content indicators using word boundaries title_lower = title.lower() for keyword in keywords: # Use word boundaries to match whole words only pattern = r"\b" + re.escape(keyword) + r"\b" if re.search(pattern, title_lower): return True return False def _extract_metadata(self, content_opf: bytes) -> Dict[str, str]: """ Extract book metadata from content.opf file. Args: content_opf: Raw bytes of the content.opf file. Returns: Dictionary containing 'title' and 'language' metadata. """ metadata: Dict[str, str] = {} try: root = ET.fromstring(content_opf) # Extract book title from dc:title title_elem = root.find(".//dc:title", NAMESPACES) if title_elem is not None and title_elem.text: metadata["title"] = title_elem.text.strip() # Extract language from dc:language lang_elem = root.find(".//dc:language", NAMESPACES) if lang_elem is not None and lang_elem.text: lang_code = lang_elem.text.strip().lower() # Map language codes to our supported languages lang_mapping = { "en": "en", "eng": "en", "english": "en", "es": "es", "spa": "es", "spanish": "es", "español": "es", "fr": "fr", "fra": "fr", "french": "fr", "français": "fr", "de": "de", "ger": "de", "german": "de", "deutsch": "de", "it": "it", "ita": "it", "italian": "it", "italiano": "it", } metadata["language"] = lang_mapping.get(lang_code, "en") except Exception as e: print(f"Warning: Could not extract metadata: {e}") return metadata def _find_navigation_document( self, content_opf: bytes, epub_zip: zipfile.ZipFile, content_dir: Path ) -> Optional[str]: """ Find the navigation document using EPUB standards. This method implements the EPUB specification for finding the table of contents: - EPUB3: Look for item with properties="nav" in manifest - EPUB2: Look for spine toc attribute and find corresponding NCX file Args: content_opf: Raw bytes of the content.opf file. epub_zip: The EPUB zip file. content_dir: Base directory for content files. Returns: Path to the navigation document, or None if not found. """ try: root = ET.fromstring(content_opf) # EPUB3: Look for navigation document with properties="nav" manifest = root.find(".//opf:manifest", NAMESPACES) if manifest is not None: for item in manifest.findall(".//opf:item", NAMESPACES): properties = item.get("properties") if properties and "nav" in properties.split(): href = item.get("href") if href: # Resolve relative path nav_path = content_dir / href return str(nav_path) # EPUB2: Look for NCX file referenced in spine spine = root.find(".//opf:spine", NAMESPACES) if spine is not None: toc_id = spine.get("toc") if toc_id: # Find the item with this ID in manifest if manifest is not None: for item in manifest.findall(".//opf:item", NAMESPACES): if item.get("id") == toc_id: href = item.get("href") if href: # Resolve relative path nav_path = content_dir / href return str(nav_path) return None except Exception as e: print(f"Warning: Could not find navigation document: {e}") return None def _remove_title_from_text(self, text: str, title: str) -> str: """ Remove the title from the beginning of the text. Args: text: The text content. title: The title to remove. Returns: Text with title removed from the beginning. """ if not title or title == "Untitled Chapter": return text # First, normalize whitespace in the text text = re.sub(r"\s+", " ", text.strip()) # Try exact match first title_escaped = re.escape(title) text = re.sub(f"^{title_escaped}\\s*", "", text, flags=re.IGNORECASE) # Try variations of the title title_variations = [ title.replace(".", ""), # Remove periods title.replace(" ", " "), # Normalize double spaces title.strip(), re.sub(r"\s+", " ", title), # Normalize all whitespace ] for variation in title_variations: if variation and variation != title: variation_escaped = re.escape(variation) text = re.sub( f"^{variation_escaped}\\s*", "", text, flags=re.IGNORECASE ) # Try removing just the first few words if they match the title title_words = title.split() if len(title_words) >= 2: # Try removing first 2-3 words if they match the title for i in range(2, min(4, len(title_words) + 1)): partial_title = " ".join(title_words[:i]) partial_escaped = re.escape(partial_title) text = re.sub(f"^{partial_escaped}\\s*", "", text, flags=re.IGNORECASE) return text.strip()
# Backward compatibility alias Parser = VivreParser