Source code for vivre.integration

"""
Integration module for the complete vivre pipeline.

This module provides high-level interfaces for processing parallel texts through
the complete pipeline: parsing EPUB files, segmenting text into sentences, and
aligning sentences between languages.
"""

from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from .align import Aligner
from .parser import VivreParser
from .segmenter import Segmenter


[docs] class VivrePipeline: """ High-level interface for the complete vivre text processing pipeline. This class provides a convenient interface for processing parallel texts through the complete workflow: parsing EPUB files, segmenting text into sentences, and aligning sentences between languages. The pipeline supports both single-chapter and multi-chapter processing, with options for automatic language detection and custom alignment parameters. Attributes: parser: The EPUB parser instance segmenter: The sentence segmenter instance aligner: The text aligner instance language_pair: The language pair for alignment (e.g., "en-es") Example: >>> pipeline = VivrePipeline("en-es") >>> alignments = pipeline.process_parallel_epubs( ... "english_book.epub", "spanish_book.epub" ... ) >>> for source, target in alignments: ... print(f"EN: {source}") ... print(f"ES: {target}") """ def __init__( self, language_pair: str = "en-es", c: Optional[float] = None, s2: Optional[float] = None, gap_penalty: Optional[float] = None, ) -> None: """ Initialize the vivre pipeline. Args: language_pair: Language pair for alignment (e.g., "en-es", "en-fr") c: Custom mean ratio for alignment (optional) s2: Custom variance for alignment (optional) gap_penalty: Custom gap penalty for alignment (optional) """ self.parser = VivreParser() self.segmenter = Segmenter() self.aligner = Aligner( language_pair=language_pair, c=c, s2=s2, gap_penalty=gap_penalty, ) self.language_pair = language_pair
[docs] def process_parallel_epubs( self, source_epub_path: Union[str, Path], target_epub_path: Union[str, Path], source_language: Optional[str] = None, target_language: Optional[str] = None, max_chapters: Optional[int] = None, ) -> List[Tuple[str, str]]: """ Process parallel EPUB files through the complete pipeline. This method processes two EPUB files (source and target languages) through the complete pipeline: parsing, segmentation, and alignment. Args: source_epub_path: Path to source language EPUB file target_epub_path: Path to target language EPUB file source_language: Source language code (optional, auto-detected if None) target_language: Target language code (optional, auto-detected if None) max_chapters: Maximum number of chapters to process (optional) Returns: List of aligned sentence pairs (source, target) Raises: FileNotFoundError: If EPUB files don't exist ValueError: If parsing or alignment fails """ # Parse both EPUB files source_chapters = self.parser.parse_epub(source_epub_path) target_chapters = self.parser.parse_epub(target_epub_path) if not source_chapters or not target_chapters: raise ValueError("No chapters found in one or both EPUB files") # Limit chapters if specified if max_chapters: source_chapters = source_chapters[:max_chapters] target_chapters = target_chapters[:max_chapters] # Process each chapter pair all_alignments: List[Tuple[str, str]] = [] for i, ( (source_title, source_content), (target_title, target_content), ) in enumerate(zip(source_chapters, target_chapters)): # Segment chapters into sentences source_sentences = self.segmenter.segment( source_content, language=source_language ) target_sentences = self.segmenter.segment( target_content, language=target_language ) if source_sentences and target_sentences: # Align sentences chapter_alignments = self.aligner.align( source_sentences, target_sentences ) all_alignments.extend(chapter_alignments) return all_alignments
[docs] def process_parallel_texts( self, source_text: str, target_text: str, source_language: Optional[str] = None, target_language: Optional[str] = None, ) -> List[Tuple[str, str]]: """ Process parallel text content through the pipeline. This method processes two text strings (source and target languages) through segmentation and alignment, skipping the parsing step. Args: source_text: Source language text content target_text: Target language text content source_language: Source language code (optional, auto-detected if None) target_language: Target language code (optional, auto-detected if None) Returns: List of aligned sentence pairs (source, target) """ # Segment texts into sentences source_sentences = self.segmenter.segment(source_text, language=source_language) target_sentences = self.segmenter.segment(target_text, language=target_language) if not source_sentences or not target_sentences: return [] # Align sentences return self.aligner.align(source_sentences, target_sentences)
[docs] def process_parallel_chapters( self, source_chapters: List[Tuple[str, str]], target_chapters: List[Tuple[str, str]], source_language: Optional[str] = None, target_language: Optional[str] = None, ) -> List[Tuple[str, str]]: """ Process parallel chapter lists through the pipeline. This method processes two lists of chapters (title, content pairs) through segmentation and alignment, skipping the parsing step. Args: source_chapters: List of (title, content) pairs for source language target_chapters: List of (title, content) pairs for target language source_language: Source language code (optional, auto-detected if None) target_language: Target language code (optional, auto-detected if None) Returns: List of aligned sentence pairs (source, target) """ all_alignments: List[Tuple[str, str]] = [] for (source_title, source_content), (target_title, target_content) in zip( source_chapters, target_chapters ): # Segment chapters into sentences source_sentences = self.segmenter.segment( source_content, language=source_language ) target_sentences = self.segmenter.segment( target_content, language=target_language ) if source_sentences and target_sentences: # Align sentences chapter_alignments = self.aligner.align( source_sentences, target_sentences ) all_alignments.extend(chapter_alignments) return all_alignments
[docs] def batch_process_epubs( self, epub_pairs: List[Tuple[Union[str, Path], Union[str, Path]]], source_language: Optional[str] = None, target_language: Optional[str] = None, max_chapters_per_book: Optional[int] = None, ) -> Dict[str, List[Tuple[str, str]]]: """ Process multiple pairs of EPUB files in batch. This method processes multiple pairs of EPUB files, returning alignments for each pair in a dictionary keyed by the source file path. Args: epub_pairs: List of (source_path, target_path) tuples source_language: Source language code (optional, auto-detected if None) target_language: Target language code (optional, auto-detected if None) max_chapters_per_book: Maximum chapters per book (optional) Returns: Dictionary mapping source file paths to alignment results """ results: Dict[str, List[Tuple[str, str]]] = {} for source_path, target_path in epub_pairs: try: alignments = self.process_parallel_epubs( source_path, target_path, source_language=source_language, target_language=target_language, max_chapters=max_chapters_per_book, ) results[str(source_path)] = alignments except Exception as e: # Log error but continue with other pairs print(f"Error processing {source_path}: {e}") results[str(source_path)] = [] return results
[docs] def get_pipeline_info(self) -> Dict[str, Any]: """ Get information about the current pipeline configuration. Returns: Dictionary containing pipeline configuration information """ return { "language_pair": self.language_pair, "aligner_parameters": { "c": self.aligner.c, "s2": self.aligner.s2, "gap_penalty": self.aligner.gap_penalty, }, "supported_languages": self.segmenter.get_supported_languages(), }
[docs] def create_pipeline( language_pair: str = "en-es", **kwargs: Any, ) -> VivrePipeline: """ Create a new vivre pipeline instance. This is a convenience function for creating pipeline instances with default or custom parameters. Args: language_pair: Language pair for alignment **kwargs: Additional arguments to pass to VivrePipeline constructor Returns: Configured VivrePipeline instance Example: >>> pipeline = create_pipeline("en-fr", gap_penalty=5.0) >>> alignments = pipeline.process_parallel_texts( ... "Hello world.", "Bonjour le monde." ... ) """ return VivrePipeline(language_pair=language_pair, **kwargs)