"""
Integration module for the complete vivre pipeline.
This module provides high-level interfaces for processing parallel texts through
the complete pipeline: parsing EPUB files, segmenting text into sentences, and
aligning sentences between languages.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from .align import Aligner
from .parser import VivreParser
from .segmenter import Segmenter
[docs]
class VivrePipeline:
"""
High-level interface for the complete vivre text processing pipeline.
This class provides a convenient interface for processing parallel texts
through the complete workflow: parsing EPUB files, segmenting text into
sentences, and aligning sentences between languages.
The pipeline supports both single-chapter and multi-chapter processing,
with options for automatic language detection and custom alignment parameters.
Attributes:
parser: The EPUB parser instance
segmenter: The sentence segmenter instance
aligner: The text aligner instance
language_pair: The language pair for alignment (e.g., "en-es")
Example:
>>> pipeline = VivrePipeline("en-es")
>>> alignments = pipeline.process_parallel_epubs(
... "english_book.epub", "spanish_book.epub"
... )
>>> for source, target in alignments:
... print(f"EN: {source}")
... print(f"ES: {target}")
"""
def __init__(
self,
language_pair: str = "en-es",
c: Optional[float] = None,
s2: Optional[float] = None,
gap_penalty: Optional[float] = None,
) -> None:
"""
Initialize the vivre pipeline.
Args:
language_pair: Language pair for alignment (e.g., "en-es", "en-fr")
c: Custom mean ratio for alignment (optional)
s2: Custom variance for alignment (optional)
gap_penalty: Custom gap penalty for alignment (optional)
"""
self.parser = VivreParser()
self.segmenter = Segmenter()
self.aligner = Aligner(
language_pair=language_pair,
c=c,
s2=s2,
gap_penalty=gap_penalty,
)
self.language_pair = language_pair
[docs]
def process_parallel_epubs(
self,
source_epub_path: Union[str, Path],
target_epub_path: Union[str, Path],
source_language: Optional[str] = None,
target_language: Optional[str] = None,
max_chapters: Optional[int] = None,
) -> List[Tuple[str, str]]:
"""
Process parallel EPUB files through the complete pipeline.
This method processes two EPUB files (source and target languages)
through the complete pipeline: parsing, segmentation, and alignment.
Args:
source_epub_path: Path to source language EPUB file
target_epub_path: Path to target language EPUB file
source_language: Source language code (optional, auto-detected if None)
target_language: Target language code (optional, auto-detected if None)
max_chapters: Maximum number of chapters to process (optional)
Returns:
List of aligned sentence pairs (source, target)
Raises:
FileNotFoundError: If EPUB files don't exist
ValueError: If parsing or alignment fails
"""
# Parse both EPUB files
source_chapters = self.parser.parse_epub(source_epub_path)
target_chapters = self.parser.parse_epub(target_epub_path)
if not source_chapters or not target_chapters:
raise ValueError("No chapters found in one or both EPUB files")
# Limit chapters if specified
if max_chapters:
source_chapters = source_chapters[:max_chapters]
target_chapters = target_chapters[:max_chapters]
# Process each chapter pair
all_alignments: List[Tuple[str, str]] = []
for i, (
(source_title, source_content),
(target_title, target_content),
) in enumerate(zip(source_chapters, target_chapters)):
# Segment chapters into sentences
source_sentences = self.segmenter.segment(
source_content, language=source_language
)
target_sentences = self.segmenter.segment(
target_content, language=target_language
)
if source_sentences and target_sentences:
# Align sentences
chapter_alignments = self.aligner.align(
source_sentences, target_sentences
)
all_alignments.extend(chapter_alignments)
return all_alignments
[docs]
def process_parallel_texts(
self,
source_text: str,
target_text: str,
source_language: Optional[str] = None,
target_language: Optional[str] = None,
) -> List[Tuple[str, str]]:
"""
Process parallel text content through the pipeline.
This method processes two text strings (source and target languages)
through segmentation and alignment, skipping the parsing step.
Args:
source_text: Source language text content
target_text: Target language text content
source_language: Source language code (optional, auto-detected if None)
target_language: Target language code (optional, auto-detected if None)
Returns:
List of aligned sentence pairs (source, target)
"""
# Segment texts into sentences
source_sentences = self.segmenter.segment(source_text, language=source_language)
target_sentences = self.segmenter.segment(target_text, language=target_language)
if not source_sentences or not target_sentences:
return []
# Align sentences
return self.aligner.align(source_sentences, target_sentences)
[docs]
def process_parallel_chapters(
self,
source_chapters: List[Tuple[str, str]],
target_chapters: List[Tuple[str, str]],
source_language: Optional[str] = None,
target_language: Optional[str] = None,
) -> List[Tuple[str, str]]:
"""
Process parallel chapter lists through the pipeline.
This method processes two lists of chapters (title, content pairs)
through segmentation and alignment, skipping the parsing step.
Args:
source_chapters: List of (title, content) pairs for source language
target_chapters: List of (title, content) pairs for target language
source_language: Source language code (optional, auto-detected if None)
target_language: Target language code (optional, auto-detected if None)
Returns:
List of aligned sentence pairs (source, target)
"""
all_alignments: List[Tuple[str, str]] = []
for (source_title, source_content), (target_title, target_content) in zip(
source_chapters, target_chapters
):
# Segment chapters into sentences
source_sentences = self.segmenter.segment(
source_content, language=source_language
)
target_sentences = self.segmenter.segment(
target_content, language=target_language
)
if source_sentences and target_sentences:
# Align sentences
chapter_alignments = self.aligner.align(
source_sentences, target_sentences
)
all_alignments.extend(chapter_alignments)
return all_alignments
[docs]
def batch_process_epubs(
self,
epub_pairs: List[Tuple[Union[str, Path], Union[str, Path]]],
source_language: Optional[str] = None,
target_language: Optional[str] = None,
max_chapters_per_book: Optional[int] = None,
) -> Dict[str, List[Tuple[str, str]]]:
"""
Process multiple pairs of EPUB files in batch.
This method processes multiple pairs of EPUB files, returning alignments
for each pair in a dictionary keyed by the source file path.
Args:
epub_pairs: List of (source_path, target_path) tuples
source_language: Source language code (optional, auto-detected if None)
target_language: Target language code (optional, auto-detected if None)
max_chapters_per_book: Maximum chapters per book (optional)
Returns:
Dictionary mapping source file paths to alignment results
"""
results: Dict[str, List[Tuple[str, str]]] = {}
for source_path, target_path in epub_pairs:
try:
alignments = self.process_parallel_epubs(
source_path,
target_path,
source_language=source_language,
target_language=target_language,
max_chapters=max_chapters_per_book,
)
results[str(source_path)] = alignments
except Exception as e:
# Log error but continue with other pairs
print(f"Error processing {source_path}: {e}")
results[str(source_path)] = []
return results
[docs]
def get_pipeline_info(self) -> Dict[str, Any]:
"""
Get information about the current pipeline configuration.
Returns:
Dictionary containing pipeline configuration information
"""
return {
"language_pair": self.language_pair,
"aligner_parameters": {
"c": self.aligner.c,
"s2": self.aligner.s2,
"gap_penalty": self.aligner.gap_penalty,
},
"supported_languages": self.segmenter.get_supported_languages(),
}
[docs]
def create_pipeline(
language_pair: str = "en-es",
**kwargs: Any,
) -> VivrePipeline:
"""
Create a new vivre pipeline instance.
This is a convenience function for creating pipeline instances with
default or custom parameters.
Args:
language_pair: Language pair for alignment
**kwargs: Additional arguments to pass to VivrePipeline constructor
Returns:
Configured VivrePipeline instance
Example:
>>> pipeline = create_pipeline("en-fr", gap_penalty=5.0)
>>> alignments = pipeline.process_parallel_texts(
... "Hello world.", "Bonjour le monde."
... )
"""
return VivrePipeline(language_pair=language_pair, **kwargs)