"""
Top-level API functions for the vivre library.
This module provides simple, user-friendly functions for common tasks:
- read(): Parse EPUB files and extract chapters
- align(): Align parallel texts and output in various formats
- quick_align(): Simple one-liner for basic alignment
- get_supported_languages(): Get list of supported languages
"""
import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from .integration import VivrePipeline
from .parser import Chapter, VivreParser
from .segmenter import Segmenter
# Keep a global pipeline instance to be reused by the API functions
# This is a simple way to speed up consecutive API calls in a single script run.
_pipeline_cache: Dict[str, VivrePipeline] = {}
[docs]
class AlignmentResult:
"""
A container for alignment results with multiple output format options.
This class holds the aligned corpus data and provides methods to
output it in various formats.
"""
def __init__(self, corpus: Dict[str, Any]):
"""
Initialize with aligned corpus data.
Args:
corpus: The aligned corpus dictionary
"""
self._corpus = corpus
[docs]
def to_dict(self) -> Dict[str, Any]:
"""Return the corpus as a dictionary."""
return self._corpus.copy()
[docs]
def to_json(self, indent: int = 2) -> str:
"""Return the corpus as JSON string."""
return json.dumps(self._corpus, indent=indent, ensure_ascii=False)
[docs]
def to_text(self) -> str:
"""Return the corpus as formatted text."""
return _format_as_text(self._corpus)
[docs]
def to_csv(self) -> str:
"""Return the corpus as CSV string."""
return _format_as_csv(self._corpus)
[docs]
def to_xml(self) -> str:
"""Return the corpus as XML string."""
return _format_as_xml(self._corpus)
def __repr__(self) -> str:
"""String representation."""
return (
f"AlignmentResult(book_title='{self._corpus.get('book_title', '')}', "
f"language_pair='{self._corpus.get('language_pair', '')}')"
)
[docs]
class Chapters:
"""
A container for parsed chapters with segmentation capabilities.
This class holds the parsed chapters and provides methods to segment
the text into sentences.
"""
def __init__(self, chapters: List[Chapter], book_title: str = ""):
"""
Initialize with parsed chapters.
Args:
chapters: List of Chapter objects
book_title: Title of the book
"""
self.chapters = chapters
self.book_title = book_title
self._segmented_chapters: Optional[List[Tuple[str, List[str]]]] = None
self._segmenter = Segmenter()
[docs]
def segment(self, language: Optional[str] = None) -> "Chapters":
"""
Segment all chapters into sentences.
Args:
language: Language code for segmentation (auto-detected if None)
Returns:
Self with segmented chapters
"""
segmented = []
for chapter in self.chapters:
sentences = self._segmenter.segment(chapter.content, language)
segmented.append((chapter.title, sentences))
self._segmented_chapters = segmented
return self
[docs]
def get_segmented(self) -> List[Tuple[str, List[str]]]:
"""Get the segmented chapters."""
if self._segmented_chapters is None:
raise ValueError(
"Chapters must be segmented first. Call .segment() method."
)
return self._segmented_chapters
def __len__(self) -> int:
"""Return the number of chapters."""
return len(self.chapters)
def __getitem__(self, index: int) -> Chapter:
"""Get a chapter by index."""
return self.chapters[index]
def __iter__(self):
"""Iterate over chapters."""
return iter(self.chapters)
def __repr__(self) -> str:
"""String representation."""
return (
f"Chapters(book_title='{self.book_title}', chapters={len(self.chapters)})"
)
[docs]
def read(epub_path: Union[str, Path]) -> Chapters:
"""
Parse an EPUB file and extract chapters.
Args:
epub_path: Path to the EPUB file
Returns:
Chapters object containing parsed chapters
Raises:
FileNotFoundError: If the EPUB file doesn't exist
ValueError: If the file is not a valid EPUB
Example:
>>> chapters = vivre.read('path/to/epub')
>>> print(f"Found {len(chapters)} chapters")
>>> for title, content in chapters:
... print(f"Chapter: {title}")
"""
epub_path = Path(epub_path)
if not epub_path.exists():
raise FileNotFoundError(f"EPUB file not found: {epub_path}")
parser = VivreParser()
try:
chapters = parser.parse_epub(epub_path)
book_title = getattr(parser, "_book_title", "")
return Chapters(chapters, book_title)
except Exception as e:
raise ValueError(f"Failed to parse EPUB file {epub_path}: {e}")
[docs]
def align(
source: Union[str, Path, Chapters],
target: Union[str, Path, Chapters],
language_pair: str,
method: str = "gale-church",
_pipeline: Optional[VivrePipeline] = None, # Add this parameter
**kwargs: Any,
) -> AlignmentResult:
"""
Align parallel EPUB files or Chapters objects and return an AlignmentResult.
This function can accept either file paths or Chapters objects, making it
flexible for different workflows. The language_pair parameter is required
for accurate alignment.
Args:
source: Source language EPUB file path or Chapters object
target: Target language EPUB file path or Chapters object
language_pair: Language pair code (e.g., "en-fr", "es-en") - REQUIRED
method: Alignment method (currently only "gale-church" supported)
_pipeline: Optional pre-existing VivrePipeline instance for dependency injection
**kwargs: Additional arguments passed to the pipeline
Returns:
AlignmentResult object with methods for different output formats
Raises:
FileNotFoundError: If EPUB files don't exist (when using file paths)
ValueError: If method is not supported or language_pair is invalid
Example:
# Using file paths
>>> result = vivre.align('english.epub', 'french.epub', 'en-fr')
>>> print(result.to_json())
>>> print(result.to_csv())
# Using Chapters objects (seamless workflow)
>>> source_chapters = vivre.read('english.epub')
>>> target_chapters = vivre.read('french.epub')
>>> result = vivre.align(source_chapters, target_chapters, 'en-fr')
>>> print(result.to_text())
# Using dependency injection for better performance
>>> pipeline = VivrePipeline('en-fr')
>>> result = vivre.align(
... source_chapters, target_chapters, 'en-fr', _pipeline=pipeline
... )
>>> print(result.to_dict())
# Get as dictionary for programmatic access
>>> data = result.to_dict()
>>> print(f"Found {len(data['chapters'])} chapters")
"""
if method != "gale-church":
raise ValueError(
f"Method '{method}' not supported. Only 'gale-church' is available."
)
# Validate language pair format
if not isinstance(language_pair, str) or "-" not in language_pair:
raise ValueError(
f"Invalid language_pair: '{language_pair}'. "
f"Use format 'en-fr', 'es-en', etc."
)
# Parse source and target based on their types
source_chapters, source_title = _parse_source_or_chapters(source, "source")
target_chapters, target_title = _parse_source_or_chapters(target, "target")
# Use the provided pipeline or get one from the cache/create a new one
pipeline = _pipeline
if pipeline is None:
# Simple cache key
cache_key = f"{language_pair}-{json.dumps(kwargs, sort_keys=True)}"
if cache_key not in _pipeline_cache:
_pipeline_cache[cache_key] = VivrePipeline(language_pair, **kwargs)
pipeline = _pipeline_cache[cache_key]
# Get book title (prefer source title, fallback to target)
book_title = source_title or target_title
# Process chapters and create aligned corpus
try:
aligned_corpus = _create_aligned_corpus(
source_chapters, target_chapters, pipeline, book_title, language_pair
)
return AlignmentResult(aligned_corpus)
except Exception as e:
raise ValueError(f"Failed to align texts: {e}")
[docs]
def quick_align(
source_epub: Union[str, Path],
target_epub: Union[str, Path],
language_pair: str,
) -> List[Tuple[str, str]]:
"""
Quick alignment function that returns simple sentence pairs.
This is a convenience function for simple use cases where you just
need sentence pairs without the full corpus structure.
Args:
source_epub: Path to source language EPUB
target_epub: Path to target language EPUB
language_pair: Language pair code (e.g., "en-fr", "es-en") - REQUIRED
Returns:
List of (source_sentence, target_sentence) tuples
Raises:
FileNotFoundError: If either EPUB file doesn't exist
ValueError: If language_pair is invalid
Example:
>>> pairs = vivre.quick_align('english.epub', 'french.epub', 'en-fr')
>>> for source, target in pairs[:3]:
... print(f"EN: {source}")
... print(f"FR: {target}")
"""
# Validate language pair format
if not isinstance(language_pair, str) or "-" not in language_pair:
raise ValueError(
f"Invalid language_pair: '{language_pair}'. "
f"Use format 'en-fr', 'es-en', etc."
)
# Use the main align function and extract sentence pairs
result = align(source_epub, target_epub, language_pair)
corpus = result.to_dict()
pairs = []
for chapter_data in corpus["chapters"].values():
for alignment in chapter_data["alignments"]:
source_lang, target_lang = language_pair.split("-")
pairs.append((alignment[source_lang], alignment[target_lang]))
return pairs
[docs]
def get_supported_languages() -> List[str]:
"""
Get a list of supported languages for segmentation.
Returns:
List of supported language codes.
Example:
>>> languages = vivre.get_supported_languages()
>>> print(f"Supported languages: {languages}")
"""
segmenter = Segmenter()
return list(segmenter._supported_languages.keys())
[docs]
def clear_pipeline_cache() -> None:
"""
Clear the pipeline cache.
This is useful for testing or when you want to free up memory.
Example:
>>> vivre.clear_pipeline_cache()
"""
_pipeline_cache.clear()
[docs]
def _create_aligned_corpus(
source_chapters: List[Chapter],
target_chapters: List[Chapter],
pipeline: VivrePipeline,
book_title: str,
language_pair: str,
) -> Dict[str, Any]:
"""Create the aligned corpus structure."""
source_lang, target_lang = language_pair.split("-")
corpus: Dict[str, Any] = {
"book_title": book_title,
"language_pair": language_pair,
"chapters": {},
}
# Process each chapter pair
for i, (source_chapter, target_chapter) in enumerate(
zip(source_chapters, target_chapters), 1
):
# Segment both chapters
source_sentences = pipeline.segmenter.segment(source_chapter.content)
target_sentences = pipeline.segmenter.segment(target_chapter.content)
# Align sentences
alignments = pipeline.aligner.align(source_sentences, target_sentences)
# Format alignments
chapter_alignments = []
for source_sent, target_sent in alignments:
chapter_alignments.append(
{source_lang: source_sent, target_lang: target_sent}
)
# Add chapter to corpus
corpus["chapters"][str(i)] = {
"title": source_chapter.title, # Use source title as primary
"alignments": chapter_alignments,
}
return corpus
[docs]
def _format_as_text(corpus: Dict[str, Any]) -> str:
"""Format corpus as plain text."""
lines = []
lines.append(f"Book: {corpus['book_title']}")
lines.append(f"Language Pair: {corpus['language_pair']}")
lines.append("=" * 50)
for chapter_num, chapter_data in corpus["chapters"].items():
lines.append(f"\nChapter {chapter_num}: {chapter_data['title']}")
lines.append("-" * 30)
for i, alignment in enumerate(chapter_data["alignments"], 1):
source_lang, target_lang = corpus["language_pair"].split("-")
lines.append(f"{i}. {source_lang.upper()}: {alignment[source_lang]}")
lines.append(f" {target_lang.upper()}: {alignment[target_lang]}")
lines.append("")
return "\n".join(lines)
[docs]
def _parse_source_or_chapters(
source: Union[str, Path, Chapters], name: str
) -> Tuple[List[Chapter], str]:
"""
Parse source or target, whether it's a file path or Chapters object.
Args:
source: File path or Chapters object
name: Name for error messages ("source" or "target")
Returns:
Tuple of (chapters, book_title)
"""
if isinstance(source, Chapters):
# Already parsed Chapters object
return source.chapters, source.book_title
else:
# File path - parse the EPUB
source_path = Path(source)
if not source_path.exists():
raise FileNotFoundError(
f"{name.capitalize()} EPUB file not found: {source_path}"
)
parser = VivreParser()
chapters = parser.parse_epub(source_path)
# Extract book title from EPUB metadata
book_title = source_path.stem # Default to filename
try:
# Load the EPUB and extract metadata
import zipfile
with zipfile.ZipFile(source_path, "r") as epub_zip:
# Find content.opf
container_xml = epub_zip.read("META-INF/container.xml")
from defusedxml import ElementTree as ET
container_root = ET.fromstring(container_xml)
# Extract the path to the content.opf file
selector = (
'.//container:rootfile[@media-type="application/oebps-package+xml"]'
)
rootfile_elem = container_root.find(
selector,
{"container": "urn:oasis:names:tc:opendocument:xmlns:container"},
)
if rootfile_elem is not None:
content_opf_path = rootfile_elem.get("full-path")
if content_opf_path:
content_opf = epub_zip.read(content_opf_path)
content_root = ET.fromstring(content_opf)
# Extract book title from dc:title
title_elem = content_root.find(
".//dc:title",
{"dc": "http://purl.org/dc/elements/1.1/"},
)
if title_elem is not None and title_elem.text:
book_title = title_elem.text.strip()
except Exception:
# If metadata extraction fails, use filename
pass
return chapters, book_title