Source code for vivre.api

"""
Top-level API functions for the vivre library.

This module provides simple, user-friendly functions for common tasks:
- read(): Parse EPUB files and extract chapters
- align(): Align parallel texts and output in various formats
- quick_align(): Simple one-liner for basic alignment
- get_supported_languages(): Get list of supported languages
"""

import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from .integration import VivrePipeline
from .parser import Chapter, VivreParser
from .segmenter import Segmenter

# Keep a global pipeline instance to be reused by the API functions
# This is a simple way to speed up consecutive API calls in a single script run.
_pipeline_cache: Dict[str, VivrePipeline] = {}



[docs]
class AlignmentResult:
    """
    A container for alignment results with multiple output format options.

    This class holds the aligned corpus data and provides methods to
    output it in various formats.
    """

    def __init__(self, corpus: Dict[str, Any]):
        """
        Initialize with aligned corpus data.

        Args:
            corpus: The aligned corpus dictionary
        """
        self._corpus = corpus


[docs]
    def to_dict(self) -> Dict[str, Any]:
        """Return the corpus as a dictionary."""
        return self._corpus.copy()



[docs]
    def to_json(self, indent: int = 2) -> str:
        """Return the corpus as JSON string."""
        return json.dumps(self._corpus, indent=indent, ensure_ascii=False)



[docs]
    def to_text(self) -> str:
        """Return the corpus as formatted text."""
        return _format_as_text(self._corpus)



[docs]
    def to_csv(self) -> str:
        """Return the corpus as CSV string."""
        return _format_as_csv(self._corpus)



[docs]
    def to_xml(self) -> str:
        """Return the corpus as XML string."""
        return _format_as_xml(self._corpus)


    def __repr__(self) -> str:
        """String representation."""
        return (
            f"AlignmentResult(book_title='{self._corpus.get('book_title', '')}', "
            f"language_pair='{self._corpus.get('language_pair', '')}')"
        )




[docs]
class Chapters:
    """
    A container for parsed chapters with segmentation capabilities.

    This class holds the parsed chapters and provides methods to segment
    the text into sentences.
    """

    def __init__(self, chapters: List[Chapter], book_title: str = ""):
        """
        Initialize with parsed chapters.

        Args:
            chapters: List of Chapter objects
            book_title: Title of the book
        """
        self.chapters = chapters
        self.book_title = book_title
        self._segmented_chapters: Optional[List[Tuple[str, List[str]]]] = None
        self._segmenter = Segmenter()


[docs]
    def segment(self, language: Optional[str] = None) -> "Chapters":
        """
        Segment all chapters into sentences.

        Args:
            language: Language code for segmentation (auto-detected if None)

        Returns:
            Self with segmented chapters
        """
        segmented = []
        for chapter in self.chapters:
            sentences = self._segmenter.segment(chapter.content, language)
            segmented.append((chapter.title, sentences))

        self._segmented_chapters = segmented
        return self



[docs]
    def get_segmented(self) -> List[Tuple[str, List[str]]]:
        """Get the segmented chapters."""
        if self._segmented_chapters is None:
            raise ValueError(
                "Chapters must be segmented first. Call .segment() method."
            )
        return self._segmented_chapters


    def __len__(self) -> int:
        """Return the number of chapters."""
        return len(self.chapters)

    def __getitem__(self, index: int) -> Chapter:
        """Get a chapter by index."""
        return self.chapters[index]

    def __iter__(self):
        """Iterate over chapters."""
        return iter(self.chapters)

    def __repr__(self) -> str:
        """String representation."""
        return (
            f"Chapters(book_title='{self.book_title}', chapters={len(self.chapters)})"
        )




[docs]
def read(epub_path: Union[str, Path]) -> Chapters:
    """
    Parse an EPUB file and extract chapters.

    Args:
        epub_path: Path to the EPUB file

    Returns:
        Chapters object containing parsed chapters

    Raises:
        FileNotFoundError: If the EPUB file doesn't exist
        ValueError: If the file is not a valid EPUB

    Example:
        >>> chapters = vivre.read('path/to/epub')
        >>> print(f"Found {len(chapters)} chapters")
        >>> for title, content in chapters:
        ...     print(f"Chapter: {title}")
    """
    epub_path = Path(epub_path)
    if not epub_path.exists():
        raise FileNotFoundError(f"EPUB file not found: {epub_path}")

    parser = VivreParser()
    try:
        chapters = parser.parse_epub(epub_path)
        book_title = getattr(parser, "_book_title", "")
        return Chapters(chapters, book_title)
    except Exception as e:
        raise ValueError(f"Failed to parse EPUB file {epub_path}: {e}")




[docs]
def align(
    source: Union[str, Path, Chapters],
    target: Union[str, Path, Chapters],
    language_pair: str,
    method: str = "gale-church",
    _pipeline: Optional[VivrePipeline] = None,  # Add this parameter
    **kwargs: Any,
) -> AlignmentResult:
    """
    Align parallel EPUB files or Chapters objects and return an AlignmentResult.

    This function can accept either file paths or Chapters objects, making it
    flexible for different workflows. The language_pair parameter is required
    for accurate alignment.

    Args:
        source: Source language EPUB file path or Chapters object
        target: Target language EPUB file path or Chapters object
        language_pair: Language pair code (e.g., "en-fr", "es-en") - REQUIRED
        method: Alignment method (currently only "gale-church" supported)
        _pipeline: Optional pre-existing VivrePipeline instance for dependency injection
        **kwargs: Additional arguments passed to the pipeline

    Returns:
        AlignmentResult object with methods for different output formats

    Raises:
        FileNotFoundError: If EPUB files don't exist (when using file paths)
        ValueError: If method is not supported or language_pair is invalid

    Example:
        # Using file paths
        >>> result = vivre.align('english.epub', 'french.epub', 'en-fr')
        >>> print(result.to_json())
        >>> print(result.to_csv())

        # Using Chapters objects (seamless workflow)
        >>> source_chapters = vivre.read('english.epub')
        >>> target_chapters = vivre.read('french.epub')
        >>> result = vivre.align(source_chapters, target_chapters, 'en-fr')
        >>> print(result.to_text())

        # Using dependency injection for better performance
        >>> pipeline = VivrePipeline('en-fr')
        >>> result = vivre.align(
        ...     source_chapters, target_chapters, 'en-fr', _pipeline=pipeline
        ... )
        >>> print(result.to_dict())

        # Get as dictionary for programmatic access
        >>> data = result.to_dict()
        >>> print(f"Found {len(data['chapters'])} chapters")
    """
    if method != "gale-church":
        raise ValueError(
            f"Method '{method}' not supported. Only 'gale-church' is available."
        )

    # Validate language pair format
    if not isinstance(language_pair, str) or "-" not in language_pair:
        raise ValueError(
            f"Invalid language_pair: '{language_pair}'. "
            f"Use format 'en-fr', 'es-en', etc."
        )

    # Parse source and target based on their types
    source_chapters, source_title = _parse_source_or_chapters(source, "source")
    target_chapters, target_title = _parse_source_or_chapters(target, "target")

    # Use the provided pipeline or get one from the cache/create a new one
    pipeline = _pipeline
    if pipeline is None:
        # Simple cache key
        cache_key = f"{language_pair}-{json.dumps(kwargs, sort_keys=True)}"
        if cache_key not in _pipeline_cache:
            _pipeline_cache[cache_key] = VivrePipeline(language_pair, **kwargs)
        pipeline = _pipeline_cache[cache_key]

    # Get book title (prefer source title, fallback to target)
    book_title = source_title or target_title

    # Process chapters and create aligned corpus
    try:
        aligned_corpus = _create_aligned_corpus(
            source_chapters, target_chapters, pipeline, book_title, language_pair
        )

        return AlignmentResult(aligned_corpus)

    except Exception as e:
        raise ValueError(f"Failed to align texts: {e}")




[docs]
def quick_align(
    source_epub: Union[str, Path],
    target_epub: Union[str, Path],
    language_pair: str,
) -> List[Tuple[str, str]]:
    """
    Quick alignment function that returns simple sentence pairs.

    This is a convenience function for simple use cases where you just
    need sentence pairs without the full corpus structure.

    Args:
        source_epub: Path to source language EPUB
        target_epub: Path to target language EPUB
        language_pair: Language pair code (e.g., "en-fr", "es-en") - REQUIRED

    Returns:
        List of (source_sentence, target_sentence) tuples

    Raises:
        FileNotFoundError: If either EPUB file doesn't exist
        ValueError: If language_pair is invalid

    Example:
        >>> pairs = vivre.quick_align('english.epub', 'french.epub', 'en-fr')
        >>> for source, target in pairs[:3]:
        ...     print(f"EN: {source}")
        ...     print(f"FR: {target}")
    """
    # Validate language pair format
    if not isinstance(language_pair, str) or "-" not in language_pair:
        raise ValueError(
            f"Invalid language_pair: '{language_pair}'. "
            f"Use format 'en-fr', 'es-en', etc."
        )

    # Use the main align function and extract sentence pairs
    result = align(source_epub, target_epub, language_pair)
    corpus = result.to_dict()

    pairs = []
    for chapter_data in corpus["chapters"].values():
        for alignment in chapter_data["alignments"]:
            source_lang, target_lang = language_pair.split("-")
            pairs.append((alignment[source_lang], alignment[target_lang]))

    return pairs




[docs]
def get_supported_languages() -> List[str]:
    """
    Get a list of supported languages for segmentation.

    Returns:
        List of supported language codes.

    Example:
        >>> languages = vivre.get_supported_languages()
        >>> print(f"Supported languages: {languages}")
    """
    segmenter = Segmenter()
    return list(segmenter._supported_languages.keys())




[docs]
def clear_pipeline_cache() -> None:
    """
    Clear the pipeline cache.

    This is useful for testing or when you want to free up memory.

    Example:
        >>> vivre.clear_pipeline_cache()
    """
    _pipeline_cache.clear()




[docs]
def _create_aligned_corpus(
    source_chapters: List[Chapter],
    target_chapters: List[Chapter],
    pipeline: VivrePipeline,
    book_title: str,
    language_pair: str,
) -> Dict[str, Any]:
    """Create the aligned corpus structure."""
    source_lang, target_lang = language_pair.split("-")

    corpus: Dict[str, Any] = {
        "book_title": book_title,
        "language_pair": language_pair,
        "chapters": {},
    }

    # Process each chapter pair
    for i, (source_chapter, target_chapter) in enumerate(
        zip(source_chapters, target_chapters), 1
    ):
        # Segment both chapters
        source_sentences = pipeline.segmenter.segment(source_chapter.content)
        target_sentences = pipeline.segmenter.segment(target_chapter.content)

        # Align sentences
        alignments = pipeline.aligner.align(source_sentences, target_sentences)

        # Format alignments
        chapter_alignments = []
        for source_sent, target_sent in alignments:
            chapter_alignments.append(
                {source_lang: source_sent, target_lang: target_sent}
            )

        # Add chapter to corpus
        corpus["chapters"][str(i)] = {
            "title": source_chapter.title,  # Use source title as primary
            "alignments": chapter_alignments,
        }

    return corpus




[docs]
def _format_as_text(corpus: Dict[str, Any]) -> str:
    """Format corpus as plain text."""
    lines = []
    lines.append(f"Book: {corpus['book_title']}")
    lines.append(f"Language Pair: {corpus['language_pair']}")
    lines.append("=" * 50)

    for chapter_num, chapter_data in corpus["chapters"].items():
        lines.append(f"\nChapter {chapter_num}: {chapter_data['title']}")
        lines.append("-" * 30)

        for i, alignment in enumerate(chapter_data["alignments"], 1):
            source_lang, target_lang = corpus["language_pair"].split("-")
            lines.append(f"{i}. {source_lang.upper()}: {alignment[source_lang]}")
            lines.append(f"   {target_lang.upper()}: {alignment[target_lang]}")
            lines.append("")

    return "\n".join(lines)




[docs]
def _format_as_csv(corpus: Dict[str, Any]) -> str:
    """Format corpus as CSV."""
    source_lang, target_lang = corpus["language_pair"].split("-")

    lines = [f"chapter,title,{source_lang},{target_lang}"]

    for chapter_num, chapter_data in corpus["chapters"].items():
        title = chapter_data["title"].replace('"', '""')  # Escape quotes

        for alignment in chapter_data["alignments"]:
            source_text = alignment[source_lang].replace('"', '""')
            target_text = alignment[target_lang].replace('"', '""')
            lines.append(f'"{chapter_num}","{title}","{source_text}","{target_text}"')

    return "\n".join(lines)




[docs]
def _format_as_xml(corpus: Dict[str, Any]) -> str:
    """Format corpus as XML."""
    source_lang, target_lang = corpus["language_pair"].split("-")

    xml_lines = [
        '<?xml version="1.0" encoding="UTF-8"?>',
        "<alignments>",
        f"  <book_title>{corpus['book_title']}</book_title>",
        f"  <language_pair>{corpus['language_pair']}</language_pair>",
        f"  <total_alignments>{len(corpus['chapters'])}</total_alignments>",
    ]

    for chapter_num, chapter_data in corpus["chapters"].items():
        xml_lines.extend(
            [
                f'  <chapter number="{chapter_num}">',
                f"    <title>{chapter_data['title']}</title>",
                "    <alignments>",
            ]
        )

        for alignment in chapter_data["alignments"]:
            xml_lines.extend(
                [
                    "      <alignment>",
                    f"        <{source_lang}>{alignment[source_lang]}</{source_lang}>",
                    f"        <{target_lang}>{alignment[target_lang]}</{target_lang}>",
                    "      </alignment>",
                ]
            )

        xml_lines.extend(["    </alignments>", "  </chapter>"])

    xml_lines.append("</alignments>")
    return "\n".join(xml_lines)




[docs]
def _parse_source_or_chapters(
    source: Union[str, Path, Chapters], name: str
) -> Tuple[List[Chapter], str]:
    """
    Parse source or target, whether it's a file path or Chapters object.

    Args:
        source: File path or Chapters object
        name: Name for error messages ("source" or "target")

    Returns:
        Tuple of (chapters, book_title)
    """
    if isinstance(source, Chapters):
        # Already parsed Chapters object
        return source.chapters, source.book_title
    else:
        # File path - parse the EPUB
        source_path = Path(source)
        if not source_path.exists():
            raise FileNotFoundError(
                f"{name.capitalize()} EPUB file not found: {source_path}"
            )

        parser = VivreParser()
        chapters = parser.parse_epub(source_path)

        # Extract book title from EPUB metadata
        book_title = source_path.stem  # Default to filename
        try:
            # Load the EPUB and extract metadata
            import zipfile

            with zipfile.ZipFile(source_path, "r") as epub_zip:
                # Find content.opf
                container_xml = epub_zip.read("META-INF/container.xml")
                from defusedxml import ElementTree as ET

                container_root = ET.fromstring(container_xml)

                # Extract the path to the content.opf file
                selector = (
                    './/container:rootfile[@media-type="application/oebps-package+xml"]'
                )
                rootfile_elem = container_root.find(
                    selector,
                    {"container": "urn:oasis:names:tc:opendocument:xmlns:container"},
                )
                if rootfile_elem is not None:
                    content_opf_path = rootfile_elem.get("full-path")
                    if content_opf_path:
                        content_opf = epub_zip.read(content_opf_path)
                        content_root = ET.fromstring(content_opf)

                        # Extract book title from dc:title
                        title_elem = content_root.find(
                            ".//dc:title",
                            {"dc": "http://purl.org/dc/elements/1.1/"},
                        )
                        if title_elem is not None and title_elem.text:
                            book_title = title_elem.text.strip()
        except Exception:
            # If metadata extraction fails, use filename
            pass

        return chapters, book_title
Source code for vivre.api

vivre

Navigation

Related Topics