Source code for vivre.cli

"""
Command-line interface for the vivre library.

This module provides a CLI for common tasks like reading EPUB files
and aligning parallel texts.
"""

import json
from pathlib import Path
from typing import Optional

import typer
from rich.console import Console
from rich.json import JSON
from rich.panel import Panel
from rich.table import Table

from .api import align as align_api
from .api import read

# Create Typer app and console
app = typer.Typer(
    name="vivre",
    help="A library for processing parallel texts",
    add_completion=False,
    rich_markup_mode="rich",
)
console = Console()



[docs]
@app.command()
def align(
    source_epub: Path = typer.Argument(
        ...,
        help="Path to source language EPUB file",
        exists=True,
        file_okay=True,
        dir_okay=False,
    ),
    target_epub: Path = typer.Argument(
        ...,
        help="Path to target language EPUB file",
        exists=True,
        file_okay=True,
        dir_okay=False,
    ),
    language_pair: str = typer.Argument(
        ...,
        help="Language pair code (e.g., 'en-es', 'fr-en') - REQUIRED",
    ),
    method: str = typer.Option(
        "gale-church",
        "--method",
        "-m",
        help="Alignment method to use",
        case_sensitive=False,
    ),
    format: str = typer.Option(
        "json",
        "--format",
        "-f",
        help="Output format (json, text, csv, xml, dict)",
        case_sensitive=False,
    ),
    output: Optional[Path] = typer.Option(
        None,
        "--output",
        "-o",
        help="Output file path (default: stdout)",
        file_okay=True,
        dir_okay=False,
    ),
    c: Optional[float] = typer.Option(
        None,
        "--c",
        help="Gale-Church alignment parameter c",
    ),
    s2: Optional[float] = typer.Option(
        None,
        "--s2",
        help="Gale-Church alignment parameter s2",
    ),
    gap_penalty: Optional[float] = typer.Option(
        None,
        "--gap-penalty",
        help="Gale-Church gap penalty parameter",
    ),
    verbose: bool = typer.Option(
        False,
        "--verbose",
        "-v",
        help="Show detailed progress and statistics",
    ),
) -> None:
    """
    Align two EPUB files using the complete pipeline.

    This command parses both EPUB files, segments the text into sentences,
    and aligns them using the specified method. The language_pair parameter
    is required for accurate alignment.

    Examples:
        $ vivre align english.epub french.epub en-fr
        $ vivre align english.epub spanish.epub es-en --format csv
        $ vivre align english.epub french.epub en-fr --output result.json
    """
    if verbose:
        console.print(
            Panel(
                f"[bold blue]Aligning EPUB files[/bold blue]\n"
                f"Source: [green]{source_epub}[/green]\n"
                f"Target: [green]{target_epub}[/green]\n"
                f"Language pair: [yellow]{language_pair}[/yellow]\n"
                f"Method: [cyan]{method}[/cyan]\n"
                f"Format: [magenta]{format}[/magenta]",
                title="[bold]Alignment Configuration[/bold]",
            )
        )

    # Validate format
    if format.lower() not in ["json", "dict", "text", "csv", "xml"]:
        console.print(
            f"[red]Error:[/red] Invalid format '{format}'. "
            f"Use 'json', 'dict', 'text', 'csv', or 'xml'."
        )
        raise typer.Exit(1)

    # Validate language pair format
    if "-" not in language_pair:
        console.print(
            f"[red]Error:[/red] Invalid language pair '{language_pair}'. "
            f"Use format 'en-fr', 'es-en', etc."
        )
        raise typer.Exit(1)

    try:
        # Build kwargs for alignment parameters
        kwargs = {}
        if c is not None:
            kwargs["c"] = c
        if s2 is not None:
            kwargs["s2"] = s2
        if gap_penalty is not None:
            kwargs["gap_penalty"] = gap_penalty

        # Perform alignment
        if verbose:
            console.print("[yellow]Processing alignment...[/yellow]")

        # Filter out _pipeline from kwargs as it's not a CLI parameter
        align_kwargs = {k: v for k, v in kwargs.items() if k != "_pipeline"}
        result = align_api(
            source_epub,
            target_epub,
            language_pair,
            method,
            **align_kwargs,  # type: ignore[arg-type]
        )

        # Format output based on requested format
        if format.lower() == "json":
            output_text = result.to_json()
        elif format.lower() == "dict":
            output_dict = result.to_dict()
            output_text = json.dumps(output_dict, indent=2, ensure_ascii=False)
        elif format.lower() == "text":
            output_text = result.to_text()
        elif format.lower() == "csv":
            output_text = result.to_csv()
        elif format.lower() == "xml":
            output_text = result.to_xml()
        else:
            output_text = result.to_json()

        # Output result
        if output:
            output.write_text(output_text, encoding="utf-8")
            console.print(f"[green]Results saved to:[/green] {output}")
        else:
            if format.lower() == "json":
                console.print(JSON(output_text))
            else:
                console.print(output_text)

        if verbose:
            corpus_data = result.to_dict()
            total_alignments = sum(
                len(ch.get("alignments", []))
                for ch in corpus_data.get("chapters", {}).values()
            )
            book_title = corpus_data.get("book_title", "Unknown")
            lang_pair = corpus_data.get("language_pair", "Unknown")
            chapter_count = len(corpus_data.get("chapters", {}))

            console.print(
                Panel(
                    f"[bold green]Alignment Complete![/bold green]\n"
                    f"Book: [cyan]{book_title}[/cyan]\n"
                    f"Language pair: [yellow]{lang_pair}[/yellow]\n"
                    f"Chapters: [magenta]{chapter_count}[/magenta]\n"
                    f"Total alignments: [blue]{total_alignments}[/blue]",
                    title="[bold]Summary[/bold]",
                )
            )

    except Exception as e:
        console.print(f"[red]Error during alignment:[/red] {e}")
        raise typer.Exit(1)




[docs]
def _format_alignments_as_text(output_data: dict) -> str:
    """Format alignments as plain text."""
    lines = []
    lines.append(f"Book: {output_data['book_title']}")
    lines.append(f"Language Pair: {output_data['language_pair']}")
    lines.append(f"Method: {output_data['method']}")
    lines.append(f"Total Alignments: {output_data['total_alignments']}")
    lines.append("=" * 50)

    source_lang, target_lang = output_data["language_pair"].split("-")

    for alignment in output_data["alignments"]:
        lines.append(
            f"\n{alignment['id']}. {source_lang.upper()}: {alignment['source']}"
        )
        lines.append(f"   {target_lang.upper()}: {alignment['target']}")

    return "\n".join(lines)




[docs]
def _format_alignments_as_csv(output_data: dict) -> str:
    """Format alignments as CSV with enhanced metadata."""
    source_lang, target_lang = output_data["language_pair"].split("-")

    # Enhanced CSV with metadata
    metadata_line = (
        f'"{output_data["book_title"]}","{output_data["language_pair"]}",'
        f'"{output_data["method"]}","{output_data["source_epub"]}",'
        f'"{output_data["target_epub"]}","{output_data["total_alignments"]}"'
    )
    lines = [
        "book_title,language_pair,method,source_epub,target_epub,total_alignments",
        metadata_line,
        "",  # Empty line to separate metadata from alignments
        f"id,{source_lang},{target_lang},source_length,target_length",
    ]

    for alignment in output_data["alignments"]:
        source_text = alignment["source"].replace('"', '""')  # Escape quotes
        target_text = alignment["target"].replace('"', '""')  # Escape quotes
        alignment_line = (
            f'"{alignment["id"]}","{source_text}","{target_text}",'
            f'"{alignment["source_length"]}","{alignment["target_length"]}"'
        )
        lines.append(alignment_line)

    return "\n".join(lines)




[docs]
def _format_alignments_as_xml(output_data: dict) -> str:
    """Format alignments as XML."""
    source_lang, target_lang = output_data["language_pair"].split("-")

    xml_lines = [
        '<?xml version="1.0" encoding="UTF-8"?>',
        "<alignments>",
        f"  <book_title>{output_data['book_title']}</book_title>",
        f"  <language_pair>{output_data['language_pair']}</language_pair>",
        f"  <method>{output_data['method']}</method>",
        f"  <source_epub>{output_data['source_epub']}</source_epub>",
        f"  <target_epub>{output_data['target_epub']}</target_epub>",
        f"  <total_alignments>{output_data['total_alignments']}</total_alignments>",
    ]

    for alignment in output_data["alignments"]:
        xml_lines.extend(
            [
                f'  <alignment id="{alignment["id"]}">',
                f"    <source>{alignment['source']}</source>",
                f"    <target>{alignment['target']}</target>",
                f"    <source_length>{alignment['source_length']}</source_length>",
                f"    <target_length>{alignment['target_length']}</target_length>",
                "</alignment>",
            ]
        )

    xml_lines.append("</alignments>")

    return "\n".join(xml_lines)




[docs]
@app.command()
def parse(
    epub_path: Path = typer.Argument(
        ...,
        help="Path to the EPUB file to parse",
        exists=True,
        file_okay=True,
        dir_okay=False,
    ),
    show_content: bool = typer.Option(
        False,
        "--show-content",
        "-c",
        help="Show chapter content (can be very long)",
    ),
    max_chapters: Optional[int] = typer.Option(
        None,
        "--max-chapters",
        "-m",
        help="Maximum number of chapters to display",
    ),
    format: str = typer.Option(
        "json",
        "--format",
        "-f",
        help="Output format (json, dict, text, csv, xml)",
        case_sensitive=False,
    ),
    output: Optional[Path] = typer.Option(
        None,
        "--output",
        "-o",
        help="Output file path (default: stdout)",
        file_okay=True,
        dir_okay=False,
    ),
    segment: bool = typer.Option(
        False,
        "--segment",
        "-s",
        help="Segment chapters into sentences",
    ),
    language: Optional[str] = typer.Option(
        None,
        "--language",
        "-l",
        help="Language code for segmentation (auto-detected if not specified)",
    ),
    verbose: bool = typer.Option(
        False,
        "--verbose",
        "-v",
        help="Show detailed output",
    ),
) -> None:
    """
    Parse and analyze an EPUB file with comprehensive details.

    This command provides detailed analysis of EPUB files including metadata,
    chapter structure, content statistics, and optional sentence segmentation.
    It's the one-stop-shop for analyzing a single EPUB file.

    Examples:
        $ vivre parse book.epub
        $ vivre parse book.epub --show-content --max-chapters 3
        $ vivre parse book.epub --segment --language en --format csv
        $ vivre parse book.epub --verbose --output analysis.json
    """
    if verbose:
        console.print(
            Panel(
                f"[bold blue]Parsing EPUB file[/bold blue]\n"
                f"File: [green]{epub_path}[/green]\n"
                f"Format: [magenta]{format}[/magenta]\n"
                f"Show content: [yellow]{show_content}[/yellow]\n"
                f"Segment: [cyan]{segment}[/cyan]",
                title="[bold]Parse Configuration[/bold]",
            )
        )

    # Validate format
    if format.lower() not in ["json", "dict", "text", "csv", "xml"]:
        console.print(
            f"[red]Error:[/red] Invalid format '{format}'. "
            f"Use 'json', 'dict', 'text', 'csv', or 'xml'."
        )
        raise typer.Exit(1)

    try:
        # Parse the EPUB
        if verbose:
            console.print("[yellow]Parsing EPUB file...[/yellow]")

        chapters = read(epub_path)

        # Segment if requested
        if segment:
            if verbose:
                console.print("[yellow]Segmenting chapters...[/yellow]")
            chapters.segment(language)

        # Prepare output data
        output_data: dict = {
            "file_path": str(epub_path),
            "book_title": chapters.book_title,
            "book_author": "Unknown",  # Could be enhanced to extract from metadata
            "book_language": language or "auto-detected",
            "chapter_count": len(chapters),
            "chapters": [],
        }

        # Process chapters
        for i, chapter in enumerate(chapters.chapters, 1):
            if max_chapters and i > max_chapters:
                break

            chapter_data: dict = {
                "number": i,
                "title": chapter.title,
                "word_count": chapter.word_count,
                "character_count": chapter.char_count,
            }

            # Add content if requested
            if show_content:
                chapter_data["content"] = chapter.content
            else:
                # Add preview
                preview = (
                    chapter.content[:200] + "..."
                    if len(chapter.content) > 200
                    else chapter.content
                )
                chapter_data["content_preview"] = preview

            # Add segmented sentences if available
            if segment and chapters._segmented_chapters:
                segmented_chapter = chapters._segmented_chapters[i - 1]
                chapter_data["sentences"] = segmented_chapter[1]

            output_data["chapters"].append(chapter_data)

        # Format output based on requested format
        if format.lower() == "json":
            output_text = json.dumps(output_data, indent=2, ensure_ascii=False)
        elif format.lower() == "dict":
            output_text = json.dumps(output_data, indent=2, ensure_ascii=False)
        elif format.lower() == "text":
            output_text = _format_parse_as_text(output_data)
        elif format.lower() == "csv":
            output_text = _format_parse_as_csv(output_data)
        elif format.lower() == "xml":
            output_text = _format_parse_as_xml(output_data)
        else:
            output_text = json.dumps(output_data, indent=2, ensure_ascii=False)

        # Output result
        if output:
            if isinstance(output_text, dict):
                output.write_text(
                    json.dumps(output_text, indent=2, ensure_ascii=False),
                    encoding="utf-8",
                )
            else:
                output.write_text(str(output_text), encoding="utf-8")
            console.print(f"[green]Results saved to:[/green] {output}")
        else:
            if verbose:
                # Show rich formatted summary
                book_title = output_data["book_title"]
                file_path = output_data["file_path"]
                chapter_count = output_data["chapter_count"]
                book_language = output_data["book_language"]

                console.print(
                    Panel(
                        f"[bold blue]Book Title:[/bold blue] {book_title}\n"
                        f"[bold blue]File Path:[/bold blue] {file_path}\n"
                        f"[bold blue]Chapters:[/bold blue] {chapter_count}\n"
                        f"[bold blue]Language:[/bold blue] {book_language}",
                        title="[bold green]Parse Summary[/bold green]",
                    )
                )

                # Show chapter statistics
                if output_data["chapters"]:
                    table = Table(title="Chapter Statistics")
                    table.add_column("#", style="cyan", justify="right")
                    table.add_column("Title", style="magenta")
                    table.add_column("Words", style="yellow", justify="right")
                    table.add_column("Chars", style="green", justify="right")

                    for chapter in output_data["chapters"]:
                        title = chapter["title"]
                        if len(title) > 50:
                            title = title[:50] + "..."
                        table.add_row(
                            str(chapter["number"]),
                            title,
                            str(chapter["word_count"]),
                            str(chapter["character_count"]),
                        )

                    console.print(table)

                    if len(output_data["chapters"]) < output_data["chapter_count"]:
                        console.print(
                            f"[yellow]Note:[/yellow] Showing first "
                            f"{len(output_data['chapters'])} "
                            f"of {output_data['chapter_count']} chapters"
                        )
            else:
                if format.lower() == "json":
                    console.print(JSON(str(output_text)))
                elif isinstance(output_text, dict):
                    console.print(JSON(json.dumps(output_text, ensure_ascii=False)))
                else:
                    console.print(str(output_text))

    except Exception as e:
        console.print(f"[red]Error parsing EPUB:[/red] {e}")
        raise typer.Exit(1)




[docs]
def _format_parse_as_text(output_data: dict) -> str:
    """Format parse results as plain text."""
    lines = []
    lines.append(f"File: {output_data['file_path']}")
    lines.append(f"Book Title: {output_data['book_title']}")
    lines.append(f"Author: {output_data['book_author']}")
    lines.append(f"Language: {output_data['book_language']}")
    lines.append(f"Chapters: {output_data['chapter_count']}")
    lines.append("=" * 50)

    for chapter in output_data["chapters"]:
        lines.append(f"\nChapter {chapter['number']}: {chapter['title']}")
        lines.append(f"Words: {chapter['word_count']}")
        lines.append(f"Characters: {chapter['character_count']}")
        if "content_preview" in chapter:
            lines.append(f"Preview: {chapter['content_preview']}")
        elif "content" in chapter:
            lines.append(f"Content: {chapter['content']}")

    return "\n".join(lines)




[docs]
def _format_parse_as_csv(output_data: dict) -> str:
    """Format parse results as CSV."""
    lines = [
        "file_path,book_title,book_author,book_language,chapter_count",
        (
            f'"{output_data["file_path"]}","{output_data["book_title"]}",'
            f'"{output_data["book_author"]}","{output_data["book_language"]}",'
            f'"{output_data["chapter_count"]}"'
        ),
        "",  # Empty line to separate metadata from chapters
        "chapter_number,title,word_count,character_count,content_preview",
    ]

    for chapter in output_data["chapters"]:
        title = chapter["title"].replace('"', '""')  # Escape quotes
        preview = chapter.get("content_preview", "").replace('"', '""')
        lines.append(
            f'"{chapter["number"]}","{title}","{chapter["word_count"]}",'
            f'"{chapter["character_count"]}","{preview}"'
        )

    return "\n".join(lines)




[docs]
def _format_parse_as_xml(output_data: dict) -> str:
    """Format parse results as XML."""
    xml_lines = [
        '<?xml version="1.0" encoding="UTF-8"?>',
        "<epub_parse>",
        f"  <file_path>{output_data['file_path']}</file_path>",
        f"  <book_title>{output_data['book_title']}</book_title>",
        f"  <book_author>{output_data['book_author']}</book_author>",
        f"  <book_language>{output_data['book_language']}</book_language>",
        f"  <chapter_count>{output_data['chapter_count']}</chapter_count>",
        "  <chapters>",
    ]

    for chapter in output_data["chapters"]:
        char_count_line = (
            f"      <character_count>{chapter['character_count']}</character_count>"
        )
        xml_lines.extend(
            [
                f'    <chapter number="{chapter["number"]}">',
                f"      <title>{chapter['title']}</title>",
                f"      <word_count>{chapter['word_count']}</word_count>",
                char_count_line,
            ]
        )

        if "content_preview" in chapter:
            xml_lines.append(
                f"      <content_preview>{chapter['content_preview']}</content_preview>"
            )
        elif "content" in chapter:
            xml_lines.append(f"      <content>{chapter['content']}</content>")

        xml_lines.append("    </chapter>")

    xml_lines.extend(["  </chapters>", "</epub_parse>"])

    return "\n".join(xml_lines)




[docs]
@app.callback()
def main(
    version: bool = typer.Option(
        None,
        "--version",
        "-V",
        help="Show version and exit",
        callback=lambda v: typer.echo("vivre 0.1.0") if v else None,
    ),
) -> None:
    """
    Vivre - A library for processing parallel texts.

    This CLI provides two powerful commands for EPUB processing and text alignment:

    • [bold]parse[/bold] - Comprehensive EPUB analysis with metadata, structure,
      and optional segmentation
    • [bold]align[/bold] - Parallel text alignment using machine learning techniques

    Examples:
        $ vivre parse book.epub --verbose
        $ vivre align english.epub french.epub en-fr --format csv
        $ vivre --help
    """
    pass



if __name__ == "__main__":
    app()
Source code for vivre.cli

vivre

Navigation

Related Topics