"""
Text segmentation module for the vivre library.
This module provides functionality to segment text into sentences or other units.
"""
from typing import List, Optional
import langdetect
import spacy
from spacy.language import Language
[docs]
class Segmenter:
"""
A text segmenter that splits text into sentences using spaCy models.
This class provides methods to segment text into meaningful units
using language detection and spaCy's sentence tokenization.
Batch Processing:
- segment_batch(): For single-language batches (requires explicit language)
- segment_mixed_batch(): For mixed-language batches (auto-detects languages)
Note: Some languages (Arabic, Hindi, Thai) use a general-purpose multilingual
model (xx_ent_wiki_sm) which may provide lower segmentation accuracy compared
to dedicated language models. For higher accuracy with these languages, consider
using larger (_lg) or transformer (_trf) spaCy models if available.
"""
def __init__(self) -> None:
"""Initialize the Segmenter instance."""
self._models: dict[str, Language] = {} # Cache by model_name only
self._supported_languages = {
"en": "en_core_web_sm",
"es": "es_core_news_sm",
"fr": "fr_core_news_sm",
"de": "de_core_news_sm",
"it": "it_core_news_sm",
"pt": "pt_core_news_sm",
"nl": "nl_core_news_sm",
"pl": "pl_core_news_sm",
"ru": "ru_core_news_sm",
"ja": "ja_core_news_sm",
"zh": "zh_core_web_sm",
"ko": "ko_core_news_sm",
"ar": "xx_ent_wiki_sm", # Arabic uses multilingual model
"hi": "xx_ent_wiki_sm", # Hindi uses multilingual model
"th": "xx_ent_wiki_sm", # Thai uses multilingual model
}
def _detect_language(self, text: str) -> str:
"""
Detect the language of the given text using langdetect.
Args:
text: The text to detect language for.
Returns:
Language code (e.g., 'en', 'es', 'fr').
"""
try:
# Use langdetect for robust language detection
detected_lang = langdetect.detect(text)
# Validate that the detected language is supported
if detected_lang in self._supported_languages:
return detected_lang
# If detected language is not supported, try to map to supported language
# Handle common language code variations
lang_mapping = {
"zh-cn": "zh", # Chinese (Simplified)
"zh-tw": "zh", # Chinese (Traditional)
"zh-hans": "zh", # Chinese (Simplified)
"zh-hant": "zh", # Chinese (Traditional)
"ja-jp": "ja", # Japanese
"ko-kr": "ko", # Korean
"ar-sa": "ar", # Arabic (Saudi Arabia)
"hi-in": "hi", # Hindi (India)
"th-th": "th", # Thai (Thailand)
}
if detected_lang in lang_mapping:
return lang_mapping[detected_lang]
# Default to English for unsupported languages
return "en"
except (langdetect.LangDetectException, Exception):
# Fallback to English if language detection fails
return "en"
def _load_model(self, lang_code: str) -> Language:
"""
Load or get cached spaCy model for the given language.
Args:
lang_code: Language code (e.g., 'en', 'es', 'fr').
Returns:
Loaded spaCy language model.
Raises:
OSError: If the model is not installed.
"""
if lang_code not in self._supported_languages:
raise ValueError(f"Unsupported language: {lang_code}")
model_name = self._supported_languages[lang_code]
# Check if this specific model is already loaded
if model_name in self._models:
# Model is already loaded, just update the mapping
return self._models[model_name]
# Load the model with only necessary components for sentence segmentation
try:
# Disable unnecessary components to improve performance
# We only need the sentence segmenter (senter), not tagger, parser, NER,
# etc.
model = spacy.load(
model_name,
disable=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)
# Add sentencizer if it's not already in the pipeline
if "sentencizer" not in model.pipe_names:
model.add_pipe("sentencizer")
self._models[model_name] = model
return model
except OSError:
raise OSError(
f"spaCy model '{model_name}' not found. "
f"Install it with: python -m spacy download {model_name}"
)
[docs]
def segment(self, text: str, language: Optional[str] = None) -> List[str]:
"""
Segment text into sentences using spaCy models.
Args:
text: The text to segment.
language: Optional language code (e.g., 'en', 'es', 'fr').
If provided, this language will be used without question.
If None, language will be auto-detected using langdetect.
User override takes precedence for maximum accuracy.
Returns:
List of sentence segments.
Raises:
OSError: If the required spaCy model is not installed.
ValueError: If the language is not supported.
"""
if text is None or not text or not text.strip():
return []
# Use user-provided language if available, otherwise auto-detect
if language is not None:
# Validate user-provided language
if not self.is_language_supported(language):
raise ValueError(f"Unsupported language: {language}")
detected_language = language
else:
# Auto-detect language as fallback
detected_language = self._detect_language(text)
# Load the appropriate spaCy model
nlp = self._load_model(detected_language)
# Process the text with spaCy
doc = nlp(text.strip())
# Extract sentences
sentences = []
for sent in doc.sents:
sentence_text = sent.text.strip()
if sentence_text:
sentences.append(sentence_text)
return sentences
[docs]
def segment_batch(self, texts: List[str], language: str) -> List[List[str]]:
"""
Segment multiple texts into sentences using spaCy's optimized batch processing.
This method uses spaCy's pipe() method for efficient batch processing,
making better use of multi-core CPUs and improving performance
significantly for bulk tasks.
IMPORTANT: All texts in the batch must be of the same language.
Mixed-language batches are not supported and will result in incorrect
segmentation. Use separate batch calls for different languages.
Args:
texts: List of texts to segment.
language: Language code (e.g., 'en', 'es', 'fr').
All texts in the batch must be of this language.
Returns:
List of sentence segments for each input text.
Raises:
OSError: If the required spaCy model is not installed.
ValueError: If the language is not supported or if texts list is empty.
"""
if not texts:
return []
# Validate language parameter
if not self.is_language_supported(language):
raise ValueError(f"Unsupported language: {language}")
# Load the appropriate spaCy model
nlp = self._load_model(language)
# Process texts in batch using spaCy's optimized pipe method
results = []
for doc in nlp.pipe([text.strip() for text in texts if text and text.strip()]):
sentences = []
for sent in doc.sents:
sentence_text = sent.text.strip()
if sentence_text:
sentences.append(sentence_text)
results.append(sentences)
return results
[docs]
def segment_mixed_batch(self, texts: List[str]) -> List[List[str]]:
"""
Segment multiple texts that may be in different languages.
This method automatically detects the language of each text and groups
them by language for efficient batch processing. This is the recommended
method for processing mixed-language text collections.
Args:
texts: List of texts to segment (can be in different languages).
Returns:
List of sentence segments for each input text, in the same order.
Raises:
OSError: If required spaCy models are not installed.
ValueError: If texts list is empty.
"""
if not texts:
return []
# Group texts by detected language
language_groups: dict[str, list[tuple[int, str]]] = {}
for i, text in enumerate(texts):
if not text or not text.strip():
# Empty text - will be handled later
continue
detected_lang = self._detect_language(text)
if detected_lang not in language_groups:
language_groups[detected_lang] = []
language_groups[detected_lang].append((i, text))
# Initialize results list with empty lists
results: List[List[str]] = [[] for _ in texts]
# Process each language group separately
for lang_code, text_items in language_groups.items():
if not self.is_language_supported(lang_code):
# Fallback to English for unsupported languages
lang_code = "en"
# Extract just the texts for this language group
indices, lang_texts = zip(*text_items)
# Process this language group
lang_results = self.segment_batch(list(lang_texts), lang_code)
# Place results back in original positions
for idx, sentences in zip(indices, lang_results):
results[idx] = sentences
return results
[docs]
def get_supported_languages(self) -> List[str]:
"""
Get list of supported language codes.
Note: Some languages (Arabic, Hindi, Thai) use a general-purpose multilingual
model (xx_ent_wiki_sm) which may provide lower segmentation accuracy compared
to dedicated language models. For higher accuracy with these languages, consider
using larger (_lg) or transformer (_trf) spaCy models if available.
Returns:
List of supported language codes.
"""
return list(self._supported_languages.keys())
[docs]
def is_language_supported(self, language: str) -> bool:
"""
Check if a language is supported.
Args:
language: Language code to check.
Returns:
True if language is supported, False otherwise.
"""
return language in self._supported_languages