|
|
""" |
|
|
Translation Service using NLLB-200 (Local Model) |
|
|
|
|
|
This service provides LOCAL translation between English and Indian languages. |
|
|
NO API CALLS - everything runs on your machine for FREE! |
|
|
|
|
|
Supported Languages: |
|
|
- English (eng) |
|
|
- Hindi (hin) |
|
|
- Tamil (tam) |
|
|
- Telugu (tel) |
|
|
- Kannada (kan) |
|
|
- Malayalam (mal) |
|
|
- Gujarati (guj) |
|
|
- Bengali (ben) |
|
|
- Marathi (mar) |
|
|
- Punjabi (pan) |
|
|
- Urdu (urd) |
|
|
|
|
|
Model Used: facebook/nllb-200-distilled-600M (~2.4GB) |
|
|
This is the smallest NLLB model, optimized for lower RAM usage. |
|
|
""" |
|
|
|
|
|
import logging |
|
|
from typing import Optional |
|
|
|
|
|
import torch |
|
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
|
from langdetect import detect, LangDetectException |
|
|
|
|
|
from config import ( |
|
|
NLLB_MODEL, |
|
|
LANGUAGE_MAP, |
|
|
SUPPORTED_LANGUAGES, |
|
|
MAX_TRANSLATION_LENGTH, |
|
|
get_nllb_code, |
|
|
get_language_name, |
|
|
is_english, |
|
|
) |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class TranslationService: |
|
|
""" |
|
|
Service for translating text between languages using NLLB-200. |
|
|
|
|
|
The model is lazily loaded on first use to save memory during startup. |
|
|
All processing happens locally - no API costs! |
|
|
""" |
|
|
|
|
|
def __init__(self, model_name: str = NLLB_MODEL): |
|
|
""" |
|
|
Initialize the translation service. |
|
|
|
|
|
Args: |
|
|
model_name: Hugging Face model identifier for NLLB-200 |
|
|
""" |
|
|
self.model_name = model_name |
|
|
self._model = None |
|
|
self._tokenizer = None |
|
|
self._device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
logger.info(f"TranslationService initialized (device: {self._device})") |
|
|
|
|
|
def _load_model(self): |
|
|
""" |
|
|
Load the NLLB-200 model and tokenizer. |
|
|
Called lazily on first translation request. |
|
|
""" |
|
|
if self._model is not None: |
|
|
return |
|
|
|
|
|
logger.info(f"Loading NLLB-200 model: {self.model_name}") |
|
|
logger.info("This may take a few minutes on first run (downloading ~2.4GB model)...") |
|
|
|
|
|
try: |
|
|
|
|
|
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
|
|
|
|
|
|
|
|
self._model = AutoModelForSeq2SeqLM.from_pretrained( |
|
|
self.model_name, |
|
|
torch_dtype=torch.float32, |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
self._model.to(self._device) |
|
|
|
|
|
logger.info("NLLB-200 model loaded successfully!") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load NLLB-200 model: {e}") |
|
|
raise Exception(f"Could not load translation model: {str(e)}") |
|
|
|
|
|
def detect_language(self, text: str) -> dict: |
|
|
""" |
|
|
Detect the language of the given text. |
|
|
|
|
|
Args: |
|
|
text: Text to detect language for |
|
|
|
|
|
Returns: |
|
|
Dictionary with: |
|
|
- code: Normalized language code (e.g., "hin") |
|
|
- name: Language name (e.g., "Hindi") |
|
|
- confidence: Detection confidence (if available) |
|
|
""" |
|
|
try: |
|
|
|
|
|
detected = detect(text) |
|
|
|
|
|
|
|
|
lang_mapping = { |
|
|
"en": "eng", |
|
|
"hi": "hin", |
|
|
"ta": "tam", |
|
|
"te": "tel", |
|
|
"kn": "kan", |
|
|
"ml": "mal", |
|
|
"gu": "guj", |
|
|
"bn": "ben", |
|
|
"mr": "mar", |
|
|
"pa": "pan", |
|
|
"ur": "urd", |
|
|
} |
|
|
|
|
|
code = lang_mapping.get(detected, detected) |
|
|
name = get_language_name(code) |
|
|
|
|
|
logger.info(f"Detected language: {name} ({code})") |
|
|
|
|
|
return { |
|
|
"code": code, |
|
|
"name": name, |
|
|
"raw_code": detected |
|
|
} |
|
|
|
|
|
except LangDetectException as e: |
|
|
logger.warning(f"Language detection failed: {e}") |
|
|
|
|
|
return { |
|
|
"code": "eng", |
|
|
"name": "English", |
|
|
"raw_code": "en" |
|
|
} |
|
|
|
|
|
def translate( |
|
|
self, |
|
|
text: str, |
|
|
source_lang: str, |
|
|
target_lang: str, |
|
|
max_length: int = 1024 |
|
|
) -> str: |
|
|
""" |
|
|
Translate text from source language to target language. |
|
|
|
|
|
Args: |
|
|
text: Text to translate |
|
|
source_lang: Source language code (e.g., "hin", "eng") |
|
|
target_lang: Target language code (e.g., "eng", "tam") |
|
|
max_length: Maximum output length |
|
|
|
|
|
Returns: |
|
|
Translated text |
|
|
|
|
|
Raises: |
|
|
ValueError: If language codes are invalid |
|
|
Exception: If translation fails |
|
|
""" |
|
|
|
|
|
self._load_model() |
|
|
|
|
|
|
|
|
try: |
|
|
source_nllb = get_nllb_code(source_lang) |
|
|
target_nllb = get_nllb_code(target_lang) |
|
|
except ValueError as e: |
|
|
raise ValueError(str(e)) |
|
|
|
|
|
logger.info(f"Translating from {source_lang} to {target_lang}") |
|
|
|
|
|
|
|
|
if len(text) > MAX_TRANSLATION_LENGTH: |
|
|
logger.info(f"Text too long ({len(text)} chars), chunking...") |
|
|
return self._translate_long_text(text, source_lang, target_lang, max_length) |
|
|
|
|
|
try: |
|
|
|
|
|
self._tokenizer.src_lang = source_nllb |
|
|
|
|
|
|
|
|
inputs = self._tokenizer( |
|
|
text, |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
truncation=True, |
|
|
max_length=max_length |
|
|
) |
|
|
inputs = {k: v.to(self._device) for k, v in inputs.items()} |
|
|
|
|
|
|
|
|
forced_bos_token_id = self._tokenizer.convert_tokens_to_ids(target_nllb) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = self._model.generate( |
|
|
**inputs, |
|
|
forced_bos_token_id=forced_bos_token_id, |
|
|
max_length=max_length, |
|
|
num_beams=5, |
|
|
length_penalty=1.0, |
|
|
early_stopping=True |
|
|
) |
|
|
|
|
|
|
|
|
translated = self._tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
|
|
|
logger.info(f"Translation complete ({len(translated)} chars)") |
|
|
|
|
|
return translated.strip() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Translation failed: {e}") |
|
|
raise Exception(f"Could not translate text: {str(e)}") |
|
|
|
|
|
def _translate_long_text( |
|
|
self, |
|
|
text: str, |
|
|
source_lang: str, |
|
|
target_lang: str, |
|
|
max_length: int = 1024 |
|
|
) -> str: |
|
|
""" |
|
|
Translate long text by splitting into chunks. |
|
|
|
|
|
Args: |
|
|
text: Long text to translate |
|
|
source_lang: Source language code |
|
|
target_lang: Target language code |
|
|
max_length: Maximum output length per chunk |
|
|
|
|
|
Returns: |
|
|
Concatenated translated text |
|
|
""" |
|
|
|
|
|
sentences = text.replace("।", ".").replace("॥", ".").split(".") |
|
|
|
|
|
chunks = [] |
|
|
current_chunk = "" |
|
|
|
|
|
for sentence in sentences: |
|
|
sentence = sentence.strip() |
|
|
if not sentence: |
|
|
continue |
|
|
|
|
|
|
|
|
if len(current_chunk) + len(sentence) + 2 > MAX_TRANSLATION_LENGTH: |
|
|
if current_chunk: |
|
|
chunks.append(current_chunk) |
|
|
current_chunk = sentence |
|
|
else: |
|
|
current_chunk = current_chunk + ". " + sentence if current_chunk else sentence |
|
|
|
|
|
if current_chunk: |
|
|
chunks.append(current_chunk) |
|
|
|
|
|
|
|
|
translated_chunks = [] |
|
|
for i, chunk in enumerate(chunks): |
|
|
logger.info(f"Translating chunk {i+1}/{len(chunks)}") |
|
|
translated = self.translate(chunk, source_lang, target_lang, max_length) |
|
|
translated_chunks.append(translated) |
|
|
|
|
|
return " ".join(translated_chunks) |
|
|
|
|
|
def translate_to_english(self, text: str, source_lang: str) -> str: |
|
|
""" |
|
|
Convenience method to translate text to English. |
|
|
|
|
|
Args: |
|
|
text: Text to translate |
|
|
source_lang: Source language code |
|
|
|
|
|
Returns: |
|
|
English translation |
|
|
""" |
|
|
if is_english(source_lang): |
|
|
return text |
|
|
|
|
|
return self.translate(text, source_lang, "eng") |
|
|
|
|
|
def translate_from_english(self, text: str, target_lang: str) -> str: |
|
|
""" |
|
|
Convenience method to translate English text to another language. |
|
|
|
|
|
Args: |
|
|
text: English text to translate |
|
|
target_lang: Target language code |
|
|
|
|
|
Returns: |
|
|
Translated text in target language |
|
|
""" |
|
|
if is_english(target_lang): |
|
|
return text |
|
|
|
|
|
return self.translate(text, "eng", target_lang) |
|
|
|
|
|
def get_supported_languages(self) -> list: |
|
|
""" |
|
|
Get list of supported languages. |
|
|
|
|
|
Returns: |
|
|
List of language dictionaries with code, name, and nllb_code |
|
|
""" |
|
|
return SUPPORTED_LANGUAGES.copy() |
|
|
|
|
|
def is_model_loaded(self) -> bool: |
|
|
"""Check if the NLLB model is currently loaded.""" |
|
|
return self._model is not None |
|
|
|
|
|
def warmup(self): |
|
|
""" |
|
|
Pre-load the model to avoid delay on first request. |
|
|
Call this during application startup if desired. |
|
|
""" |
|
|
logger.info("Warming up TranslationService...") |
|
|
self._load_model() |
|
|
logger.info("TranslationService warmup complete!") |
|
|
|