mon_tokenizer / upload_to_hub.py
janakhpon's picture
feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue
1f1b899
#!/usr/bin/env python3
"""
Upload Mon tokenizer to Hugging Face Hub.
This script provides functionality to validate and upload the Mon language tokenizer
to Hugging Face Hub with comprehensive validation and modern best practices.
Required files:
- `tokenizer_config.json` - Main tokenizer configuration
- `special_tokens_map.json` - Special token mappings
- `README.md` - Model documentation and usage instructions
- `.gitattributes` - Git LFS configuration for large files
Required tokenizer model files (at least one):
- `tokenizer.json` - Fast tokenizer (recommended for reliability)
- `tokenizer.model` - SentencePiece model file (slow tokenizer backup)
- `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated)
Optional files:
- `generation_config.json` - Text generation configuration
- `vocab.txt` - Vocabulary file for certain tokenizer types
- `merges.txt` - BPE merge rules for certain tokenizer types
The script validates all files, tests functionality, and uploads only essential files
while excluding development artifacts (.env, .py scripts, caches, etc.).
"""
import logging
import os
from pathlib import Path
from typing import List, Optional
from huggingface_hub import HfApi, login, whoami
from transformers import AutoTokenizer
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler()],
)
logger = logging.getLogger(__name__)
class TokenizerUploader:
"""Handles validation and upload of tokenizers to Hugging Face Hub."""
def __init__(self, directory: str = "."):
"""
Initialize the uploader.
Args:
directory: Directory containing the tokenizer files
"""
self.directory = Path(directory)
self.required_files = [
"tokenizer_config.json",
"special_tokens_map.json",
"README.md",
".gitattributes",
]
# Optional but recommended files
self.optional_files = [
"generation_config.json",
"vocab.txt",
"merges.txt",
"tokenizer.json", # Fast tokenizer (becomes required if no .model file)
"added_tokens.json", # Additional tokens
"preprocessor_config.json", # Preprocessing configuration
]
# Tokenizer model files - check for either fast or slow tokenizer
self.model_files = ["tokenizer.json", "tokenizer.model", "mon_tokenizer.model"]
def validate_files(self) -> bool:
"""
Validate that all required files are present.
Returns:
bool: True if all files are present, False otherwise
"""
logger.info(f"Validating tokenizer files in: {self.directory.absolute()}")
missing_files = []
present_files = []
# Check regular required files
for file_name in self.required_files:
file_path = self.directory / file_name
if file_path.exists():
size = file_path.stat().st_size
present_files.append((file_name, size))
logger.info(f"✓ {file_name} ({size:,} bytes)")
else:
missing_files.append(file_name)
logger.error(f"✗ {file_name} (missing)")
# Check optional files
for file_name in self.optional_files:
file_path = self.directory / file_name
if file_path.exists():
size = file_path.stat().st_size
present_files.append((file_name, size))
logger.info(f"✓ {file_name} ({size:,} bytes) [optional]")
# Check for tokenizer model files - at least one must exist
model_found = False
found_models = []
for model_name in self.model_files:
model_path = self.directory / model_name
if model_path.exists():
size = model_path.stat().st_size
present_files.append((model_name, size))
found_models.append(model_name)
logger.info(f"✓ {model_name} ({size:,} bytes)")
model_found = True
if not model_found:
missing_files.append("tokenizer model file (tokenizer.json, tokenizer.model, or mon_tokenizer.model)")
logger.error(f"✗ No tokenizer model file found (looked for: {', '.join(self.model_files)})")
else:
logger.info(f"✓ Found tokenizer model(s): {', '.join(found_models)}")
if missing_files:
logger.error(f"Missing required files: {', '.join(missing_files)}")
return False
total_required = len(self.required_files) + 1 # +1 for model file
logger.info(f"✓ All {total_required} essential files present")
return True
def validate_tokenizer_functionality(self) -> bool:
"""
Validate tokenizer functionality with comprehensive tests.
Supports both fast (tokenizer.json) and slow (tokenizer.model) tokenizers.
Returns:
bool: True if all tests pass, False otherwise
"""
logger.info("Validating tokenizer functionality")
try:
# Load tokenizer with explicit local files only
abs_directory = str(self.directory.absolute())
# Determine tokenizer type for better error handling
has_fast = (self.directory / "tokenizer.json").exists()
has_slow = any((self.directory / model).exists() for model in ["tokenizer.model", "mon_tokenizer.model"])
if has_fast:
logger.info("Detected fast tokenizer (tokenizer.json)")
if has_slow:
logger.info("Detected slow tokenizer (*.model)")
tokenizer = AutoTokenizer.from_pretrained(
abs_directory,
local_files_only=True,
trust_remote_code=False # Security best practice
)
tokenizer_type = "fast" if tokenizer.is_fast else "slow"
logger.info(f"✓ {tokenizer_type.capitalize()} tokenizer loaded (vocab: {tokenizer.vocab_size:,})")
# Comprehensive test cases for Mon language
test_cases = [
{
"text": "ဘာသာမန်",
"description": "Simple Mon word"
},
{
"text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
"description": "Complex Mon sentence with punctuation"
},
{
"text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
"description": "Mon text with geographical references"
},
{
"text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ",
"description": "Mon numerals and dates"
},
{
"text": "",
"description": "Empty string test"
}
]
passed_tests = 0
for i, test_case in enumerate(test_cases, 1):
text = test_case["text"]
description = test_case["description"]
try:
# Test tokenization
tokens = tokenizer(text, return_tensors="pt")
if tokens["input_ids"].numel() == 0 and text:
logger.warning(f"⚠ Test {i}: Empty tokenization for non-empty text")
continue
# Test decoding
decoded = tokenizer.decode(
tokens["input_ids"][0],
skip_special_tokens=True
)
# Check round-trip accuracy
if text.strip() == decoded.strip():
logger.info(f"✓ Test {i}: {description} - PASSED")
passed_tests += 1
else:
logger.warning(f"⚠ Test {i}: {description} - Round-trip mismatch")
logger.warning(f" Input: '{text}'")
logger.warning(f" Output: '{decoded}'")
# For some cases, minor differences might be acceptable
if len(text.strip()) > 0: # Don't fail on empty strings
continue
passed_tests += 1
except Exception as e:
logger.error(f"✗ Test {i}: {description} - FAILED: {e}")
return False
# Check test results
total_tests = len([tc for tc in test_cases if tc["text"]]) # Exclude empty string
if passed_tests >= total_tests - 1: # Allow one test to fail
logger.info(f"✓ Functionality validation passed ({passed_tests}/{len(test_cases)} tests)")
return True
else:
logger.error(f"✗ Functionality validation failed ({passed_tests}/{len(test_cases)} tests passed)")
return False
except Exception as e:
logger.error(f"✗ Tokenizer validation failed: {e}")
return False
def validate_tokenizer(self) -> bool:
"""
Run complete tokenizer validation.
Returns:
bool: True if validation passes, False otherwise
"""
logger.info("=== Starting Tokenizer Validation ===")
# Validate files
if not self.validate_files():
return False
# Validate functionality
if not self.validate_tokenizer_functionality():
return False
logger.info("✅ Tokenizer validation completed successfully")
return True
def check_authentication(self) -> Optional[str]:
"""
Check Hugging Face authentication status.
Returns:
Optional[str]: Username if authenticated, None otherwise
"""
try:
user_info = whoami()
username = user_info.get("name", "unknown")
logger.info(f"✓ Authenticated as: {username}")
return username
except Exception:
logger.warning("Not authenticated with Hugging Face")
return None
def upload_to_hub(
self,
repo_id: str,
private: bool = False,
commit_message: str = "Upload Mon language tokenizer",
create_pr: bool = False,
) -> bool:
"""
Upload tokenizer to Hugging Face Hub.
Args:
repo_id: Repository ID (e.g., "username/model-name")
private: Whether to create a private repository
commit_message: Commit message for the upload
create_pr: Whether to create a pull request instead of direct push
Returns:
bool: True if upload successful, False otherwise
"""
logger.info(f"=== Starting Upload to {repo_id} ===")
try:
# Validate tokenizer first
if not self.validate_tokenizer():
logger.error("❌ Upload cancelled - validation failed")
return False
# Check authentication
if not self.check_authentication():
logger.info("Attempting to log in...")
try:
login()
if not self.check_authentication():
logger.error("❌ Authentication failed")
return False
except Exception as e:
logger.error(f"❌ Login failed: {e}")
return False
# Create API client
api = HfApi()
# Create/update repository
logger.info(f"Creating/updating repository: {repo_id}")
api.create_repo(
repo_id=repo_id,
private=private,
exist_ok=True,
repo_type="model"
)
logger.info("✓ Repository ready")
# List files to upload
upload_files = []
ignore_patterns = [
# Python compilation artifacts
"*.pyc",
"__pycache__/",
"*.pyo",
# Version control and development
".git/",
".gitignore",
".venv/",
"venv/",
".env",
".env.*",
".python-version",
# Build and dependency files
"*.lock",
"uv.lock",
"Pipfile.lock",
"poetry.lock",
"pyproject.toml",
"setup.py",
"setup.cfg",
"requirements.txt",
"requirements-dev.txt",
# Development and testing files
"test_*",
"tests/",
"*_test.py",
"sample_*",
"example_*",
"demo_*",
"*_demo.py",
# Build and conversion scripts
"convert_*",
"upload_*",
"build_*",
"text_processing_*",
"*.py", # Don't upload Python scripts
# Dataset and training artifacts
"datasets/",
"data/",
"checkpoints/",
"logs/",
"wandb/",
# Temporary and cache files
"*.tmp",
"*.temp",
".cache/",
"*.meta.json",
"*.backup",
# OS specific files
".DS_Store",
"Thumbs.db",
"desktop.ini"
]
logger.info("Files to be uploaded:")
for file_path in self.directory.iterdir():
if file_path.is_file() and not any(
file_path.match(pattern) for pattern in ignore_patterns
):
size = file_path.stat().st_size
upload_files.append(file_path.name)
logger.info(f" ✓ {file_path.name} ({size:,} bytes)")
# Upload files
logger.info("Uploading to Hugging Face Hub...")
api.upload_folder(
folder_path=str(self.directory),
repo_id=repo_id,
commit_message=commit_message,
ignore_patterns=ignore_patterns,
create_pr=create_pr,
)
hub_url = f"https://huggingface.co/{repo_id}"
logger.info(f"🎉 Upload successful!")
logger.info(f"📍 Repository URL: {hub_url}")
if create_pr:
logger.info("📝 Pull request created for review")
return True
except Exception as e:
logger.error(f"❌ Upload failed: {e}")
return False
def main():
"""Main entry point for the upload script."""
import argparse
parser = argparse.ArgumentParser(
description="Upload Mon tokenizer to Hugging Face Hub"
)
parser.add_argument(
"--repo-id",
default="janakhpon/mon_tokenizer",
help="Repository ID (default: janakhpon/mon_tokenizer)",
)
parser.add_argument(
"--directory",
default=".",
help="Directory containing tokenizer files (default: current directory)",
)
parser.add_argument(
"--private",
action="store_true",
help="Create private repository",
)
parser.add_argument(
"--message",
default="Upload Mon language tokenizer",
help="Commit message",
)
parser.add_argument(
"--create-pr",
action="store_true",
help="Create pull request instead of direct push",
)
parser.add_argument(
"--validate-only",
action="store_true",
help="Only validate tokenizer, don't upload",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Create uploader
uploader = TokenizerUploader(directory=args.directory)
if args.validate_only:
# Only validate
success = uploader.validate_tokenizer()
logger.info("Validation completed.")
else:
# Interactive mode if no repo ID provided
if args.repo_id == "janakhpon/mon_tokenizer":
print("\n🤗 Mon Tokenizer Hub Uploader")
print("=" * 40)
repo_input = input(f"Repository ID [{args.repo_id}]: ").strip()
if repo_input:
args.repo_id = repo_input
private_input = input("Private repository? (y/N): ").strip().lower()
args.private = private_input == 'y'
print(f"\nUploading to: {args.repo_id}")
print(f"Private: {args.private}")
print("-" * 40)
# Upload tokenizer
success = uploader.upload_to_hub(
repo_id=args.repo_id,
private=args.private,
commit_message=args.message,
create_pr=args.create_pr,
)
exit(0 if success else 1)
if __name__ == "__main__":
main()