#!/usr/bin/env python3 """ Upload Mon tokenizer to Hugging Face Hub. This script provides functionality to validate and upload the Mon language tokenizer to Hugging Face Hub with comprehensive validation and modern best practices. Required files: - `tokenizer_config.json` - Main tokenizer configuration - `special_tokens_map.json` - Special token mappings - `README.md` - Model documentation and usage instructions - `.gitattributes` - Git LFS configuration for large files Required tokenizer model files (at least one): - `tokenizer.json` - Fast tokenizer (recommended for reliability) - `tokenizer.model` - SentencePiece model file (slow tokenizer backup) - `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated) Optional files: - `generation_config.json` - Text generation configuration - `vocab.txt` - Vocabulary file for certain tokenizer types - `merges.txt` - BPE merge rules for certain tokenizer types The script validates all files, tests functionality, and uploads only essential files while excluding development artifacts (.env, .py scripts, caches, etc.). """ import logging import os from pathlib import Path from typing import List, Optional from huggingface_hub import HfApi, login, whoami from transformers import AutoTokenizer # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()], ) logger = logging.getLogger(__name__) class TokenizerUploader: """Handles validation and upload of tokenizers to Hugging Face Hub.""" def __init__(self, directory: str = "."): """ Initialize the uploader. Args: directory: Directory containing the tokenizer files """ self.directory = Path(directory) self.required_files = [ "tokenizer_config.json", "special_tokens_map.json", "README.md", ".gitattributes", ] # Optional but recommended files self.optional_files = [ "generation_config.json", "vocab.txt", "merges.txt", "tokenizer.json", # Fast tokenizer (becomes required if no .model file) "added_tokens.json", # Additional tokens "preprocessor_config.json", # Preprocessing configuration ] # Tokenizer model files - check for either fast or slow tokenizer self.model_files = ["tokenizer.json", "tokenizer.model", "mon_tokenizer.model"] def validate_files(self) -> bool: """ Validate that all required files are present. Returns: bool: True if all files are present, False otherwise """ logger.info(f"Validating tokenizer files in: {self.directory.absolute()}") missing_files = [] present_files = [] # Check regular required files for file_name in self.required_files: file_path = self.directory / file_name if file_path.exists(): size = file_path.stat().st_size present_files.append((file_name, size)) logger.info(f"✓ {file_name} ({size:,} bytes)") else: missing_files.append(file_name) logger.error(f"✗ {file_name} (missing)") # Check optional files for file_name in self.optional_files: file_path = self.directory / file_name if file_path.exists(): size = file_path.stat().st_size present_files.append((file_name, size)) logger.info(f"✓ {file_name} ({size:,} bytes) [optional]") # Check for tokenizer model files - at least one must exist model_found = False found_models = [] for model_name in self.model_files: model_path = self.directory / model_name if model_path.exists(): size = model_path.stat().st_size present_files.append((model_name, size)) found_models.append(model_name) logger.info(f"✓ {model_name} ({size:,} bytes)") model_found = True if not model_found: missing_files.append("tokenizer model file (tokenizer.json, tokenizer.model, or mon_tokenizer.model)") logger.error(f"✗ No tokenizer model file found (looked for: {', '.join(self.model_files)})") else: logger.info(f"✓ Found tokenizer model(s): {', '.join(found_models)}") if missing_files: logger.error(f"Missing required files: {', '.join(missing_files)}") return False total_required = len(self.required_files) + 1 # +1 for model file logger.info(f"✓ All {total_required} essential files present") return True def validate_tokenizer_functionality(self) -> bool: """ Validate tokenizer functionality with comprehensive tests. Supports both fast (tokenizer.json) and slow (tokenizer.model) tokenizers. Returns: bool: True if all tests pass, False otherwise """ logger.info("Validating tokenizer functionality") try: # Load tokenizer with explicit local files only abs_directory = str(self.directory.absolute()) # Determine tokenizer type for better error handling has_fast = (self.directory / "tokenizer.json").exists() has_slow = any((self.directory / model).exists() for model in ["tokenizer.model", "mon_tokenizer.model"]) if has_fast: logger.info("Detected fast tokenizer (tokenizer.json)") if has_slow: logger.info("Detected slow tokenizer (*.model)") tokenizer = AutoTokenizer.from_pretrained( abs_directory, local_files_only=True, trust_remote_code=False # Security best practice ) tokenizer_type = "fast" if tokenizer.is_fast else "slow" logger.info(f"✓ {tokenizer_type.capitalize()} tokenizer loaded (vocab: {tokenizer.vocab_size:,})") # Comprehensive test cases for Mon language test_cases = [ { "text": "ဘာသာမန်", "description": "Simple Mon word" }, { "text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။", "description": "Complex Mon sentence with punctuation" }, { "text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။", "description": "Mon text with geographical references" }, { "text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ", "description": "Mon numerals and dates" }, { "text": "", "description": "Empty string test" } ] passed_tests = 0 for i, test_case in enumerate(test_cases, 1): text = test_case["text"] description = test_case["description"] try: # Test tokenization tokens = tokenizer(text, return_tensors="pt") if tokens["input_ids"].numel() == 0 and text: logger.warning(f"⚠ Test {i}: Empty tokenization for non-empty text") continue # Test decoding decoded = tokenizer.decode( tokens["input_ids"][0], skip_special_tokens=True ) # Check round-trip accuracy if text.strip() == decoded.strip(): logger.info(f"✓ Test {i}: {description} - PASSED") passed_tests += 1 else: logger.warning(f"⚠ Test {i}: {description} - Round-trip mismatch") logger.warning(f" Input: '{text}'") logger.warning(f" Output: '{decoded}'") # For some cases, minor differences might be acceptable if len(text.strip()) > 0: # Don't fail on empty strings continue passed_tests += 1 except Exception as e: logger.error(f"✗ Test {i}: {description} - FAILED: {e}") return False # Check test results total_tests = len([tc for tc in test_cases if tc["text"]]) # Exclude empty string if passed_tests >= total_tests - 1: # Allow one test to fail logger.info(f"✓ Functionality validation passed ({passed_tests}/{len(test_cases)} tests)") return True else: logger.error(f"✗ Functionality validation failed ({passed_tests}/{len(test_cases)} tests passed)") return False except Exception as e: logger.error(f"✗ Tokenizer validation failed: {e}") return False def validate_tokenizer(self) -> bool: """ Run complete tokenizer validation. Returns: bool: True if validation passes, False otherwise """ logger.info("=== Starting Tokenizer Validation ===") # Validate files if not self.validate_files(): return False # Validate functionality if not self.validate_tokenizer_functionality(): return False logger.info("✅ Tokenizer validation completed successfully") return True def check_authentication(self) -> Optional[str]: """ Check Hugging Face authentication status. Returns: Optional[str]: Username if authenticated, None otherwise """ try: user_info = whoami() username = user_info.get("name", "unknown") logger.info(f"✓ Authenticated as: {username}") return username except Exception: logger.warning("Not authenticated with Hugging Face") return None def upload_to_hub( self, repo_id: str, private: bool = False, commit_message: str = "Upload Mon language tokenizer", create_pr: bool = False, ) -> bool: """ Upload tokenizer to Hugging Face Hub. Args: repo_id: Repository ID (e.g., "username/model-name") private: Whether to create a private repository commit_message: Commit message for the upload create_pr: Whether to create a pull request instead of direct push Returns: bool: True if upload successful, False otherwise """ logger.info(f"=== Starting Upload to {repo_id} ===") try: # Validate tokenizer first if not self.validate_tokenizer(): logger.error("❌ Upload cancelled - validation failed") return False # Check authentication if not self.check_authentication(): logger.info("Attempting to log in...") try: login() if not self.check_authentication(): logger.error("❌ Authentication failed") return False except Exception as e: logger.error(f"❌ Login failed: {e}") return False # Create API client api = HfApi() # Create/update repository logger.info(f"Creating/updating repository: {repo_id}") api.create_repo( repo_id=repo_id, private=private, exist_ok=True, repo_type="model" ) logger.info("✓ Repository ready") # List files to upload upload_files = [] ignore_patterns = [ # Python compilation artifacts "*.pyc", "__pycache__/", "*.pyo", # Version control and development ".git/", ".gitignore", ".venv/", "venv/", ".env", ".env.*", ".python-version", # Build and dependency files "*.lock", "uv.lock", "Pipfile.lock", "poetry.lock", "pyproject.toml", "setup.py", "setup.cfg", "requirements.txt", "requirements-dev.txt", # Development and testing files "test_*", "tests/", "*_test.py", "sample_*", "example_*", "demo_*", "*_demo.py", # Build and conversion scripts "convert_*", "upload_*", "build_*", "text_processing_*", "*.py", # Don't upload Python scripts # Dataset and training artifacts "datasets/", "data/", "checkpoints/", "logs/", "wandb/", # Temporary and cache files "*.tmp", "*.temp", ".cache/", "*.meta.json", "*.backup", # OS specific files ".DS_Store", "Thumbs.db", "desktop.ini" ] logger.info("Files to be uploaded:") for file_path in self.directory.iterdir(): if file_path.is_file() and not any( file_path.match(pattern) for pattern in ignore_patterns ): size = file_path.stat().st_size upload_files.append(file_path.name) logger.info(f" ✓ {file_path.name} ({size:,} bytes)") # Upload files logger.info("Uploading to Hugging Face Hub...") api.upload_folder( folder_path=str(self.directory), repo_id=repo_id, commit_message=commit_message, ignore_patterns=ignore_patterns, create_pr=create_pr, ) hub_url = f"https://huggingface.co/{repo_id}" logger.info(f"🎉 Upload successful!") logger.info(f"📍 Repository URL: {hub_url}") if create_pr: logger.info("📝 Pull request created for review") return True except Exception as e: logger.error(f"❌ Upload failed: {e}") return False def main(): """Main entry point for the upload script.""" import argparse parser = argparse.ArgumentParser( description="Upload Mon tokenizer to Hugging Face Hub" ) parser.add_argument( "--repo-id", default="janakhpon/mon_tokenizer", help="Repository ID (default: janakhpon/mon_tokenizer)", ) parser.add_argument( "--directory", default=".", help="Directory containing tokenizer files (default: current directory)", ) parser.add_argument( "--private", action="store_true", help="Create private repository", ) parser.add_argument( "--message", default="Upload Mon language tokenizer", help="Commit message", ) parser.add_argument( "--create-pr", action="store_true", help="Create pull request instead of direct push", ) parser.add_argument( "--validate-only", action="store_true", help="Only validate tokenizer, don't upload", ) parser.add_argument( "--verbose", action="store_true", help="Enable verbose logging", ) args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # Create uploader uploader = TokenizerUploader(directory=args.directory) if args.validate_only: # Only validate success = uploader.validate_tokenizer() logger.info("Validation completed.") else: # Interactive mode if no repo ID provided if args.repo_id == "janakhpon/mon_tokenizer": print("\n🤗 Mon Tokenizer Hub Uploader") print("=" * 40) repo_input = input(f"Repository ID [{args.repo_id}]: ").strip() if repo_input: args.repo_id = repo_input private_input = input("Private repository? (y/N): ").strip().lower() args.private = private_input == 'y' print(f"\nUploading to: {args.repo_id}") print(f"Private: {args.private}") print("-" * 40) # Upload tokenizer success = uploader.upload_to_hub( repo_id=args.repo_id, private=args.private, commit_message=args.message, create_pr=args.create_pr, ) exit(0 if success else 1) if __name__ == "__main__": main()