| | |
| | """ |
| | Upload Mon tokenizer to Hugging Face Hub. |
| | |
| | This script provides functionality to validate and upload the Mon language tokenizer |
| | to Hugging Face Hub with comprehensive validation and modern best practices. |
| | |
| | Required files: |
| | - `tokenizer_config.json` - Main tokenizer configuration |
| | - `special_tokens_map.json` - Special token mappings |
| | - `README.md` - Model documentation and usage instructions |
| | - `.gitattributes` - Git LFS configuration for large files |
| | |
| | Required tokenizer model files (at least one): |
| | - `tokenizer.json` - Fast tokenizer (recommended for reliability) |
| | - `tokenizer.model` - SentencePiece model file (slow tokenizer backup) |
| | - `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated) |
| | |
| | Optional files: |
| | - `generation_config.json` - Text generation configuration |
| | - `vocab.txt` - Vocabulary file for certain tokenizer types |
| | - `merges.txt` - BPE merge rules for certain tokenizer types |
| | |
| | The script validates all files, tests functionality, and uploads only essential files |
| | while excluding development artifacts (.env, .py scripts, caches, etc.). |
| | |
| | """ |
| |
|
| | import logging |
| | import os |
| | from pathlib import Path |
| | from typing import List, Optional |
| |
|
| | from huggingface_hub import HfApi, login, whoami |
| | from transformers import AutoTokenizer |
| |
|
| | |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format="%(asctime)s - %(levelname)s - %(message)s", |
| | handlers=[logging.StreamHandler()], |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class TokenizerUploader: |
| | """Handles validation and upload of tokenizers to Hugging Face Hub.""" |
| |
|
| | def __init__(self, directory: str = "."): |
| | """ |
| | Initialize the uploader. |
| | |
| | Args: |
| | directory: Directory containing the tokenizer files |
| | """ |
| | self.directory = Path(directory) |
| | self.required_files = [ |
| | "tokenizer_config.json", |
| | "special_tokens_map.json", |
| | "README.md", |
| | ".gitattributes", |
| | ] |
| | |
| | self.optional_files = [ |
| | "generation_config.json", |
| | "vocab.txt", |
| | "merges.txt", |
| | "tokenizer.json", |
| | "added_tokens.json", |
| | "preprocessor_config.json", |
| | ] |
| | |
| | self.model_files = ["tokenizer.json", "tokenizer.model", "mon_tokenizer.model"] |
| |
|
| | def validate_files(self) -> bool: |
| | """ |
| | Validate that all required files are present. |
| | |
| | Returns: |
| | bool: True if all files are present, False otherwise |
| | """ |
| | logger.info(f"Validating tokenizer files in: {self.directory.absolute()}") |
| |
|
| | missing_files = [] |
| | present_files = [] |
| |
|
| | |
| | for file_name in self.required_files: |
| | file_path = self.directory / file_name |
| | if file_path.exists(): |
| | size = file_path.stat().st_size |
| | present_files.append((file_name, size)) |
| | logger.info(f"✓ {file_name} ({size:,} bytes)") |
| | else: |
| | missing_files.append(file_name) |
| | logger.error(f"✗ {file_name} (missing)") |
| |
|
| | |
| | for file_name in self.optional_files: |
| | file_path = self.directory / file_name |
| | if file_path.exists(): |
| | size = file_path.stat().st_size |
| | present_files.append((file_name, size)) |
| | logger.info(f"✓ {file_name} ({size:,} bytes) [optional]") |
| |
|
| | |
| | model_found = False |
| | found_models = [] |
| | for model_name in self.model_files: |
| | model_path = self.directory / model_name |
| | if model_path.exists(): |
| | size = model_path.stat().st_size |
| | present_files.append((model_name, size)) |
| | found_models.append(model_name) |
| | logger.info(f"✓ {model_name} ({size:,} bytes)") |
| | model_found = True |
| | |
| | if not model_found: |
| | missing_files.append("tokenizer model file (tokenizer.json, tokenizer.model, or mon_tokenizer.model)") |
| | logger.error(f"✗ No tokenizer model file found (looked for: {', '.join(self.model_files)})") |
| | else: |
| | logger.info(f"✓ Found tokenizer model(s): {', '.join(found_models)}") |
| |
|
| | if missing_files: |
| | logger.error(f"Missing required files: {', '.join(missing_files)}") |
| | return False |
| |
|
| | total_required = len(self.required_files) + 1 |
| | logger.info(f"✓ All {total_required} essential files present") |
| | return True |
| |
|
| | def validate_tokenizer_functionality(self) -> bool: |
| | """ |
| | Validate tokenizer functionality with comprehensive tests. |
| | Supports both fast (tokenizer.json) and slow (tokenizer.model) tokenizers. |
| | |
| | Returns: |
| | bool: True if all tests pass, False otherwise |
| | """ |
| | logger.info("Validating tokenizer functionality") |
| |
|
| | try: |
| | |
| | abs_directory = str(self.directory.absolute()) |
| | |
| | |
| | has_fast = (self.directory / "tokenizer.json").exists() |
| | has_slow = any((self.directory / model).exists() for model in ["tokenizer.model", "mon_tokenizer.model"]) |
| | |
| | if has_fast: |
| | logger.info("Detected fast tokenizer (tokenizer.json)") |
| | if has_slow: |
| | logger.info("Detected slow tokenizer (*.model)") |
| | |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | abs_directory, |
| | local_files_only=True, |
| | trust_remote_code=False |
| | ) |
| |
|
| | tokenizer_type = "fast" if tokenizer.is_fast else "slow" |
| | logger.info(f"✓ {tokenizer_type.capitalize()} tokenizer loaded (vocab: {tokenizer.vocab_size:,})") |
| |
|
| | |
| | test_cases = [ |
| | { |
| | "text": "ဘာသာမန်", |
| | "description": "Simple Mon word" |
| | }, |
| | { |
| | "text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။", |
| | "description": "Complex Mon sentence with punctuation" |
| | }, |
| | { |
| | "text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။", |
| | "description": "Mon text with geographical references" |
| | }, |
| | { |
| | "text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ", |
| | "description": "Mon numerals and dates" |
| | }, |
| | { |
| | "text": "", |
| | "description": "Empty string test" |
| | } |
| | ] |
| |
|
| | passed_tests = 0 |
| | |
| | for i, test_case in enumerate(test_cases, 1): |
| | text = test_case["text"] |
| | description = test_case["description"] |
| | |
| | try: |
| | |
| | tokens = tokenizer(text, return_tensors="pt") |
| | |
| | if tokens["input_ids"].numel() == 0 and text: |
| | logger.warning(f"⚠ Test {i}: Empty tokenization for non-empty text") |
| | continue |
| | |
| | |
| | decoded = tokenizer.decode( |
| | tokens["input_ids"][0], |
| | skip_special_tokens=True |
| | ) |
| | |
| | |
| | if text.strip() == decoded.strip(): |
| | logger.info(f"✓ Test {i}: {description} - PASSED") |
| | passed_tests += 1 |
| | else: |
| | logger.warning(f"⚠ Test {i}: {description} - Round-trip mismatch") |
| | logger.warning(f" Input: '{text}'") |
| | logger.warning(f" Output: '{decoded}'") |
| | |
| | |
| | if len(text.strip()) > 0: |
| | continue |
| | passed_tests += 1 |
| | |
| | except Exception as e: |
| | logger.error(f"✗ Test {i}: {description} - FAILED: {e}") |
| | return False |
| |
|
| | |
| | total_tests = len([tc for tc in test_cases if tc["text"]]) |
| | if passed_tests >= total_tests - 1: |
| | logger.info(f"✓ Functionality validation passed ({passed_tests}/{len(test_cases)} tests)") |
| | return True |
| | else: |
| | logger.error(f"✗ Functionality validation failed ({passed_tests}/{len(test_cases)} tests passed)") |
| | return False |
| |
|
| | except Exception as e: |
| | logger.error(f"✗ Tokenizer validation failed: {e}") |
| | return False |
| |
|
| | def validate_tokenizer(self) -> bool: |
| | """ |
| | Run complete tokenizer validation. |
| | |
| | Returns: |
| | bool: True if validation passes, False otherwise |
| | """ |
| | logger.info("=== Starting Tokenizer Validation ===") |
| | |
| | |
| | if not self.validate_files(): |
| | return False |
| | |
| | |
| | if not self.validate_tokenizer_functionality(): |
| | return False |
| | |
| | logger.info("✅ Tokenizer validation completed successfully") |
| | return True |
| |
|
| | def check_authentication(self) -> Optional[str]: |
| | """ |
| | Check Hugging Face authentication status. |
| | |
| | Returns: |
| | Optional[str]: Username if authenticated, None otherwise |
| | """ |
| | try: |
| | user_info = whoami() |
| | username = user_info.get("name", "unknown") |
| | logger.info(f"✓ Authenticated as: {username}") |
| | return username |
| | except Exception: |
| | logger.warning("Not authenticated with Hugging Face") |
| | return None |
| |
|
| | def upload_to_hub( |
| | self, |
| | repo_id: str, |
| | private: bool = False, |
| | commit_message: str = "Upload Mon language tokenizer", |
| | create_pr: bool = False, |
| | ) -> bool: |
| | """ |
| | Upload tokenizer to Hugging Face Hub. |
| | |
| | Args: |
| | repo_id: Repository ID (e.g., "username/model-name") |
| | private: Whether to create a private repository |
| | commit_message: Commit message for the upload |
| | create_pr: Whether to create a pull request instead of direct push |
| | |
| | Returns: |
| | bool: True if upload successful, False otherwise |
| | """ |
| | logger.info(f"=== Starting Upload to {repo_id} ===") |
| |
|
| | try: |
| | |
| | if not self.validate_tokenizer(): |
| | logger.error("❌ Upload cancelled - validation failed") |
| | return False |
| |
|
| | |
| | if not self.check_authentication(): |
| | logger.info("Attempting to log in...") |
| | try: |
| | login() |
| | if not self.check_authentication(): |
| | logger.error("❌ Authentication failed") |
| | return False |
| | except Exception as e: |
| | logger.error(f"❌ Login failed: {e}") |
| | return False |
| |
|
| | |
| | api = HfApi() |
| |
|
| | |
| | logger.info(f"Creating/updating repository: {repo_id}") |
| | api.create_repo( |
| | repo_id=repo_id, |
| | private=private, |
| | exist_ok=True, |
| | repo_type="model" |
| | ) |
| | logger.info("✓ Repository ready") |
| |
|
| | |
| | upload_files = [] |
| | ignore_patterns = [ |
| | |
| | "*.pyc", |
| | "__pycache__/", |
| | "*.pyo", |
| | |
| | |
| | ".git/", |
| | ".gitignore", |
| | ".venv/", |
| | "venv/", |
| | ".env", |
| | ".env.*", |
| | ".python-version", |
| | |
| | |
| | "*.lock", |
| | "uv.lock", |
| | "Pipfile.lock", |
| | "poetry.lock", |
| | "pyproject.toml", |
| | "setup.py", |
| | "setup.cfg", |
| | "requirements.txt", |
| | "requirements-dev.txt", |
| | |
| | |
| | "test_*", |
| | "tests/", |
| | "*_test.py", |
| | "sample_*", |
| | "example_*", |
| | "demo_*", |
| | "*_demo.py", |
| | |
| | |
| | "convert_*", |
| | "upload_*", |
| | "build_*", |
| | "text_processing_*", |
| | "*.py", |
| | |
| | |
| | "datasets/", |
| | "data/", |
| | "checkpoints/", |
| | "logs/", |
| | "wandb/", |
| | |
| | |
| | "*.tmp", |
| | "*.temp", |
| | ".cache/", |
| | "*.meta.json", |
| | "*.backup", |
| | |
| | |
| | ".DS_Store", |
| | "Thumbs.db", |
| | "desktop.ini" |
| | ] |
| |
|
| | logger.info("Files to be uploaded:") |
| | for file_path in self.directory.iterdir(): |
| | if file_path.is_file() and not any( |
| | file_path.match(pattern) for pattern in ignore_patterns |
| | ): |
| | size = file_path.stat().st_size |
| | upload_files.append(file_path.name) |
| | logger.info(f" ✓ {file_path.name} ({size:,} bytes)") |
| |
|
| | |
| | logger.info("Uploading to Hugging Face Hub...") |
| | api.upload_folder( |
| | folder_path=str(self.directory), |
| | repo_id=repo_id, |
| | commit_message=commit_message, |
| | ignore_patterns=ignore_patterns, |
| | create_pr=create_pr, |
| | ) |
| |
|
| | hub_url = f"https://huggingface.co/{repo_id}" |
| | logger.info(f"🎉 Upload successful!") |
| | logger.info(f"📍 Repository URL: {hub_url}") |
| | |
| | if create_pr: |
| | logger.info("📝 Pull request created for review") |
| | |
| | return True |
| |
|
| | except Exception as e: |
| | logger.error(f"❌ Upload failed: {e}") |
| | return False |
| |
|
| |
|
| | def main(): |
| | """Main entry point for the upload script.""" |
| | import argparse |
| |
|
| | parser = argparse.ArgumentParser( |
| | description="Upload Mon tokenizer to Hugging Face Hub" |
| | ) |
| | parser.add_argument( |
| | "--repo-id", |
| | default="janakhpon/mon_tokenizer", |
| | help="Repository ID (default: janakhpon/mon_tokenizer)", |
| | ) |
| | parser.add_argument( |
| | "--directory", |
| | default=".", |
| | help="Directory containing tokenizer files (default: current directory)", |
| | ) |
| | parser.add_argument( |
| | "--private", |
| | action="store_true", |
| | help="Create private repository", |
| | ) |
| | parser.add_argument( |
| | "--message", |
| | default="Upload Mon language tokenizer", |
| | help="Commit message", |
| | ) |
| | parser.add_argument( |
| | "--create-pr", |
| | action="store_true", |
| | help="Create pull request instead of direct push", |
| | ) |
| | parser.add_argument( |
| | "--validate-only", |
| | action="store_true", |
| | help="Only validate tokenizer, don't upload", |
| | ) |
| | parser.add_argument( |
| | "--verbose", |
| | action="store_true", |
| | help="Enable verbose logging", |
| | ) |
| |
|
| | args = parser.parse_args() |
| |
|
| | if args.verbose: |
| | logging.getLogger().setLevel(logging.DEBUG) |
| |
|
| | |
| | uploader = TokenizerUploader(directory=args.directory) |
| |
|
| | if args.validate_only: |
| | |
| | success = uploader.validate_tokenizer() |
| | logger.info("Validation completed.") |
| | else: |
| | |
| | if args.repo_id == "janakhpon/mon_tokenizer": |
| | print("\n🤗 Mon Tokenizer Hub Uploader") |
| | print("=" * 40) |
| | |
| | repo_input = input(f"Repository ID [{args.repo_id}]: ").strip() |
| | if repo_input: |
| | args.repo_id = repo_input |
| | |
| | private_input = input("Private repository? (y/N): ").strip().lower() |
| | args.private = private_input == 'y' |
| | |
| | print(f"\nUploading to: {args.repo_id}") |
| | print(f"Private: {args.private}") |
| | print("-" * 40) |
| |
|
| | |
| | success = uploader.upload_to_hub( |
| | repo_id=args.repo_id, |
| | private=args.private, |
| | commit_message=args.message, |
| | create_pr=args.create_pr, |
| | ) |
| |
|
| | exit(0 if success else 1) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |