mon_tokenizer / upload_to_hub.py

feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue

1f1b899 6 months ago

18 kB

	#!/usr/bin/env python3
	"""
	Upload Mon tokenizer to Hugging Face Hub.

	This script provides functionality to validate and upload the Mon language tokenizer
	to Hugging Face Hub with comprehensive validation and modern best practices.

	Required files:
	- `tokenizer_config.json` - Main tokenizer configuration
	- `special_tokens_map.json` - Special token mappings
	- `README.md` - Model documentation and usage instructions
	- `.gitattributes` - Git LFS configuration for large files

	Required tokenizer model files (at least one):
	- `tokenizer.json` - Fast tokenizer (recommended for reliability)
	- `tokenizer.model` - SentencePiece model file (slow tokenizer backup)
	- `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated)

	Optional files:
	- `generation_config.json` - Text generation configuration
	- `vocab.txt` - Vocabulary file for certain tokenizer types
	- `merges.txt` - BPE merge rules for certain tokenizer types

	The script validates all files, tests functionality, and uploads only essential files
	while excluding development artifacts (.env, .py scripts, caches, etc.).

	"""

	import logging
	import os
	from pathlib import Path
	from typing import List, Optional

	from huggingface_hub import HfApi, login, whoami
	from transformers import AutoTokenizer

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	handlers=[logging.StreamHandler()],
	)
	logger = logging.getLogger(__name__)


	class TokenizerUploader:
	"""Handles validation and upload of tokenizers to Hugging Face Hub."""

	def __init__(self, directory: str = "."):
	"""
	Initialize the uploader.

	Args:
	directory: Directory containing the tokenizer files
	"""
	self.directory = Path(directory)
	self.required_files = [
	"tokenizer_config.json",
	"special_tokens_map.json",
	"README.md",
	".gitattributes",
	]
	# Optional but recommended files
	self.optional_files = [
	"generation_config.json",
	"vocab.txt",
	"merges.txt",
	"tokenizer.json", # Fast tokenizer (becomes required if no .model file)
	"added_tokens.json", # Additional tokens
	"preprocessor_config.json", # Preprocessing configuration
	]
	# Tokenizer model files - check for either fast or slow tokenizer
	self.model_files = ["tokenizer.json", "tokenizer.model", "mon_tokenizer.model"]

	def validate_files(self) -> bool:
	"""
	Validate that all required files are present.

	Returns:
	bool: True if all files are present, False otherwise
	"""
	logger.info(f"Validating tokenizer files in: {self.directory.absolute()}")

	missing_files = []
	present_files = []

	# Check regular required files
	for file_name in self.required_files:
	file_path = self.directory / file_name
	if file_path.exists():
	size = file_path.stat().st_size
	present_files.append((file_name, size))
	logger.info(f"✓ {file_name} ({size:,} bytes)")
	else:
	missing_files.append(file_name)
	logger.error(f"✗ {file_name} (missing)")

	# Check optional files
	for file_name in self.optional_files:
	file_path = self.directory / file_name
	if file_path.exists():
	size = file_path.stat().st_size
	present_files.append((file_name, size))
	logger.info(f"✓ {file_name} ({size:,} bytes) [optional]")

	# Check for tokenizer model files - at least one must exist
	model_found = False
	found_models = []
	for model_name in self.model_files:
	model_path = self.directory / model_name
	if model_path.exists():
	size = model_path.stat().st_size
	present_files.append((model_name, size))
	found_models.append(model_name)
	logger.info(f"✓ {model_name} ({size:,} bytes)")
	model_found = True

	if not model_found:
	missing_files.append("tokenizer model file (tokenizer.json, tokenizer.model, or mon_tokenizer.model)")
	logger.error(f"✗ No tokenizer model file found (looked for: {', '.join(self.model_files)})")
	else:
	logger.info(f"✓ Found tokenizer model(s): {', '.join(found_models)}")

	if missing_files:
	logger.error(f"Missing required files: {', '.join(missing_files)}")
	return False

	total_required = len(self.required_files) + 1 # +1 for model file
	logger.info(f"✓ All {total_required} essential files present")
	return True

	def validate_tokenizer_functionality(self) -> bool:
	"""
	Validate tokenizer functionality with comprehensive tests.
	Supports both fast (tokenizer.json) and slow (tokenizer.model) tokenizers.

	Returns:
	bool: True if all tests pass, False otherwise
	"""
	logger.info("Validating tokenizer functionality")

	try:
	# Load tokenizer with explicit local files only
	abs_directory = str(self.directory.absolute())

	# Determine tokenizer type for better error handling
	has_fast = (self.directory / "tokenizer.json").exists()
	has_slow = any((self.directory / model).exists() for model in ["tokenizer.model", "mon_tokenizer.model"])

	if has_fast:
	logger.info("Detected fast tokenizer (tokenizer.json)")
	if has_slow:
	logger.info("Detected slow tokenizer (*.model)")

	tokenizer = AutoTokenizer.from_pretrained(
	abs_directory,
	local_files_only=True,
	trust_remote_code=False # Security best practice
	)

	tokenizer_type = "fast" if tokenizer.is_fast else "slow"
	logger.info(f"✓ {tokenizer_type.capitalize()} tokenizer loaded (vocab: {tokenizer.vocab_size:,})")

	# Comprehensive test cases for Mon language
	test_cases = [
	{
	"text": "ဘာသာမန်",
	"description": "Simple Mon word"
	},
	{
	"text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
	"description": "Complex Mon sentence with punctuation"
	},
	{
	"text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
	"description": "Mon text with geographical references"
	},
	{
	"text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ",
	"description": "Mon numerals and dates"
	},
	{
	"text": "",
	"description": "Empty string test"
	}
	]

	passed_tests = 0

	for i, test_case in enumerate(test_cases, 1):
	text = test_case["text"]
	description = test_case["description"]

	try:
	# Test tokenization
	tokens = tokenizer(text, return_tensors="pt")

	if tokens["input_ids"].numel() == 0 and text:
	logger.warning(f"⚠ Test {i}: Empty tokenization for non-empty text")
	continue

	# Test decoding
	decoded = tokenizer.decode(
	tokens["input_ids"][0],
	skip_special_tokens=True
	)

	# Check round-trip accuracy
	if text.strip() == decoded.strip():
	logger.info(f"✓ Test {i}: {description} - PASSED")
	passed_tests += 1
	else:
	logger.warning(f"⚠ Test {i}: {description} - Round-trip mismatch")
	logger.warning(f" Input: '{text}'")
	logger.warning(f" Output: '{decoded}'")

	# For some cases, minor differences might be acceptable
	if len(text.strip()) > 0: # Don't fail on empty strings
	continue
	passed_tests += 1

	except Exception as e:
	logger.error(f"✗ Test {i}: {description} - FAILED: {e}")
	return False

	# Check test results
	total_tests = len([tc for tc in test_cases if tc["text"]]) # Exclude empty string
	if passed_tests >= total_tests - 1: # Allow one test to fail
	logger.info(f"✓ Functionality validation passed ({passed_tests}/{len(test_cases)} tests)")
	return True
	else:
	logger.error(f"✗ Functionality validation failed ({passed_tests}/{len(test_cases)} tests passed)")
	return False

	except Exception as e:
	logger.error(f"✗ Tokenizer validation failed: {e}")
	return False

	def validate_tokenizer(self) -> bool:
	"""
	Run complete tokenizer validation.

	Returns:
	bool: True if validation passes, False otherwise
	"""
	logger.info("=== Starting Tokenizer Validation ===")

	# Validate files
	if not self.validate_files():
	return False

	# Validate functionality
	if not self.validate_tokenizer_functionality():
	return False

	logger.info("✅ Tokenizer validation completed successfully")
	return True

	def check_authentication(self) -> Optional[str]:
	"""
	Check Hugging Face authentication status.

	Returns:
	Optional[str]: Username if authenticated, None otherwise
	"""
	try:
	user_info = whoami()
	username = user_info.get("name", "unknown")
	logger.info(f"✓ Authenticated as: {username}")
	return username
	except Exception:
	logger.warning("Not authenticated with Hugging Face")
	return None

	def upload_to_hub(
	self,
	repo_id: str,
	private: bool = False,
	commit_message: str = "Upload Mon language tokenizer",
	create_pr: bool = False,
	) -> bool:
	"""
	Upload tokenizer to Hugging Face Hub.

	Args:
	repo_id: Repository ID (e.g., "username/model-name")
	private: Whether to create a private repository
	commit_message: Commit message for the upload
	create_pr: Whether to create a pull request instead of direct push

	Returns:
	bool: True if upload successful, False otherwise
	"""
	logger.info(f"=== Starting Upload to {repo_id} ===")

	try:
	# Validate tokenizer first
	if not self.validate_tokenizer():
	logger.error("❌ Upload cancelled - validation failed")
	return False

	# Check authentication
	if not self.check_authentication():
	logger.info("Attempting to log in...")
	try:
	login()
	if not self.check_authentication():
	logger.error("❌ Authentication failed")
	return False
	except Exception as e:
	logger.error(f"❌ Login failed: {e}")
	return False

	# Create API client
	api = HfApi()

	# Create/update repository
	logger.info(f"Creating/updating repository: {repo_id}")
	api.create_repo(
	repo_id=repo_id,
	private=private,
	exist_ok=True,
	repo_type="model"
	)
	logger.info("✓ Repository ready")

	# List files to upload
	upload_files = []
	ignore_patterns = [
	# Python compilation artifacts
	"*.pyc",
	"__pycache__/",
	"*.pyo",

	# Version control and development
	".git/",
	".gitignore",
	".venv/",
	"venv/",
	".env",
	".env.*",
	".python-version",

	# Build and dependency files
	"*.lock",
	"uv.lock",
	"Pipfile.lock",
	"poetry.lock",
	"pyproject.toml",
	"setup.py",
	"setup.cfg",
	"requirements.txt",
	"requirements-dev.txt",

	# Development and testing files
	"test_*",
	"tests/",
	"*_test.py",
	"sample_*",
	"example_*",
	"demo_*",
	"*_demo.py",

	# Build and conversion scripts
	"convert_*",
	"upload_*",
	"build_*",
	"text_processing_*",
	"*.py", # Don't upload Python scripts

	# Dataset and training artifacts
	"datasets/",
	"data/",
	"checkpoints/",
	"logs/",
	"wandb/",

	# Temporary and cache files
	"*.tmp",
	"*.temp",
	".cache/",
	"*.meta.json",
	"*.backup",

	# OS specific files
	".DS_Store",
	"Thumbs.db",
	"desktop.ini"
	]

	logger.info("Files to be uploaded:")
	for file_path in self.directory.iterdir():
	if file_path.is_file() and not any(
	file_path.match(pattern) for pattern in ignore_patterns
	):
	size = file_path.stat().st_size
	upload_files.append(file_path.name)
	logger.info(f" ✓ {file_path.name} ({size:,} bytes)")

	# Upload files
	logger.info("Uploading to Hugging Face Hub...")
	api.upload_folder(
	folder_path=str(self.directory),
	repo_id=repo_id,
	commit_message=commit_message,
	ignore_patterns=ignore_patterns,
	create_pr=create_pr,
	)

	hub_url = f"https://huggingface.co/{repo_id}"
	logger.info(f"🎉 Upload successful!")
	logger.info(f"📍 Repository URL: {hub_url}")

	if create_pr:
	logger.info("📝 Pull request created for review")

	return True

	except Exception as e:
	logger.error(f"❌ Upload failed: {e}")
	return False


	def main():
	"""Main entry point for the upload script."""
	import argparse

	parser = argparse.ArgumentParser(
	description="Upload Mon tokenizer to Hugging Face Hub"
	)
	parser.add_argument(
	"--repo-id",
	default="janakhpon/mon_tokenizer",
	help="Repository ID (default: janakhpon/mon_tokenizer)",
	)
	parser.add_argument(
	"--directory",
	default=".",
	help="Directory containing tokenizer files (default: current directory)",
	)
	parser.add_argument(
	"--private",
	action="store_true",
	help="Create private repository",
	)
	parser.add_argument(
	"--message",
	default="Upload Mon language tokenizer",
	help="Commit message",
	)
	parser.add_argument(
	"--create-pr",
	action="store_true",
	help="Create pull request instead of direct push",
	)
	parser.add_argument(
	"--validate-only",
	action="store_true",
	help="Only validate tokenizer, don't upload",
	)
	parser.add_argument(
	"--verbose",
	action="store_true",
	help="Enable verbose logging",
	)

	args = parser.parse_args()

	if args.verbose:
	logging.getLogger().setLevel(logging.DEBUG)

	# Create uploader
	uploader = TokenizerUploader(directory=args.directory)

	if args.validate_only:
	# Only validate
	success = uploader.validate_tokenizer()
	logger.info("Validation completed.")
	else:
	# Interactive mode if no repo ID provided
	if args.repo_id == "janakhpon/mon_tokenizer":
	print("\n🤗 Mon Tokenizer Hub Uploader")
	print("=" * 40)

	repo_input = input(f"Repository ID [{args.repo_id}]: ").strip()
	if repo_input:
	args.repo_id = repo_input

	private_input = input("Private repository? (y/N): ").strip().lower()
	args.private = private_input == 'y'

	print(f"\nUploading to: {args.repo_id}")
	print(f"Private: {args.private}")
	print("-" * 40)

	# Upload tokenizer
	success = uploader.upload_to_hub(
	repo_id=args.repo_id,
	private=args.private,
	commit_message=args.message,
	create_pr=args.create_pr,
	)

	exit(0 if success else 1)


	if __name__ == "__main__":
	main()