#!/usr/bin/env python3
"""
Upload Mon tokenizer to Hugging Face Hub.

This script provides functionality to validate and upload the Mon language tokenizer
to Hugging Face Hub with comprehensive validation and modern best practices.

Required files:
- `tokenizer_config.json` - Main tokenizer configuration
- `special_tokens_map.json` - Special token mappings  
- `README.md` - Model documentation and usage instructions
- `.gitattributes` - Git LFS configuration for large files

Required tokenizer model files (at least one):
- `tokenizer.json` - Fast tokenizer (recommended for reliability)
- `tokenizer.model` - SentencePiece model file (slow tokenizer backup)
- `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated)

Optional files:
- `generation_config.json` - Text generation configuration
- `vocab.txt` - Vocabulary file for certain tokenizer types
- `merges.txt` - BPE merge rules for certain tokenizer types

The script validates all files, tests functionality, and uploads only essential files
while excluding development artifacts (.env, .py scripts, caches, etc.).

"""

import logging
import os
from pathlib import Path
from typing import List, Optional

from huggingface_hub import HfApi, login, whoami
from transformers import AutoTokenizer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],
)
logger = logging.getLogger(__name__)


class TokenizerUploader:
    """Handles validation and upload of tokenizers to Hugging Face Hub."""

    def __init__(self, directory: str = "."):
        """
        Initialize the uploader.

        Args:
            directory: Directory containing the tokenizer files
        """
        self.directory = Path(directory)
        self.required_files = [
            "tokenizer_config.json",
            "special_tokens_map.json",
            "README.md",
            ".gitattributes",
        ]
        # Optional but recommended files
        self.optional_files = [
            "generation_config.json",
            "vocab.txt",
            "merges.txt",
            "tokenizer.json",  # Fast tokenizer (becomes required if no .model file)
            "added_tokens.json",  # Additional tokens
            "preprocessor_config.json",  # Preprocessing configuration
        ]
        # Tokenizer model files - check for either fast or slow tokenizer
        self.model_files = ["tokenizer.json", "tokenizer.model", "mon_tokenizer.model"]

    def validate_files(self) -> bool:
        """
        Validate that all required files are present.

        Returns:
            bool: True if all files are present, False otherwise
        """
        logger.info(f"Validating tokenizer files in: {self.directory.absolute()}")

        missing_files = []
        present_files = []

        # Check regular required files
        for file_name in self.required_files:
            file_path = self.directory / file_name
            if file_path.exists():
                size = file_path.stat().st_size
                present_files.append((file_name, size))
                logger.info(f"✓ {file_name} ({size:,} bytes)")
            else:
                missing_files.append(file_name)
                logger.error(f"✗ {file_name} (missing)")

        # Check optional files
        for file_name in self.optional_files:
            file_path = self.directory / file_name
            if file_path.exists():
                size = file_path.stat().st_size
                present_files.append((file_name, size))
                logger.info(f"✓ {file_name} ({size:,} bytes) [optional]")

        # Check for tokenizer model files - at least one must exist
        model_found = False
        found_models = []
        for model_name in self.model_files:
            model_path = self.directory / model_name
            if model_path.exists():
                size = model_path.stat().st_size
                present_files.append((model_name, size))
                found_models.append(model_name)
                logger.info(f"✓ {model_name} ({size:,} bytes)")
                model_found = True
        
        if not model_found:
            missing_files.append("tokenizer model file (tokenizer.json, tokenizer.model, or mon_tokenizer.model)")
            logger.error(f"✗ No tokenizer model file found (looked for: {', '.join(self.model_files)})")
        else:
            logger.info(f"✓ Found tokenizer model(s): {', '.join(found_models)}")

        if missing_files:
            logger.error(f"Missing required files: {', '.join(missing_files)}")
            return False

        total_required = len(self.required_files) + 1  # +1 for model file
        logger.info(f"✓ All {total_required} essential files present")
        return True

    def validate_tokenizer_functionality(self) -> bool:
        """
        Validate tokenizer functionality with comprehensive tests.
        Supports both fast (tokenizer.json) and slow (tokenizer.model) tokenizers.

        Returns:
            bool: True if all tests pass, False otherwise
        """
        logger.info("Validating tokenizer functionality")

        try:
            # Load tokenizer with explicit local files only
            abs_directory = str(self.directory.absolute())
            
            # Determine tokenizer type for better error handling
            has_fast = (self.directory / "tokenizer.json").exists()
            has_slow = any((self.directory / model).exists() for model in ["tokenizer.model", "mon_tokenizer.model"])
            
            if has_fast:
                logger.info("Detected fast tokenizer (tokenizer.json)")
            if has_slow:
                logger.info("Detected slow tokenizer (*.model)")
            
            tokenizer = AutoTokenizer.from_pretrained(
                abs_directory, 
                local_files_only=True,
                trust_remote_code=False  # Security best practice
            )

            tokenizer_type = "fast" if tokenizer.is_fast else "slow"
            logger.info(f"✓ {tokenizer_type.capitalize()} tokenizer loaded (vocab: {tokenizer.vocab_size:,})")

            # Comprehensive test cases for Mon language
            test_cases = [
                {
                    "text": "ဘာသာမန်",
                    "description": "Simple Mon word"
                },
                {
                    "text": "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
                    "description": "Complex Mon sentence with punctuation"
                },
                {
                    "text": "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။",
                    "description": "Mon text with geographical references"
                },
                {
                    "text": "၁၂၃၄၅ ဂတာပ်ခ္ဍာ် ၂၀၂၄ သၞာံ",
                    "description": "Mon numerals and dates"
                },
                {
                    "text": "",
                    "description": "Empty string test"
                }
            ]

            passed_tests = 0
            
            for i, test_case in enumerate(test_cases, 1):
                text = test_case["text"]
                description = test_case["description"]
                
                try:
                    # Test tokenization
                    tokens = tokenizer(text, return_tensors="pt")
                    
                    if tokens["input_ids"].numel() == 0 and text:
                        logger.warning(f"⚠ Test {i}: Empty tokenization for non-empty text")
                        continue
                    
                    # Test decoding
                    decoded = tokenizer.decode(
                        tokens["input_ids"][0], 
                        skip_special_tokens=True
                    )
                    
                    # Check round-trip accuracy
                    if text.strip() == decoded.strip():
                        logger.info(f"✓ Test {i}: {description} - PASSED")
                        passed_tests += 1
                    else:
                        logger.warning(f"⚠ Test {i}: {description} - Round-trip mismatch")
                        logger.warning(f"  Input:  '{text}'")
                        logger.warning(f"  Output: '{decoded}'")
                        
                        # For some cases, minor differences might be acceptable
                        if len(text.strip()) > 0:  # Don't fail on empty strings
                            continue
                        passed_tests += 1
                        
                except Exception as e:
                    logger.error(f"✗ Test {i}: {description} - FAILED: {e}")
                    return False

            # Check test results
            total_tests = len([tc for tc in test_cases if tc["text"]])  # Exclude empty string
            if passed_tests >= total_tests - 1:  # Allow one test to fail
                logger.info(f"✓ Functionality validation passed ({passed_tests}/{len(test_cases)} tests)")
                return True
            else:
                logger.error(f"✗ Functionality validation failed ({passed_tests}/{len(test_cases)} tests passed)")
                return False

        except Exception as e:
            logger.error(f"✗ Tokenizer validation failed: {e}")
            return False

    def validate_tokenizer(self) -> bool:
        """
        Run complete tokenizer validation.

        Returns:
            bool: True if validation passes, False otherwise
        """
        logger.info("=== Starting Tokenizer Validation ===")
        
        # Validate files
        if not self.validate_files():
            return False
            
        # Validate functionality
        if not self.validate_tokenizer_functionality():
            return False
            
        logger.info("✅ Tokenizer validation completed successfully")
        return True

    def check_authentication(self) -> Optional[str]:
        """
        Check Hugging Face authentication status.

        Returns:
            Optional[str]: Username if authenticated, None otherwise
        """
        try:
            user_info = whoami()
            username = user_info.get("name", "unknown")
            logger.info(f"✓ Authenticated as: {username}")
            return username
        except Exception:
            logger.warning("Not authenticated with Hugging Face")
            return None

    def upload_to_hub(
        self,
        repo_id: str,
        private: bool = False,
        commit_message: str = "Upload Mon language tokenizer",
        create_pr: bool = False,
    ) -> bool:
        """
        Upload tokenizer to Hugging Face Hub.

        Args:
            repo_id: Repository ID (e.g., "username/model-name")
            private: Whether to create a private repository
            commit_message: Commit message for the upload
            create_pr: Whether to create a pull request instead of direct push

        Returns:
            bool: True if upload successful, False otherwise
        """
        logger.info(f"=== Starting Upload to {repo_id} ===")

        try:
            # Validate tokenizer first
            if not self.validate_tokenizer():
                logger.error("❌ Upload cancelled - validation failed")
                return False

            # Check authentication
            if not self.check_authentication():
                logger.info("Attempting to log in...")
                try:
                    login()
                    if not self.check_authentication():
                        logger.error("❌ Authentication failed")
                        return False
                except Exception as e:
                    logger.error(f"❌ Login failed: {e}")
                    return False

            # Create API client
            api = HfApi()

            # Create/update repository
            logger.info(f"Creating/updating repository: {repo_id}")
            api.create_repo(
                repo_id=repo_id,
                private=private,
                exist_ok=True,
                repo_type="model"
            )
            logger.info("✓ Repository ready")

            # List files to upload
            upload_files = []
            ignore_patterns = [
                # Python compilation artifacts
                "*.pyc",
                "__pycache__/",
                "*.pyo",
                
                # Version control and development
                ".git/",
                ".gitignore",
                ".venv/",
                "venv/",
                ".env",
                ".env.*",
                ".python-version",
                
                # Build and dependency files
                "*.lock",
                "uv.lock",
                "Pipfile.lock",
                "poetry.lock",
                "pyproject.toml",
                "setup.py",
                "setup.cfg",
                "requirements.txt",
                "requirements-dev.txt",
                
                # Development and testing files
                "test_*",
                "tests/",
                "*_test.py",
                "sample_*",
                "example_*",
                "demo_*",
                "*_demo.py",
                
                # Build and conversion scripts
                "convert_*",
                "upload_*", 
                "build_*",
                "text_processing_*",
                "*.py",  # Don't upload Python scripts
                
                # Dataset and training artifacts
                "datasets/",
                "data/",
                "checkpoints/",
                "logs/",
                "wandb/",
                
                # Temporary and cache files
                "*.tmp",
                "*.temp",
                ".cache/",
                "*.meta.json",
                "*.backup",
                
                # OS specific files
                ".DS_Store",
                "Thumbs.db",
                "desktop.ini"
            ]

            logger.info("Files to be uploaded:")
            for file_path in self.directory.iterdir():
                if file_path.is_file() and not any(
                    file_path.match(pattern) for pattern in ignore_patterns
                ):
                    size = file_path.stat().st_size
                    upload_files.append(file_path.name)
                    logger.info(f"  ✓ {file_path.name} ({size:,} bytes)")

            # Upload files
            logger.info("Uploading to Hugging Face Hub...")
            api.upload_folder(
                folder_path=str(self.directory),
                repo_id=repo_id,
                commit_message=commit_message,
                ignore_patterns=ignore_patterns,
                create_pr=create_pr,
            )

            hub_url = f"https://huggingface.co/{repo_id}"
            logger.info(f"🎉 Upload successful!")
            logger.info(f"📍 Repository URL: {hub_url}")
            
            if create_pr:
                logger.info("📝 Pull request created for review")
            
            return True

        except Exception as e:
            logger.error(f"❌ Upload failed: {e}")
            return False


def main():
    """Main entry point for the upload script."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Upload Mon tokenizer to Hugging Face Hub"
    )
    parser.add_argument(
        "--repo-id",
        default="janakhpon/mon_tokenizer",
        help="Repository ID (default: janakhpon/mon_tokenizer)",
    )
    parser.add_argument(
        "--directory",
        default=".",
        help="Directory containing tokenizer files (default: current directory)",
    )
    parser.add_argument(
        "--private",
        action="store_true",
        help="Create private repository",
    )
    parser.add_argument(
        "--message",
        default="Upload Mon language tokenizer",
        help="Commit message",
    )
    parser.add_argument(
        "--create-pr",
        action="store_true",
        help="Create pull request instead of direct push",
    )
    parser.add_argument(
        "--validate-only",
        action="store_true",
        help="Only validate tokenizer, don't upload",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose logging",
    )

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Create uploader
    uploader = TokenizerUploader(directory=args.directory)

    if args.validate_only:
        # Only validate
        success = uploader.validate_tokenizer()
        logger.info("Validation completed.")
    else:
        # Interactive mode if no repo ID provided
        if args.repo_id == "janakhpon/mon_tokenizer":
            print("\n🤗 Mon Tokenizer Hub Uploader")
            print("=" * 40)
            
            repo_input = input(f"Repository ID [{args.repo_id}]: ").strip()
            if repo_input:
                args.repo_id = repo_input
            
            private_input = input("Private repository? (y/N): ").strip().lower()
            args.private = private_input == 'y'
            
            print(f"\nUploading to: {args.repo_id}")
            print(f"Private: {args.private}")
            print("-" * 40)

        # Upload tokenizer
        success = uploader.upload_to_hub(
            repo_id=args.repo_id,
            private=args.private,
            commit_message=args.message,
            create_pr=args.create_pr,
        )

    exit(0 if success else 1)


if __name__ == "__main__":
    main()