import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL
from diffusers.schedulers import EulerDiscreteScheduler
from diffusers.utils import load_image, check_min_version
import os
import time
from PIL import Image
from typing import Generator, Tuple
import gc

# Model configuration
MODEL_ID = "cerspense/zeroscope_v2_576w"  # 2.5GB model with good quality
VAE_ID = "madebyollin/sdxl-vae-fp16-fix"  # Compact VAE
SCHEDULER = "EulerDiscreteScheduler"

@spaces.GPU(duration=1500)  # AoT compilation for 7GB+ model
def compile_model():
    """Compile the text-to-video model for optimal performance"""
    print("🚀 Compiling model for ahead-of-time optimization...")
    
    # Load components
    vae = AutoencoderKL.from_pretrained(VAE_ID, torch_dtype=torch.float16)
    scheduler = EulerDiscreteScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
    
    # Create pipeline with optimization
    pipe = DiffusionPipeline.from_pretrained(
        MODEL_ID,
        vae=vae,
        scheduler=scheduler,
        torch_dtype=torch.float16,
        variant="fp16",
        use_safetensors=True
    )
    
    # Enable memory efficient attention and compile
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()
    pipe.enable_attention_slicing()
    
    # AoT compilation for 1.3x-1.8x speedup
    with spaces.aoti_capture(pipe.transformer) as call:
        pipe("test prompt for compilation", num_frames=6)
    
    exported = torch.export.export(
        pipe.transformer,
        args=call.args,
        kwargs=call.kwargs,
    )
    
    compiled_model = spaces.aoti_compile(exported)
    spaces.aoti_apply(compiled_model, pipe.transformer)
    
    return pipe

# Initialize the model
print("🔄 Loading text-to-video model...")
pipe = compile_model()
pipe.to('cuda')

@spaces.GPU
def generate_video(
    prompt: str,
    num_frames: int = 8,
    width: int = 576,
    height: int = 320,
    num_inference_steps: int = 25,
    guidance_scale: float = 17.5,
    progress: gr.Progress = gr.Progress()
) -> Generator[Tuple[str, np.ndarray], None, None]:
    """
    Generate a video from text prompt using the compiled model.
    
    Args:
        prompt: Text description for video generation
        num_frames: Number of frames in the video (6-16)
        width: Video width (576 recommended for quality)
        height: Video height (320 recommended for quality)
        num_inference_steps: Diffusion steps (20-30 recommended)
        guidance_scale: CFG scale (15-20 recommended)
        
    Yields:
        Tuple of (status_message, video_data)
    """
    try:
        # Clear GPU cache for optimal performance
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        
        # Validate parameters
        prompt = prompt.strip()
        if not prompt:
            yield "❌ Please enter a text prompt", None
            return
            
        if not 6 <= num_frames <= 16:
            yield "❌ Number of frames must be between 6-16", None
            return
            
        if not 200 <= width <= 1024:
            yield "❌ Width must be between 200-1024", None
            return
            
        if not 200 <= height <= 1024:
            yield "❌ Height must be between 200-1024", None
            return
        
        yield "🎬 Initializing video generation...", None
        
        # Set up progress tracking
        total_steps = num_inference_steps
        current_step = 0
        
        def progress_callback(step, timestep, latents):
            nonlocal current_step
            current_step += 1
            progress = (current_step / total_steps) * 100
            yield f"🎨 Generating video... {progress:.1f}% ({current_step}/{total_steps} steps)", None
        
        # Generate video frames
        yield "🔥 Generating video frames...", None
        start_time = time.time()
        
        # Run inference with optimized settings
        with torch.inference_mode():
            result = pipe(
                prompt=prompt,
                num_frames=num_frames,
                width=width,
                height=height,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                callback=progress_callback,
                callback_steps=1
            )
        
        # Extract frames
        frames = result.frames[0]  # Get first batch of frames
        generation_time = time.time() - start_time
        
        yield f"✅ Video generated in {generation_time:.1f}s!", frames
        
    except Exception as e:
        error_msg = f"❌ Generation failed: {str(e)}"
        yield error_msg, None
        print(f"Error: {e}")
        
    finally:
        # Clean up
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

def get_recommended_settings() -> dict:
    """Get recommended generation settings"""
    return {
        "num_frames": 8,
        "width": 576,
        "height": 320,
        "num_inference_steps": 25,
        "guidance_scale": 17.5
    }

# Create the Gradio interface
def create_demo():
    """Create the main Gradio demo"""
    
    with gr.Blocks(
        title="🚀 Lightning Text-to-Video Generator",
        description="Generate high-quality videos from text prompts using advanced AI",
        theme=gr.themes.Soft()
    ) as demo:
        
        # Header with anycoder attribution
        gr.HTML("""
        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
            <h1 style="color: white; margin: 0; font-size: 2.5em;">🎬 Lightning Text-to-Video Generator</h1>
            <p style="color: white; margin: 10px 0; font-size: 1.2em;">Transform your ideas into stunning videos instantly</p>
            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #FFD700; text-decoration: none; font-size: 1.1em; font-weight: bold;">
                ⭐ Built with anycoder
            </a>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h3>📝 Text Prompt</h3>")
                prompt_input = gr.Textbox(
                    label="Describe your video",
                    placeholder="A majestic dragon flying over a mystical forest at sunset, with glowing particles falling from the sky",
                    lines=4,
                    max_length=500
                )
                
                # Quick presets
                gr.HTML("<h3>🎯 Quick Presets</h3>")
                with gr.Row():
                    preset_btn1 = gr.Button("🌊 Nature Scene", variant="secondary", size="sm")
                    preset_btn2 = gr.Button("🏙️ Urban Scene", variant="secondary", size="sm")
                    preset_btn3 = gr.Button("🚀 Sci-Fi", variant="secondary", size="sm")
                    preset_btn4 = gr.Button("🎭 Fantasy", variant="secondary", size="sm")
                
                # Advanced settings
                with gr.Accordion("⚙️ Advanced Settings", open=False):
                    num_frames = gr.Slider(
                        minimum=6, maximum=16, value=8, step=1,
                        label="Number of Frames",
                        info="More frames = longer video but slower generation"
                    )
                    
                    with gr.Row():
                        width = gr.Slider(
                            minimum=200, maximum=1024, value=576, step=64,
                            label="Width",
                            info="Video width (576px recommended)"
                        )
                        height = gr.Slider(
                            minimum=200, maximum=1024, value=320, step=64,
                            label="Height", 
                            info="Video height (320px recommended)"
                        )
                    
                    num_inference_steps = gr.Slider(
                        minimum=15, maximum=50, value=25, step=5,
                        label="Generation Steps",
                        info="More steps = better quality but slower"
                    )
                    
                    guidance_scale = gr.Slider(
                        minimum=5, maximum=25, value=17.5, step=0.5,
                        label="Guidance Scale",
                        info="How closely to follow the prompt (15-20 recommended)"
                    )
                
                # Action buttons
                with gr.Row():
                    generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")
                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                
                # Quick settings
                with gr.Row():
                    quality_btn = gr.Button("⚡ Fast", variant="secondary", size="sm")
                    quality_btn2 = gr.Button("🎨 High Quality", variant="secondary", size="sm")
                
                # Status display
                status = gr.HTML("<p style='color: #666;'>Ready to generate your video!</p>")
                
            with gr.Column(scale=1):
                gr.HTML("<h3>🎥 Generated Video</h3>")
                video_output = gr.Video(
                    label="Your Generated Video",
                    format="mp4",
                    loop=True,
                    autoplay=True,
                    height=400
                )
                
                # Info panel
                info_panel = gr.HTML("""
                <div style="padding: 15px; background: #f8f9fa; border-radius: 8px; margin-top: 10px;">
                    <h4>💡 Tips for Better Results:</h4>
                    <ul style="color: #555; font-size: 0.9em;">
                        <li>Be specific and descriptive in your prompts</li>
                        <li>Use adjectives to describe style, lighting, mood</li>
                        <li>Include camera movements (pan, zoom, rotate)</li>
                        <li>Fast mode: 6-8 frames, 15-20 steps</li>
                        <li>High quality: 10-12 frames, 25-30 steps</li>
                    </ul>
                </div>
                """)
        
        # Preset prompt handlers
        preset_prompts = {
            preset_btn1: "A serene mountain landscape with flowing river, golden hour lighting, birds flying in the sky",
            preset_btn2: "A bustling city street at night with neon lights, cars driving by, people walking",
            preset_btn3: "A futuristic spaceship flying through a galaxy with colorful nebulas and distant stars",
            preset_btn4: "A magical forest with glowing mushrooms, fairy lights dancing, mystical creatures moving"
        }
        
        for btn, preset_text in preset_prompts.items():
            btn.click(
                lambda text=preset_text: gr.update(value=text),
                outputs=prompt_input
            )
        
        # Quality settings
        def apply_fast_settings():
            return 6, 512, 288, 15, 15.0
        
        def apply_quality_settings():
            return 12, 576, 320, 30, 18.0
        
        quality_btn.click(apply_fast_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
        quality_btn2.click(apply_quality_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
        
        # Main generation handler
        def handle_generate(prompt, num_frames, width, height, steps, guidance):
            # Create generator for progress updates
            def gen():
                for status, video in generate_video(prompt, num_frames, width, height, steps, guidance):
                    yield status, video
            
            return gen
        
        # Connect events
        generate_btn.click(
            handle_generate,
            inputs=[prompt_input, num_frames, width, height, num_inference_steps, guidance_scale],
            outputs=[status, video_output]
        )
        
        def clear_all():
            return "", None, *get_recommended_settings().values(), "🗑️ Cleared! Ready for new generation."
        
        clear_btn.click(
            clear_all,
            outputs=[prompt_input, video_output, num_frames, width, height, num_inference_steps, guidance_scale, status]
        )
    
    return demo

# Create and launch the demo
if __name__ == "__main__":
    demo = create_demo()
    
    # Launch with optimized settings
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True,
        quiet=False,
        max_threads=40,
        concurrency_limit=10
    )