Spaces:

AiCoderv2
/

app-jcvryp-74

Build error

File size: 13,007 Bytes

3da4f52

import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL
from diffusers.schedulers import EulerDiscreteScheduler
from diffusers.utils import load_image, check_min_version
import os
import time
from PIL import Image
from typing import Generator, Tuple
import gc

# Model configuration
MODEL_ID = "cerspense/zeroscope_v2_576w"  # 2.5GB model with good quality
VAE_ID = "madebyollin/sdxl-vae-fp16-fix"  # Compact VAE
SCHEDULER = "EulerDiscreteScheduler"

@spaces.GPU(duration=1500)  # AoT compilation for 7GB+ model
def compile_model():
    """Compile the text-to-video model for optimal performance"""
    print("🚀 Compiling model for ahead-of-time optimization...")
    
    # Load components
    vae = AutoencoderKL.from_pretrained(VAE_ID, torch_dtype=torch.float16)
    scheduler = EulerDiscreteScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
    
    # Create pipeline with optimization
    pipe = DiffusionPipeline.from_pretrained(
        MODEL_ID,
        vae=vae,
        scheduler=scheduler,
        torch_dtype=torch.float16,
        variant="fp16",
        use_safetensors=True
    )
    
    # Enable memory efficient attention and compile
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()
    pipe.enable_attention_slicing()
    
    # AoT compilation for 1.3x-1.8x speedup
    with spaces.aoti_capture(pipe.transformer) as call:
        pipe("test prompt for compilation", num_frames=6)
    
    exported = torch.export.export(
        pipe.transformer,
        args=call.args,
        kwargs=call.kwargs,
    )
    
    compiled_model = spaces.aoti_compile(exported)
    spaces.aoti_apply(compiled_model, pipe.transformer)
    
    return pipe

# Initialize the model
print("🔄 Loading text-to-video model...")
pipe = compile_model()
pipe.to('cuda')

@spaces.GPU
def generate_video(
    prompt: str,
    num_frames: int = 8,
    width: int = 576,
    height: int = 320,
    num_inference_steps: int = 25,
    guidance_scale: float = 17.5,
    progress: gr.Progress = gr.Progress()
) -> Generator[Tuple[str, np.ndarray], None, None]:
    """
    Generate a video from text prompt using the compiled model.
    
    Args:
        prompt: Text description for video generation
        num_frames: Number of frames in the video (6-16)
        width: Video width (576 recommended for quality)
        height: Video height (320 recommended for quality)
        num_inference_steps: Diffusion steps (20-30 recommended)
        guidance_scale: CFG scale (15-20 recommended)
        
    Yields:
        Tuple of (status_message, video_data)
    """
    try:
        # Clear GPU cache for optimal performance
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        
        # Validate parameters
        prompt = prompt.strip()
        if not prompt:
            yield "❌ Please enter a text prompt", None
            return
            
        if not 6 <= num_frames <= 16:
            yield "❌ Number of frames must be between 6-16", None
            return
            
        if not 200 <= width <= 1024:
            yield "❌ Width must be between 200-1024", None
            return
            
        if not 200 <= height <= 1024:
            yield "❌ Height must be between 200-1024", None
            return
        
        yield "🎬 Initializing video generation...", None
        
        # Set up progress tracking
        total_steps = num_inference_steps
        current_step = 0
        
        def progress_callback(step, timestep, latents):
            nonlocal current_step
            current_step += 1
            progress = (current_step / total_steps) * 100
            yield f"🎨 Generating video... {progress:.1f}% ({current_step}/{total_steps} steps)", None
        
        # Generate video frames
        yield "🔥 Generating video frames...", None
        start_time = time.time()
        
        # Run inference with optimized settings
        with torch.inference_mode():
            result = pipe(
                prompt=prompt,
                num_frames=num_frames,
                width=width,
                height=height,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                callback=progress_callback,
                callback_steps=1
            )
        
        # Extract frames
        frames = result.frames[0]  # Get first batch of frames
        generation_time = time.time() - start_time
        
        yield f"✅ Video generated in {generation_time:.1f}s!", frames
        
    except Exception as e:
        error_msg = f"❌ Generation failed: {str(e)}"
        yield error_msg, None
        print(f"Error: {e}")
        
    finally:
        # Clean up
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

def get_recommended_settings() -> dict:
    """Get recommended generation settings"""
    return {
        "num_frames": 8,
        "width": 576,
        "height": 320,
        "num_inference_steps": 25,
        "guidance_scale": 17.5
    }

# Create the Gradio interface
def create_demo():
    """Create the main Gradio demo"""
    
    with gr.Blocks(
        title="🚀 Lightning Text-to-Video Generator",
        description="Generate high-quality videos from text prompts using advanced AI",
        theme=gr.themes.Soft()
    ) as demo:
        
        # Header with anycoder attribution
        gr.HTML("""
        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
            <h1 style="color: white; margin: 0; font-size: 2.5em;">🎬 Lightning Text-to-Video Generator</h1>
            <p style="color: white; margin: 10px 0; font-size: 1.2em;">Transform your ideas into stunning videos instantly</p>
            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #FFD700; text-decoration: none; font-size: 1.1em; font-weight: bold;">
                ⭐ Built with anycoder
            </a>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h3>📝 Text Prompt</h3>")
                prompt_input = gr.Textbox(
                    label="Describe your video",
                    placeholder="A majestic dragon flying over a mystical forest at sunset, with glowing particles falling from the sky",
                    lines=4,
                    max_length=500
                )
                
                # Quick presets
                gr.HTML("<h3>🎯 Quick Presets</h3>")
                with gr.Row():
                    preset_btn1 = gr.Button("🌊 Nature Scene", variant="secondary", size="sm")
                    preset_btn2 = gr.Button("🏙️ Urban Scene", variant="secondary", size="sm")
                    preset_btn3 = gr.Button("🚀 Sci-Fi", variant="secondary", size="sm")
                    preset_btn4 = gr.Button("🎭 Fantasy", variant="secondary", size="sm")
                
                # Advanced settings
                with gr.Accordion("⚙️ Advanced Settings", open=False):
                    num_frames = gr.Slider(
                        minimum=6, maximum=16, value=8, step=1,
                        label="Number of Frames",
                        info="More frames = longer video but slower generation"
                    )
                    
                    with gr.Row():
                        width = gr.Slider(
                            minimum=200, maximum=1024, value=576, step=64,
                            label="Width",
                            info="Video width (576px recommended)"
                        )
                        height = gr.Slider(
                            minimum=200, maximum=1024, value=320, step=64,
                            label="Height", 
                            info="Video height (320px recommended)"
                        )
                    
                    num_inference_steps = gr.Slider(
                        minimum=15, maximum=50, value=25, step=5,
                        label="Generation Steps",
                        info="More steps = better quality but slower"
                    )
                    
                    guidance_scale = gr.Slider(
                        minimum=5, maximum=25, value=17.5, step=0.5,
                        label="Guidance Scale",
                        info="How closely to follow the prompt (15-20 recommended)"
                    )
                
                # Action buttons
                with gr.Row():
                    generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")
                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                
                # Quick settings
                with gr.Row():
                    quality_btn = gr.Button("⚡ Fast", variant="secondary", size="sm")
                    quality_btn2 = gr.Button("🎨 High Quality", variant="secondary", size="sm")
                
                # Status display
                status = gr.HTML("<p style='color: #666;'>Ready to generate your video!</p>")
                
            with gr.Column(scale=1):
                gr.HTML("<h3>🎥 Generated Video</h3>")
                video_output = gr.Video(
                    label="Your Generated Video",
                    format="mp4",
                    loop=True,
                    autoplay=True,
                    height=400
                )
                
                # Info panel
                info_panel = gr.HTML("""
                <div style="padding: 15px; background: #f8f9fa; border-radius: 8px; margin-top: 10px;">
                    <h4>💡 Tips for Better Results:</h4>
                    <ul style="color: #555; font-size: 0.9em;">
                        <li>Be specific and descriptive in your prompts</li>
                        <li>Use adjectives to describe style, lighting, mood</li>
                        <li>Include camera movements (pan, zoom, rotate)</li>
                        <li>Fast mode: 6-8 frames, 15-20 steps</li>
                        <li>High quality: 10-12 frames, 25-30 steps</li>
                    </ul>
                </div>
                """)
        
        # Preset prompt handlers
        preset_prompts = {
            preset_btn1: "A serene mountain landscape with flowing river, golden hour lighting, birds flying in the sky",
            preset_btn2: "A bustling city street at night with neon lights, cars driving by, people walking",
            preset_btn3: "A futuristic spaceship flying through a galaxy with colorful nebulas and distant stars",
            preset_btn4: "A magical forest with glowing mushrooms, fairy lights dancing, mystical creatures moving"
        }
        
        for btn, preset_text in preset_prompts.items():
            btn.click(
                lambda text=preset_text: gr.update(value=text),
                outputs=prompt_input
            )
        
        # Quality settings
        def apply_fast_settings():
            return 6, 512, 288, 15, 15.0
        
        def apply_quality_settings():
            return 12, 576, 320, 30, 18.0
        
        quality_btn.click(apply_fast_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
        quality_btn2.click(apply_quality_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
        
        # Main generation handler
        def handle_generate(prompt, num_frames, width, height, steps, guidance):
            # Create generator for progress updates
            def gen():
                for status, video in generate_video(prompt, num_frames, width, height, steps, guidance):
                    yield status, video
            
            return gen
        
        # Connect events
        generate_btn.click(
            handle_generate,
            inputs=[prompt_input, num_frames, width, height, num_inference_steps, guidance_scale],
            outputs=[status, video_output]
        )
        
        def clear_all():
            return "", None, *get_recommended_settings().values(), "🗑️ Cleared! Ready for new generation."
        
        clear_btn.click(
            clear_all,
            outputs=[prompt_input, video_output, num_frames, width, height, num_inference_steps, guidance_scale, status]
        )
    
    return demo

# Create and launch the demo
if __name__ == "__main__":
    demo = create_demo()
    
    # Launch with optimized settings
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True,
        quiet=False,
        max_threads=40,
        concurrency_limit=10
    )