import gradio as gr import spaces import torch import numpy as np from diffusers import DiffusionPipeline from diffusers.models import AutoencoderKL from diffusers.schedulers import EulerDiscreteScheduler from diffusers.utils import load_image, check_min_version import os import time from PIL import Image from typing import Generator, Tuple import gc # Model configuration MODEL_ID = "cerspense/zeroscope_v2_576w" # 2.5GB model with good quality VAE_ID = "madebyollin/sdxl-vae-fp16-fix" # Compact VAE SCHEDULER = "EulerDiscreteScheduler" @spaces.GPU(duration=1500) # AoT compilation for 7GB+ model def compile_model(): """Compile the text-to-video model for optimal performance""" print("🚀 Compiling model for ahead-of-time optimization...") # Load components vae = AutoencoderKL.from_pretrained(VAE_ID, torch_dtype=torch.float16) scheduler = EulerDiscreteScheduler.from_pretrained(MODEL_ID, subfolder="scheduler") # Create pipeline with optimization pipe = DiffusionPipeline.from_pretrained( MODEL_ID, vae=vae, scheduler=scheduler, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) # Enable memory efficient attention and compile pipe.enable_model_cpu_offload() pipe.enable_vae_slicing() pipe.enable_attention_slicing() # AoT compilation for 1.3x-1.8x speedup with spaces.aoti_capture(pipe.transformer) as call: pipe("test prompt for compilation", num_frames=6) exported = torch.export.export( pipe.transformer, args=call.args, kwargs=call.kwargs, ) compiled_model = spaces.aoti_compile(exported) spaces.aoti_apply(compiled_model, pipe.transformer) return pipe # Initialize the model print("🔄 Loading text-to-video model...") pipe = compile_model() pipe.to('cuda') @spaces.GPU def generate_video( prompt: str, num_frames: int = 8, width: int = 576, height: int = 320, num_inference_steps: int = 25, guidance_scale: float = 17.5, progress: gr.Progress = gr.Progress() ) -> Generator[Tuple[str, np.ndarray], None, None]: """ Generate a video from text prompt using the compiled model. Args: prompt: Text description for video generation num_frames: Number of frames in the video (6-16) width: Video width (576 recommended for quality) height: Video height (320 recommended for quality) num_inference_steps: Diffusion steps (20-30 recommended) guidance_scale: CFG scale (15-20 recommended) Yields: Tuple of (status_message, video_data) """ try: # Clear GPU cache for optimal performance if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() # Validate parameters prompt = prompt.strip() if not prompt: yield "❌ Please enter a text prompt", None return if not 6 <= num_frames <= 16: yield "❌ Number of frames must be between 6-16", None return if not 200 <= width <= 1024: yield "❌ Width must be between 200-1024", None return if not 200 <= height <= 1024: yield "❌ Height must be between 200-1024", None return yield "🎬 Initializing video generation...", None # Set up progress tracking total_steps = num_inference_steps current_step = 0 def progress_callback(step, timestep, latents): nonlocal current_step current_step += 1 progress = (current_step / total_steps) * 100 yield f"🎨 Generating video... {progress:.1f}% ({current_step}/{total_steps} steps)", None # Generate video frames yield "🔥 Generating video frames...", None start_time = time.time() # Run inference with optimized settings with torch.inference_mode(): result = pipe( prompt=prompt, num_frames=num_frames, width=width, height=height, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, callback=progress_callback, callback_steps=1 ) # Extract frames frames = result.frames[0] # Get first batch of frames generation_time = time.time() - start_time yield f"✅ Video generated in {generation_time:.1f}s!", frames except Exception as e: error_msg = f"❌ Generation failed: {str(e)}" yield error_msg, None print(f"Error: {e}") finally: # Clean up if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() def get_recommended_settings() -> dict: """Get recommended generation settings""" return { "num_frames": 8, "width": 576, "height": 320, "num_inference_steps": 25, "guidance_scale": 17.5 } # Create the Gradio interface def create_demo(): """Create the main Gradio demo""" with gr.Blocks( title="🚀 Lightning Text-to-Video Generator", description="Generate high-quality videos from text prompts using advanced AI", theme=gr.themes.Soft() ) as demo: # Header with anycoder attribution gr.HTML("""
Transform your ideas into stunning videos instantly
⭐ Built with anycoderReady to generate your video!
") with gr.Column(scale=1): gr.HTML("