app-jcvryp-74 / app.py
AiCoderv2's picture
Deploy Gradio app with multiple files
3da4f52 verified
import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL
from diffusers.schedulers import EulerDiscreteScheduler
from diffusers.utils import load_image, check_min_version
import os
import time
from PIL import Image
from typing import Generator, Tuple
import gc
# Model configuration
MODEL_ID = "cerspense/zeroscope_v2_576w" # 2.5GB model with good quality
VAE_ID = "madebyollin/sdxl-vae-fp16-fix" # Compact VAE
SCHEDULER = "EulerDiscreteScheduler"
@spaces.GPU(duration=1500) # AoT compilation for 7GB+ model
def compile_model():
"""Compile the text-to-video model for optimal performance"""
print("πŸš€ Compiling model for ahead-of-time optimization...")
# Load components
vae = AutoencoderKL.from_pretrained(VAE_ID, torch_dtype=torch.float16)
scheduler = EulerDiscreteScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
# Create pipeline with optimization
pipe = DiffusionPipeline.from_pretrained(
MODEL_ID,
vae=vae,
scheduler=scheduler,
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True
)
# Enable memory efficient attention and compile
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()
pipe.enable_attention_slicing()
# AoT compilation for 1.3x-1.8x speedup
with spaces.aoti_capture(pipe.transformer) as call:
pipe("test prompt for compilation", num_frames=6)
exported = torch.export.export(
pipe.transformer,
args=call.args,
kwargs=call.kwargs,
)
compiled_model = spaces.aoti_compile(exported)
spaces.aoti_apply(compiled_model, pipe.transformer)
return pipe
# Initialize the model
print("πŸ”„ Loading text-to-video model...")
pipe = compile_model()
pipe.to('cuda')
@spaces.GPU
def generate_video(
prompt: str,
num_frames: int = 8,
width: int = 576,
height: int = 320,
num_inference_steps: int = 25,
guidance_scale: float = 17.5,
progress: gr.Progress = gr.Progress()
) -> Generator[Tuple[str, np.ndarray], None, None]:
"""
Generate a video from text prompt using the compiled model.
Args:
prompt: Text description for video generation
num_frames: Number of frames in the video (6-16)
width: Video width (576 recommended for quality)
height: Video height (320 recommended for quality)
num_inference_steps: Diffusion steps (20-30 recommended)
guidance_scale: CFG scale (15-20 recommended)
Yields:
Tuple of (status_message, video_data)
"""
try:
# Clear GPU cache for optimal performance
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
# Validate parameters
prompt = prompt.strip()
if not prompt:
yield "❌ Please enter a text prompt", None
return
if not 6 <= num_frames <= 16:
yield "❌ Number of frames must be between 6-16", None
return
if not 200 <= width <= 1024:
yield "❌ Width must be between 200-1024", None
return
if not 200 <= height <= 1024:
yield "❌ Height must be between 200-1024", None
return
yield "🎬 Initializing video generation...", None
# Set up progress tracking
total_steps = num_inference_steps
current_step = 0
def progress_callback(step, timestep, latents):
nonlocal current_step
current_step += 1
progress = (current_step / total_steps) * 100
yield f"🎨 Generating video... {progress:.1f}% ({current_step}/{total_steps} steps)", None
# Generate video frames
yield "πŸ”₯ Generating video frames...", None
start_time = time.time()
# Run inference with optimized settings
with torch.inference_mode():
result = pipe(
prompt=prompt,
num_frames=num_frames,
width=width,
height=height,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
callback=progress_callback,
callback_steps=1
)
# Extract frames
frames = result.frames[0] # Get first batch of frames
generation_time = time.time() - start_time
yield f"βœ… Video generated in {generation_time:.1f}s!", frames
except Exception as e:
error_msg = f"❌ Generation failed: {str(e)}"
yield error_msg, None
print(f"Error: {e}")
finally:
# Clean up
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
def get_recommended_settings() -> dict:
"""Get recommended generation settings"""
return {
"num_frames": 8,
"width": 576,
"height": 320,
"num_inference_steps": 25,
"guidance_scale": 17.5
}
# Create the Gradio interface
def create_demo():
"""Create the main Gradio demo"""
with gr.Blocks(
title="πŸš€ Lightning Text-to-Video Generator",
description="Generate high-quality videos from text prompts using advanced AI",
theme=gr.themes.Soft()
) as demo:
# Header with anycoder attribution
gr.HTML("""
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
<h1 style="color: white; margin: 0; font-size: 2.5em;">🎬 Lightning Text-to-Video Generator</h1>
<p style="color: white; margin: 10px 0; font-size: 1.2em;">Transform your ideas into stunning videos instantly</p>
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #FFD700; text-decoration: none; font-size: 1.1em; font-weight: bold;">
⭐ Built with anycoder
</a>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<h3>πŸ“ Text Prompt</h3>")
prompt_input = gr.Textbox(
label="Describe your video",
placeholder="A majestic dragon flying over a mystical forest at sunset, with glowing particles falling from the sky",
lines=4,
max_length=500
)
# Quick presets
gr.HTML("<h3>🎯 Quick Presets</h3>")
with gr.Row():
preset_btn1 = gr.Button("🌊 Nature Scene", variant="secondary", size="sm")
preset_btn2 = gr.Button("πŸ™οΈ Urban Scene", variant="secondary", size="sm")
preset_btn3 = gr.Button("πŸš€ Sci-Fi", variant="secondary", size="sm")
preset_btn4 = gr.Button("🎭 Fantasy", variant="secondary", size="sm")
# Advanced settings
with gr.Accordion("βš™οΈ Advanced Settings", open=False):
num_frames = gr.Slider(
minimum=6, maximum=16, value=8, step=1,
label="Number of Frames",
info="More frames = longer video but slower generation"
)
with gr.Row():
width = gr.Slider(
minimum=200, maximum=1024, value=576, step=64,
label="Width",
info="Video width (576px recommended)"
)
height = gr.Slider(
minimum=200, maximum=1024, value=320, step=64,
label="Height",
info="Video height (320px recommended)"
)
num_inference_steps = gr.Slider(
minimum=15, maximum=50, value=25, step=5,
label="Generation Steps",
info="More steps = better quality but slower"
)
guidance_scale = gr.Slider(
minimum=5, maximum=25, value=17.5, step=0.5,
label="Guidance Scale",
info="How closely to follow the prompt (15-20 recommended)"
)
# Action buttons
with gr.Row():
generate_btn = gr.Button("πŸš€ Generate Video", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
# Quick settings
with gr.Row():
quality_btn = gr.Button("⚑ Fast", variant="secondary", size="sm")
quality_btn2 = gr.Button("🎨 High Quality", variant="secondary", size="sm")
# Status display
status = gr.HTML("<p style='color: #666;'>Ready to generate your video!</p>")
with gr.Column(scale=1):
gr.HTML("<h3>πŸŽ₯ Generated Video</h3>")
video_output = gr.Video(
label="Your Generated Video",
format="mp4",
loop=True,
autoplay=True,
height=400
)
# Info panel
info_panel = gr.HTML("""
<div style="padding: 15px; background: #f8f9fa; border-radius: 8px; margin-top: 10px;">
<h4>πŸ’‘ Tips for Better Results:</h4>
<ul style="color: #555; font-size: 0.9em;">
<li>Be specific and descriptive in your prompts</li>
<li>Use adjectives to describe style, lighting, mood</li>
<li>Include camera movements (pan, zoom, rotate)</li>
<li>Fast mode: 6-8 frames, 15-20 steps</li>
<li>High quality: 10-12 frames, 25-30 steps</li>
</ul>
</div>
""")
# Preset prompt handlers
preset_prompts = {
preset_btn1: "A serene mountain landscape with flowing river, golden hour lighting, birds flying in the sky",
preset_btn2: "A bustling city street at night with neon lights, cars driving by, people walking",
preset_btn3: "A futuristic spaceship flying through a galaxy with colorful nebulas and distant stars",
preset_btn4: "A magical forest with glowing mushrooms, fairy lights dancing, mystical creatures moving"
}
for btn, preset_text in preset_prompts.items():
btn.click(
lambda text=preset_text: gr.update(value=text),
outputs=prompt_input
)
# Quality settings
def apply_fast_settings():
return 6, 512, 288, 15, 15.0
def apply_quality_settings():
return 12, 576, 320, 30, 18.0
quality_btn.click(apply_fast_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
quality_btn2.click(apply_quality_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
# Main generation handler
def handle_generate(prompt, num_frames, width, height, steps, guidance):
# Create generator for progress updates
def gen():
for status, video in generate_video(prompt, num_frames, width, height, steps, guidance):
yield status, video
return gen
# Connect events
generate_btn.click(
handle_generate,
inputs=[prompt_input, num_frames, width, height, num_inference_steps, guidance_scale],
outputs=[status, video_output]
)
def clear_all():
return "", None, *get_recommended_settings().values(), "πŸ—‘οΈ Cleared! Ready for new generation."
clear_btn.click(
clear_all,
outputs=[prompt_input, video_output, num_frames, width, height, num_inference_steps, guidance_scale, status]
)
return demo
# Create and launch the demo
if __name__ == "__main__":
demo = create_demo()
# Launch with optimized settings
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
show_error=True,
quiet=False,
max_threads=40,
concurrency_limit=10
)