File size: 13,007 Bytes
3da4f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL
from diffusers.schedulers import EulerDiscreteScheduler
from diffusers.utils import load_image, check_min_version
import os
import time
from PIL import Image
from typing import Generator, Tuple
import gc

# Model configuration
MODEL_ID = "cerspense/zeroscope_v2_576w"  # 2.5GB model with good quality
VAE_ID = "madebyollin/sdxl-vae-fp16-fix"  # Compact VAE
SCHEDULER = "EulerDiscreteScheduler"

@spaces.GPU(duration=1500)  # AoT compilation for 7GB+ model
def compile_model():
    """Compile the text-to-video model for optimal performance"""
    print("πŸš€ Compiling model for ahead-of-time optimization...")
    
    # Load components
    vae = AutoencoderKL.from_pretrained(VAE_ID, torch_dtype=torch.float16)
    scheduler = EulerDiscreteScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
    
    # Create pipeline with optimization
    pipe = DiffusionPipeline.from_pretrained(
        MODEL_ID,
        vae=vae,
        scheduler=scheduler,
        torch_dtype=torch.float16,
        variant="fp16",
        use_safetensors=True
    )
    
    # Enable memory efficient attention and compile
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()
    pipe.enable_attention_slicing()
    
    # AoT compilation for 1.3x-1.8x speedup
    with spaces.aoti_capture(pipe.transformer) as call:
        pipe("test prompt for compilation", num_frames=6)
    
    exported = torch.export.export(
        pipe.transformer,
        args=call.args,
        kwargs=call.kwargs,
    )
    
    compiled_model = spaces.aoti_compile(exported)
    spaces.aoti_apply(compiled_model, pipe.transformer)
    
    return pipe

# Initialize the model
print("πŸ”„ Loading text-to-video model...")
pipe = compile_model()
pipe.to('cuda')

@spaces.GPU
def generate_video(
    prompt: str,
    num_frames: int = 8,
    width: int = 576,
    height: int = 320,
    num_inference_steps: int = 25,
    guidance_scale: float = 17.5,
    progress: gr.Progress = gr.Progress()
) -> Generator[Tuple[str, np.ndarray], None, None]:
    """
    Generate a video from text prompt using the compiled model.
    
    Args:
        prompt: Text description for video generation
        num_frames: Number of frames in the video (6-16)
        width: Video width (576 recommended for quality)
        height: Video height (320 recommended for quality)
        num_inference_steps: Diffusion steps (20-30 recommended)
        guidance_scale: CFG scale (15-20 recommended)
        
    Yields:
        Tuple of (status_message, video_data)
    """
    try:
        # Clear GPU cache for optimal performance
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        
        # Validate parameters
        prompt = prompt.strip()
        if not prompt:
            yield "❌ Please enter a text prompt", None
            return
            
        if not 6 <= num_frames <= 16:
            yield "❌ Number of frames must be between 6-16", None
            return
            
        if not 200 <= width <= 1024:
            yield "❌ Width must be between 200-1024", None
            return
            
        if not 200 <= height <= 1024:
            yield "❌ Height must be between 200-1024", None
            return
        
        yield "🎬 Initializing video generation...", None
        
        # Set up progress tracking
        total_steps = num_inference_steps
        current_step = 0
        
        def progress_callback(step, timestep, latents):
            nonlocal current_step
            current_step += 1
            progress = (current_step / total_steps) * 100
            yield f"🎨 Generating video... {progress:.1f}% ({current_step}/{total_steps} steps)", None
        
        # Generate video frames
        yield "πŸ”₯ Generating video frames...", None
        start_time = time.time()
        
        # Run inference with optimized settings
        with torch.inference_mode():
            result = pipe(
                prompt=prompt,
                num_frames=num_frames,
                width=width,
                height=height,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                callback=progress_callback,
                callback_steps=1
            )
        
        # Extract frames
        frames = result.frames[0]  # Get first batch of frames
        generation_time = time.time() - start_time
        
        yield f"βœ… Video generated in {generation_time:.1f}s!", frames
        
    except Exception as e:
        error_msg = f"❌ Generation failed: {str(e)}"
        yield error_msg, None
        print(f"Error: {e}")
        
    finally:
        # Clean up
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

def get_recommended_settings() -> dict:
    """Get recommended generation settings"""
    return {
        "num_frames": 8,
        "width": 576,
        "height": 320,
        "num_inference_steps": 25,
        "guidance_scale": 17.5
    }

# Create the Gradio interface
def create_demo():
    """Create the main Gradio demo"""
    
    with gr.Blocks(
        title="πŸš€ Lightning Text-to-Video Generator",
        description="Generate high-quality videos from text prompts using advanced AI",
        theme=gr.themes.Soft()
    ) as demo:
        
        # Header with anycoder attribution
        gr.HTML("""
        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
            <h1 style="color: white; margin: 0; font-size: 2.5em;">🎬 Lightning Text-to-Video Generator</h1>
            <p style="color: white; margin: 10px 0; font-size: 1.2em;">Transform your ideas into stunning videos instantly</p>
            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #FFD700; text-decoration: none; font-size: 1.1em; font-weight: bold;">
                ⭐ Built with anycoder
            </a>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h3>πŸ“ Text Prompt</h3>")
                prompt_input = gr.Textbox(
                    label="Describe your video",
                    placeholder="A majestic dragon flying over a mystical forest at sunset, with glowing particles falling from the sky",
                    lines=4,
                    max_length=500
                )
                
                # Quick presets
                gr.HTML("<h3>🎯 Quick Presets</h3>")
                with gr.Row():
                    preset_btn1 = gr.Button("🌊 Nature Scene", variant="secondary", size="sm")
                    preset_btn2 = gr.Button("πŸ™οΈ Urban Scene", variant="secondary", size="sm")
                    preset_btn3 = gr.Button("πŸš€ Sci-Fi", variant="secondary", size="sm")
                    preset_btn4 = gr.Button("🎭 Fantasy", variant="secondary", size="sm")
                
                # Advanced settings
                with gr.Accordion("βš™οΈ Advanced Settings", open=False):
                    num_frames = gr.Slider(
                        minimum=6, maximum=16, value=8, step=1,
                        label="Number of Frames",
                        info="More frames = longer video but slower generation"
                    )
                    
                    with gr.Row():
                        width = gr.Slider(
                            minimum=200, maximum=1024, value=576, step=64,
                            label="Width",
                            info="Video width (576px recommended)"
                        )
                        height = gr.Slider(
                            minimum=200, maximum=1024, value=320, step=64,
                            label="Height", 
                            info="Video height (320px recommended)"
                        )
                    
                    num_inference_steps = gr.Slider(
                        minimum=15, maximum=50, value=25, step=5,
                        label="Generation Steps",
                        info="More steps = better quality but slower"
                    )
                    
                    guidance_scale = gr.Slider(
                        minimum=5, maximum=25, value=17.5, step=0.5,
                        label="Guidance Scale",
                        info="How closely to follow the prompt (15-20 recommended)"
                    )
                
                # Action buttons
                with gr.Row():
                    generate_btn = gr.Button("πŸš€ Generate Video", variant="primary", size="lg")
                    clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
                
                # Quick settings
                with gr.Row():
                    quality_btn = gr.Button("⚑ Fast", variant="secondary", size="sm")
                    quality_btn2 = gr.Button("🎨 High Quality", variant="secondary", size="sm")
                
                # Status display
                status = gr.HTML("<p style='color: #666;'>Ready to generate your video!</p>")
                
            with gr.Column(scale=1):
                gr.HTML("<h3>πŸŽ₯ Generated Video</h3>")
                video_output = gr.Video(
                    label="Your Generated Video",
                    format="mp4",
                    loop=True,
                    autoplay=True,
                    height=400
                )
                
                # Info panel
                info_panel = gr.HTML("""
                <div style="padding: 15px; background: #f8f9fa; border-radius: 8px; margin-top: 10px;">
                    <h4>πŸ’‘ Tips for Better Results:</h4>
                    <ul style="color: #555; font-size: 0.9em;">
                        <li>Be specific and descriptive in your prompts</li>
                        <li>Use adjectives to describe style, lighting, mood</li>
                        <li>Include camera movements (pan, zoom, rotate)</li>
                        <li>Fast mode: 6-8 frames, 15-20 steps</li>
                        <li>High quality: 10-12 frames, 25-30 steps</li>
                    </ul>
                </div>
                """)
        
        # Preset prompt handlers
        preset_prompts = {
            preset_btn1: "A serene mountain landscape with flowing river, golden hour lighting, birds flying in the sky",
            preset_btn2: "A bustling city street at night with neon lights, cars driving by, people walking",
            preset_btn3: "A futuristic spaceship flying through a galaxy with colorful nebulas and distant stars",
            preset_btn4: "A magical forest with glowing mushrooms, fairy lights dancing, mystical creatures moving"
        }
        
        for btn, preset_text in preset_prompts.items():
            btn.click(
                lambda text=preset_text: gr.update(value=text),
                outputs=prompt_input
            )
        
        # Quality settings
        def apply_fast_settings():
            return 6, 512, 288, 15, 15.0
        
        def apply_quality_settings():
            return 12, 576, 320, 30, 18.0
        
        quality_btn.click(apply_fast_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
        quality_btn2.click(apply_quality_settings, outputs=[num_frames, width, height, num_inference_steps, guidance_scale])
        
        # Main generation handler
        def handle_generate(prompt, num_frames, width, height, steps, guidance):
            # Create generator for progress updates
            def gen():
                for status, video in generate_video(prompt, num_frames, width, height, steps, guidance):
                    yield status, video
            
            return gen
        
        # Connect events
        generate_btn.click(
            handle_generate,
            inputs=[prompt_input, num_frames, width, height, num_inference_steps, guidance_scale],
            outputs=[status, video_output]
        )
        
        def clear_all():
            return "", None, *get_recommended_settings().values(), "πŸ—‘οΈ Cleared! Ready for new generation."
        
        clear_btn.click(
            clear_all,
            outputs=[prompt_input, video_output, num_frames, width, height, num_inference_steps, guidance_scale, status]
        )
    
    return demo

# Create and launch the demo
if __name__ == "__main__":
    demo = create_demo()
    
    # Launch with optimized settings
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True,
        quiet=False,
        max_threads=40,
        concurrency_limit=10
    )