Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on Aug 31

Commit

22d96d3

verified ·

1 Parent(s): 4588e7b

Update app.py

Browse files

Files changed (1) hide show

app.py +436 -425

app.py CHANGED Viewed

@@ -1,463 +1,474 @@
-import os, sys, json, tempfile, subprocess, shutil, uuid, glob, traceback, datetime
-from pathlib import Path
-from typing import Tuple, List
-# ========= Crash trap & env =========
-import faulthandler
-faulthandler.enable()
-os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")
-os.environ.setdefault("GRADIO_NUM_PORTS", "1")
-os.environ.setdefault("HF_HUB_VERBOSE", "1")
-os.environ.setdefault("TRANSFORMERS_VERBOSITY", "info")
-os.environ.setdefault("PYTHONUNBUFFERED", "1")
-def _crash_trap(exctype, value, tb):
-    ts = datetime.datetime.utcnow().isoformat()
-    print(f"\n===== FATAL ({ts}Z) =====================================")
-    traceback.print_exception(exctype, value, tb)
-    print("=========================================================\n", flush=True)
-sys.excepthook = _crash_trap
-# ========= Minimal imports for startup =========
 import gradio as gr
-from spaces import GPU  # ensure checker can see decorator
 from loguru import logger
-# ---- ZeroGPU marker FIRST (so startup detector finds it) ----
-@GPU(duration=5)
-def _zgpu_marker(_: int = 0) -> int:
-    """No-op; only to advertise a GPU-decorated function at import-time."""
-    return _
-# ========= Paths & Configs =========
-ROOT = Path(__file__).parent.resolve()
-REPO_DIR = ROOT / "HunyuanVideo-Foley"
-WEIGHTS_DIR = ROOT / "weights"
-CACHE_DIR = ROOT / "cache"
-OUT_DIR = ROOT / "outputs"
-ASSETS = ROOT / "assets"
-for p in (ASSETS, WEIGHTS_DIR, CACHE_DIR, OUT_DIR):
-    p.mkdir(parents=True, exist_ok=True)
-APP_TITLE   = os.environ.get("APP_TITLE", "Foley Studio · ZeroGPU")
-APP_TAGLINE = os.environ.get("APP_TAGLINE", "Generate scene-true foley for short clips (ZeroGPU-ready).")
-PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF")
-# ZeroGPU-friendly defaults
-MAX_SECS = int(os.environ.get("MAX_SECS", "15"))
-TARGET_H = int(os.environ.get("TARGET_H", "480"))
-SR       = int(os.environ.get("TARGET_SR", "48000"))
-ZEROGPU_DURATION = int(os.environ.get("ZEROGPU_DURATION", "110"))
-# ========= Light utils (safe at import) =========
-def sh(cmd: str):
-    print(">>", cmd)
-    subprocess.run(cmd, shell=True, check=True)
-def ffprobe_duration(path: str) -> float:
-    try:
-        out = subprocess.check_output([
-            "ffprobe", "-v", "error", "-show_entries", "format=duration",
-            "-of", "default=noprint_wrappers=1:nokey=1", path
-        ]).decode().strip()
-        return float(out)
-    except Exception:
-        return 0.0
-def _clone_without_lfs():
-    if REPO_DIR.exists():
-        return
-    try:
-        sh(
-            "GIT_LFS_SKIP_SMUDGE=1 "
-            "git -c filter.lfs.smudge= -c filter.lfs.required=false "
-            f"clone --depth 1 https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
-        )
-        assets = REPO_DIR / "assets"
-        if assets.exists():
-            shutil.rmtree(assets, ignore_errors=True)
         return
-    except subprocess.CalledProcessError as e:
-        print("Shallow clone with LFS skipped failed, trying sparse checkout…", e)
-    REPO_DIR.mkdir(parents=True, exist_ok=True)
-    sh(f"git -C {REPO_DIR} init")
-    sh(
-        f"git -C {REPO_DIR} -c filter.lfs.smudge= -c filter.lfs.required=false "
-        "remote add origin https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git"
     )
-    sh(f"git -C {REPO_DIR} config core.sparseCheckout true")
-    sparse_file = REPO_DIR / ".git" / "info" / "sparse-checkout"
-    sparse_file.parent.mkdir(parents=True, exist_ok=True)
-    sparse_file.write_text("\n".join([
-        "hunyuanvideo_foley/",
-        "configs/",
-        "gradio_app.py",
-        "requirements.txt",
-        "LICENSE",
-        "README.md",
-    ]) + "\n")
-    try:
-        sh(f"git -C {REPO_DIR} fetch --depth 1 origin main")
-        sh(f"git -C {REPO_DIR} checkout main")
-    except subprocess.CalledProcessError:
-        sh(f"git -C {REPO_DIR} fetch --depth 1 origin master")
-        sh(f"git -C {REPO_DIR} checkout master")
-def prepare_code_and_weights():
-    from huggingface_hub import snapshot_download
-    _clone_without_lfs()
-    if str(REPO_DIR) not in sys.path:
-        sys.path.insert(0, str(REPO_DIR))
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
-        local_dir=str(WEIGHTS_DIR),
-        local_dir_use_symlinks=False,
-        repo_type="model",
         resume_download=True,
     )
-    os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
-# Do lightweight prep (no model init) at import-time
-prepare_code_and_weights()
-# Prefer safetensors & fast transfer for later downloads
-os.environ["TRANSFORMERS_PREFER_SAFETENSORS"] = "1"
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-# ========= Heavy deps & model utilities (deferred import) =========
-_model_dict = None
-_cfg = None
-_device = None
-def _lazy_heavy_imports():
-    global torch, torchaudio
-    import torch, torchaudio  # noqa
-    try:
-        import audiotools  # provided by 'descript-audiotools'
-    except Exception as e:
-        raise RuntimeError(
-            "Missing 'audiotools'. Add 'descript-audiotools>=0.7.2' to requirements.txt."
-        ) from e
-    try:
-        import omegaconf  # noqa
-        import yaml       # noqa
-        import easydict   # noqa
-    except Exception as e:
-        raise RuntimeError(
-            "Missing config deps. Add: omegaconf>=2.3.0, pyyaml, easydict."
-        ) from e
-    # Tencent internals
-    from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa
-    from hunyuanvideo_foley.utils.feature_utils import feature_process           # noqa
-    from hunyuanvideo_foley.utils.media_utils import merge_audio_video           # noqa
-    return torch, torchaudio
-def _ensure_clap_safetensors_only():
-    from huggingface_hub import snapshot_download
-    # Pre-cache only safetensors; block .bin selection
-    snapshot_download(
-        repo_id="laion/larger_clap_general",
-        allow_patterns=[
-            "*.safetensors", "config.json", "*.json", "*.txt",
-            "tokenizer*", "*merges*", "*vocab*"
-        ],
-        ignore_patterns=["*.bin"],
-        resume_download=True,
-        local_dir=None,
-        local_dir_use_symlinks=False,
-    )
-    # Purge any cached .bin for the model
-    cache_root = Path.home() / ".cache" / "huggingface" / "hub"
-    for pat in [cache_root / "models--laion--larger_clap_general" / "snapshots" / "*" / "*.bin"]:
-        for f in glob.glob(str(pat)):
-            try:
-                Path(f).unlink()
-                print(f">> Purged cached bin: {f}")
-            except Exception:
-                pass
-def _setup_device(device_str: str = "auto", gpu_id: int = 0):
-    import torch
-    if device_str == "auto":
-        if torch.cuda.is_available():
-            d = torch.device(f"cuda:{gpu_id}")
-            logger.info(f"Using CUDA {d}")
-        elif torch.backends.mps.is_available():
-            d = torch.device("mps")
-            logger.info("Using MPS")
-        else:
-            d = torch.device("cpu")
-            logger.info("Using CPU")
-    else:
-        d = torch.device(device_str if device_str != "cuda" else f"cuda:{gpu_id}")
-        logger.info(f"Using specified device: {d}")
-    return d
 def auto_load_models() -> str:
-    """Load the full Tencent pipeline (lazy; call when needed)."""
     global _model_dict, _cfg, _device
-    if _model_dict is not None:
-        return "✅ Model already loaded"
-    # Imports & guards
-    torch, _ = _lazy_heavy_imports()
-    MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", str(WEIGHTS_DIR))
-    CONFIG_PATH = str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")
-    if not os.path.exists(CONFIG_PATH):
-        return f"❌ Config file not found: {CONFIG_PATH}"
     _device = _setup_device("auto", 0)
     logger.info("Loading HunyuanVideo-Foley model...")
-    logger.info(f"MODEL_PATH:  {MODEL_PATH}")
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
-    # Force CLAP to safetensors path
-    _ensure_clap_safetensors_only()
-    os.environ["HF_HUB_OFFLINE"] = "1"
-    os.environ["TRANSFORMERS_OFFLINE"] = "1"
-    from hunyuanvideo_foley.utils.model_utils import load_model
-    _model_dict, _cfg = load_model(MODEL_PATH, CONFIG_PATH, _device)
-    logger.info("✅ Model loaded")
-    return "✅ Model loaded"
-# ========= Pre/Post-processing =========
-def preprocess_video(in_path: str) -> Tuple[str, float]:
-    dur = ffprobe_duration(in_path)
-    if dur == 0:
-        raise RuntimeError("Unable to read the video duration.")
-    temp_dir = Path(tempfile.mkdtemp(prefix="pre_"))
-    trimmed = temp_dir / "trim.mp4"
-    processed = temp_dir / "proc.mp4"
-    trim_args = ["-t", str(MAX_SECS)] if dur > MAX_SECS else []
-    sh(" ".join([
-        "ffmpeg", "-y", "-i", f"\"{in_path}\"", *trim_args,
-        "-an", "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
-        "-movflags", "+faststart", f"\"{trimmed}\""
-    ]))
-    vf = f"scale=-2:{TARGET_H}:flags=bicubic"
-    sh(" ".join([
-        "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
-        "-vf", f"\"{vf}\"", "-an",
-        "-vcodec", "libx264", "-profile:v", "baseline", "-level", "3.1",
-        "-pix_fmt", "yuv420p", "-preset", "veryfast", "-crf", "24",
-        "-movflags", "+faststart", f"\"{processed}\""
-    ]))
-    return str(processed), min(dur, float(MAX_SECS))
-def mux_audio_with_video(video_path: str, audio_path: str) -> str:
-    out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
-    sh(" ".join([
-        "ffmpeg", "-y", "-i", f"\"{video_path}\"", "-i", f"\"{audio_path}\"",
-        "-map", "0:v:0", "-map", "1:a:0", "-c:v", "copy", "-c:a", "aac", "-b:a", "192k",
-        "-shortest", f"\"{out_path}\""
-    ]))
-    return str(out_path)
-# ========= Inference (GPU-decorated) =========
-@GPU(duration=ZEROGPU_DURATION)
-def run_model(video_path: str, prompt_text: str,
-              guidance_scale: float = 4.5,
-              num_inference_steps: int = 50,
-              sample_nums: int = 1):
     """
-    ZeroGPU-safe native pipeline. Returns ([wav_paths], sample_rate).
     """
-    # Lazy load model the first time this runs
-    if _model_dict is None:
-        msg = auto_load_models()
-        logger.info(msg)
-    # heavy imports (after model load prepared)
-    import torchaudio
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.model_utils import denoise_process
-    text_prompt = (prompt_text or "").strip()
     visual_feats, text_feats, audio_len_s = feature_process(
-        video_path, text_prompt, _model_dict, _cfg
     )
-    logger.info(f"Generating {sample_nums} sample(s)...")
-    audio_batch, sr = denoise_process(
-        visual_feats, text_feats, audio_len_s, _model_dict, _cfg,
-        guidance_scale=guidance_scale, num_inference_steps=num_inference_steps,
-        batch_size=sample_nums
     )
-    out_dir = OUT_DIR / f"job_{uuid.uuid4().hex[:8]}"
-    out_dir.mkdir(parents=True, exist_ok=True)
-    wav_paths = []
     for i in range(sample_nums):
-        wav_p = out_dir / f"generated_audio_{i+1}.wav"
-        torchaudio.save(str(wav_p), audio_batch[i], sr)
-        wav_paths.append(str(wav_p))
-    return wav_paths, sr
-# ========= UI Handlers =========
-def single_generate(video: str, prompt: str, want_mux: bool, project_name: str):
-    history = []
     try:
-        if not video:
-            return None, None, "⚠️ Please upload a video.", history
-        history.append(["Preprocess", "Downscaling & trimming"])
-        pre_path, final_dur = preprocess_video(video)
-        history.append(["Inference", "ZeroGPU native pipeline"])
-        wav_list, sr = run_model(pre_path, prompt or "", guidance_scale=4.5, num_inference_steps=50, sample_nums=1)
-        if not wav_list:
-            raise RuntimeError("No audio produced.")
-        wav = wav_list[0]
-        muxed = mux_audio_with_video(pre_path, wav) if want_mux else None
-        history.append(["Done", f"OK · ~{final_dur:.1f}s"])
-        return wav, muxed, f"✅ Completed (~{final_dur:.1f}s)", history
-    except Exception as e:
-        history.append(["Error", str(e)])
-        return None, None, f"❌ {type(e).__name__}: {e}", history
-def batch_lite_generate(files: List[str], prompt: str, want_mux: bool):
-    log = []
-    if not files:
-        return "⚠️ Please upload 1–3 videos.", log
-    if len(files) > 3:
-        files = files[:3]
-        log.append(["Info", "Limiting to first 3 videos."])
-    outputs = []
-    for i, f in enumerate(files, 1):
-        try:
-            log.append([f"Preprocess {i}", Path(f).name])
-            pre, final_dur = preprocess_video(f)
-            log.append([f"Run {i}", f"ZeroGPU ~{final_dur:.1f}s"])
-            wav_list, sr = run_model(pre, prompt or "", sample_nums=1)
-            if not wav_list:
-                raise RuntimeError("No audio produced.")
-            wav = wav_list[0]
-            muxed = mux_audio_with_video(pre, wav) if want_mux else None
-            outputs.append((wav, muxed))
-            log.append([f"Done {i}", "OK"])
-        except Exception as e:
-            log.append([f"Error {i}", str(e)])
-    manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json"
-    manifest.write_text(json.dumps(
-        [{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2
-    ))
-    return f"✅ Batch-lite finished · items: {len(outputs)}", log
-# ========= UI (refreshed design) =========
-THEME_CSS = f"""
-:root {{
-  --brand: {PRIMARY_COLOR};
-  --bg: #0f1120;
-  --panel: #181a2e;
-  --text: #edf0ff;
-  --muted: #b7bce3;
-  --card: #15172a;
-}}
-.gradio-container {{
-  font-family: Inter, ui-sans-serif, -apple-system, Segoe UI, Roboto, Cairo, Noto Sans, Arial;
-  background: var(--bg);
-  color: var(--text);
-}}
-#hero {{
-  background: linear-gradient(135deg, var(--brand) 0%, #2f2e8b 40%, #1b1a3a 100%);
-  border-radius: 18px;
-  padding: 18px 20px;
-  color: white;
-  box-shadow: 0 10px 30px rgba(0,0,0,.35);
-}}
-#hero h1 {{ margin: 0 0 6px 0; font-size: 20px; font-weight: 700; letter-spacing: .2px; }}
-#hero p  {{ margin: 0; opacity: .95; }}
-.gr-tabitem, .gr-block.gr-group, .gr-panel {{
-  background: var(--panel);
-  border-radius: 16px !important;
-  box-shadow: 0 6px 18px rgba(0,0,0,.28);
-  border: 1px solid rgba(255,255,255,.04);
-}}
-.gr-button {{ border-radius: 12px !important; border: 1px solid rgba(255,255,255,.08) !important; }}
-.gradio-container .tabs .tab-nav button.selected {{
-  background: rgba(255,255,255,.06); border-radius: 12px; border: 1px solid rgba(255,255,255,.08);
-}}
-.badge {{ display:inline-block; padding:2px 8px; border-radius:999px; background: rgba(255,255,255,.12); color:#fff; font-size:12px }}
-"""
-with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
-    with gr.Row():
-        gr.HTML(f"""
-        <div id="hero">
-          <h1>{APP_TITLE}</h1>
-          <p>{APP_TAGLINE}</p>
-          <div style="margin-top:8px"><span class="badge">ZeroGPU</span> <span class="badge">Auto-trim ≤ {MAX_SECS}s</span> <span class="badge">Downscale {TARGET_H}p</span></div>
-        </div>
-        """)
-    with gr.Tabs():
-        with gr.Tab("🎬 Single Clip"):
-            with gr.Group():
-                project_name = gr.Textbox(label="Project name (optional)", placeholder="Enter a short label for this clip")
-                with gr.Row():
-                    v_single = gr.Video(label=f"Video (≤ ~{MAX_SECS}s recommended)")
-                    p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps on wood, light rain, indoor reverb")
-                with gr.Row():
-                    want_mux_single = gr.Checkbox(value=True, label="Mux foley into MP4 output")
-                run_btn = gr.Button("Generate", variant="primary")
                 with gr.Row():
-                    out_audio = gr.Audio(label=f"Generated Foley ({SR//1000} kHz WAV)", type="filepath")
-                    out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
-                status_md = gr.Markdown()
-                history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity")
-            run_btn.click(
-                single_generate,
-                inputs=[v_single, p_single, want_mux_single, project_name],
-                outputs=[out_audio, out_mux, status_md, history_table]
             )
-        with gr.Tab("📦 Batch-Lite (1–3 clips)"):
-            files = gr.Files(label="Upload 1–3 short videos", file_types=[".mp4",".mov"], file_count="multiple")
-            prompt_b = gr.Textbox(label="Global prompt (optional)")
-            want_mux_b = gr.Checkbox(value=True, label="Mux each output")
-            go_b = gr.Button("Run batch-lite")
-            batch_status = gr.Markdown()
-            batch_log = gr.Dataframe(headers=["Step","Note"], datatype=["str","str"], interactive=False, wrap=True, label="Batch Log")
-            go_b.click(batch_lite_generate, inputs=[files, prompt_b, want_mux_b], outputs=[batch_status, batch_log])
-        with gr.Tab("ℹ️ Tips"):
-            gr.Markdown(f"""
-**Usage guidelines**
-- Keep clips short (the tool trims to **≤ {MAX_SECS}s** automatically).
-- The video is downscaled to **{TARGET_H}p** to fit the ZeroGPU time window.
-- If you see a quota message, try again later (ZeroGPU limits GPU minutes per visitor).
-**Outputs**
-- WAV is **{SR//1000} kHz** stereo.
-- Enable **Mux** to get a ready MP4 with the generated foley track.
-""")
-# Health endpoint
-try:
-    from fastapi import FastAPI
-    fastapi_app = demo.app
-    @fastapi_app.get("/health")
-    def _health():
-        return {"ok": True, "model_loaded": _model_dict is not None, "device": str(_device) if _device else None}
-except Exception:
-    pass
-# Launch
-logger.remove()
-logger.add(lambda msg: print(msg, end=''), level="INFO")
-try:
-    demo.queue(max_size=24).launch(server_name="0.0.0.0")
-except Exception:
-    print("\n[BOOT][ERROR] Gradio launch failed:")
-    traceback.print_exc()
-    raise

+import os
+import io
+import sys
+import json
+import shutil
+import random
+import tempfile
+import base64
+from datetime import datetime
+from typing import List, Optional, Tuple, Dict
 import gradio as gr
+import numpy as np
+import torch
+import torchaudio
 from loguru import logger
+from huggingface_hub import snapshot_download
+# --- Tencent repo imports (pulled at startup) ---
+# These are available after we git clone the repo in prepare_once()
+# Do not move these imports above the clone step in __main__.
+# from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
+# from hunyuanvideo_foley.utils.feature_utils import feature_process
+# from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+# HF Spaces GPU decorator
+import spaces
+# -------------------------
+# Constants & configuration
+# -------------------------
+SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
+SPACE_TAGLINE = "Text/Video → Audio Foley. Created by bilsimaging.com"
+GALLERY_DIR = os.environ.get("OUTPUTS_DIR", "outputs")
+WEIGHTS_DIR = os.environ.get("HIFI_FOLEY_MODEL_PATH", "/home/user/app/weights")
+REPO_DIR = "/home/user/app/HunyuanVideo-Foley"
+CONFIG_PATH = os.environ.get(
+    "HIFI_FOLEY_CONFIG",
+    f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml"
+)
+# keep <=120s for ZeroGPU
+GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
+os.makedirs(GALLERY_DIR, exist_ok=True)
+os.makedirs(WEIGHTS_DIR, exist_ok=True)
+# Globals populated after model load
+_model_dict = None
+_cfg = None
+_device: Optional[torch.device] = None
+# ------------
+# Small helpers
+# ------------
+def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
+    """Pick CUDA if available, else MPS, else CPU."""
+    if pref == "auto":
+        if torch.cuda.is_available():
+            d = torch.device(f"cuda:{gpu_id}")
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            d = torch.device("mps")
+        else:
+            d = torch.device("cpu")
+    else:
+        d = torch.device(pref)
+    logger.info(f"Using CUDA {d}" if d.type == "cuda" else f"Using {d}")
+    return d
+def _save_video_result(video_file: str, audio_tensor: torch.Tensor, sr: int, idx: int) -> str:
+    """Save audio to wav, merge with original video, and save mp4 into gallery."""
+    from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+    temp_dir = tempfile.mkdtemp()
+    audio_path = os.path.join(temp_dir, f"gen_{idx}.wav")
+    # torchaudio expects shape [channels, samples]
+    if audio_tensor.ndim == 1:
+        audio_tensor = audio_tensor.unsqueeze(0)
+    torchaudio.save(audio_path, audio_tensor.cpu(), sr)
+    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
+    out_name = f"shortifoley_{timestamp}_{idx}.mp4"
+    out_path = os.path.join(GALLERY_DIR, out_name)
+    merge_audio_video(audio_path, video_file, out_path)
+    return out_path
+def _list_gallery(limit: int = 100) -> List[str]:
+    files = []
+    for fn in sorted(os.listdir(GALLERY_DIR), reverse=True):
+        if fn.lower().endswith((".mp4", ".webm", ".mov", ".mkv")):
+            files.append(os.path.join(GALLERY_DIR, fn))
+        if len(files) >= limit:
+            break
+    return files
+def _ensure_repo() -> None:
+    """Shallow clone the Tencent repo with LFS smudge disabled to avoid quota issues."""
+    if os.path.exists(REPO_DIR) and os.path.isdir(REPO_DIR):
         return
+    cmd = (
+        f"GIT_LFS_SKIP_SMUDGE=1 git -c filter.lfs.smudge= "
+        f"-c filter.lfs.required=false clone --depth 1 "
+        f"https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
     )
+    logger.info(f">> {cmd}")
+    os.system(cmd)
+def _download_weights_if_needed() -> None:
+    """Pull big .pth files (and small assets) from HF model repo snapshot."""
+    # The official weights are hosted on the HF model page, so we snapshot into WEIGHTS_DIR
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
+        local_dir=WEIGHTS_DIR,
         resume_download=True,
+        allow_patterns=[
+            "hunyuanvideo_foley.pth",
+            "synchformer_state_dict.pth",
+            "vae_128d_48k.pth",
+            "assets/*",
+            "config.yaml",  # not used directly here, but harmless
+        ],
     )
+def prepare_once() -> None:
+    _ensure_repo()
+    _download_weights_if_needed()
+# -----------------------
+# Model load & inference
+# -----------------------
 def auto_load_models() -> str:
+    """
+    Load HunyuanVideo-Foley + encoders on the chosen device.
+    Uses safetensors where possible; falls back to HF/torch internal loaders.
+    """
     global _model_dict, _cfg, _device
+    if _model_dict is not None and _cfg is not None:
+        return "Model already loaded."
+    # Late imports (repo becomes available after clone).
+    sys.path.append(REPO_DIR)
+    from hunyuanvideo_foley.utils.model_utils import load_model
     _device = _setup_device("auto", 0)
     logger.info("Loading HunyuanVideo-Foley model...")
+    logger.info(f"MODEL_PATH:  {WEIGHTS_DIR}")
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
+    try:
+        _model_dict, _cfg = load_model(WEIGHTS_DIR, CONFIG_PATH, _device)
+        return "✅ Model loaded."
+    except Exception as e:
+        logger.error(e)
+        return f"❌ Failed to load model: {e}"
+@spaces.GPU(duration=GPU_DURATION)
+@torch.inference_mode()
+def infer_single_video(
+    video_file: str,
+    text_prompt: str,
+    guidance_scale: float = 4.5,
+    num_inference_steps: int = 50,
+    sample_nums: int = 1,
+) -> Tuple[List[str], str]:
     """
+    Generate Foley audio for an uploaded video (1–6 variants).
+    Args:
+        video_file: Path to a local video file on the Space.
+        text_prompt: Optional text prompt to steer the audio.
+        guidance_scale: CFG scale.
+        num_inference_steps: Denoising steps.
+        sample_nums: Number of audio variants to produce (1–6).
+    Returns:
+        (video_paths, status_message)
     """
+    if _model_dict is None or _cfg is None:
+        return [], "❌ Load the model first."
+    if not video_file:
+        return [], "❌ Please provide a video."
+    sys.path.append(REPO_DIR)
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     from hunyuanvideo_foley.utils.model_utils import denoise_process
+    # preprocess
     visual_feats, text_feats, audio_len_s = feature_process(
+        video_file, (text_prompt or "").strip(), _model_dict, _cfg
     )
+    # generate batch
+    sample_nums = int(max(1, min(6, sample_nums)))
+    audio, sr = denoise_process(
+        visual_feats,
+        text_feats,
+        audio_len_s,
+        _model_dict,
+        _cfg,
+        guidance_scale=guidance_scale,
+        num_inference_steps=int(num_inference_steps),
+        batch_size=sample_nums,
     )
+    # save results
+    out_videos = []
     for i in range(sample_nums):
+        out_videos.append(_save_video_result(video_file, audio[i], sr, i + 1))
+    return out_videos, f"✅ Generated {len(out_videos)} result(s). Saved to {GALLERY_DIR}/"
+# ---------------
+# MCP-only API(s)
+# ---------------
+def _download_to_tmp(url: str) -> str:
+    """Download a remote file to a temp path. Lightweight helper for MCP."""
     try:
+        import requests  # optional dependency
+    except Exception:
+        raise RuntimeError("The server is missing 'requests'. Add it to requirements.txt to use URL inputs.")
+    r = requests.get(url, timeout=30)
+    r.raise_for_status()
+    suffix = ".mp4"
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+    tmp.write(r.content)
+    tmp.flush()
+    tmp.close()
+    return tmp.name
+def _maybe_from_base64(data_url_or_b64: str) -> str:
+    """Accept data: URLs or raw base64 for MCP convenience; returns temp file path."""
+    b64 = data_url_or_b64
+    if data_url_or_b64.startswith("data:"):
+        # data:video/mp4;base64,XXXX
+        b64 = data_url_or_b64.split(",", 1)[-1]
+    raw = base64.b64decode(b64)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+    tmp.write(raw)
+    tmp.flush()
+    tmp.close()
+    return tmp.name
+def _normalize_video_input(video_url_or_b64: str) -> str:
+    """Return a local filename from url or base64. Raises on error."""
+    v = (video_url_or_b64 or "").strip()
+    if v.startswith("http://") or v.startswith("https://"):
+        return _download_to_tmp(v)
+    # assume base64
+    return _maybe_from_base64(v)
+def _api_generate_from_local(
+    local_video_path: str,
+    text_prompt: str = "",
+    guidance_scale: float = 4.5,
+    num_inference_steps: int = 50,
+    sample_nums: int = 1,
+) -> Dict[str, List[str]]:
+    outs, msg = infer_single_video(
+        video_file=local_video_path,
+        text_prompt=text_prompt or "",
+        guidance_scale=float(guidance_scale),
+        num_inference_steps=int(num_inference_steps),
+        sample_nums=int(sample_nums),
+    )
+    return {"videos": outs, "message": msg}
+# Expose a **pure API** endpoint that becomes an MCP tool but does not show a UI.
+with gr.Blocks() as mcp_only_endpoints:
+    gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
+    @gr.api  # becomes an MCP tool and a REST API endpoint automatically
+    def api_generate_from_url(
+        video_url_or_b64: str,
+        text_prompt: str = "",
+        guidance_scale: float = 4.5,
+        num_inference_steps: int = 50,
+        sample_nums: int = 1,
+    ) -> Dict[str, List[str]]:
+        """
+        Generate Foley from a remote video URL or base64-encoded video.
+        Args:
+            video_url_or_b64: http(s) URL or data/base64 string of a short video (mp4).
+            text_prompt: Optional audio description (English).
+            guidance_scale: CFG scale (1.0–10.0).
+            num_inference_steps: Denoising steps (10–100).
+            sample_nums: Number of variants to return (1–6).
+        Returns:
+            dict with { "videos": [paths], "message": str }
+        """
+        if _model_dict is None or _cfg is None:
+            raise RuntimeError("Model not loaded. Call /load_model tool or use the UI once.")
+        local_path = _normalize_video_input(video_url_or_b64)
+        return _api_generate_from_local(local_path, text_prompt, guidance_scale, num_inference_steps, sample_nums)
+    # Tiny status resource & prompt to help MCP clients
+    @gr.mcp.resource("shortifoley://status")
+    def shortifoley_status() -> str:
+        """Return a simple readiness string for MCP clients."""
+        ready = _model_dict is not None and _cfg is not None
+        dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
+        return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={GALLERY_DIR}"
+    @gr.mcp.prompt()
+    def foley_prompt(name: str = "default") -> str:
+        """A reusable prompt template for generating Foley."""
+        return (
+            "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
+            "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
+        )
+# -----------------
+# Gradio UI (Blocks)
+# -----------------
+def create_ui() -> gr.Blocks:
+    with gr.Blocks(
+        title="ShortiFoley — HunyuanVideo-Foley",
+        css="""
+        .main-header{ text-align:center; padding:1.5rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
+        .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
+        .generate-btn button{ font-weight:700; }
+        """
+    ) as demo:
+        gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
+        with gr.Row():
+            with gr.Column(scale=1, elem_classes=["card"]):
+                gr.Markdown("### 📹 Input")
+                video_input = gr.Video(label="Upload Video", height=300)
+                text_input = gr.Textbox(
+                    label="🎯 Audio Description (optional, English)",
+                    placeholder="e.g., Quick rubber-soled footsteps on tile; echoey hallway."
+                )
                 with gr.Row():
+                    guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
+                    steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
+                    samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
+                generate = gr.Button("🎵 Generate Audio", variant="primary", elem_classes=["generate-btn"])
+            with gr.Column(scale=1, elem_classes=["card"]):
+                gr.Markdown("### 🎥 Result(s)")
+                v1 = gr.Video(label="Sample 1", height=260, visible=True)
+                v2 = gr.Video(label="Sample 2", height=160, visible=False)
+                v3 = gr.Video(label="Sample 3", height=160, visible=False)
+                v4 = gr.Video(label="Sample 4", height=160, visible=False)
+                v5 = gr.Video(label="Sample 5", height=160, visible=False)
+                v6 = gr.Video(label="Sample 6", height=160, visible=False)
+                status = gr.Textbox(label="Status", interactive=False)
+        with gr.Tab("📁 Gallery"):
+            gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
+            gallery = gr.Gallery(
+                value=_list_gallery(),
+                columns=3,
+                preview=True,
+                label="Saved Results"
+            )
+            refresh = gr.Button("🔄 Refresh Gallery")
+        # Event handlers
+        def _process(
+            video_file, text_prompt, cfg, nsteps, nsamples
+        ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], str]:
+            outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
+            # set visibilities based on how many were generated
+            vis = [gr.update(visible=i < len(outs), value=(outs[i] if i < len(outs) else None)) for i in range(6)]
+            # update gallery (prepend newest)
+            return (
+                *[v.value if isinstance(v, gr.Video) else None for v in []],  # filler not used; kept for clarity
             )
+        def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
+            outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
+            updates = []
+            # six video slots
+            for i in range(6):
+                if i < len(outs):
+                    updates.append(gr.update(visible=True, value=outs[i]))
+                else:
+                    updates.append(gr.update(visible=False, value=None))
+            # status
+            updates.append(msg)
+            # refresh gallery implicitly
+            gallery_items = _list_gallery()
+            return (*updates, gr.update(value=gallery_items))
+        generate.click(
+            fn=_process_and_update,
+            inputs=[video_input, text_input, guidance_scale, steps, samples],
+            outputs=[v1, v2, v3, v4, v5, v6, status, gallery],
+            api_name="/infer",
+            api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
+        )
+        # Visibility toggling from samples slider
+        def _toggle_vis(n):
+            n = int(n)
+            return [
+                gr.update(visible=True),
+                gr.update(visible=n >= 2),
+                gr.update(visible=n >= 3),
+                gr.update(visible=n >= 4),
+                gr.update(visible=n >= 5),
+                gr.update(visible=n >= 6),
+            ]
+        samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
+        refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
+    return demo
+def set_seeds(s: int = 1):
+    random.seed(s)
+    np.random.seed(s)
+    torch.manual_seed(s)
+# -------------
+# App bootstrap
+# -------------
+if __name__ == "__main__":
+    # clean logger -> print to stdout
+    logger.remove()
+    logger.add(lambda m: print(m, end=""), level="INFO")
+    set_seeds(1)
+    logger.info("===== Application Startup =====\n")
+    prepare_once()
+    # Late import after repo present
+    sys.path.append(REPO_DIR)
+    from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
+    from hunyuanvideo_foley.utils.feature_utils import feature_process  # noqa: F401
+    from hunyuanvideo_foley.utils.media_utils import merge_audio_video  # noqa: F401
+    msg = auto_load_models()
+    if not msg.startswith("✅"):
+        logger.error(f"[BOOT][ERROR] auto_load_models() failed:\n{msg}")
+    else:
+        logger.info(msg)
+    ui = create_ui()
+    # Mount MCP-only endpoints alongside the UI (optional but handy)
+    ui.blocks.append(mcp_only_endpoints)
+    # IMPORTANT: enable MCP server (tools/resources/prompts). This is all you need.
+    # See: https://www.gradio.app/guides/building-mcp-server-with-gradio
+    ui.launch(
+        server_name="0.0.0.0",
+        share=False,
+        show_error=True,
+        mcp_server=True,        # <— MCP enabled
+        # ssr_mode=True (default in 5.x)
+    )