Spaces:

alibabasglab
/

EEYD

Sleeping

App Files Files Community

alibabasglab commited on 6 days ago

Commit

d0c2212

verified ·

1 Parent(s): 3c07701

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -39

app.py CHANGED Viewed

@@ -2,11 +2,8 @@ import torch
 import soundfile as sf
 import gradio as gr
 import spaces
-import os
 from extract_everything import extract_everything
 @spaces.GPU(duration=30)
 def fn_extract_everything(input_type, input_audio, input_video, input_text_prompt):
@@ -19,54 +16,70 @@ def fn_extract_everything(input_type, input_audio, input_video, input_text_promp
     extract_everything_model = extract_everything()
     orig_wav, output_wav, residual_wav = extract_everything_model(input_wav, input_text_prompt)
     sf.write('original_audio.wav', orig_wav, 16000)
     sf.write('extracted_audio.wav', output_wav, 16000)
-    sf.write('residual_audio.wav', residual_wav, 16000)
-    return 'original_audio.wav', 'extracted_audio.wav', 'residual_audio.wav'
-# Update visibility of the file input based on the selected input type
 def update_input_visibility(input_type):
     if input_type == "Audio":
         return gr.update(visible=True), gr.update(visible=False)
     elif input_type == "Video":
         return gr.update(visible=False), gr.update(visible=True)
-demo = gr.Blocks()
-with demo:
-    input_type = gr.Dropdown(["Audio", "Video"], value="Audio", multiselect=False, label="Select Input Type")
-    audio_input = gr.Audio(label="Input Audio here", type="filepath", visible=True)
-    video_input = gr.Video(label="Input Video here", visible=False)
     input_type.change(
         fn=update_input_visibility,
-        inputs=[input_type],
-        outputs=[audio_input, video_input],
     )
-    gr.Interface(
-        fn=fn_extract_everything,
-        inputs = [
-            input_type,
-            audio_input,
-            video_input,
-            gr.Textbox(label="Enter the description of the sound, in keywords or in sentence."),
-        ],
-        outputs = [
-            gr.Audio(label="Unprocessed Audio", type="filepath"),
-            gr.Audio(label="Extracted Audio", type="filepath"),
-            gr.Audio(label="Residual Audio", type="filepath"),
-        ],
-        title = "OmniSoniX: Text-Driven Universal Target Audio Extraction for any Speech, Music, and Sound Events",
-        description = ("OmniSoniX is an AI-powered tool to extract any sound you described from an audio/video."
-                       "To try it, simply click one of the examples, or upload your own audio/video (Preferably less than 20 seconds or less due to the GPU usage limits here). "),
-        examples = [
-            ["Audio", "examples/noisy_speech.wav", None,  "noise"],
-            ["Audio", "examples/song_chinese.wav", None,  "vocal"],
         ],
-        cache_examples = False,
     )
 demo.launch()

 import soundfile as sf
 import gradio as gr
 import spaces
 from extract_everything import extract_everything
+import os
 @spaces.GPU(duration=30)
 def fn_extract_everything(input_type, input_audio, input_video, input_text_prompt):
     extract_everything_model = extract_everything()
     orig_wav, output_wav, residual_wav = extract_everything_model(input_wav, input_text_prompt)
+    # Save only the two outputs we display
     sf.write('original_audio.wav', orig_wav, 16000)
     sf.write('extracted_audio.wav', output_wav, 16000)
+    # (residual is computed but not saved or returned)
+    return 'original_audio.wav', 'extracted_audio.wav'
 def update_input_visibility(input_type):
     if input_type == "Audio":
         return gr.update(visible=True), gr.update(visible=False)
     elif input_type == "Video":
         return gr.update(visible=False), gr.update(visible=True)
+# Build UI with Blocks
+with gr.Blocks(title="OmniSoniX") as demo:
+    gr.Markdown("# OmniSoniX: Text-Driven Universal Target Audio Extraction")
+    gr.Markdown("Extract any sound (speech, music, sound events) using free-form text prompts.")
+    with gr.Row():
+        input_type = gr.Dropdown(
+            choices=["Audio", "Video"],
+            value="Audio",
+            label="Select Input Type"
+        )
+    with gr.Row():
+        audio_input = gr.Audio(label="Input Audio", type="filepath", visible=True)
+        video_input = gr.Video(label="Input Video", visible=False)
     input_type.change(
         fn=update_input_visibility,
+        inputs=input_type,
+        outputs=[audio_input, video_input]
     )
+    text_prompt = gr.Textbox(
+        label="Enter the description of the sound (e.g., 'vocal', 'dog barking', 'female speaker')",
+        placeholder="Type your prompt here..."
+    )
+    with gr.Row():
+        btn = gr.Button("Extract")
+    with gr.Row():
+        orig_out = gr.Audio(label="Original Audio", type="filepath")
+        extracted_out = gr.Audio(label="Extracted Audio", type="filepath")
+    # Examples — note: residual is omitted from outputs
+    gr.Examples(
+        examples=[
+            ["Audio", "examples/noisy_speech.wav", None, "noise"],
+            ["Audio", "examples/song_chinese.wav", None, "vocal"],
         ],
+        inputs=[input_type, audio_input, video_input, text_prompt],
+        outputs=[orig_out, extracted_out],
+        fn=fn_extract_everything,
+        cache_examples=False,
+    )
+    btn.click(
+        fn=fn_extract_everything,
+        inputs=[input_type, audio_input, video_input, text_prompt],
+        outputs=[orig_out, extracted_out]
     )
 demo.launch()