Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,506 Bytes
fca537b 8e86802 e4ada08 663300b e4ada08 fca537b ecf6f46 4c2fa0e ecf6f46 e4ada08 ecf6f46 d0c2212 e4ada08 2077bf5 e54c733 e4ada08 d0c2212 e54c733 2077bf5 d0c2212 ecf6f46 d0c2212 ecf6f46 e54c733 e4ada08 d0c2212 2077bf5 351386e 2077bf5 d6b381d 2077bf5 d6b381d 4c2fa0e e4ada08 ecf6f46 d0c2212 ecf6f46 d0c2212 e4ada08 ecf6f46 e54c733 e4ada08 4c2fa0e fca537b e54c733 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from extract_everything import extract_everything
@spaces.GPU(duration=30)
def fn_extract_everything(input_audio, input_text_prompt):
extract_everything_model = extract_everything()
orig_wav, output_wav, residual_wav = extract_everything_model(input_audio, input_text_prompt)
sf.write('extracted_audio.wav', output_wav, 16000)
return 'extracted_audio.wav'
def clear_all():
return gr.update(value=None), gr.update(value="")
with gr.Blocks(title="OmniSoniX") as demo:
gr.Markdown("# OmniSoniX: Text-Driven Universal Target Audio Extraction")
gr.Markdown(
"Extract any sound using free-form text prompts.To try it, simply click one of the examples, or upload your own audio/video (Preferably less than 20 seconds or less due to the GPU usage limits here, try again after a few seconds if encounters huggingface error.)"
)
audio_input = gr.Audio(label="Input Audio", type="filepath")
text_prompt = gr.Textbox(
label="Describe the sound to extract",
placeholder="e.g., 'vocal', 'dog barking', 'female speech'"
)
with gr.Row():
clear_btn = gr.Button("Clear")
extract_btn = gr.Button("Extract")
extracted_out = gr.Audio(label="Extracted Audio", type="filepath")
# Examples — now guaranteed to work
gr.Examples(
examples=[
["examples/noisy_speech.wav", "noise"],
["examples/song_chinese.wav", "vocal"],
["examples/song_english.wav", "drum"],
["examples/bird_speech.wav", "bird chirp"],
["examples/keyboard_water.wav", "A person types on a keyboard"],
["examples/siren_speech.wav", "Ambulance siren"],
["examples/low_volumn_speech.wav", "Low volume speech"],
["examples/male_speech.wav", "Male speech"],
["examples/czech_speech.wav", "Czech speech"],
["examples/slower_speech.wav", "slower voice"],
["examples/happy_speech.wav", "happy speech"],
],
inputs=[audio_input, text_prompt],
outputs=[extracted_out],
fn=fn_extract_everything,
cache_examples=False,
)
extract_btn.click(
fn=fn_extract_everything,
inputs=[audio_input, text_prompt],
outputs=[extracted_out]
)
clear_btn.click(
fn=clear_all,
inputs=[],
outputs=[audio_input, text_prompt]
)
demo.launch()
|