import torch import soundfile as sf import gradio as gr import spaces from extract_everything import extract_everything @spaces.GPU(duration=30) def fn_extract_everything(input_audio, input_text_prompt): extract_everything_model = extract_everything() orig_wav, output_wav, residual_wav = extract_everything_model(input_audio, input_text_prompt) sf.write('extracted_audio.wav', output_wav, 16000) return 'extracted_audio.wav' def clear_all(): return gr.update(value=None), gr.update(value="") with gr.Blocks(title="OmniSoniX") as demo: gr.Markdown("# OmniSoniX: Text-Driven Universal Target Audio Extraction") gr.Markdown( "Extract any sound using free-form text prompts.To try it, simply click one of the examples, or upload your own audio/video (Preferably less than 20 seconds or less due to the GPU usage limits here, try again after a few seconds if encounters huggingface error.)" ) audio_input = gr.Audio(label="Input Audio", type="filepath") text_prompt = gr.Textbox( label="Describe the sound to extract", placeholder="e.g., 'vocal', 'dog barking', 'female speech'" ) with gr.Row(): clear_btn = gr.Button("Clear") extract_btn = gr.Button("Extract") extracted_out = gr.Audio(label="Extracted Audio", type="filepath") # Examples — now guaranteed to work gr.Examples( examples=[ ["examples/noisy_speech.wav", "noise"], ["examples/song_chinese.wav", "vocal"], ["examples/song_english.wav", "drum"], ["examples/bird_speech.wav", "bird chirp"], ["examples/keyboard_water.wav", "A person types on a keyboard"], ["examples/siren_speech.wav", "Ambulance siren"], ["examples/low_volumn_speech.wav", "Low volume speech"], ["examples/male_speech.wav", "Male speech"], ["examples/czech_speech.wav", "Czech speech"], ["examples/slower_speech.wav", "slower voice"], ["examples/happy_speech.wav", "happy speech"], ], inputs=[audio_input, text_prompt], outputs=[extracted_out], fn=fn_extract_everything, cache_examples=False, ) extract_btn.click( fn=fn_extract_everything, inputs=[audio_input, text_prompt], outputs=[extracted_out] ) clear_btn.click( fn=clear_all, inputs=[], outputs=[audio_input, text_prompt] ) demo.launch()