File size: 2,506 Bytes
fca537b
 
 
 
 
 
8e86802
e4ada08
663300b
e4ada08
fca537b
ecf6f46
4c2fa0e
ecf6f46
e4ada08
ecf6f46
d0c2212
 
e4ada08
2077bf5
e54c733
 
e4ada08
d0c2212
e54c733
2077bf5
d0c2212
 
 
ecf6f46
 
d0c2212
ecf6f46
e54c733
e4ada08
d0c2212
 
2077bf5
 
 
351386e
2077bf5
 
d6b381d
2077bf5
 
 
d6b381d
4c2fa0e
e4ada08
ecf6f46
d0c2212
 
 
 
ecf6f46
d0c2212
e4ada08
ecf6f46
 
 
 
 
e54c733
e4ada08
4c2fa0e
fca537b
e54c733
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
import soundfile as sf
import gradio as gr
import spaces
from extract_everything import extract_everything

@spaces.GPU(duration=30)
def fn_extract_everything(input_audio, input_text_prompt):
    extract_everything_model = extract_everything()
    orig_wav, output_wav, residual_wav = extract_everything_model(input_audio, input_text_prompt)
    sf.write('extracted_audio.wav', output_wav, 16000)
    return 'extracted_audio.wav'

def clear_all():
    return gr.update(value=None), gr.update(value="")

with gr.Blocks(title="OmniSoniX") as demo:
    gr.Markdown("# OmniSoniX: Text-Driven Universal Target Audio Extraction")
    gr.Markdown(
        "Extract any sound using free-form text prompts.To try it, simply click one of the examples, or upload your own audio/video (Preferably less than 20 seconds or less due to the GPU usage limits here, try again after a few seconds if encounters huggingface error.)"
    )

    audio_input = gr.Audio(label="Input Audio", type="filepath")
    text_prompt = gr.Textbox(
        label="Describe the sound to extract",
        placeholder="e.g., 'vocal', 'dog barking', 'female speech'"
    )
    
    with gr.Row():
        clear_btn = gr.Button("Clear")
        extract_btn = gr.Button("Extract")
    
    extracted_out = gr.Audio(label="Extracted Audio", type="filepath")

    # Examples — now guaranteed to work
    gr.Examples(
        examples=[
            ["examples/noisy_speech.wav",  "noise"],
            ["examples/song_chinese.wav",  "vocal"],
            ["examples/song_english.wav",  "drum"],
            ["examples/bird_speech.wav",  "bird chirp"],
            ["examples/keyboard_water.wav",  "A person types on a keyboard"],
            ["examples/siren_speech.wav",  "Ambulance siren"],
            ["examples/low_volumn_speech.wav",  "Low volume speech"],
            ["examples/male_speech.wav",  "Male speech"],
            ["examples/czech_speech.wav",  "Czech speech"],
            ["examples/slower_speech.wav",  "slower voice"],
            ["examples/happy_speech.wav",  "happy speech"],
        ],
        inputs=[audio_input, text_prompt],
        outputs=[extracted_out],
        fn=fn_extract_everything,
        cache_examples=False,
    )
    
    extract_btn.click(
        fn=fn_extract_everything,
        inputs=[audio_input, text_prompt],
        outputs=[extracted_out]
    )
    
    clear_btn.click(
        fn=clear_all,
        inputs=[],
        outputs=[audio_input, text_prompt]
    )

demo.launch()