alibabasglab commited on
Commit
d0c2212
·
verified ·
1 Parent(s): 3c07701

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -39
app.py CHANGED
@@ -2,11 +2,8 @@ import torch
2
  import soundfile as sf
3
  import gradio as gr
4
  import spaces
5
- import os
6
-
7
-
8
  from extract_everything import extract_everything
9
-
10
 
11
  @spaces.GPU(duration=30)
12
  def fn_extract_everything(input_type, input_audio, input_video, input_text_prompt):
@@ -19,54 +16,70 @@ def fn_extract_everything(input_type, input_audio, input_video, input_text_promp
19
 
20
  extract_everything_model = extract_everything()
21
  orig_wav, output_wav, residual_wav = extract_everything_model(input_wav, input_text_prompt)
 
 
22
  sf.write('original_audio.wav', orig_wav, 16000)
23
  sf.write('extracted_audio.wav', output_wav, 16000)
24
- sf.write('residual_audio.wav', residual_wav, 16000)
25
- return 'original_audio.wav', 'extracted_audio.wav', 'residual_audio.wav'
26
-
27
 
28
- # Update visibility of the file input based on the selected input type
29
  def update_input_visibility(input_type):
30
  if input_type == "Audio":
31
  return gr.update(visible=True), gr.update(visible=False)
32
  elif input_type == "Video":
33
  return gr.update(visible=False), gr.update(visible=True)
34
-
35
- demo = gr.Blocks()
36
-
37
-
38
- with demo:
39
- input_type = gr.Dropdown(["Audio", "Video"], value="Audio", multiselect=False, label="Select Input Type")
40
- audio_input = gr.Audio(label="Input Audio here", type="filepath", visible=True)
41
- video_input = gr.Video(label="Input Video here", visible=False)
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  input_type.change(
44
  fn=update_input_visibility,
45
- inputs=[input_type],
46
- outputs=[audio_input, video_input],
47
  )
48
-
49
- gr.Interface(
50
- fn=fn_extract_everything,
51
- inputs = [
52
- input_type,
53
- audio_input,
54
- video_input,
55
- gr.Textbox(label="Enter the description of the sound, in keywords or in sentence."),
56
- ],
57
- outputs = [
58
- gr.Audio(label="Unprocessed Audio", type="filepath"),
59
- gr.Audio(label="Extracted Audio", type="filepath"),
60
- gr.Audio(label="Residual Audio", type="filepath"),
61
- ],
62
- title = "OmniSoniX: Text-Driven Universal Target Audio Extraction for any Speech, Music, and Sound Events",
63
- description = ("OmniSoniX is an AI-powered tool to extract any sound you described from an audio/video."
64
- "To try it, simply click one of the examples, or upload your own audio/video (Preferably less than 20 seconds or less due to the GPU usage limits here). "),
65
- examples = [
66
- ["Audio", "examples/noisy_speech.wav", None, "noise"],
67
- ["Audio", "examples/song_chinese.wav", None, "vocal"],
68
  ],
69
- cache_examples = False,
 
 
 
 
 
 
 
 
 
70
  )
71
 
72
  demo.launch()
 
2
  import soundfile as sf
3
  import gradio as gr
4
  import spaces
 
 
 
5
  from extract_everything import extract_everything
6
+ import os
7
 
8
  @spaces.GPU(duration=30)
9
  def fn_extract_everything(input_type, input_audio, input_video, input_text_prompt):
 
16
 
17
  extract_everything_model = extract_everything()
18
  orig_wav, output_wav, residual_wav = extract_everything_model(input_wav, input_text_prompt)
19
+
20
+ # Save only the two outputs we display
21
  sf.write('original_audio.wav', orig_wav, 16000)
22
  sf.write('extracted_audio.wav', output_wav, 16000)
23
+ # (residual is computed but not saved or returned)
24
+
25
+ return 'original_audio.wav', 'extracted_audio.wav'
26
 
 
27
  def update_input_visibility(input_type):
28
  if input_type == "Audio":
29
  return gr.update(visible=True), gr.update(visible=False)
30
  elif input_type == "Video":
31
  return gr.update(visible=False), gr.update(visible=True)
 
 
 
 
 
 
 
 
32
 
33
+ # Build UI with Blocks
34
+ with gr.Blocks(title="OmniSoniX") as demo:
35
+ gr.Markdown("# OmniSoniX: Text-Driven Universal Target Audio Extraction")
36
+ gr.Markdown("Extract any sound (speech, music, sound events) using free-form text prompts.")
37
+
38
+ with gr.Row():
39
+ input_type = gr.Dropdown(
40
+ choices=["Audio", "Video"],
41
+ value="Audio",
42
+ label="Select Input Type"
43
+ )
44
+
45
+ with gr.Row():
46
+ audio_input = gr.Audio(label="Input Audio", type="filepath", visible=True)
47
+ video_input = gr.Video(label="Input Video", visible=False)
48
+
49
  input_type.change(
50
  fn=update_input_visibility,
51
+ inputs=input_type,
52
+ outputs=[audio_input, video_input]
53
  )
54
+
55
+ text_prompt = gr.Textbox(
56
+ label="Enter the description of the sound (e.g., 'vocal', 'dog barking', 'female speaker')",
57
+ placeholder="Type your prompt here..."
58
+ )
59
+
60
+ with gr.Row():
61
+ btn = gr.Button("Extract")
62
+
63
+ with gr.Row():
64
+ orig_out = gr.Audio(label="Original Audio", type="filepath")
65
+ extracted_out = gr.Audio(label="Extracted Audio", type="filepath")
66
+
67
+ # Examples — note: residual is omitted from outputs
68
+ gr.Examples(
69
+ examples=[
70
+ ["Audio", "examples/noisy_speech.wav", None, "noise"],
71
+ ["Audio", "examples/song_chinese.wav", None, "vocal"],
 
 
72
  ],
73
+ inputs=[input_type, audio_input, video_input, text_prompt],
74
+ outputs=[orig_out, extracted_out],
75
+ fn=fn_extract_everything,
76
+ cache_examples=False,
77
+ )
78
+
79
+ btn.click(
80
+ fn=fn_extract_everything,
81
+ inputs=[input_type, audio_input, video_input, text_prompt],
82
+ outputs=[orig_out, extracted_out]
83
  )
84
 
85
  demo.launch()