File size: 2,633 Bytes
fb63790
38967a3
fb63790
 
38967a3
d4c09e5
960e153
fb63790
38967a3
fb63790
 
38967a3
fb63790
 
050eb17
 
 
 
 
 
 
 
fb63790
4a7abf4
fb63790
 
d4c09e5
fb63790
 
 
38967a3
960e153
ad61fc3
fb63790
 
 
 
38967a3
fb63790
 
 
 
 
38967a3
 
 
880a1d9
 
 
a0627ea
 
 
 
880a1d9
 
ad61fc3
880a1d9
f81bd55
 
a0627ea
 
 
 
880a1d9
 
 
a0627ea
880a1d9
 
ad61fc3
880a1d9
 
 
a0627ea
880a1d9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import torch
import soundfile as sf
from huggingface_hub import login
from diffusers import StableAudioPipeline
import gradio as gr
import spaces

# Load Hugging Face token securely
HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
if HUGGINGFACE_TOKEN is None:
    raise ValueError("Missing Hugging Face token. Please set it in Spaces Secrets.")
login(HUGGINGFACE_TOKEN)

# Set device for PyTorch (only CPU, if no GPU is available)
device = "cpu"
torch_dtype = torch.float32  # Use float32 for CPU by default

# Check for GPU availability
if torch.cuda.is_available():
    device = "cuda"
    torch_dtype = torch.float16  # Use float16 for GPU to optimize memory usage

# Load the pipeline
pipe = StableAudioPipeline.from_pretrained(
    "stabilityai/stable-audio-open-1.0",
    torch_dtype=torch_dtype
)
pipe = pipe.to(device)

# Function to generate audio
@spaces.GPU
def generate_audio(prompt, negative_prompt, duration, diffusion_steps, seed):
    generator = torch.Generator(device).manual_seed(seed)
    audio_output = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=int(diffusion_steps),  # Number of diffusion steps
        audio_end_in_s=duration,
        num_waveforms_per_prompt=1,
        generator=generator
    ).audios
    output_audio = audio_output[0].T.float().cpu().numpy()
    output_file = "output.wav"
    sf.write(output_file, output_audio, pipe.vae.sampling_rate)
    return output_file

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🎧 Stable Audio Open - Audio Generation 🎼")
    gr.Markdown("### Adjust prompts, duration, and diffusion steps to control the generation!")
    
    # Input Section
    with gr.Row():
        prompt_input = gr.Textbox(label="Prompt", value="The sound of a hammer hitting a wooden surface.")
        negative_input = gr.Textbox(label="Negative Prompt", value="Low quality.")
    with gr.Row():
        duration_input = gr.Slider(minimum=1, maximum=10, step=0.5, value=1, label="Duration (seconds)")
        diffusion_steps_input = gr.Slider(minimum=1, maximum=500, step=10, value=10, label="Diffusion Steps")
    with gr.Row():
        seed_input = gr.Number(label="Random Seed", value=42)
    
    # Output Section
    generate_button = gr.Button("Generate Audio")
    output_audio = gr.Audio(label="Generated Audio", type="filepath")
    
    # Connect the function to the button click
    generate_button.click(
        generate_audio, 
        inputs=[prompt_input, negative_input, duration_input, diffusion_steps_input, seed_input], 
        outputs=output_audio
    )

# Launch the app
demo.launch()