Spaces:
Running
on
Zero
Running
on
Zero
Refactor audio loading to use librosa for consistency and improved handling of stereo audio
Browse files- utils/extract_conditions.py +19 -12
- utils/stable_audio_dataset_utils.py +10 -5
utils/extract_conditions.py
CHANGED
|
@@ -1,15 +1,10 @@
|
|
| 1 |
-
import torchaudio
|
| 2 |
import numpy as np
|
| 3 |
-
from scipy.signal import savgol_filter
|
| 4 |
-
import librosa
|
| 5 |
import torch
|
| 6 |
import torchaudio
|
|
|
|
| 7 |
import scipy.signal as signal
|
|
|
|
| 8 |
from torchaudio import transforms as T
|
| 9 |
-
import torch
|
| 10 |
-
import torchaudio
|
| 11 |
-
import librosa
|
| 12 |
-
import numpy as np
|
| 13 |
|
| 14 |
|
| 15 |
def compute_melody_v2(stereo_audio: torch.Tensor) -> np.ndarray:
|
|
@@ -23,7 +18,10 @@ def compute_melody_v2(stereo_audio: torch.Tensor) -> np.ndarray:
|
|
| 23 |
每一列代表: [L1, R1, L2, R2, L3, R3, L4, R4](按 frame 交錯),
|
| 24 |
且每個值都 ∈ {1, 2, …, 128},對應 CQT 的頻率 bin。
|
| 25 |
"""
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
# 1. 先針對左、右聲道分別計算 CQT (128 bins),回傳 cqt_db 形狀都是 (128, T_frames)
|
| 28 |
cqt_left = compute_music_represent(audio[0], sr) # shape: (128, T_frames)
|
| 29 |
cqt_right = compute_music_represent(audio[1], sr) # shape: (128, T_frames)
|
|
@@ -95,7 +93,10 @@ def compute_melody(input_audio):
|
|
| 95 |
sample_rate = 44100
|
| 96 |
|
| 97 |
# Load audio file
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
| 99 |
if sr != sample_rate:
|
| 100 |
resample = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
|
| 101 |
wav = resample(wav)
|
|
@@ -127,7 +128,10 @@ def compute_dynamics(audio_file, hop_length=160, target_sample_rate=44100, cut=T
|
|
| 127 |
dynamics_curve (numpy.ndarray): The computed dynamic values in dB.
|
| 128 |
"""
|
| 129 |
# Load audio file
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
if original_sample_rate != target_sample_rate:
|
| 132 |
resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
|
| 133 |
waveform = resampler(waveform)
|
|
@@ -173,9 +177,12 @@ def extract_melody_one_hot(audio_path,
|
|
| 173 |
One-hot chromagram of the most prominent pitch class per frame.
|
| 174 |
"""
|
| 175 |
# ---------------------------------------------------------
|
| 176 |
-
# 1. Load audio (
|
| 177 |
# ---------------------------------------------------------
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
# Convert to mono by averaging channels: shape => (samples,)
|
| 181 |
audio_mono = audio.mean(dim=0)
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
|
|
|
|
|
|
| 2 |
import torch
|
| 3 |
import torchaudio
|
| 4 |
+
import librosa
|
| 5 |
import scipy.signal as signal
|
| 6 |
+
from scipy.signal import savgol_filter
|
| 7 |
from torchaudio import transforms as T
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def compute_melody_v2(stereo_audio: torch.Tensor) -> np.ndarray:
|
|
|
|
| 18 |
每一列代表: [L1, R1, L2, R2, L3, R3, L4, R4](按 frame 交錯),
|
| 19 |
且每個值都 ∈ {1, 2, …, 128},對應 CQT 的頻率 bin。
|
| 20 |
"""
|
| 21 |
+
audio_np, sr = librosa.load(stereo_audio, sr=None, mono=False)
|
| 22 |
+
if audio_np.ndim == 1:
|
| 23 |
+
audio_np = np.expand_dims(audio_np, 0)
|
| 24 |
+
audio = torch.as_tensor(audio_np, dtype=torch.float32)
|
| 25 |
# 1. 先針對左、右聲道分別計算 CQT (128 bins),回傳 cqt_db 形狀都是 (128, T_frames)
|
| 26 |
cqt_left = compute_music_represent(audio[0], sr) # shape: (128, T_frames)
|
| 27 |
cqt_right = compute_music_represent(audio[1], sr) # shape: (128, T_frames)
|
|
|
|
| 93 |
sample_rate = 44100
|
| 94 |
|
| 95 |
# Load audio file
|
| 96 |
+
wav_np, sr = librosa.load(input_audio, sr=None, mono=False)
|
| 97 |
+
if wav_np.ndim == 1:
|
| 98 |
+
wav_np = np.expand_dims(wav_np, 0)
|
| 99 |
+
wav = torch.as_tensor(wav_np, dtype=torch.float32)
|
| 100 |
if sr != sample_rate:
|
| 101 |
resample = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
|
| 102 |
wav = resample(wav)
|
|
|
|
| 128 |
dynamics_curve (numpy.ndarray): The computed dynamic values in dB.
|
| 129 |
"""
|
| 130 |
# Load audio file
|
| 131 |
+
waveform_np, original_sample_rate = librosa.load(audio_file, sr=None, mono=False)
|
| 132 |
+
if waveform_np.ndim == 1:
|
| 133 |
+
waveform_np = np.expand_dims(waveform_np, 0)
|
| 134 |
+
waveform = torch.as_tensor(waveform_np, dtype=torch.float32)
|
| 135 |
if original_sample_rate != target_sample_rate:
|
| 136 |
resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
|
| 137 |
waveform = resampler(waveform)
|
|
|
|
| 177 |
One-hot chromagram of the most prominent pitch class per frame.
|
| 178 |
"""
|
| 179 |
# ---------------------------------------------------------
|
| 180 |
+
# 1. Load audio (librosa => shape: (channels, samples))
|
| 181 |
# ---------------------------------------------------------
|
| 182 |
+
audio_np, in_sr = librosa.load(audio_path, sr=None, mono=False)
|
| 183 |
+
if audio_np.ndim == 1:
|
| 184 |
+
audio_np = np.expand_dims(audio_np, 0)
|
| 185 |
+
audio = torch.as_tensor(audio_np, dtype=torch.float32)
|
| 186 |
|
| 187 |
# Convert to mono by averaging channels: shape => (samples,)
|
| 188 |
audio_mono = audio.mean(dim=0)
|
utils/stable_audio_dataset_utils.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
| 1 |
import math
|
| 2 |
import random
|
| 3 |
-
import torch
|
| 4 |
-
from torch import nn
|
| 5 |
from typing import Tuple
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
import torch.nn.functional as F
|
|
|
|
| 8 |
from torchaudio import transforms as T
|
| 9 |
|
| 10 |
def load_audio_file(filename, target_sr=44100, target_samples=2097152):
|
| 11 |
try:
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
# Resample if necessary
|
| 14 |
if in_sr != target_sr:
|
| 15 |
resampler = T.Resample(in_sr, target_sr)
|
|
@@ -126,4 +131,4 @@ class Stereo(nn.Module):
|
|
| 126 |
elif signal_shape[0] > 2: #?, s -> 2,s
|
| 127 |
signal = signal[:2, :]
|
| 128 |
|
| 129 |
-
return signal
|
|
|
|
| 1 |
import math
|
| 2 |
import random
|
|
|
|
|
|
|
| 3 |
from typing import Tuple
|
| 4 |
+
|
| 5 |
+
import librosa
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
import torch.nn.functional as F
|
| 9 |
+
from torch import nn
|
| 10 |
from torchaudio import transforms as T
|
| 11 |
|
| 12 |
def load_audio_file(filename, target_sr=44100, target_samples=2097152):
|
| 13 |
try:
|
| 14 |
+
audio_np, in_sr = librosa.load(filename, sr=None, mono=False)
|
| 15 |
+
if audio_np.ndim == 1:
|
| 16 |
+
audio_np = np.expand_dims(audio_np, 0)
|
| 17 |
+
audio = torch.as_tensor(audio_np, dtype=torch.float32)
|
| 18 |
# Resample if necessary
|
| 19 |
if in_sr != target_sr:
|
| 20 |
resampler = T.Resample(in_sr, target_sr)
|
|
|
|
| 131 |
elif signal_shape[0] > 2: #?, s -> 2,s
|
| 132 |
signal = signal[:2, :]
|
| 133 |
|
| 134 |
+
return signal
|