manoskary commited on
Commit
3f9798c
·
1 Parent(s): ef862da

Refactor audio loading to use librosa for consistency and improved handling of stereo audio

Browse files
utils/extract_conditions.py CHANGED
@@ -1,15 +1,10 @@
1
- import torchaudio
2
  import numpy as np
3
- from scipy.signal import savgol_filter
4
- import librosa
5
  import torch
6
  import torchaudio
 
7
  import scipy.signal as signal
 
8
  from torchaudio import transforms as T
9
- import torch
10
- import torchaudio
11
- import librosa
12
- import numpy as np
13
 
14
 
15
  def compute_melody_v2(stereo_audio: torch.Tensor) -> np.ndarray:
@@ -23,7 +18,10 @@ def compute_melody_v2(stereo_audio: torch.Tensor) -> np.ndarray:
23
  每一列代表: [L1, R1, L2, R2, L3, R3, L4, R4](按 frame 交錯),
24
  且每個值都 ∈ {1, 2, …, 128},對應 CQT 的頻率 bin。
25
  """
26
- audio, sr = torchaudio.load(stereo_audio)
 
 
 
27
  # 1. 先針對左、右聲道分別計算 CQT (128 bins),回傳 cqt_db 形狀都是 (128, T_frames)
28
  cqt_left = compute_music_represent(audio[0], sr) # shape: (128, T_frames)
29
  cqt_right = compute_music_represent(audio[1], sr) # shape: (128, T_frames)
@@ -95,7 +93,10 @@ def compute_melody(input_audio):
95
  sample_rate = 44100
96
 
97
  # Load audio file
98
- wav, sr = torchaudio.load(input_audio)
 
 
 
99
  if sr != sample_rate:
100
  resample = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
101
  wav = resample(wav)
@@ -127,7 +128,10 @@ def compute_dynamics(audio_file, hop_length=160, target_sample_rate=44100, cut=T
127
  dynamics_curve (numpy.ndarray): The computed dynamic values in dB.
128
  """
129
  # Load audio file
130
- waveform, original_sample_rate = torchaudio.load(audio_file)
 
 
 
131
  if original_sample_rate != target_sample_rate:
132
  resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
133
  waveform = resampler(waveform)
@@ -173,9 +177,12 @@ def extract_melody_one_hot(audio_path,
173
  One-hot chromagram of the most prominent pitch class per frame.
174
  """
175
  # ---------------------------------------------------------
176
- # 1. Load audio (Torchaudio => shape: (channels, samples))
177
  # ---------------------------------------------------------
178
- audio, in_sr = torchaudio.load(audio_path)
 
 
 
179
 
180
  # Convert to mono by averaging channels: shape => (samples,)
181
  audio_mono = audio.mean(dim=0)
 
 
1
  import numpy as np
 
 
2
  import torch
3
  import torchaudio
4
+ import librosa
5
  import scipy.signal as signal
6
+ from scipy.signal import savgol_filter
7
  from torchaudio import transforms as T
 
 
 
 
8
 
9
 
10
  def compute_melody_v2(stereo_audio: torch.Tensor) -> np.ndarray:
 
18
  每一列代表: [L1, R1, L2, R2, L3, R3, L4, R4](按 frame 交錯),
19
  且每個值都 ∈ {1, 2, …, 128},對應 CQT 的頻率 bin。
20
  """
21
+ audio_np, sr = librosa.load(stereo_audio, sr=None, mono=False)
22
+ if audio_np.ndim == 1:
23
+ audio_np = np.expand_dims(audio_np, 0)
24
+ audio = torch.as_tensor(audio_np, dtype=torch.float32)
25
  # 1. 先針對左、右聲道分別計算 CQT (128 bins),回傳 cqt_db 形狀都是 (128, T_frames)
26
  cqt_left = compute_music_represent(audio[0], sr) # shape: (128, T_frames)
27
  cqt_right = compute_music_represent(audio[1], sr) # shape: (128, T_frames)
 
93
  sample_rate = 44100
94
 
95
  # Load audio file
96
+ wav_np, sr = librosa.load(input_audio, sr=None, mono=False)
97
+ if wav_np.ndim == 1:
98
+ wav_np = np.expand_dims(wav_np, 0)
99
+ wav = torch.as_tensor(wav_np, dtype=torch.float32)
100
  if sr != sample_rate:
101
  resample = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
102
  wav = resample(wav)
 
128
  dynamics_curve (numpy.ndarray): The computed dynamic values in dB.
129
  """
130
  # Load audio file
131
+ waveform_np, original_sample_rate = librosa.load(audio_file, sr=None, mono=False)
132
+ if waveform_np.ndim == 1:
133
+ waveform_np = np.expand_dims(waveform_np, 0)
134
+ waveform = torch.as_tensor(waveform_np, dtype=torch.float32)
135
  if original_sample_rate != target_sample_rate:
136
  resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
137
  waveform = resampler(waveform)
 
177
  One-hot chromagram of the most prominent pitch class per frame.
178
  """
179
  # ---------------------------------------------------------
180
+ # 1. Load audio (librosa => shape: (channels, samples))
181
  # ---------------------------------------------------------
182
+ audio_np, in_sr = librosa.load(audio_path, sr=None, mono=False)
183
+ if audio_np.ndim == 1:
184
+ audio_np = np.expand_dims(audio_np, 0)
185
+ audio = torch.as_tensor(audio_np, dtype=torch.float32)
186
 
187
  # Convert to mono by averaging channels: shape => (samples,)
188
  audio_mono = audio.mean(dim=0)
utils/stable_audio_dataset_utils.py CHANGED
@@ -1,15 +1,20 @@
1
  import math
2
  import random
3
- import torch
4
- from torch import nn
5
  from typing import Tuple
6
- import torchaudio
 
 
 
7
  import torch.nn.functional as F
 
8
  from torchaudio import transforms as T
9
 
10
  def load_audio_file(filename, target_sr=44100, target_samples=2097152):
11
  try:
12
- audio, in_sr = torchaudio.load(filename)
 
 
 
13
  # Resample if necessary
14
  if in_sr != target_sr:
15
  resampler = T.Resample(in_sr, target_sr)
@@ -126,4 +131,4 @@ class Stereo(nn.Module):
126
  elif signal_shape[0] > 2: #?, s -> 2,s
127
  signal = signal[:2, :]
128
 
129
- return signal
 
1
  import math
2
  import random
 
 
3
  from typing import Tuple
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import torch
8
  import torch.nn.functional as F
9
+ from torch import nn
10
  from torchaudio import transforms as T
11
 
12
  def load_audio_file(filename, target_sr=44100, target_samples=2097152):
13
  try:
14
+ audio_np, in_sr = librosa.load(filename, sr=None, mono=False)
15
+ if audio_np.ndim == 1:
16
+ audio_np = np.expand_dims(audio_np, 0)
17
+ audio = torch.as_tensor(audio_np, dtype=torch.float32)
18
  # Resample if necessary
19
  if in_sr != target_sr:
20
  resampler = T.Resample(in_sr, target_sr)
 
131
  elif signal_shape[0] > 2: #?, s -> 2,s
132
  signal = signal[:2, :]
133
 
134
+ return signal