Spaces:
Build error
Build error
Commit
·
cb6da82
1
Parent(s):
579d79b
update
Browse files- app.py +16 -11
- inference_speech_editing_scale.py +9 -4
- inference_tts_scale.py +11 -5
- requirements.txt +3 -2
app.py
CHANGED
|
@@ -11,31 +11,36 @@ import io
|
|
| 11 |
import numpy as np
|
| 12 |
import random
|
| 13 |
import uuid
|
| 14 |
-
import spaces
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
|
| 18 |
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
| 19 |
MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
|
| 20 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 21 |
whisper_model, align_model, voicecraft_model = None, None, None
|
| 22 |
|
| 23 |
|
| 24 |
def get_random_string():
|
| 25 |
return "".join(str(uuid.uuid4()).split("-"))
|
| 26 |
|
| 27 |
-
|
| 28 |
def seed_everything(seed):
|
| 29 |
if seed != -1:
|
| 30 |
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 31 |
random.seed(seed)
|
| 32 |
np.random.seed(seed)
|
| 33 |
torch.manual_seed(seed)
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
torch.backends.cudnn.benchmark = False
|
| 36 |
torch.backends.cudnn.deterministic = True
|
| 37 |
|
| 38 |
-
|
| 39 |
class WhisperxAlignModel:
|
| 40 |
def __init__(self):
|
| 41 |
from whisperx import load_align_model
|
|
@@ -46,7 +51,7 @@ class WhisperxAlignModel:
|
|
| 46 |
audio = load_audio(audio_path)
|
| 47 |
return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
|
| 48 |
|
| 49 |
-
|
| 50 |
class WhisperModel:
|
| 51 |
def __init__(self, model_name):
|
| 52 |
from whisper import load_model
|
|
@@ -63,7 +68,7 @@ class WhisperModel:
|
|
| 63 |
def transcribe(self, audio_path):
|
| 64 |
return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
|
| 65 |
|
| 66 |
-
|
| 67 |
class WhisperxModel:
|
| 68 |
def __init__(self, model_name, align_model: WhisperxAlignModel):
|
| 69 |
from whisperx import load_model
|
|
@@ -74,7 +79,7 @@ class WhisperxModel:
|
|
| 74 |
segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
|
| 75 |
return self.align_model.align(segments, audio_path)
|
| 76 |
|
| 77 |
-
|
| 78 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
|
| 79 |
global transcribe_model, align_model, voicecraft_model
|
| 80 |
|
|
@@ -123,7 +128,7 @@ def get_transcribe_state(segments):
|
|
| 123 |
"word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
|
| 124 |
}
|
| 125 |
|
| 126 |
-
|
| 127 |
def transcribe(seed, audio_path):
|
| 128 |
if transcribe_model is None:
|
| 129 |
raise gr.Error("Transcription model not loaded")
|
|
@@ -162,7 +167,7 @@ def align_segments(transcript, audio_path):
|
|
| 162 |
with open(tmp_sync_map_path, "r") as f:
|
| 163 |
return json.load(f)
|
| 164 |
|
| 165 |
-
|
| 166 |
def align(seed, transcript, audio_path):
|
| 167 |
if align_model is None:
|
| 168 |
raise gr.Error("Align model not loaded")
|
|
@@ -193,7 +198,7 @@ def get_output_audio(audio_tensors, codec_audio_sr):
|
|
| 193 |
buffer.seek(0)
|
| 194 |
return buffer.read()
|
| 195 |
|
| 196 |
-
|
| 197 |
def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
| 198 |
stop_repetition, sample_batch_size, kvcache, silence_tokens,
|
| 199 |
audio_path, transcribe_state, transcript, smart_transcript,
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
import random
|
| 13 |
import uuid
|
| 14 |
+
#import spaces
|
| 15 |
+
import devicetorch
|
| 16 |
|
| 17 |
|
| 18 |
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
|
| 19 |
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
| 20 |
MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
|
| 21 |
+
#device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 22 |
+
device = devicetorch(torch)
|
| 23 |
whisper_model, align_model, voicecraft_model = None, None, None
|
| 24 |
|
| 25 |
|
| 26 |
def get_random_string():
|
| 27 |
return "".join(str(uuid.uuid4()).split("-"))
|
| 28 |
|
| 29 |
+
#@spaces.GPU(duration=30)
|
| 30 |
def seed_everything(seed):
|
| 31 |
if seed != -1:
|
| 32 |
os.environ['PYTHONHASHSEED'] = str(seed)
|
| 33 |
random.seed(seed)
|
| 34 |
np.random.seed(seed)
|
| 35 |
torch.manual_seed(seed)
|
| 36 |
+
if device == "cuda":
|
| 37 |
+
torch.cuda.manual_seed(seed)
|
| 38 |
+
elif device == "mps":
|
| 39 |
+
torch.mps.manual_seed(seed)
|
| 40 |
torch.backends.cudnn.benchmark = False
|
| 41 |
torch.backends.cudnn.deterministic = True
|
| 42 |
|
| 43 |
+
#@spaces.GPU(duration=120)
|
| 44 |
class WhisperxAlignModel:
|
| 45 |
def __init__(self):
|
| 46 |
from whisperx import load_align_model
|
|
|
|
| 51 |
audio = load_audio(audio_path)
|
| 52 |
return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
|
| 53 |
|
| 54 |
+
#@spaces.GPU(duration=120)
|
| 55 |
class WhisperModel:
|
| 56 |
def __init__(self, model_name):
|
| 57 |
from whisper import load_model
|
|
|
|
| 68 |
def transcribe(self, audio_path):
|
| 69 |
return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
|
| 70 |
|
| 71 |
+
#@spaces.GPU(duration=120)
|
| 72 |
class WhisperxModel:
|
| 73 |
def __init__(self, model_name, align_model: WhisperxAlignModel):
|
| 74 |
from whisperx import load_model
|
|
|
|
| 79 |
segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
|
| 80 |
return self.align_model.align(segments, audio_path)
|
| 81 |
|
| 82 |
+
#@spaces.GPU(duration=120)
|
| 83 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
|
| 84 |
global transcribe_model, align_model, voicecraft_model
|
| 85 |
|
|
|
|
| 128 |
"word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
|
| 129 |
}
|
| 130 |
|
| 131 |
+
#@spaces.GPU(duration=60)
|
| 132 |
def transcribe(seed, audio_path):
|
| 133 |
if transcribe_model is None:
|
| 134 |
raise gr.Error("Transcription model not loaded")
|
|
|
|
| 167 |
with open(tmp_sync_map_path, "r") as f:
|
| 168 |
return json.load(f)
|
| 169 |
|
| 170 |
+
#@spaces.GPU(duration=90)
|
| 171 |
def align(seed, transcript, audio_path):
|
| 172 |
if align_model is None:
|
| 173 |
raise gr.Error("Align model not loaded")
|
|
|
|
| 198 |
buffer.seek(0)
|
| 199 |
return buffer.read()
|
| 200 |
|
| 201 |
+
#@spaces.GPU(duration=90)
|
| 202 |
def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
|
| 203 |
stop_repetition, sample_batch_size, kvcache, silence_tokens,
|
| 204 |
audio_path, transcribe_state, transcript, smart_transcript,
|
inference_speech_editing_scale.py
CHANGED
|
@@ -4,6 +4,7 @@ import os, random
|
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import torchaudio
|
|
|
|
| 7 |
|
| 8 |
from data.tokenizer import (
|
| 9 |
AudioTokenizer,
|
|
@@ -96,9 +97,10 @@ def get_model(exp_dir, device=None):
|
|
| 96 |
del ckpt
|
| 97 |
logging.info("done loading weights...")
|
| 98 |
if device == None:
|
| 99 |
-
device = torch
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
model.to(device)
|
| 103 |
model.eval()
|
| 104 |
return model, model_args, phn2num
|
|
@@ -132,7 +134,10 @@ if __name__ == "__main__":
|
|
| 132 |
random.seed(seed)
|
| 133 |
np.random.seed(seed)
|
| 134 |
torch.manual_seed(seed)
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
| 136 |
torch.backends.cudnn.benchmark = False
|
| 137 |
torch.backends.cudnn.deterministic = True
|
| 138 |
formatter = (
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import torchaudio
|
| 7 |
+
import devicetorch
|
| 8 |
|
| 9 |
from data.tokenizer import (
|
| 10 |
AudioTokenizer,
|
|
|
|
| 97 |
del ckpt
|
| 98 |
logging.info("done loading weights...")
|
| 99 |
if device == None:
|
| 100 |
+
device = devicetorch(torch)
|
| 101 |
+
# device = torch.device("cpu")
|
| 102 |
+
# if torch.cuda.is_available():
|
| 103 |
+
# device = torch.device("cuda:0")
|
| 104 |
model.to(device)
|
| 105 |
model.eval()
|
| 106 |
return model, model_args, phn2num
|
|
|
|
| 134 |
random.seed(seed)
|
| 135 |
np.random.seed(seed)
|
| 136 |
torch.manual_seed(seed)
|
| 137 |
+
if device == "cuda":
|
| 138 |
+
torch.cuda.manual_seed(seed)
|
| 139 |
+
elif device == "mps":
|
| 140 |
+
torch.mps.manual_seed(seed)
|
| 141 |
torch.backends.cudnn.benchmark = False
|
| 142 |
torch.backends.cudnn.deterministic = True
|
| 143 |
formatter = (
|
inference_tts_scale.py
CHANGED
|
@@ -4,6 +4,7 @@ import os, random
|
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import torchaudio
|
|
|
|
| 7 |
|
| 8 |
from data.tokenizer import (
|
| 9 |
AudioTokenizer,
|
|
@@ -115,9 +116,10 @@ def get_model(exp_dir, device=None):
|
|
| 115 |
del ckpt
|
| 116 |
logging.info("done loading weights...")
|
| 117 |
if device == None:
|
| 118 |
-
device =
|
| 119 |
-
|
| 120 |
-
|
|
|
|
| 121 |
model.to(device)
|
| 122 |
model.eval()
|
| 123 |
return model, model_args, phn2num
|
|
@@ -128,7 +130,11 @@ if __name__ == "__main__":
|
|
| 128 |
random.seed(seed)
|
| 129 |
np.random.seed(seed)
|
| 130 |
torch.manual_seed(seed)
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
torch.backends.cudnn.benchmark = False
|
| 133 |
torch.backends.cudnn.deterministic = True
|
| 134 |
formatter = (
|
|
@@ -187,4 +193,4 @@ if __name__ == "__main__":
|
|
| 187 |
seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
|
| 188 |
|
| 189 |
torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
|
| 190 |
-
torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import torchaudio
|
| 7 |
+
import devicetorch
|
| 8 |
|
| 9 |
from data.tokenizer import (
|
| 10 |
AudioTokenizer,
|
|
|
|
| 116 |
del ckpt
|
| 117 |
logging.info("done loading weights...")
|
| 118 |
if device == None:
|
| 119 |
+
device = devicetorch.get(torch)
|
| 120 |
+
# device = torch.device("cpu")
|
| 121 |
+
# if torch.cuda.is_available():
|
| 122 |
+
# device = torch.device("cuda:0")
|
| 123 |
model.to(device)
|
| 124 |
model.eval()
|
| 125 |
return model, model_args, phn2num
|
|
|
|
| 130 |
random.seed(seed)
|
| 131 |
np.random.seed(seed)
|
| 132 |
torch.manual_seed(seed)
|
| 133 |
+
device = devicetorch.get(torch)
|
| 134 |
+
if device == "cuda":
|
| 135 |
+
torch.cuda.manual_seed(seed)
|
| 136 |
+
elif device == "mps":
|
| 137 |
+
torch.mps.manual_seed(seed)
|
| 138 |
torch.backends.cudnn.benchmark = False
|
| 139 |
torch.backends.cudnn.deterministic = True
|
| 140 |
formatter = (
|
|
|
|
| 193 |
seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
|
| 194 |
|
| 195 |
torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
|
| 196 |
+
torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)
|
requirements.txt
CHANGED
|
@@ -3,7 +3,8 @@ phonemizer==3.2.1
|
|
| 3 |
gradio
|
| 4 |
nltk>=3.8.1
|
| 5 |
openai-whisper>=20231117
|
| 6 |
-
spaces
|
| 7 |
aeneas==1.7.3.0
|
| 8 |
whisperx==3.1.1
|
| 9 |
-
huggingface-hub==0.22.2
|
|
|
|
|
|
| 3 |
gradio
|
| 4 |
nltk>=3.8.1
|
| 5 |
openai-whisper>=20231117
|
| 6 |
+
#spaces
|
| 7 |
aeneas==1.7.3.0
|
| 8 |
whisperx==3.1.1
|
| 9 |
+
huggingface-hub==0.22.2
|
| 10 |
+
devicetorch
|