Commit
·
afeb57e
1
Parent(s):
a18eac3
make voicecraft changes
Browse files
app.py
CHANGED
|
@@ -248,14 +248,12 @@ def load_hubert():
|
|
| 248 |
global hubert_model
|
| 249 |
# Load the model
|
| 250 |
|
| 251 |
-
configH
|
| 252 |
configH.output_hidden_states = True
|
| 253 |
hubert_model = HubertModel(configH)
|
| 254 |
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
|
| 255 |
# Prepare the model
|
| 256 |
hubert_model = hubert_model.to(config.device)
|
| 257 |
-
config.device = "cuda"
|
| 258 |
-
config.is_half=True
|
| 259 |
if config.is_half:
|
| 260 |
hubert_model = hubert_model.half()
|
| 261 |
else:
|
|
@@ -1400,6 +1398,7 @@ def download_from_url(url, model, associated_user=None):
|
|
| 1400 |
os.makedirs("unzips", exist_ok=True)
|
| 1401 |
zipfile = model + '.zip'
|
| 1402 |
zipfile_path = './zips/' + zipfile
|
|
|
|
| 1403 |
try:
|
| 1404 |
if "drive.google.com" in url or "drive.usercontent.google.com":
|
| 1405 |
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
|
@@ -1474,6 +1473,18 @@ def stoptraining(mim):
|
|
| 1474 |
|
| 1475 |
|
| 1476 |
def transcribe_btn_click(audio_choice):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1477 |
batch_size = 1 # Adjust based on your GPU memory availability
|
| 1478 |
compute_type = "float16"
|
| 1479 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -1497,7 +1508,6 @@ def transcribe_btn_click(audio_choice):
|
|
| 1497 |
orig_audio = audio_choice
|
| 1498 |
orig_transcript = result
|
| 1499 |
# move the audio and transcript to temp folder
|
| 1500 |
-
temp_folder = "./demo/temp"
|
| 1501 |
os.makedirs(temp_folder, exist_ok=True)
|
| 1502 |
os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
|
| 1503 |
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
|
|
@@ -1507,12 +1517,9 @@ def transcribe_btn_click(audio_choice):
|
|
| 1507 |
align_temp = f"{temp_folder}/mfa_alignments"
|
| 1508 |
os.makedirs(align_temp, exist_ok=True)
|
| 1509 |
|
| 1510 |
-
global audio_fn
|
| 1511 |
audio_fn = f"{temp_folder}/{filename}.wav"
|
| 1512 |
-
global transcript_fn
|
| 1513 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
| 1514 |
|
| 1515 |
-
|
| 1516 |
return result
|
| 1517 |
|
| 1518 |
|
|
@@ -1530,6 +1537,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
| 1530 |
info = torchaudio.info(audio_fn)
|
| 1531 |
audio_dur = info.num_frames / info.sample_rate
|
| 1532 |
|
|
|
|
| 1533 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1534 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1535 |
|
|
@@ -1570,7 +1578,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
| 1570 |
|
| 1571 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
| 1572 |
|
| 1573 |
-
def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
| 1574 |
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
|
| 1575 |
sid,
|
| 1576 |
f0_up_key,
|
|
@@ -1585,19 +1593,25 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
| 1585 |
rms_mix_rate,
|
| 1586 |
protect,
|
| 1587 |
crepe_hop_length):
|
| 1588 |
-
|
| 1589 |
global voicecraft_model, voicecraft_config, phn2num
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1590 |
|
| 1591 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 1592 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1593 |
os.environ["USER"] = "USER"
|
| 1594 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1595 |
-
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
|
|
|
| 1596 |
target_transcript = transcribed_text + ' ' + target_transcript
|
| 1597 |
print(target_transcript)
|
| 1598 |
info = torchaudio.info(audio_fn)
|
| 1599 |
audio_dur = info.num_frames / info.sample_rate
|
| 1600 |
|
|
|
|
|
|
|
| 1601 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1602 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1603 |
|
|
@@ -1617,6 +1631,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
| 1617 |
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
| 1618 |
audio_fn, target_transcript, config.device, decode_config,
|
| 1619 |
prompt_end_frame)
|
|
|
|
| 1620 |
|
| 1621 |
# save segments for comparison
|
| 1622 |
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
|
|
@@ -1636,7 +1651,8 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
| 1636 |
|
| 1637 |
f0_up_key = int(f0_up_key)
|
| 1638 |
try:
|
| 1639 |
-
audio = gen_audio
|
|
|
|
| 1640 |
audio_max = np.abs(audio).max() / 0.95
|
| 1641 |
if audio_max > 1:
|
| 1642 |
audio /= audio_max
|
|
@@ -1657,6 +1673,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
| 1657 |
# file_big_npy = (
|
| 1658 |
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
| 1659 |
# )
|
|
|
|
| 1660 |
audio_opt = vc.pipeline(
|
| 1661 |
hubert_model,
|
| 1662 |
net_g,
|
|
@@ -2029,6 +2046,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 2029 |
run_btn_joint.click(
|
| 2030 |
fn=run_joint,
|
| 2031 |
inputs=[
|
|
|
|
| 2032 |
seed,
|
| 2033 |
stop_repitition,
|
| 2034 |
sample_batch_size,
|
|
@@ -2429,4 +2447,4 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 2429 |
)
|
| 2430 |
|
| 2431 |
app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
|
| 2432 |
-
#endregion
|
|
|
|
| 248 |
global hubert_model
|
| 249 |
# Load the model
|
| 250 |
|
| 251 |
+
configH= HubertConfig()
|
| 252 |
configH.output_hidden_states = True
|
| 253 |
hubert_model = HubertModel(configH)
|
| 254 |
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
|
| 255 |
# Prepare the model
|
| 256 |
hubert_model = hubert_model.to(config.device)
|
|
|
|
|
|
|
| 257 |
if config.is_half:
|
| 258 |
hubert_model = hubert_model.half()
|
| 259 |
else:
|
|
|
|
| 1398 |
os.makedirs("unzips", exist_ok=True)
|
| 1399 |
zipfile = model + '.zip'
|
| 1400 |
zipfile_path = './zips/' + zipfile
|
| 1401 |
+
return
|
| 1402 |
try:
|
| 1403 |
if "drive.google.com" in url or "drive.usercontent.google.com":
|
| 1404 |
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
|
|
|
| 1473 |
|
| 1474 |
|
| 1475 |
def transcribe_btn_click(audio_choice):
|
| 1476 |
+
global transcript_fn
|
| 1477 |
+
global audio_fn
|
| 1478 |
+
|
| 1479 |
+
temp_folder = "./demo/temp"
|
| 1480 |
+
orig_audio = audio_choice
|
| 1481 |
+
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
|
| 1482 |
+
audio_fn = f"{temp_folder}/{filename}.wav"
|
| 1483 |
+
transcript_fn = f"{temp_folder}/{filename}.txt"
|
| 1484 |
+
if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
|
| 1485 |
+
print("Audio and transcript already exist, skipping transcript")
|
| 1486 |
+
return
|
| 1487 |
+
|
| 1488 |
batch_size = 1 # Adjust based on your GPU memory availability
|
| 1489 |
compute_type = "float16"
|
| 1490 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 1508 |
orig_audio = audio_choice
|
| 1509 |
orig_transcript = result
|
| 1510 |
# move the audio and transcript to temp folder
|
|
|
|
| 1511 |
os.makedirs(temp_folder, exist_ok=True)
|
| 1512 |
os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
|
| 1513 |
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
|
|
|
|
| 1517 |
align_temp = f"{temp_folder}/mfa_alignments"
|
| 1518 |
os.makedirs(align_temp, exist_ok=True)
|
| 1519 |
|
|
|
|
| 1520 |
audio_fn = f"{temp_folder}/{filename}.wav"
|
|
|
|
| 1521 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
| 1522 |
|
|
|
|
| 1523 |
return result
|
| 1524 |
|
| 1525 |
|
|
|
|
| 1537 |
info = torchaudio.info(audio_fn)
|
| 1538 |
audio_dur = info.num_frames / info.sample_rate
|
| 1539 |
|
| 1540 |
+
print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
|
| 1541 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1542 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1543 |
|
|
|
|
| 1578 |
|
| 1579 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
| 1580 |
|
| 1581 |
+
def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
| 1582 |
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
|
| 1583 |
sid,
|
| 1584 |
f0_up_key,
|
|
|
|
| 1593 |
rms_mix_rate,
|
| 1594 |
protect,
|
| 1595 |
crepe_hop_length):
|
|
|
|
| 1596 |
global voicecraft_model, voicecraft_config, phn2num
|
| 1597 |
+
|
| 1598 |
+
print("Transcribing the input audio")
|
| 1599 |
+
transcribe_btn_click(input_audio_fn)
|
| 1600 |
+
print("Transcription complete")
|
| 1601 |
|
| 1602 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 1603 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1604 |
os.environ["USER"] = "USER"
|
| 1605 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1606 |
+
# cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1607 |
+
|
| 1608 |
target_transcript = transcribed_text + ' ' + target_transcript
|
| 1609 |
print(target_transcript)
|
| 1610 |
info = torchaudio.info(audio_fn)
|
| 1611 |
audio_dur = info.num_frames / info.sample_rate
|
| 1612 |
|
| 1613 |
+
cut_off_sec = audio_dur - 0.1
|
| 1614 |
+
|
| 1615 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1616 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1617 |
|
|
|
|
| 1631 |
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
| 1632 |
audio_fn, target_transcript, config.device, decode_config,
|
| 1633 |
prompt_end_frame)
|
| 1634 |
+
print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
|
| 1635 |
|
| 1636 |
# save segments for comparison
|
| 1637 |
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
|
|
|
|
| 1651 |
|
| 1652 |
f0_up_key = int(f0_up_key)
|
| 1653 |
try:
|
| 1654 |
+
# audio = gen_audio.squeeze()
|
| 1655 |
+
audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
|
| 1656 |
audio_max = np.abs(audio).max() / 0.95
|
| 1657 |
if audio_max > 1:
|
| 1658 |
audio /= audio_max
|
|
|
|
| 1673 |
# file_big_npy = (
|
| 1674 |
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
| 1675 |
# )
|
| 1676 |
+
print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
|
| 1677 |
audio_opt = vc.pipeline(
|
| 1678 |
hubert_model,
|
| 1679 |
net_g,
|
|
|
|
| 2046 |
run_btn_joint.click(
|
| 2047 |
fn=run_joint,
|
| 2048 |
inputs=[
|
| 2049 |
+
input_audio0,
|
| 2050 |
seed,
|
| 2051 |
stop_repitition,
|
| 2052 |
sample_batch_size,
|
|
|
|
| 2447 |
)
|
| 2448 |
|
| 2449 |
app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
|
| 2450 |
+
#endregion
|