3417543_models

Paused

App Files Files Community

ajayarora1235 commited on Apr 3, 2024

Commit

afeb57e

1 Parent(s): a18eac3

make voicecraft changes

Browse files

Files changed (1) hide show

app.py +30 -12

app.py CHANGED Viewed

@@ -248,14 +248,12 @@ def load_hubert():
     global hubert_model
     # Load the model
-    configH = HubertConfig()
     configH.output_hidden_states = True
     hubert_model = HubertModel(configH)
     hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
     # Prepare the model
     hubert_model = hubert_model.to(config.device)
-    config.device = "cuda"
-    config.is_half=True
     if config.is_half:
         hubert_model = hubert_model.half()
     else:
@@ -1400,6 +1398,7 @@ def download_from_url(url, model, associated_user=None):
     os.makedirs("unzips", exist_ok=True)
     zipfile = model + '.zip'
     zipfile_path = './zips/' + zipfile
     try:
         if "drive.google.com" in url or "drive.usercontent.google.com":
             subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
@@ -1474,6 +1473,18 @@ def stoptraining(mim):
 def transcribe_btn_click(audio_choice):
     batch_size = 1  # Adjust based on your GPU memory availability
     compute_type = "float16"
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -1497,7 +1508,6 @@ def transcribe_btn_click(audio_choice):
     orig_audio = audio_choice
     orig_transcript = result
     # move the audio and transcript to temp folder
-    temp_folder = "./demo/temp"
     os.makedirs(temp_folder, exist_ok=True)
     os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
     filename = os.path.splitext(orig_audio.split("/")[-1])[0]
@@ -1507,12 +1517,9 @@ def transcribe_btn_click(audio_choice):
     align_temp = f"{temp_folder}/mfa_alignments"
     os.makedirs(align_temp, exist_ok=True)
-    global audio_fn
     audio_fn = f"{temp_folder}/{filename}.wav"
-    global transcript_fn
     transcript_fn = f"{temp_folder}/{filename}.txt"
     return result
@@ -1530,6 +1537,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)
@@ -1570,7 +1578,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
     return [seg_save_fn_concat, seg_save_fn_gen]
-def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
         temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
         sid,
         f0_up_key,
@@ -1585,19 +1593,25 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
         rms_mix_rate,
         protect,
         crepe_hop_length):
     global voicecraft_model, voicecraft_config, phn2num
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
-    cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + ' ' + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)
@@ -1617,6 +1631,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
     concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
                                                      audio_fn, target_transcript, config.device, decode_config,
                                                      prompt_end_frame)
     # save segments for comparison
     concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
@@ -1636,7 +1651,8 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
     f0_up_key = int(f0_up_key)
     try:
-        audio = gen_audio
         audio_max = np.abs(audio).max() / 0.95
         if audio_max > 1:
             audio /= audio_max
@@ -1657,6 +1673,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
         # file_big_npy = (
         #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
         # )
         audio_opt = vc.pipeline(
             hubert_model,
             net_g,
@@ -2029,6 +2046,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                 run_btn_joint.click(
                     fn=run_joint,
                     inputs=[
                         seed,
                         stop_repitition,
                         sample_batch_size,
@@ -2429,4 +2447,4 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
             )
     app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
-#endregion

     global hubert_model
     # Load the model
+    configH= HubertConfig()
     configH.output_hidden_states = True
     hubert_model = HubertModel(configH)
     hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
     # Prepare the model
     hubert_model = hubert_model.to(config.device)
     if config.is_half:
         hubert_model = hubert_model.half()
     else:
     os.makedirs("unzips", exist_ok=True)
     zipfile = model + '.zip'
     zipfile_path = './zips/' + zipfile
+    return
     try:
         if "drive.google.com" in url or "drive.usercontent.google.com":
             subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
 def transcribe_btn_click(audio_choice):
+    global transcript_fn
+    global audio_fn
+    temp_folder = "./demo/temp"
+    orig_audio = audio_choice
+    filename = os.path.splitext(orig_audio.split("/")[-1])[0]
+    audio_fn = f"{temp_folder}/{filename}.wav"
+    transcript_fn = f"{temp_folder}/{filename}.txt"
+    if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
+        print("Audio and transcript already exist, skipping transcript")
+        return
     batch_size = 1  # Adjust based on your GPU memory availability
     compute_type = "float16"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     orig_audio = audio_choice
     orig_transcript = result
     # move the audio and transcript to temp folder
     os.makedirs(temp_folder, exist_ok=True)
     os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
     filename = os.path.splitext(orig_audio.split("/")[-1])[0]
     align_temp = f"{temp_folder}/mfa_alignments"
     os.makedirs(align_temp, exist_ok=True)
     audio_fn = f"{temp_folder}/{filename}.wav"
     transcript_fn = f"{temp_folder}/{filename}.txt"
     return result
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
+    print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)
     return [seg_save_fn_concat, seg_save_fn_gen]
+def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
         temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
         sid,
         f0_up_key,
         rms_mix_rate,
         protect,
         crepe_hop_length):
     global voicecraft_model, voicecraft_config, phn2num
+    print("Transcribing the input audio")
+    transcribe_btn_click(input_audio_fn)
+    print("Transcription complete")
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+    # cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + ' ' + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
+    cut_off_sec = audio_dur - 0.1
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)
     concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
                                                      audio_fn, target_transcript, config.device, decode_config,
                                                      prompt_end_frame)
+    print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
     # save segments for comparison
     concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
     f0_up_key = int(f0_up_key)
     try:
+        # audio = gen_audio.squeeze()
+        audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
         audio_max = np.abs(audio).max() / 0.95
         if audio_max > 1:
             audio /= audio_max
         # file_big_npy = (
         #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
         # )
+        print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
         audio_opt = vc.pipeline(
             hubert_model,
             net_g,
                 run_btn_joint.click(
                     fn=run_joint,
                     inputs=[
+                        input_audio0,
                         seed,
                         stop_repitition,
                         sample_batch_size,
             )
     app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
+#endregion