Spaces:

ResembleAI
/

chatterbox-turbo-demo

Running on Zero

App Files Files Community

ollieollie commited on 11 days ago

Commit

3ce1983

verified ·

1 Parent(s): 831dd4e

Update chatterbox/tts_turbo.py

Browse files

Files changed (1) hide show

chatterbox/tts_turbo.py +153 -296

chatterbox/tts_turbo.py CHANGED Viewed

@@ -1,305 +1,162 @@
 import os
-import math
-from dataclasses import dataclass
-from pathlib import Path
-import librosa
 import torch
-import perth
-import pyloudnorm as ln
-from safetensors.torch import load_file
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-from .models.t3 import T3
-from .models.s3tokenizer import S3_SR
-from .models.s3gen import S3GEN_SR, S3Gen
-from .models.tokenizers import EnTokenizer
-from .models.voice_encoder import VoiceEncoder
-from .models.t3.modules.cond_enc import T3Cond
-from .models.t3.modules.t3_config import T3Config
-from .models.s3gen.const import S3GEN_SIL
-import logging
-logger = logging.getLogger(__name__)
-REPO_ID = "ResembleAI/chatterbox-turbo"
-def punc_norm(text: str) -> str:
-    """
-        Quick cleanup func for punctuation from LLMs or
-        containing chars not seen often in the dataset
-    """
-    if len(text) == 0:
-        return "You need to add some text for me to talk."
-    # Capitalise first letter
-    if text[0].islower():
-        text = text[0].upper() + text[1:]
-    # Remove multiple space chars
-    text = " ".join(text.split())
-    # Replace uncommon/llm punc
-    punc_to_replace = [
-        ("…", ", "),
-        (":", ","),
-        ("—", "-"),
-        ("–", "-"),
-        (" ,", ","),
-        ("“", "\""),
-        ("”", "\""),
-        ("‘", "'"),
-        ("’", "'"),
-    ]
-    for old_char_sequence, new_char in punc_to_replace:
-        text = text.replace(old_char_sequence, new_char)
-    # Add full stop if no ending punc
-    text = text.rstrip(" ")
-    sentence_enders = {".", "!", "?", "-", ","}
-    if not any(text.endswith(p) for p in sentence_enders):
-        text += "."
-    return text
-@dataclass
-class Conditionals:
-    """
-    Conditionals for T3 and S3Gen
-    - T3 conditionals:
-        - speaker_emb
-        - clap_emb
-        - cond_prompt_speech_tokens
-        - cond_prompt_speech_emb
-        - emotion_adv
-    - S3Gen conditionals:
-        - prompt_token
-        - prompt_token_len
-        - prompt_feat
-        - prompt_feat_len
-        - embedding
-    """
-    t3: T3Cond
-    gen: dict
-    def to(self, device):
-        self.t3 = self.t3.to(device=device)
-        for k, v in self.gen.items():
-            if torch.is_tensor(v):
-                self.gen[k] = v.to(device=device)
-        return self
-    def save(self, fpath: Path):
-        arg_dict = dict(
-            t3=self.t3.__dict__,
-            gen=self.gen
-        )
-        torch.save(arg_dict, fpath)
-    @classmethod
-    def load(cls, fpath, map_location="cpu"):
-        if isinstance(map_location, str):
-            map_location = torch.device(map_location)
-        kwargs = torch.load(fpath, map_location=map_location, weights_only=True)
-        return cls(T3Cond(**kwargs['t3']), kwargs['gen'])
-class ChatterboxTurboTTS:
-    ENC_COND_LEN = 15 * S3_SR
-    DEC_COND_LEN = 10 * S3GEN_SR
-    def __init__(
-        self,
-        t3: T3,
-        s3gen: S3Gen,
-        ve: VoiceEncoder,
-        tokenizer: EnTokenizer,
-        device: str,
-        conds: Conditionals = None,
-    ):
-        self.sr = S3GEN_SR  # sample rate of synthesized audio
-        self.t3 = t3
-        self.s3gen = s3gen
-        self.ve = ve
-        self.tokenizer = tokenizer
-        self.device = device
-        self.conds = conds
-        self.watermarker = perth.PerthImplicitWatermarker()
-    def to(self, device):
-        self.device = device
-        self.t3 = self.t3.to(device)
-        self.s3gen = self.s3gen.to(device)
-        self.ve = self.ve.to(device)
-        if self.conds is not None:
-            self.conds = self.conds.to(device)
-        return self
-    @classmethod
-    def from_local(cls, ckpt_dir, device) -> 'ChatterboxTurboTTS':
-        ckpt_dir = Path(ckpt_dir)
-        # Always load to CPU first for non-CUDA devices to handle CUDA-saved models
-        if device in ["cpu", "mps"]:
-            map_location = torch.device('cpu')
-        else:
-            map_location = None
-        ve = VoiceEncoder()
-        ve.load_state_dict(
-            load_file(ckpt_dir / "ve.safetensors")
-        )
-        ve.to(device).eval()
-        # Turbo specific hp
-        hp = T3Config(text_tokens_dict_size=50276)
-        hp.llama_config_name = "GPT2_medium"
-        hp.speech_tokens_dict_size = 6563
-        hp.input_pos_emb = None
-        hp.speech_cond_prompt_len = 375
-        hp.use_perceiver_resampler = False
-        hp.emotion_adv = False
-        t3 = T3(hp)
-        t3_state = load_file(ckpt_dir / "t3_turbo_v1.safetensors")
-        if "model" in t3_state.keys():
-            t3_state = t3_state["model"][0]
-        t3.load_state_dict(t3_state)
-        del t3.tfmr.wte
-        t3.to(device).eval()
-        s3gen = S3Gen(meanflow=True)
-        weights = load_file(ckpt_dir / "s3gen_meanflow.safetensors")
-        s3gen.load_state_dict(
-            weights, strict=True
-        )
-        s3gen.to(device).eval()
-        tokenizer = AutoTokenizer.from_pretrained(ckpt_dir)
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        if len(tokenizer) != 50276:
-            print(f"WARNING: Tokenizer len {len(tokenizer)} != 50276")
-        conds = None
-        builtin_voice = ckpt_dir / "conds.pt"
-        if builtin_voice.exists():
-            conds = Conditionals.load(builtin_voice, map_location=map_location).to(device)
-        return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
-    @classmethod
-    def from_pretrained(cls, device) -> 'ChatterboxTurboTTS':
-        # Check if MPS is available on macOS
-        if device == "mps" and not torch.backends.mps.is_available():
-            if not torch.backends.mps.is_built():
-                print("MPS not available because the current PyTorch install was not built with MPS enabled.")
-            else:
-                print("MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.")
-            device = "cpu"
-        local_path = snapshot_download(
-            repo_id=REPO_ID,
-            token=os.getenv("HF_TOKEN") or True,
-            # Optional: Filter to download only what you need
-            allow_patterns=["*.safetensors", "*.json", "*.txt", "*.pt", "*.model"]
-        )
-        return cls.from_local(local_path, device)
-    def norm_loudness(self, wav, sr, target_lufs=-27):
-        try:
-            meter = ln.Meter(sr)
-            loudness = meter.integrated_loudness(wav)
-            gain_db = target_lufs - loudness
-            gain_linear = 10.0 ** (gain_db / 20.0)
-            if math.isfinite(gain_linear) and gain_linear > 0.0:
-                wav = wav * gain_linear
-        except Exception as e:
-            print(f"Warning: Error in norm_loudness, skipping: {e}")
-        return wav
-    def prepare_conditionals(self, wav_fpath, exaggeration=0.5, norm_loudness=True):
-        ## Load and norm reference wav
-        s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
-        assert len(s3gen_ref_wav) / _sr > 5.0, "Audio prompt must be longer than 5 seconds!"
-        if norm_loudness:
-            s3gen_ref_wav = self.norm_loudness(s3gen_ref_wav, _sr)
-        ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
-        s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
-        s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
-        # Speech cond prompt tokens
-        if plen := self.t3.hp.speech_cond_prompt_len:
-            s3_tokzr = self.s3gen.tokenizer
-            t3_cond_prompt_tokens, _ = s3_tokzr.forward([ref_16k_wav[:self.ENC_COND_LEN]], max_len=plen)
-            t3_cond_prompt_tokens = torch.atleast_2d(t3_cond_prompt_tokens).to(self.device)
-        # Voice-encoder speaker embedding
-        ve_embed = torch.from_numpy(self.ve.embeds_from_wavs([ref_16k_wav], sample_rate=S3_SR))
-        ve_embed = ve_embed.mean(axis=0, keepdim=True).to(self.device)
-        t3_cond = T3Cond(
-            speaker_emb=ve_embed,
-            cond_prompt_speech_tokens=t3_cond_prompt_tokens,
-            emotion_adv=exaggeration * torch.ones(1, 1, 1),
-        ).to(device=self.device)
-        self.conds = Conditionals(t3_cond, s3gen_ref_dict)
-    def generate(
-        self,
-        text,
-        repetition_penalty=1.2,
-        min_p=0.00,
-        top_p=0.95,
-        audio_prompt_path=None,
-        exaggeration=0.0,
-        cfg_weight=0.0,
-        temperature=0.8,
-        top_k=1000,
-        norm_loudness=True,
-    ):
-        if audio_prompt_path:
-            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
-        else:
-            assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
-        if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
-            logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
-        # Norm and tokenize text
-        text = punc_norm(text)
-        text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-        text_tokens = text_tokens.input_ids.to(self.device)
-        speech_tokens = self.t3.inference_turbo(
-            t3_cond=self.conds.t3,
-            text_tokens=text_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-        )
-        # Remove OOV tokens and add silence to end
-        speech_tokens = speech_tokens[speech_tokens < 6561]
-        speech_tokens = speech_tokens.to(self.device)
-        silence = torch.tensor([S3GEN_SIL, S3GEN_SIL, S3GEN_SIL]).long().to(self.device)
-        speech_tokens = torch.cat([speech_tokens, silence])
-        wav, _ = self.s3gen.inference(
-            speech_tokens=speech_tokens,
-            ref_dict=self.conds.gen,
-            n_cfm_timesteps=2,
-        )
-        wav = wav.squeeze(0).detach().cpu().numpy()
-        watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
-        return torch.from_numpy(watermarked_wav).unsqueeze(0)

+import random
 import os
+import numpy as np
 import torch
+import gradio as gr
+import spaces
+from chatterbox.tts_turbo import ChatterboxTurboTTS
+# --- 1. FORCE CPU FOR GLOBAL LOADING ---
+# ZeroGPU forbids CUDA during startup. We only move to CUDA inside the decorated function.
+DEVICE = "cpu"
+MODEL = None
+EVENT_TAGS = [
+    "[clear throat]", "[sigh]", "[shush]", "[cough]", "[groan]",
+    "[sniff]", "[gasp]", "[chuckle]", "[laugh]"
+]
+CUSTOM_CSS = """
+.tag-container {
+    display: flex !important;
+    flex-wrap: wrap !important;
+    gap: 8px !important;
+    margin-top: 5px !important;
+    margin-bottom: 10px !important;
+    border: none !important;
+    background: transparent !important;
+}
+.tag-btn {
+    min-width: fit-content !important;
+    width: auto !important;
+    height: 32px !important;
+    font-size: 13px !important;
+    background: #eef2ff !important;
+    border: 1px solid #c7d2fe !important;
+    color: #3730a3 !important;
+    border-radius: 6px !important;
+    padding: 0 10px !important;
+    margin: 0 !important;
+    box-shadow: none !important;
+}
+.tag-btn:hover {
+    background: #c7d2fe !important;
+    transform: translateY(-1px);
+}
+"""
+INSERT_TAG_JS = """
+(tag_val, current_text) => {
+    const textarea = document.querySelector('#main_textbox textarea');
+    if (!textarea) return current_text + " " + tag_val;
+    const start = textarea.selectionStart;
+    const end = textarea.selectionEnd;
+    let prefix = " ";
+    let suffix = " ";
+    if (start === 0) prefix = "";
+    else if (current_text[start - 1] === ' ') prefix = "";
+    if (end < current_text.length && current_text[end] === ' ') suffix = "";
+    return current_text.slice(0, start) + prefix + tag_val + suffix + current_text.slice(end);
+}
+"""
+def set_seed(seed: int):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+def load_model():
+    global MODEL
+    print(f"Loading Chatterbox-Turbo on {DEVICE}...")
+    MODEL = ChatterboxTurboTTS.from_pretrained(DEVICE)
+    return MODEL
+@spaces.GPU
+def generate(
+        text,
+        audio_prompt_path,
+        temperature,
+        seed_num,
+        min_p,
+        top_p,
+        top_k,
+        repetition_penalty,
+        norm_loudness
+):
+    global MODEL
+    # Reload if the worker lost the global state
+    if MODEL is None:
+        MODEL = ChatterboxTurboTTS.from_pretrained("cpu")
+    # --- MOVE TO GPU HERE ---
+    MODEL.to("cuda")
+    if seed_num != 0:
+        set_seed(int(seed_num))
+    wav = MODEL.generate(
+        text,
+        audio_prompt_path=audio_prompt_path,
+        temperature=temperature,
+        min_p=min_p,
+        top_p=top_p,
+        top_k=int(top_k),
+        repetition_penalty=repetition_penalty,
+        norm_loudness=norm_loudness,
+    )
+    return (MODEL.sr, wav.squeeze(0).cpu().numpy())
+with gr.Blocks(title="Chatterbox Turbo") as demo:
+    gr.Markdown("# ⚡ Chatterbox Turbo")
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(
+                value="Congratulations Miss Connor! [chuckle] Um anyway, we do have a new model in store. It's the SkyNet T-800 series and it's got basically everything. Including AI integration with ChatGPT and all that jazz. Would you like me to get some prices for you?",
+                label="Text to synthesize (max chars 300)",
+                max_lines=5,
+                elem_id="main_textbox"
+            )
+            with gr.Row(elem_classes=["tag-container"]):
+                for tag in EVENT_TAGS:
+                    btn = gr.Button(tag, elem_classes=["tag-btn"])
+                    btn.click(
+                        fn=None,
+                        inputs=[btn, text],
+                        outputs=text,
+                        js=INSERT_TAG_JS
+                    )
+            ref_wav = gr.Audio(
+                sources=["upload", "microphone"],
+                type="filepath",
+                label="Reference Audio File",
+                value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_random_podcast.wav"
+            )
+            run_btn = gr.Button("Generate ⚡", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Output Audio")
+            with gr.Accordion("Advanced Options", open=False):
+                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
+                temp = gr.Slider(0.05, 2.0, step=.05, label="Temperature", value=0.8)
+                top_p = gr.Slider(0.00, 1.00, step=0.01, label="Top P", value=0.95)
+                top_k = gr.Slider(0, 1000, step=10, label="Top K", value=1000)
+                repetition_penalty = gr.Slider(1.00, 2.00, step=0.05, label="Repetition Penalty", value=1.2)
+                min_p = gr.Slider(0.00, 1.00, step=0