Text-to-Speech
KimiAudio
Safetensors
English
Chinese
audio
audio-language-model
speech-recognition
audio-understanding
audio-generation
chat
custom_code
Instructions to use rsxdalv/Kimi-Audio-7B-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- KimiAudio
How to use rsxdalv/Kimi-Audio-7B-Instruct with KimiAudio:
# Example usage for KimiAudio # pip install git+https://github.com/MoonshotAI/Kimi-Audio.git from kimia_infer.api.kimia import KimiAudio model = KimiAudio(model_path="rsxdalv/Kimi-Audio-7B-Instruct", load_detokenizer=True) sampling_params = { "audio_temperature": 0.8, "audio_top_k": 10, "text_temperature": 0.0, "text_top_k": 5, } # For ASR asr_audio = "asr_example.wav" messages_asr = [ {"role": "user", "message_type": "text", "content": "Please transcribe the following audio:"}, {"role": "user", "message_type": "audio", "content": asr_audio} ] _, text = model.generate(messages_asr, **sampling_params, output_type="text") print(text) # For Q&A qa_audio = "qa_example.wav" messages_conv = [{"role": "user", "message_type": "audio", "content": qa_audio}] wav, text = model.generate(messages_conv, **sampling_params, output_type="both") sf.write("output_audio.wav", wav.cpu().view(-1).numpy(), 24000) print(text) - Notebooks
- Google Colab
- Kaggle
| { | |
| "architectures": [ | |
| "MoonshotKimiaForCausalLM" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_moonshot_kimia.KimiAudioConfig", | |
| "AutoModel": "modeling_moonshot_kimia.MoonshotKimiaModel", | |
| "AutoModelForCausalLM": "modeling_moonshot_kimia.MoonshotKimiaForCausalLM" | |
| }, | |
| "bos_token_id": 151643, | |
| "eos_token_ids": [ | |
| 151644, | |
| 151645 | |
| ], | |
| "hidden_act": "silu", | |
| "hidden_size": 3584, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 18944, | |
| "kimia_adaptor_input_dim": 5120, | |
| "kimia_audio_output_vocab": 16896, | |
| "kimia_media_begin": 151661, | |
| "kimia_media_end": 151663, | |
| "kimia_mimo_audiodelaytokens": 5, | |
| "kimia_mimo_layers": 6, | |
| "kimia_mimo_transformer_from_layer_index": 21, | |
| "kimia_text_output_vocab": 152064, | |
| "kimia_token_offset": 152064, | |
| "num_attention_heads": 28, | |
| "num_audio_special_tokens": 512, | |
| "num_base_tokens": 151643, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 4, | |
| "pad_token_id": 152063, | |
| "max_position_embeddings": 8192, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": null, | |
| "rope_theta": 1000000.0, | |
| "tie_word_embeddings": false, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.44.1", | |
| "use_cache": true, | |
| "use_whisper_feature": true, | |
| "vocab_size": 168448 | |
| } |