| from typing import Dict, List, Any |
| import soundfile as sf |
| from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor |
| from qwen_omni_utils import process_mm_info |
|
|
| class EndpointHandler(): |
| def __init__(self, path="./"): |
| self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( |
| path, |
| dtype="auto", |
| device_map="auto", |
| ) |
| self.processor = Qwen3OmniMoeProcessor.from_pretrained(path) |
|
|
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| messages = data.get("messages", []) |
| use_audio_in_video = data.get("use_audio_in_video", True) |
| speaker = data.get("speaker", "Ethan") |
|
|
| text = self.processor.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| ) |
| audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video) |
| inputs = self.processor( |
| text=text, |
| audio=audios, |
| images=images, |
| videos=videos, |
| return_tensors="pt", |
| padding=True, |
| use_audio_in_video=use_audio_in_video |
| ) |
| inputs = inputs.to(self.model.device).to(self.model.dtype) |
|
|
| text_ids, audio = self.model.generate( |
| **inputs, |
| speaker=speaker, |
| thinker_return_dict_in_generate=True, |
| use_audio_in_video=use_audio_in_video |
| ) |
| text_output = self.processor.batch_decode( |
| text_ids.sequences[:, inputs["input_ids"].shape[1]:], |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=False |
| ) |
| result = {"generated_text": text_output} |
| if audio is not None: |
| |
| sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000) |
| result["audio_path"] = "output.wav" |
| return [result] |