Spaces:

Steveeeeeeen
/

asr-19m-v2-en-32b

Sleeping

App Files Files Community

Steveeeeeeen HF Staff commited on about 1 month ago

Commit

40ffafe

verified ·

1 Parent(s): 33023c3

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +104 -0
model_handler.py +56 -0
requirements.txt +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import gradio as gr
+from model_handler import ASRHandler
+# Configuration
+MODEL_ID = "abr-ai/asr-19m-v2-en-32b"
+# Initialize the model handler
+# We initialize this at the global scope so it loads when the app starts
+try:
+    asr_handler = ASRHandler(MODEL_ID)
+except Exception as e:
+    print(f"Failed to load model: {e}")
+    asr_handler = None
+def process_audio(audio_filepath):
+    """
+    Wrapper function to connect Gradio input to the model handler.
+    """
+    if asr_handler is None:
+        return "Error: Model failed to load. Please check logs."
+    if audio_filepath is None:
+        return ""
+    return asr_handler.transcribe(audio_filepath)
+# Define the CSS for the interface
+custom_css = """
+.container { max-width: 800px; margin: auto; }
+.header-link { font-size: 0.9rem; color: #666; text-decoration: none; }
+.header-link:hover { color: #ff7e5f; text-decoration: underline; }
+"""
+# Build the Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    # Header Section
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                """
+                # 🎙️ ASR 19M v2 (English) Demo
+                This space demonstrates the **abr-ai/asr-19m-v2-en-32b** model.
+                It is a highly efficient, compact (19M parameters) Automatic Speech Recognition model designed for English transcription.
+                """
+            )
+            gr.Markdown(
+                "[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)",
+                elem_classes=["header-link"]
+            )
+    # Main Interface
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input: Audio Recorder and Uploader
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Input Audio",
+                show_download_button=True
+            )
+            submit_btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column(scale=1):
+            # Output: Text
+            text_output = gr.Textbox(
+                label="Transcription",
+                placeholder="Transcription will appear here...",
+                lines=5,
+                show_copy_button=True
+            )
+    # Instructions / Footer
+    with gr.Accordion("About the Model", open=False):
+        gr.Markdown(
+            f"""
+            **Model ID:** `{MODEL_ID}`
+            **Description:**
+            This is a lightweight ASR model. Due to its small size (19 Million parameters),
+            it is designed for speed and efficiency while maintaining reasonable accuracy for English speech.
+            **Note:** The first inference might take a few seconds to warm up.
+            """
+        )
+    # Event Listeners
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=[text_output]
+    )
+    # Auto-submit when a file is uploaded (optional, usually better to wait for button on mics)
+    audio_input.upload(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=[text_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

model_handler.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from transformers import pipeline
+import sys
+class ASRHandler:
+    """
+    Handles the loading of the Automatic Speech Recognition model
+    and the inference logic.
+    """
+    def __init__(self, model_id: str):
+        self.model_id = model_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.pipe = None
+        self._load_model()
+    def _load_model(self):
+        """
+        Initializes the Hugging Face pipeline.
+        """
+        print(f"Loading model {self.model_id} on {self.device}...")
+        try:
+            # We use trust_remote_code=True because some specific architectures
+            # require custom modeling code from the hub.
+            self.pipe = pipeline(
+                "automatic-speech-recognition",
+                model=self.model_id,
+                device=self.device,
+                trust_remote_code=True
+            )
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"Error loading model: {e}", file=sys.stderr)
+            raise e
+    def transcribe(self, audio_path: str) -> str:
+        """
+        Runs inference on the provided audio file.
+        Args:
+            audio_path (str): Path to the temporary audio file generated by Gradio.
+        Returns:
+            str: The transcribed text.
+        """
+        if not audio_path:
+            return "Please provide an audio input."
+        if self.pipe is None:
+            return "Model not initialized."
+        try:
+            # The pipeline handles loading audio and resampling automatically
+            output = self.pipe(audio_path)
+            return output.get("text", "No text detected.")
+        except Exception as e:
+            return f"Error during transcription: {str(e)}"

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+requests
+Pillow
+numpy
+torch
+torchaudio
+git+https://github.com/huggingface/transformers
+accelerate
+tokenizers
+datasets
+librosa
+soundfile
+sentencepiece