Steveeeeeeen HF Staff commited on
Commit
40ffafe
·
verified ·
1 Parent(s): 33023c3

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app.py +104 -0
  2. model_handler.py +56 -0
  3. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from model_handler import ASRHandler
3
+
4
+ # Configuration
5
+ MODEL_ID = "abr-ai/asr-19m-v2-en-32b"
6
+
7
+ # Initialize the model handler
8
+ # We initialize this at the global scope so it loads when the app starts
9
+ try:
10
+ asr_handler = ASRHandler(MODEL_ID)
11
+ except Exception as e:
12
+ print(f"Failed to load model: {e}")
13
+ asr_handler = None
14
+
15
+ def process_audio(audio_filepath):
16
+ """
17
+ Wrapper function to connect Gradio input to the model handler.
18
+ """
19
+ if asr_handler is None:
20
+ return "Error: Model failed to load. Please check logs."
21
+
22
+ if audio_filepath is None:
23
+ return ""
24
+
25
+ return asr_handler.transcribe(audio_filepath)
26
+
27
+ # Define the CSS for the interface
28
+ custom_css = """
29
+ .container { max-width: 800px; margin: auto; }
30
+ .header-link { font-size: 0.9rem; color: #666; text-decoration: none; }
31
+ .header-link:hover { color: #ff7e5f; text-decoration: underline; }
32
+ """
33
+
34
+ # Build the Gradio Interface
35
+ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
36
+
37
+ # Header Section
38
+ with gr.Row():
39
+ with gr.Column():
40
+ gr.Markdown(
41
+ """
42
+ # 🎙️ ASR 19M v2 (English) Demo
43
+
44
+ This space demonstrates the **abr-ai/asr-19m-v2-en-32b** model.
45
+ It is a highly efficient, compact (19M parameters) Automatic Speech Recognition model designed for English transcription.
46
+ """
47
+ )
48
+ gr.Markdown(
49
+ "[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)",
50
+ elem_classes=["header-link"]
51
+ )
52
+
53
+ # Main Interface
54
+ with gr.Row():
55
+ with gr.Column(scale=1):
56
+ # Input: Audio Recorder and Uploader
57
+ audio_input = gr.Audio(
58
+ sources=["microphone", "upload"],
59
+ type="filepath",
60
+ label="Input Audio",
61
+ show_download_button=True
62
+ )
63
+
64
+ submit_btn = gr.Button("Transcribe", variant="primary")
65
+
66
+ with gr.Column(scale=1):
67
+ # Output: Text
68
+ text_output = gr.Textbox(
69
+ label="Transcription",
70
+ placeholder="Transcription will appear here...",
71
+ lines=5,
72
+ show_copy_button=True
73
+ )
74
+
75
+ # Instructions / Footer
76
+ with gr.Accordion("About the Model", open=False):
77
+ gr.Markdown(
78
+ f"""
79
+ **Model ID:** `{MODEL_ID}`
80
+
81
+ **Description:**
82
+ This is a lightweight ASR model. Due to its small size (19 Million parameters),
83
+ it is designed for speed and efficiency while maintaining reasonable accuracy for English speech.
84
+
85
+ **Note:** The first inference might take a few seconds to warm up.
86
+ """
87
+ )
88
+
89
+ # Event Listeners
90
+ submit_btn.click(
91
+ fn=process_audio,
92
+ inputs=[audio_input],
93
+ outputs=[text_output]
94
+ )
95
+
96
+ # Auto-submit when a file is uploaded (optional, usually better to wait for button on mics)
97
+ audio_input.upload(
98
+ fn=process_audio,
99
+ inputs=[audio_input],
100
+ outputs=[text_output]
101
+ )
102
+
103
+ if __name__ == "__main__":
104
+ demo.launch()
model_handler.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ import sys
4
+
5
+ class ASRHandler:
6
+ """
7
+ Handles the loading of the Automatic Speech Recognition model
8
+ and the inference logic.
9
+ """
10
+ def __init__(self, model_id: str):
11
+ self.model_id = model_id
12
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ self.pipe = None
14
+ self._load_model()
15
+
16
+ def _load_model(self):
17
+ """
18
+ Initializes the Hugging Face pipeline.
19
+ """
20
+ print(f"Loading model {self.model_id} on {self.device}...")
21
+ try:
22
+ # We use trust_remote_code=True because some specific architectures
23
+ # require custom modeling code from the hub.
24
+ self.pipe = pipeline(
25
+ "automatic-speech-recognition",
26
+ model=self.model_id,
27
+ device=self.device,
28
+ trust_remote_code=True
29
+ )
30
+ print("Model loaded successfully.")
31
+ except Exception as e:
32
+ print(f"Error loading model: {e}", file=sys.stderr)
33
+ raise e
34
+
35
+ def transcribe(self, audio_path: str) -> str:
36
+ """
37
+ Runs inference on the provided audio file.
38
+
39
+ Args:
40
+ audio_path (str): Path to the temporary audio file generated by Gradio.
41
+
42
+ Returns:
43
+ str: The transcribed text.
44
+ """
45
+ if not audio_path:
46
+ return "Please provide an audio input."
47
+
48
+ if self.pipe is None:
49
+ return "Model not initialized."
50
+
51
+ try:
52
+ # The pipeline handles loading audio and resampling automatically
53
+ output = self.pipe(audio_path)
54
+ return output.get("text", "No text detected.")
55
+ except Exception as e:
56
+ return f"Error during transcription: {str(e)}"
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ requests
3
+ Pillow
4
+ numpy
5
+ torch
6
+ torchaudio
7
+ git+https://github.com/huggingface/transformers
8
+ accelerate
9
+ tokenizers
10
+ datasets
11
+ librosa
12
+ soundfile
13
+ sentencepiece