bskrishna2006 commited on
Commit
0254d02
·
1 Parent(s): b4562f5

Add audio transcription endpoints for Railway integration

Browse files
Files changed (1) hide show
  1. app.py +174 -0
app.py CHANGED
@@ -135,6 +135,180 @@ def warmup_models():
135
  }), 500
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  # =============================================================================
139
  # TRANSCRIPT ENDPOINTS
140
  # =============================================================================
 
135
  }), 500
136
 
137
 
138
+ # =============================================================================
139
+ # AUDIO TRANSCRIPTION ENDPOINTS (for Railway integration)
140
+ # =============================================================================
141
+
142
+ @app.route('/api/transcribe-audio', methods=['POST'])
143
+ def transcribe_audio():
144
+ """
145
+ Transcribe audio using Whisper.
146
+ Receives audio as base64 from Railway backend.
147
+ """
148
+ try:
149
+ data = request.get_json()
150
+
151
+ if not data or 'audio_base64' not in data:
152
+ return jsonify({
153
+ 'error': 'Missing audio',
154
+ 'message': 'Please provide audio_base64'
155
+ }), 400
156
+
157
+ import base64
158
+ import tempfile
159
+
160
+ # Decode audio
161
+ audio_data = base64.b64decode(data['audio_base64'])
162
+
163
+ # Save to temp file
164
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
165
+ f.write(audio_data)
166
+ audio_path = f.name
167
+
168
+ try:
169
+ # Transcribe with Whisper
170
+ from services.speech_to_text import SpeechToTextService
171
+ stt = SpeechToTextService()
172
+ result = stt.transcribe_audio(audio_path)
173
+
174
+ return jsonify({
175
+ 'success': True,
176
+ 'transcript': result['text'],
177
+ 'language': result['language'],
178
+ 'word_count': len(result['text'].split())
179
+ }), 200
180
+
181
+ finally:
182
+ # Cleanup
183
+ import os
184
+ if os.path.exists(audio_path):
185
+ os.remove(audio_path)
186
+
187
+ except Exception as e:
188
+ logger.error(f"Audio transcription failed: {e}")
189
+ return jsonify({
190
+ 'error': 'Transcription failed',
191
+ 'message': str(e)
192
+ }), 500
193
+
194
+
195
+ @app.route('/api/process-audio', methods=['POST'])
196
+ def process_audio():
197
+ """
198
+ Full pipeline for audio: Whisper transcription → Translation → Summary.
199
+ Receives audio as base64 from Railway backend.
200
+ """
201
+ try:
202
+ data = request.get_json()
203
+
204
+ if not data or 'audio_base64' not in data:
205
+ return jsonify({
206
+ 'error': 'Missing audio',
207
+ 'message': 'Please provide audio_base64'
208
+ }), 400
209
+
210
+ import base64
211
+ import tempfile
212
+
213
+ video_id = data.get('video_id', 'unknown')
214
+ summary_type = data.get('summary_type', 'general')
215
+ target_language = data.get('target_language', 'eng')
216
+
217
+ # Decode audio
218
+ audio_data = base64.b64decode(data['audio_base64'])
219
+
220
+ # Save to temp file
221
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
222
+ f.write(audio_data)
223
+ audio_path = f.name
224
+
225
+ try:
226
+ # Step 1: Transcribe with Whisper
227
+ logger.info("Transcribing audio with Whisper...")
228
+ from services.speech_to_text import SpeechToTextService
229
+ stt = SpeechToTextService()
230
+ whisper_result = stt.transcribe_audio(audio_path)
231
+
232
+ transcript = whisper_result['text']
233
+ original_language = whisper_result['language']
234
+ original_word_count = len(transcript.split())
235
+
236
+ logger.info(f"Transcription complete. Language: {original_language}")
237
+
238
+ # Step 2: Translate to English if needed
239
+ english_transcript = transcript
240
+
241
+ if not is_english(original_language):
242
+ logger.info("Translating to English...")
243
+ translation_service = get_translation_service()
244
+ english_transcript = translation_service.translate_to_english(
245
+ transcript,
246
+ original_language
247
+ )
248
+
249
+ # Step 3: Summarize
250
+ logger.info("Generating summary...")
251
+ summary = summarizer_service.summarize(
252
+ text=english_transcript,
253
+ summary_type=summary_type,
254
+ chunk_size=2500,
255
+ max_tokens=500
256
+ )
257
+
258
+ # Step 4: Translate summary to target language if needed
259
+ final_summary = summary
260
+ summary_language = "eng"
261
+
262
+ if not is_english(target_language):
263
+ logger.info(f"Translating summary to {target_language}...")
264
+ translation_service = get_translation_service()
265
+ final_summary = translation_service.translate_from_english(summary, target_language)
266
+ summary_language = target_language
267
+
268
+ # Calculate statistics
269
+ summary_word_count = len(final_summary.split())
270
+ compression_ratio = (summary_word_count / original_word_count) * 100 if original_word_count > 0 else 0
271
+
272
+ response = {
273
+ 'success': True,
274
+ 'video_id': video_id,
275
+ 'original_language': original_language,
276
+ 'original_language_name': get_language_name(original_language),
277
+ 'transcript': transcript,
278
+ 'transcript_source': 'whisper',
279
+ 'summary': final_summary,
280
+ 'summary_language': summary_language,
281
+ 'summary_language_name': get_language_name(summary_language),
282
+ 'statistics': {
283
+ 'original_word_count': original_word_count,
284
+ 'summary_word_count': summary_word_count,
285
+ 'compression_ratio': round(compression_ratio, 1),
286
+ 'reading_time_minutes': max(1, summary_word_count // 200)
287
+ }
288
+ }
289
+
290
+ if not is_english(original_language):
291
+ response['english_transcript'] = english_transcript
292
+ if not is_english(target_language):
293
+ response['english_summary'] = summary
294
+
295
+ logger.info("Audio processing complete!")
296
+ return jsonify(response), 200
297
+
298
+ finally:
299
+ # Cleanup
300
+ import os
301
+ if os.path.exists(audio_path):
302
+ os.remove(audio_path)
303
+
304
+ except Exception as e:
305
+ logger.error(f"Audio processing failed: {e}")
306
+ return jsonify({
307
+ 'error': 'Processing failed',
308
+ 'message': str(e)
309
+ }), 500
310
+
311
+
312
  # =============================================================================
313
  # TRANSCRIPT ENDPOINTS
314
  # =============================================================================