"""
Voice Agent Gradio Application
Web interface for the Voice Agent with microphone support
"""
import gradio as gr
import asyncio
import logging
import os
from .voice_agent import VoiceAgent
class VoiceApp:
"""Gradio web application for Voice Agent."""
def __init__(self):
self.agent = VoiceAgent()
self.conversation_history = []
# Set up logging
logging.basicConfig(level=logging.INFO)
# Create the interface
self.interface = self._create_interface()
def _create_interface(self):
"""Create the Gradio interface."""
with gr.Blocks(
title="š¤ Voice Agent - Secure AI Suite",
theme=gr.themes.Soft(
primary_hue="orange",
secondary_hue="gray",
neutral_hue="slate"
),
css="""
.container { max-width: 1200px; margin: auto; }
.chatbot { height: 500px; }
.status-card { background: linear-gradient(90deg, #fa709a 0%, #fee140 100%); color: white; }
.tool-card { border: 2px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; }
.audio-controls { text-align: center; padding: 20px; background: #f8fafc; border-radius: 8px; }
"""
) as app:
# Header
gr.HTML("""
š¤ Voice Agent
Speech-to-AI & Text-to-Speech with Multi-modal Processing
š Secure AI Agents Suite
""")
with gr.Row():
# Left column - Voice interface
with gr.Column(scale=2):
gr.HTML("šļø Voice Interaction
")
# Audio input/output section
with gr.Column():
gr.HTML("")
gr.HTML("
šļø Record Your Voice
")
audio_input = gr.Audio(
label="Click to record or upload audio file",
type="filepath",
format="mp3",
elem_classes=["audio-input"]
)
gr.HTML("š£ļø AI Response (Audio)
")
audio_output = gr.Audio(
label="AI response will appear here",
type="numpy",
elem_classes=["audio-output"]
)
gr.HTML("")
gr.HTML("š¬ Text Chat with Voice Features
")
chatbot = gr.Chatbot(
label="Voice Assistant Chat",
height=300,
elem_classes=["chatbot"],
avatar_images=(None, "š¤")
)
with gr.Row():
msg_input = gr.Textbox(
placeholder="Type or use voice input. Try: 'Transcribe this audio' or 'Say hello in a female voice'...",
lines=2,
max_lines=4,
label="Your Message"
)
with gr.Column(scale=0):
send_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Right column - Voice Tools and Settings
with gr.Column(scale=1):
gr.HTML("š ļø Voice Services
")
tools_info = gr.HTML("""
""")
gr.HTML("šļø Voice Settings
")
with gr.Row():
voice_select = gr.Dropdown(
choices=["Adam (Male)", "Rachel (Female)", "Cloyd (Deep)", "Custom"],
value="Adam (Male)",
label="Voice Selection"
)
speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speech Speed")
gr.HTML("š System Status
")
status_display = gr.HTML()
# Event handlers
def user(user_message, history):
"""Handle user input."""
if not user_message.strip():
return history, ""
# Add user message to history
history.append((user_message, None))
return history, ""
async def bot_response(history, user_message):
"""Generate bot response."""
if not user_message.strip():
return history
# Get response from agent
response = await self.agent.handle_user_input(user_message)
# Add bot response to history
history[-1] = (user_message, response)
return history
async def process_audio(audio_file):
"""Process uploaded or recorded audio."""
if not audio_file:
return None, "No audio file provided"
try:
# Process audio with voice agent
response = await self.agent.handle_user_input("process this audio file")
return audio_file, response
except Exception as e:
return audio_file, f"Error processing audio: {str(e)}"
async def text_to_speech(text, voice_style, speed):
"""Convert text to speech."""
if not text.strip():
return None, "No text provided"
try:
# Process with voice synthesis
voice_prompt = f"speak: {text} with {voice_style} voice at {speed}x speed"
response = await self.agent.handle_user_input(voice_prompt)
# Generate mock audio file path
audio_path = f"temp_audio_{hash(text)}.mp3"
return audio_path, response
except Exception as e:
return None, f"Error generating speech: {str(e)}"
def clear_conversation():
"""Clear conversation history."""
return []
def update_status():
"""Update status display."""
status = self.agent.get_status()
voice_settings = self.agent.config.get("voice_settings", {})
return f"""
ā
Voice System Status
Agent: {status['name']}
Status: {status['status']}
Whisper: {voice_settings.get('whisper_model', 'whisper-1')}
ElevenLabs: Active
Languages: 5+ supported
Security: {'š”ļø Enabled' if status['security_enabled'] else 'ā Disabled'}
"""
# Connect events
send_btn.click(
user,
inputs=[msg_input, chatbot],
outputs=[chatbot, msg_input]
).then(
bot_response,
inputs=[chatbot, msg_input],
outputs=[chatbot]
)
msg_input.submit(
user,
inputs=[msg_input, chatbot],
outputs=[chatbot, msg_input]
).then(
bot_response,
inputs=[chatbot, msg_input],
outputs=[chatbot]
)
# Audio processing
audio_input.change(
process_audio,
inputs=[audio_input],
outputs=[audio_output, chatbot]
)
# Text-to-speech generation
def generate_speech(text, voice, speed):
return text_to_speech(text, voice, speed)
clear_btn.click(clear_conversation, outputs=chatbot)
# Initial status update
app.load(update_status, outputs=status_display)
return app
def launch(self, **kwargs):
"""Launch the Gradio application."""
self.interface.launch(
server_name="0.0.0.0",
server_port=7863,
share=False,
show_error=True,
quiet=False,
**kwargs
)
# Example usage and quick commands
EXAMPLE_QUERIES = [
"Transcribe this audio file",
"Say 'Hello, welcome to our voice AI' in a female voice",
"Start a voice conversation",
"Analyze the sentiment of this audio",
"Search for meeting recordings about project updates",
"Enable multilingual voice mode"
]
def main():
"""Main function to run the Voice Agent app."""
print("š¤ Starting Voice Agent...")
print("šļø Initializing Whisper (Speech-to-Text)...")
print("š£ļø Loading ElevenLabs (Text-to-Speech)...")
print("š§ Connecting AI models (GPT-4o, Gemini)...")
print("š Setting up multilingual support...")
app = VoiceApp()
print("\n" + "="*60)
print("š¤ VOICE AGENT - SPEECH PROCESSING SUITE")
print("="*60)
print("\nš” Example voice requests you can try:")
for i, query in enumerate(EXAMPLE_QUERIES, 1):
print(f" {i}. {query}")
print("\nšļø Features:")
print(" ⢠Record your voice or upload audio files")
print(" ⢠Convert text to natural-sounding speech")
print(" ⢠Full voice conversations with AI")
print(" ⢠Multi-language support (English, Spanish, Nepali, etc.)")
print("\nš Starting Gradio server...")
print("š Open your browser to: http://localhost:7863")
print("\n" + "="*60)
app.launch()
if __name__ == "__main__":
main()