GitHub Actions
Deploy backend from GitHub Actions
34283f4
raw
history blame
28.8 kB
"""
Chat functionality for RAG system.
Handles conversation context, retrieval, generation, and streaming responses.
"""
import json
import asyncio
import logging
from typing import List, Dict, Any, Optional, AsyncGenerator
import uuid
from datetime import datetime
import openai
from openai import AsyncOpenAI
import tiktoken
from .qdrant_client import QdrantManager
from .embeddings import EmbeddingGenerator
from .retrieval import RetrievalEngine
from .models import (
Message, MessageRole, ConversationContext, Citation,
ChatRequest, ChatResponse
)
logger = logging.getLogger(__name__)
class ChatHandler:
"""Handles chat functionality with RAG retrieval and streaming responses."""
def __init__(
self,
qdrant_manager: QdrantManager,
openai_api_key: str,
model: str = "gpt-4.1-nano",
embedding_model: str = "text-embedding-3-small",
max_context_messages: int = 3,
context_window_size: int = 4000,
max_retries: int = 3
):
self.qdrant_manager = qdrant_manager
self.model = model
self.embedding_model = embedding_model
self.max_context_messages = max_context_messages
self.context_window_size = context_window_size
self.max_retries = max_retries
# Initialize clients
self.openai_client = AsyncOpenAI(api_key=openai_api_key)
self.embedder = EmbeddingGenerator(
api_key=openai_api_key,
model=embedding_model
)
self.encoding = tiktoken.get_encoding("cl100k_base")
# Initialize retrieval engine with improved settings
self.retrieval_engine = RetrievalEngine(
qdrant_manager=qdrant_manager,
embedder=self.embedder,
score_threshold=0.5, # Lowered to 0.5 to better match document scores
enable_mmr=True,
mmr_lambda=0.5
)
# In-memory conversation context (for production, use Redis or database)
self.conversations: Dict[str, ConversationContext] = {}
def get_adaptive_threshold(self, query_length: int, result_count: int) -> float:
"""
Get adaptive similarity threshold based on query characteristics.
Args:
query_length: Length of the query in characters
result_count: Number of results found in initial search
Returns:
Adaptive threshold value
"""
base_threshold = 0.5
# Lower threshold for very specific queries (longer)
if query_length > 100:
return max(0.5, base_threshold - 0.2)
# Raise threshold if too many results found
if result_count > 20:
return min(0.9, base_threshold + 0.2)
# Lower threshold if very few results found
if result_count < 3:
return max(0.5, base_threshold - 0.1)
return base_threshold
async def stream_chat(
self,
query: str,
session_id: Optional[str] = None,
k: int = 5,
context_window: Optional[int] = None,
filters: Optional[Dict[str, Any]] = None
) -> AsyncGenerator[str, None]:
"""
Stream chat response with Server-Sent Events.
Yields JSON-formatted SSE messages.
"""
start_time = datetime.utcnow()
try:
# Generate or retrieve session ID
if not session_id:
session_id = str(uuid.uuid4())
# Handle greetings and very short queries
query_lower = query.strip().lower()
greetings = ['hi', 'hello', 'hey', 'yo', 'sup', 'greetings', 'good morning', 'good afternoon', 'good evening', 'assalamualikum', 'salam', 'assalam o alaikum']
if query_lower in greetings or len(query.strip()) <= 2:
# For greetings, provide a friendly response without searching
greeting_responses = [
"Hello! I'm here to help you learn about Physical AI and Humanoid Robotics. What would you like to know?",
"Hi there! I can help you with questions about humanoid robots and physical AI. What topic interests you?",
"Hey! I'm your AI assistant for the Physical AI & Humanoid Robotics book. How can I assist you today?",
"Greetings! Feel free to ask me anything about humanoid robotics, AI, or the content of this book.",
"Wa Alaikum Assalam! I'm happy to help you with Physical AI and Humanoid Robotics topics. What would you like to explore?"
]
import random
response_text = random.choice(greeting_responses)
# Send the greeting response
yield self._format_sse_message({
"type": "start",
"session_id": session_id,
"sources": [],
"retrieved_docs": 0
})
# Stream the response word by word for consistency
words = response_text.split()
for word in words:
yield self._format_sse_message({
"type": "chunk",
"content": word + " "
})
await asyncio.sleep(0.05) # Small delay for natural effect
yield self._format_sse_message({
"type": "done",
"session_id": session_id,
"response_time": 0.1,
"tokens_used": self.count_tokens(response_text)
})
return
# Get or create conversation context
context = self._get_or_create_context(session_id)
# Add user message to context
user_message = Message(
id=str(uuid.uuid4()),
role=MessageRole.USER,
content=query,
token_count=self.count_tokens(query)
)
context.add_message(user_message)
# Retrieve relevant documents using RetrievalEngine
logger.info(f"Retrieving {k} relevant documents...")
# Get adaptive threshold if needed
retrieved_docs = await self.retrieval_engine.retrieve(
query=query,
k=k * 3, # Get more to account for filtering
filters=filters,
exclude_templates=True,
use_mmr=True
)
# Limit to k results after filtering and deduplication
retrieved_docs = retrieved_docs[:k]
# Check if any documents were retrieved
if not retrieved_docs:
# If no documents found, try with a lower threshold for very short queries
if len(query.strip()) < 20:
logger.info(f"Short query with no results, retrying with lower threshold...")
retrieved_docs = await self.retrieval_engine.retrieve(
query=query,
k=k,
filters=filters,
exclude_templates=True,
use_mmr=False # Disable MMR for retry
)
retrieved_docs = retrieved_docs[:k]
# If still no results, handle gracefully
if not retrieved_docs:
logger.info(f"No content found for query: {query[:100]}...")
# Provide a helpful response when no content is found
no_content_response = (
"I couldn't find specific information about that topic in the book. "
"This book covers Physical AI & Humanoid Robotics. Try asking about:\n"
"• Introduction to physical AI\n"
"• Types of humanoid robots\n"
"• AI control systems\n"
"• Robot locomotion\n"
"• Specific chapters or sections"
)
# Stream the helpful response
words = no_content_response.split()
for word in words:
yield self._format_sse_message({
"type": "chunk",
"content": word + " "
})
await asyncio.sleep(0.05)
yield self._format_sse_message({
"type": "done",
"session_id": session_id,
"response_time": 0.1,
"tokens_used": self.count_tokens(no_content_response),
"no_results": True
})
return
# Log monitoring metrics
logger.info(
"Retrieval metrics - query_length=%d, retrieved_count=%d, threshold=%.2f, session_id=%s",
len(query),
len(retrieved_docs),
self.retrieval_engine.score_threshold,
session_id
)
# Log similarity scores for monitoring
scores = [result["similarity_score"] for result in retrieved_docs]
if scores:
logger.info(
"Similarity scores - min=%.3f, max=%.3f, avg=%.3f, count=%d",
min(scores),
max(scores),
sum(scores) / len(scores),
len(scores)
)
# Create citations
citations = []
source_context = []
for i, result in enumerate(retrieved_docs):
chunk = result["chunk"]
metadata = chunk.metadata
citation = Citation(
id=str(uuid.uuid4()),
chunk_id=chunk.id,
document_id=metadata.get("document_id", ""),
text_snippet=chunk.content[:200] + "...",
relevance_score=result["similarity_score"],
chapter=metadata.get("chapter"),
section=metadata.get("section_header") or metadata.get("section"),
url=metadata.get("url"),
confidence=result["similarity_score"]
)
citations.append(citation)
# Add to context with citation marker
source_text = chunk.content
if source_text:
source_url = metadata.get("url", "")
url_info = f" (URL: {source_url})" if source_url else ""
source_context.append(f"[Source {i+1}]{url_info}: {source_text}")
# Build context with conversation history and retrieved documents
context_messages = self._build_context_messages(
context,
source_context,
context_window or self.context_window_size
)
# Send initial metadata
yield self._format_sse_message({
"type": "start",
"session_id": session_id,
"sources": [citation.to_markdown() for citation in citations],
"retrieved_docs": len(retrieved_docs)
})
# Generate streaming response
logger.info("Generating streaming response...")
full_response = "" # Initialize response accumulator
stream = await self.openai_client.chat.completions.create(
model=self.model,
messages=context_messages,
stream=True,
max_completion_tokens=1000
)
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta:
content = chunk.choices[0].delta.content
if content:
full_response += content
yield self._format_sse_message({
"type": "chunk",
"content": content
})
# Create assistant message and add to context
assistant_message = Message(
id=str(uuid.uuid4()),
role=MessageRole.ASSISTANT,
content=full_response,
token_count=self.count_tokens(full_response),
citations=[citation.id for citation in citations]
)
context.add_message(assistant_message)
# Send completion message
response_time = (datetime.utcnow() - start_time).total_seconds()
yield self._format_sse_message({
"type": "done",
"session_id": session_id,
"response_time": response_time,
"tokens_used": user_message.token_count + assistant_message.token_count
})
except Exception as e:
logger.error(f"Chat streaming failed: {str(e)}", exc_info=True)
yield self._format_sse_message({
"type": "error",
"error": str(e)
})
async def chat(
self,
query: str,
session_id: Optional[str] = None,
k: int = 5,
context_window: Optional[int] = None,
filters: Optional[Dict[str, Any]] = None
) -> ChatResponse:
"""
Non-streaming chat response.
Returns complete response with citations.
"""
start_time = datetime.utcnow()
try:
# Generate or retrieve session ID
if not session_id:
session_id = str(uuid.uuid4())
# Handle greetings and very short queries
query_lower = query.strip().lower()
greetings = ['hi', 'hello', 'hey', 'yo', 'sup', 'greetings', 'good morning', 'good afternoon', 'good evening', 'assalamualikum', 'salam', 'assalam o alaikum']
if query_lower in greetings or len(query.strip()) <= 2:
# For greetings, provide a friendly response without searching
greeting_responses = [
"Hello! I'm here to help you learn about Physical AI and Humanoid Robotics. What would you like to know?",
"Hi there! I can help you with questions about humanoid robots and physical AI. What topic interests you?",
"Hey! I'm your AI assistant for the Physical AI & Humanoid Robotics book. How can I assist you today?",
"Greetings! Feel free to ask me anything about humanoid robotics, AI, or the content of this book.",
"Wa Alaikum Assalam! I'm happy to help you with Physical AI and Humanoid Robotics topics. What would you like to explore?"
]
import random
answer = random.choice(greeting_responses)
response_time = (datetime.utcnow() - start_time).total_seconds()
# Return greeting as JSON response
greeting_response = {
"type": "final",
"answer": answer,
"sources": [],
"session_id": session_id,
"query": query,
"response_time": response_time,
"tokens_used": self.count_tokens(answer),
"context_used": False,
"model": self.model,
"has_context": False
}
yield f"data: {json.dumps(greeting_response)}\n\n"
yield f"data: [DONE]\n\n"
return
# Get or create conversation context
context = self._get_or_create_context(session_id)
# Add user message to context
user_message = Message(
id=str(uuid.uuid4()),
role=MessageRole.USER,
content=query,
token_count=self.count_tokens(query)
)
context.add_message(user_message)
# Retrieve relevant documents using RetrievalEngine
logger.info(f"Retrieving {k} relevant documents...")
retrieved_docs = await self.retrieval_engine.retrieve(
query=query,
k=k * 3, # Get more to account for filtering
filters=filters,
exclude_templates=True,
use_mmr=True
)
# Limit to k results after filtering and deduplication
retrieved_docs = retrieved_docs[:k]
# Check if any documents were retrieved
if not retrieved_docs:
# If no documents found, try with a lower threshold for very short queries
if len(query.strip()) < 20:
logger.info(f"Short query with no results, retrying with lower threshold...")
retrieved_docs = await self.retrieval_engine.retrieve(
query=query,
k=k,
filters=filters,
exclude_templates=True,
use_mmr=False # Disable MMR for retry
)
retrieved_docs = retrieved_docs[:k]
# If still no results, handle gracefully
if not retrieved_docs:
logger.info(f"No content found for query: {query[:100]}...")
# Provide a helpful response when no content is found
no_content_response = (
"I couldn't find specific information about that topic in the book. "
"This book covers Physical AI & Humanoid Robotics. Try asking about:\n"
"• Introduction to physical AI\n"
"• Types of humanoid robots\n"
"• AI control systems\n"
"• Robot locomotion\n"
"• Specific chapters or sections"
)
# Stream the helpful response
words = no_content_response.split()
for word in words:
yield self._format_sse_message({
"type": "chunk",
"content": word + " "
})
await asyncio.sleep(0.05)
yield self._format_sse_message({
"type": "done",
"session_id": session_id,
"response_time": 0.1,
"tokens_used": self.count_tokens(no_content_response),
"no_results": True
})
return
# Log monitoring metrics
logger.info(
"Retrieval metrics - query_length=%d, retrieved_count=%d, threshold=%.2f, session_id=%s",
len(query),
len(retrieved_docs),
self.retrieval_engine.score_threshold,
session_id
)
# Log similarity scores for monitoring
scores = [result["similarity_score"] for result in retrieved_docs]
if scores:
logger.info(
"Similarity scores - min=%.3f, max=%.3f, avg=%.3f, count=%d",
min(scores),
max(scores),
sum(scores) / len(scores),
len(scores)
)
# Create citations
citations = []
source_context = []
for result in retrieved_docs:
chunk = result["chunk"]
metadata = chunk.metadata
citation = Citation(
id=str(uuid.uuid4()),
chunk_id=chunk.id,
document_id=metadata.get("document_id", ""),
text_snippet=chunk.content[:200] + "...",
relevance_score=result["similarity_score"],
chapter=metadata.get("chapter"),
section=metadata.get("section_header") or metadata.get("section"),
confidence=result["similarity_score"]
)
citations.append(citation)
# Add to context
source_text = chunk.content
if source_text:
source_context.append(f"[Source]: {source_text}")
# Build context with conversation history and retrieved documents
context_messages = self._build_context_messages(
context,
source_context,
context_window or self.context_window_size
)
# Generate response
logger.info("Generating response...")
response = await self.openai_client.chat.completions.create(
model=self.model,
messages=context_messages,
max_completion_tokens=1000
)
answer = response.choices[0].message.content
tokens_used = response.usage.total_tokens if response.usage else 0
# Create assistant message and add to context
assistant_message = Message(
id=str(uuid.uuid4()),
role=MessageRole.ASSISTANT,
content=answer,
token_count=self.count_tokens(answer),
citations=[citation.id for citation in citations]
)
context.add_message(assistant_message)
# Calculate response time
response_time = (datetime.utcnow() - start_time).total_seconds()
# Helper function to serialize citations
def serialize_citation(citation):
"""Convert Citation object to JSON-serializable dict."""
return {
"id": getattr(citation, 'id', ''),
"chunk_id": getattr(citation, 'chunk_id', ''),
"document_id": getattr(citation, 'document_id', ''),
"text_snippet": getattr(citation, 'text_snippet', ''),
"relevance_score": getattr(citation, 'relevance_score', 0),
"chapter": getattr(citation, 'chapter', ''),
"section": getattr(citation, 'section', ''),
"confidence": getattr(citation, 'confidence', 0)
}
# Final response
final_response = {
"type": "final",
"answer": answer,
"sources": [serialize_citation(citation) for citation in citations],
"session_id": session_id,
"query": query,
"response_time": response_time,
"tokens_used": tokens_used,
"model": self.model
}
yield f"data: {json.dumps(final_response)}\n\n"
yield f"data: [DONE]\n\n"
except Exception as e:
logger.error(f"Chat failed: {str(e)}", exc_info=True)
raise
def _get_or_create_context(self, session_id: str) -> ConversationContext:
"""Get existing conversation context or create new one."""
if session_id not in self.conversations:
self.conversations[session_id] = ConversationContext(
session_id=session_id,
max_messages=self.max_context_messages,
messages=[
Message(
id=str(uuid.uuid4()),
role=MessageRole.SYSTEM,
content=(
"You are an AI assistant for the book 'Physical AI and Humanoid Robotics'. "
"This book covers topics including physical AI systems, humanoid robots, "
"robot sensing, actuation mechanisms, and the convergence of AI with robotics. "
"Provide accurate, detailed answers based on the provided book content. "
"Always cite your sources. Use the format [Chapter - Section](URL) if a URL is provided in the context; otherwise, use [Chapter - Section]. "
"If users ask about topics outside this book (other books, movies, general knowledge), "
"politely explain: 'I can only provide information about Physical AI, humanoid robots, "
"and the specific topics covered in this book.' "
"If the book context doesn't contain relevant information, say so clearly."
),
token_count=self.count_tokens(
"You are an AI assistant for the book 'Physical AI and Humanoid Robotics'. "
"This book covers topics including physical AI systems, humanoid robots, "
"robot sensing, actuation mechanisms, and the convergence of AI with robotics. "
"Provide accurate, detailed answers based on the provided book content. "
"Always cite your sources. Use the format [Chapter - Section](URL) if a URL is provided in the context; otherwise, use [Chapter - Section]. "
"If users ask about topics outside this book (other books, movies, general knowledge), "
"politely explain: 'I can only provide information about Physical AI, humanoid robots, "
"and the specific topics covered in this book.' "
"If the book context doesn't contain relevant information, say so clearly."
)
)
]
)
return self.conversations[session_id]
def _build_context_messages(
self,
context: ConversationContext,
source_texts: List[str],
max_tokens: int
) -> List[Dict[str, str]]:
"""Build context messages for OpenAI API."""
messages = []
current_tokens = 0
# Add system message
system_msg = context.messages[0] if context.messages else None
if system_msg:
messages.append({
"role": system_msg.role.value,
"content": system_msg.content
})
current_tokens += system_msg.token_count
# Add context from retrieved documents
if source_texts:
context_content = "\n\n".join(source_texts)
context_message = {
"role": "system",
"content": f"Context from the book:\n\n{context_content}"
}
context_tokens = self.count_tokens(context_content)
# Only add if within token limit
if current_tokens + context_tokens < max_tokens * 0.6: # Reserve 40% for conversation
messages.append(context_message)
current_tokens += context_tokens
# Add conversation history
for msg in context.get_context_messages():
if msg.role != MessageRole.SYSTEM: # System message already added
msg_tokens = msg.token_count
# Check if we have space for this message
if current_tokens + msg_tokens < max_tokens * 0.9: # Leave some buffer
messages.append({
"role": msg.role.value,
"content": msg.content
})
current_tokens += msg_tokens
else:
# Break if we're running out of space
break
return messages
def _format_sse_message(self, data: Dict[str, Any]) -> str:
"""Format message for Server-Sent Events."""
return f"data: {json.dumps(data)}\n\n"
def count_tokens(self, text: str) -> int:
"""Count tokens in text using tiktoken."""
return len(self.encoding.encode(text))
async def clear_context(self, session_id: str):
"""Clear conversation context for a session."""
if session_id in self.conversations:
del self.conversations[session_id]
logger.info(f"Cleared context for session: {session_id}")
async def get_context(self, session_id: str) -> Optional[ConversationContext]:
"""Get conversation context for a session."""
return self.conversations.get(session_id)
async def close(self):
"""Close clients and cleanup."""
if self.openai_client:
await self.openai_client.close()
if self.embedder:
await self.embedder.close()