Spaces:

mrowaisabdullah
/

ai-humanoid-robotics

Paused

App Files Files Community

GitHub Actions commited on 6 days ago

Commit

457b685

1 Parent(s): 84b0fa3

Deploy backend from GitHub Actions

Browse files

🚀 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +159 -15
.gitignore +3 -1
OPENAI_AGENTS_FIX.md +348 -0
alembic/env.py +5 -1
alembic/versions/001_reader_features_tables.py +179 -0
alembic/versions/004_add_translation_tables.py +114 -0
alembic/versions/005_add_openai_translation_tables.py +295 -0
create_translation_tables.py +47 -0
fix_async_client.py +44 -0
fix_jsonb.py +28 -0
fix_translation_endpoint.py +45 -0
fix_user_id_issue.py +34 -0
fix_user_model.py +53 -0
main.py +58 -3
migrate_user_id.py +63 -0
migrate_user_id_fixed.py +53 -0
migration_summary_translation_tables.md +124 -0
migrations/versions/001_create_openai_translation_tables.py +297 -0
pyproject.toml +7 -1
requirements.txt +7 -0
src/api/v1/progress.py +450 -0
src/api/v1/reader_features.py +94 -0
src/api/v1/translation.py +336 -0
src/config/logging_config.py +442 -0
src/config/translation_config.py +432 -0
src/database/base.py +1 -1
src/middleware/auth.py +302 -0
src/middleware/cors.py +356 -0
src/middleware/rate_limit.py +385 -0
src/models/__init__.py +29 -0
src/models/auth.py +3 -0
src/models/base.py +26 -0
src/models/bookmark.py +53 -0
src/models/chat.py +1 -1
src/models/content_localization.py +50 -0
src/models/personalization.py +64 -0
src/models/reading_progress.py +33 -0
src/models/search_index.py +30 -0
src/models/translation_openai.py +512 -0
src/models/user_preferences.py +54 -0
src/services/cache_examples.py +231 -0
src/services/cache_service.py +690 -0
src/services/code_block_handler.py +630 -0
src/services/content_reconstructor.py +471 -0
src/services/html_parser.py +565 -0
src/services/openai_translation/__init__.py +10 -0
src/services/openai_translation/client.py +59 -0
src/services/openai_translation/openai_agent.py +533 -0
src/services/openai_translation/service.py +855 -0
src/services/openai_translation/translation_agent.py +198 -0

.env.example CHANGED Viewed

@@ -1,4 +1,13 @@
 # Google OAuth Configuration
 GOOGLE_CLIENT_ID=your-google-client-id
 GOOGLE_CLIENT_SECRET=your-google-client-secret
 # For production:
@@ -7,47 +16,182 @@ GOOGLE_CLIENT_SECRET=your-google-client-secret
 AUTH_REDIRECT_URI=http://localhost:3000/auth/google/callback
 FRONTEND_URL=http://localhost:3000
 # JWT Configuration
 JWT_SECRET_KEY=your-super-secret-jwt-key-at-least-32-characters-long
 JWT_ALGORITHM=HS256
 JWT_EXPIRE_MINUTES=10080  # 7 days
 # Database Configuration
 DATABASE_URL=sqlite:///./database/auth.db
 # API Configuration
 API_HOST=0.0.0.0
 API_PORT=7860
 LOG_LEVEL=INFO
-# Rate Limiting
-RATE_LIMIT_REQUESTS=60
-RATE_LIMIT_WINDOW=60
 # CORS Configuration
-ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co
-# OpenAI Configuration (already existing)
-OPENAI_API_KEY=your-openai-api-key
-OPENAI_MODEL=gpt-4.1-nano
-OPENAI_EMBEDDING_MODEL=text-embedding-3-small
-# Qdrant Configuration (already existing)
 QDRANT_URL=http://localhost:6333
 QDRANT_API_KEY=your-qdrant-api-key-if-needed
-# Content Configuration (already existing)
 BOOK_CONTENT_PATH=./book_content
 CHUNK_SIZE=1000
 CHUNK_OVERLAP=200
-# Conversation Context (already existing)
 MAX_CONTEXT_MESSAGES=3
 CONTEXT_WINDOW_SIZE=4000
-# Ingestion Configuration (already existing)
 BATCH_SIZE=100
 MAX_CONCURRENT_REQUESTS=10
-# Health Monitoring (already existing)
-HEALTH_CHECK_INTERVAL=30

+# ============================================
+# Environment Configuration
+# ============================================
+# Environment: development, testing, staging, production
+ENVIRONMENT=development
+DEBUG=true
+# ============================================
 # Google OAuth Configuration
+# ============================================
 GOOGLE_CLIENT_ID=your-google-client-id
 GOOGLE_CLIENT_SECRET=your-google-client-secret
 # For production:
 AUTH_REDIRECT_URI=http://localhost:3000/auth/google/callback
 FRONTEND_URL=http://localhost:3000
+# ============================================
 # JWT Configuration
+# ============================================
 JWT_SECRET_KEY=your-super-secret-jwt-key-at-least-32-characters-long
 JWT_ALGORITHM=HS256
 JWT_EXPIRE_MINUTES=10080  # 7 days
+# ============================================
 # Database Configuration
+# ============================================
 DATABASE_URL=sqlite:///./database/auth.db
+DB_POOL_SIZE=5
+DB_MAX_OVERFLOW=10
+DB_POOL_TIMEOUT=30
+DB_POOL_RECYCLE=3600
+DB_AUTO_MIGRATE=true
+# ============================================
+# Gemini API Configuration (for OpenAI SDK)
+# ============================================
+GEMINI_API_KEY=your-gemini-api-key
+GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
+GEMINI_MODEL=gemini-2.0-flash-lite
+GEMINI_TIMEOUT=60
+GEMINI_MAX_RETRIES=3
+GEMINI_RETRY_DELAY=1.0
+GEMINI_HTTP2=true
+GEMINI_RPM=60
+GEMINI_RPH=1000
+# ============================================
+# OpenAI Agents SDK Configuration
+# ============================================
+OPENAI_AGENTS_ENABLED=true
+OPENAI_AGENTS_TRACING=false
+OPENAI_AGENTS_VERBOSE=false
+AGENT_DEFAULT_TEMPERATURE=0.3
+AGENT_MAX_TOKENS=2048
+AGENT_MAX_TURNS=5
+AGENT_HTML_TOOL=true
+AGENT_CODE_TOOL=true
+AGENT_QUALITY_TOOL=true
+AGENT_QUALITY_CHECK=true
+AGENT_CONFIDENCE_THRESHOLD=0.8
+# ============================================
+# Legacy OpenAI Configuration (for RAG)
+# ============================================
+OPENAI_API_KEY=your-openai-api-key
+OPENAI_MODEL=gpt-4.1-nano
+OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+# ============================================
+# Cache Configuration
+# ============================================
+CACHE_BACKEND=memory
+CACHE_DEFAULT_TTL=168
+CACHE_HIGH_QUALITY_TTL=720
+CACHE_LOW_QUALITY_TTL=24
+REDIS_URL=redis://localhost:6379
+REDIS_PREFIX=translation:
+REDIS_MAX_CONNECTIONS=10
+CACHE_MEMORY_MAX_SIZE=1000
+CACHE_CLEANUP_INTERVAL=3600
+# ============================================
+# Rate Limiting Configuration
+# ============================================
+RATE_LIMIT_ENABLED=true
+RATE_LIMIT_RPM=60
+RATE_LIMIT_RPH=1000
+RATE_LIMIT_RPD=10000
+TRANSLATION_RPM=10
+TRANSLATION_RPH=500
+RATE_LIMIT_BLOCK_DURATION=3600
+RATE_LIMIT_WARNING_THRESHOLD=0.8
+RATE_LIMIT_REDIS=false
+# ============================================
 # API Configuration
+# ============================================
 API_HOST=0.0.0.0
 API_PORT=7860
 LOG_LEVEL=INFO
+# ============================================
 # CORS Configuration
+# ============================================
+CORS_ORIGINS=http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co
+CORS_METHODS=GET,POST,PUT,DELETE
+CORS_HEADERS=*
+# ============================================
+# Security Configuration
+# ============================================
+SECURITY_REQUIRE_API_KEY=false
+SECURITY_API_KEY_HEADER=X-API-Key
+SECURITY_MAX_TEXT_LENGTH=100000
+SECURITY_MAX_CHUNKS=100
+SECURITY_CONTENT_FILTER=true
+SECURITY_BLOCKED_PATTERNS=
+SECURITY_IP_WHITELIST=
+SECURITY_IP_BLACKLIST=
+# ============================================
+# Logging Configuration
+# ============================================
+LOG_FILE_ENABLED=true
+LOG_FILE_PATH=logs/translation.log
+LOG_FILE_ROTATION=1 day
+LOG_FILE_RETENTION=30 days
+LOG_MAX_FILE_SIZE=100 MB
+LOG_JSON_FORMAT=false
+LOG_INCLUDE_REQUEST_ID=true
+LOG_FILTER_SENSITIVE=true
+SENSITIVE_FIELDS=api_key,password,token,authorization
+# ============================================
+# Monitoring Configuration
+# ============================================
+MONITORING_ENABLED=true
+METRICS_ENDPOINT=/metrics
+METRICS_PORT=9090
+HEALTH_ENDPOINT=/health
+HEALTH_DETAILED=true
+TRACK_PERFORMANCE=true
+SLOW_QUERY_THRESHOLD=1000
+TRACK_ERRORS=true
+ERROR_SAMPLE_RATE=1.0
+# External Monitoring
+SENTRY_DSN=
+PROMETHEUS_GATEWAY=
+# ============================================
+# Qdrant Configuration (for RAG)
+# ============================================
 QDRANT_URL=http://localhost:6333
 QDRANT_API_KEY=your-qdrant-api-key-if-needed
+# ============================================
+# Content Configuration (for RAG)
+# ============================================
 BOOK_CONTENT_PATH=./book_content
 CHUNK_SIZE=1000
 CHUNK_OVERLAP=200
+# ============================================
+# Conversation Context (for RAG)
+# ============================================
 MAX_CONTEXT_MESSAGES=3
 CONTEXT_WINDOW_SIZE=4000
+# ============================================
+# Ingestion Configuration (for RAG)
+# ============================================
 BATCH_SIZE=100
 MAX_CONCURRENT_REQUESTS=10
+# ============================================
+# Health Monitoring
+# ============================================
+HEALTH_CHECK_INTERVAL=30
+# ============================================
+# Proxy Configuration (Optional)
+# ============================================
+HTTP_PROXY=
+HTTPS_PROXY=
+# ============================================
+# Feature Flags
+# ============================================
+FEATURE_STREAMING=true
+FEATURE_QUALITY_CHECK=true
+FEATURE_CHUNKING=true
+FEATURE_CODE_PRESERVATION=true
+FEATURE_HTML_PRESERVATION=true
+FEATURE_BATCH_TRANSLATION=true

.gitignore CHANGED Viewed

@@ -149,4 +149,6 @@ Thumbs.db
 # Test files
 test_output/
-test_reports/

 # Test files
 test_output/
+test_reports/
+.playwright-mcp

OPENAI_AGENTS_FIX.md ADDED Viewed

	@@ -0,0 +1,348 @@

+# OpenAI Agents SDK Implementation Fix for Gemini API Quota Errors
+## Problem Summary
+The translation system was experiencing Gemini API quota exceeded errors (HTTP 429) due to several issues with the OpenAI Agents SDK implementation:
+1. **Incorrect Package Name**: The code was importing from `agents` package instead of the correct `openai-agents-sdk`
+2. **Not Actually Using OpenAI Agents SDK**: Despite claiming to use the SDK, the implementation was using the OpenAI client directly
+3. **Insufficient Rate Limit Handling**: Basic error handling that didn't properly implement exponential backoff
+4. **Missing Per-User Rate Limiting**: No per-user or per-IP rate limiting to prevent quota exhaustion
+## Solution Implementation
+### 1. Fixed Package Dependencies
+Updated `pyproject.toml`:
+```toml
+# Before
+"openai-agents>=0.1.0"
+# After
+"openai-agents-sdk>=0.2.9"
+```
+### 2. Created Proper OpenAI Agents SDK Implementation
+**File**: `src/services/openai_translation/openai_agent.py`
+- Correct imports from `openai_agents_sdk`
+- Proper agent implementation with tools
+- Enhanced error handling for rate limits
+- Exponential backoff with jitter
+- Detailed error reporting
+Key features:
+```python
+from openai_agents_sdk import Agent, Runner, function_tool, RunContextWrapper
+from openai_agents_sdk.errors import RateLimitError as OpenAIRateLimitError
+```
+### 3. Enhanced Error Handling
+**File**: `src/services/openai_translation/enhanced_service.py`
+- Per-user rate limiting
+- Exponential backoff implementation
+- Detailed rate limit error responses
+- Retry attempt tracking
+- Backoff time accumulation
+Example retry logic:
+```python
+for attempt in range(request.max_retries + 1):
+    try:
+        # API call
+        result = await api_call()
+        return result
+    except RateLimitError as e:
+        if attempt < request.max_retries:
+            delay = min(
+                request.retry_delay * (request.backoff_factor ** attempt),
+                request.max_retry_delay
+            )
+            # Add jitter
+            delay *= (0.5 + random.random() * 0.5)
+            await asyncio.sleep(delay)
+            continue
+        else:
+            raise
+```
+### 4. Enhanced API Endpoints
+**File**: `src/api/v1/enhanced_translation.py`
+- Proper HTTP 429 status codes
+- Retry-After headers
+- Detailed rate limit information
+- Per-endpoint rate limiting
+Example response:
+```json
+{
+  "error": "RATE_LIMIT_EXCEEDED",
+  "message": "User rate limit exceeded. Please wait 45.2 seconds.",
+  "retry_after": 45.2,
+  "rate_limit_info": {
+    "retry_after": 45.2,
+    "limit_type": "quota_exceeded",
+    "user_id": "user123"
+  },
+  "timestamp": 1703847123.45
+}
+```
+### 5. Rate Limiting Middleware
+**File**: `src/middleware/rate_limit.py`
+- Per-IP rate limiting
+- Per-user rate limiting (if authenticated)
+- Sliding window algorithm
+- Redis support for distributed systems
+- In-memory fallback
+## How to Use the Enhanced System
+### 1. Update Your Environment
+```bash
+cd backend
+pip install -e .
+```
+### 2. Update Your `.env` File
+Make sure you have:
+```env
+GEMINI_API_KEY=your_gemini_api_key_here
+GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
+GEMINI_MODEL=gemini-2.0-flash-lite
+```
+### 3. Add Rate Limiting to Your App
+In your FastAPI app initialization:
+```python
+from src.middleware.rate_limit import TranslationRateLimitMiddleware
+app.add_middleware(TranslationRateLimitMiddleware)
+```
+### 4. Use Enhanced Endpoints
+Instead of `/translation/translate`, use the enhanced endpoint:
+```http
+POST /translation/translate
+```
+This provides better error handling and rate limit information.
+## Rate Limit Configuration
+Default limits:
+- **Per IP**: 60 requests per minute, 1000 per hour
+- **Per User (if authenticated)**: 10 translations per minute, 500 per hour
+- **Translation Endpoints**: Stricter limits (10/min, 500/hour)
+These can be configured via environment variables or in the middleware initialization.
+## Monitoring and Metrics
+The enhanced system provides detailed metrics:
+```json
+{
+  "period": "24h",
+  "total_requests": 1250,
+  "successful_requests": 1180,
+  "failed_requests": 45,
+  "rate_limited_requests": 25,
+  "cache_hit_rate": 0.35,
+  "avg_processing_time_ms": 2340,
+  "total_cost_usd": 2.45,
+  "active_users": 15,
+  "user_rate_limits": {
+    "user123": {
+      "requests_last_minute": 3,
+      "last_reset": 1703847123.45
+    }
+  }
+}
+```
+## Best Practices
+1. **Handle Rate Limit Errors Properly**
+   ```python
+   try:
+       result = await translate_text(text)
+   except RateLimitError as e:
+       print(f"Rate limited. Retry after {e.retry_after} seconds")
+       await asyncio.sleep(e.retry_after)
+       # Retry with backoff
+   ```
+2. **Use Caching When Possible**
+   - The system automatically caches successful translations
+   - Cache hits don't count against rate limits
+   - Provide `page_url` for better cache keys
+3. **Batch Large Translations**
+   - The system automatically chunks large texts
+   - Configure `chunk_size` and `max_chunks` appropriately
+   - Monitor processing time to optimize chunk size
+4. **Monitor Your Usage**
+   - Use `/translation/metrics` endpoint (admin only)
+   - Watch for rate limit errors in logs
+   - Adjust retry settings based on your quota
+## Testing the Fix
+To test the rate limiting:
+```python
+import asyncio
+import httpx
+async def test_rate_limit():
+    async with httpx.AsyncClient() as client:
+        # Make rapid requests to trigger rate limit
+        for i in range(15):
+            response = await client.post(
+                "http://localhost:8000/translation/translate",
+                json={
+                    "text": f"Test translation {i}",
+                    "source_language": "en",
+                    "target_language": "ur"
+                }
+            )
+            print(f"Request {i}: Status {response.status_code}")
+            if response.status_code == 429:
+                retry_after = response.headers.get("Retry-After")
+                print(f"Rate limited. Retry after {retry_after} seconds")
+                break
+asyncio.run(test_rate_limit())
+```
+## Troubleshooting
+### Still Getting 429 Errors?
+1. **Check Your Gemini API Quota**
+   - Visit Google AI Studio
+   - Verify your daily/monthly quota
+   - Request quota increase if needed
+2. **Implement Client-Side Rate Limiting**
+   ```python
+   import asyncio
+   from asyncio import Semaphore
+   # Limit concurrent requests
+   semaphore = Semaphore(5)  # Max 5 concurrent requests
+   async def translate_with_limit(text):
+       async with semaphore:
+           return await translate_text(text)
+   ```
+3. **Use Backoff in Your Client**
+   ```python
+   import backoff
+   @backoff.on_exception(backoff.expo, RateLimitError, max_tries=3)
+   async def safe_translate(text):
+       return await translate_text(text)
+   ```
+### Performance Issues?
+1. **Reduce Chunk Size**
+   - Smaller chunks process faster
+   - Less chance of timeout
+   - Better error recovery
+2. **Enable Caching**
+   - Set `page_url` for content-based caching
+   - Cache hits are instant
+   - Reduces API usage
+3. **Monitor Memory Usage**
+   - Large translations use more memory
+   - Consider streaming for very large texts
+   - Implement pagination for batch jobs
+## Migration Guide
+To migrate from the old implementation:
+1. **Update Dependencies**
+   ```bash
+   pip install openai-agents-sdk>=0.2.9
+   ```
+2. **Update Imports**
+   ```python
+   # Old
+   from agents import Agent, Runner
+   # New
+   from openai_agents_sdk import Agent, Runner
+   ```
+3. **Update Error Handling**
+   ```python
+   # Old
+   except Exception as e:
+       if "429" in str(e):
+           # Handle rate limit
+   # New
+   except RateLimitError as e:
+       retry_after = e.retry_after
+       # Handle with proper backoff
+   ```
+4. **Add Rate Limiting**
+   ```python
+   from src.middleware.rate_limit import TranslationRateLimitMiddleware
+   app.add_middleware(TranslationRateLimitMiddleware)
+   ```
+## Conclusion
+The enhanced OpenAI Agents SDK implementation provides:
+- ✅ Correct package usage and imports
+- ✅ Proper agent implementation with tools
+- ✅ Robust rate limit error handling
+- ✅ Exponential backoff with jitter
+- ✅ Per-user and per-IP rate limiting
+- ✅ Detailed error reporting and metrics
+- ✅ Caching to reduce API usage
+- ✅ Monitoring and health checks
+This should significantly reduce Gemini API quota errors and provide a better user experience with proper error handling and retry logic.

alembic/env.py CHANGED Viewed

@@ -10,7 +10,11 @@ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 # Import models
 from src.models.auth import Base
-from src.models.chat import Base
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.

 # Import models
 from src.models.auth import Base
+# Import other models to register them with the Base metadata
+import src.models.chat
+import src.models.translation
+import src.models.personalization
+import src.models.content_localization
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.

alembic/versions/001_reader_features_tables.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Create tables for reader experience features
+Revision ID: 003_reader_features_tables
+Revises: 002_add_onboarding_tables
+Create Date: 2025-01-09
+"""
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers
+revision = '003_reader_features_tables'
+down_revision = '002_add_onboarding_tables'
+branch_labels = None
+depends_on = None
+def upgrade():
+    # Create reading_progress table
+    op.create_table('reading_progress',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('user_id', sa.String(), nullable=False),
+        sa.Column('chapter_id', sa.String(), nullable=False),
+        sa.Column('section_id', sa.String(), nullable=False),
+        sa.Column('position', sa.Float(), nullable=False),
+        sa.Column('completed', sa.Boolean(), nullable=False, server_default='false'),
+        sa.Column('time_spent', sa.Integer(), nullable=False, server_default='0'),
+        sa.Column('last_accessed', sa.DateTime(), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('user_id', 'chapter_id', 'section_id')
+    )
+    op.create_index('idx_reading_progress_user_chapter', 'reading_progress', ['user_id', 'chapter_id'])
+    op.create_index('idx_reading_progress_last_accessed', 'reading_progress', ['last_accessed'])
+    # Create bookmarks table
+    op.create_table('bookmarks',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('user_id', sa.String(), nullable=False),
+        sa.Column('chapter_id', sa.String(), nullable=False),
+        sa.Column('section_id', sa.String(), nullable=True),
+        sa.Column('page_url', sa.String(), nullable=False),
+        sa.Column('page_title', sa.String(length=255), nullable=False),
+        sa.Column('snippet', sa.String(), nullable=True),
+        sa.Column('note', sa.String(length=1000), nullable=True),
+        sa.Column('is_private', sa.Boolean(), nullable=False, server_default='true'),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index('idx_bookmarks_user_created', 'bookmarks', ['user_id', 'created_at'])
+    op.create_index('idx_bookmarks_chapter', 'bookmarks', ['chapter_id'])
+    # Create bookmark_tags table
+    op.create_table('bookmark_tags',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('bookmark_id', sa.String(), nullable=False),
+        sa.Column('tag', sa.String(length=50), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(['bookmark_id'], ['bookmarks.id'], ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('bookmark_id', 'tag')
+    )
+    op.create_index('idx_bookmark_tags_tag', 'bookmark_tags', ['tag'])
+    # Create user_preferences table
+    op.create_table('user_preferences',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('user_id', sa.String(), nullable=False),
+        sa.Column('language', sa.String(), nullable=False),
+        sa.Column('reading_pace', sa.String(), nullable=False),
+        sa.Column('preferred_depth', sa.String(), nullable=False),
+        sa.Column('show_code_examples', sa.Boolean(), nullable=False, server_default='true'),
+        sa.Column('adaptive_difficulty', sa.Boolean(), nullable=False, server_default='false'),
+        sa.Column('theme', sa.String(), nullable=False),
+        sa.Column('font_size', sa.Integer(), nullable=False, server_default='16'),
+        sa.Column('line_height', sa.Float(), nullable=False, server_default='1.5'),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('user_id')
+    )
+    # Create user_custom_notes table
+    op.create_table('user_custom_notes',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('user_preference_id', sa.String(), nullable=False),
+        sa.Column('key', sa.String(), nullable=False),
+        sa.Column('value', sa.String(), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(['user_preference_id'], ['user_preferences.id'], ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('user_preference_id', 'key')
+    )
+    # Create content_localization table
+    op.create_table('content_localization',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('content_id', sa.String(), nullable=False),
+        sa.Column('language', sa.String(), nullable=False),
+        sa.Column('title', sa.String(length=255), nullable=False),
+        sa.Column('content', sa.String(), nullable=False),
+        sa.Column('word_count', sa.Integer(), nullable=False, server_default='0'),
+        sa.Column('reading_time_minutes', sa.Integer(), nullable=False, server_default='0'),
+        sa.Column('last_updated', sa.DateTime(), nullable=False),
+        sa.Column('translator', sa.String(), nullable=True),
+        sa.Column('reviewed', sa.Boolean(), nullable=False, server_default='false'),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('content_id', 'language')
+    )
+    op.create_index('idx_content_localization_language', 'content_localization', ['language'])
+    op.create_index('idx_content_localization_content', 'content_localization', ['content_id'])
+    # Create search_index table
+    op.create_table('search_index',
+        sa.Column('id', sa.String(), nullable=False),
+        sa.Column('content_id', sa.String(), nullable=False),
+        sa.Column('language', sa.String(), nullable=False),
+        sa.Column('content_type', sa.String(), nullable=False),
+        sa.Column('title', sa.String(), nullable=False),
+        sa.Column('content', sa.String(), nullable=False),
+        sa.Column('chapter_id', sa.String(), nullable=False),
+        sa.Column('section_id', sa.String(), nullable=True),
+        sa.Column('rank', sa.Float(), nullable=False, server_default='0.5'),
+        sa.Column('indexed_at', sa.DateTime(), nullable=False),
+        sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index('idx_search_index_language_rank', 'search_index', ['language', 'rank'])
+    op.create_index('idx_search_index_chapter', 'search_index', ['chapter_id'])
+    # Create FTS virtual table for search
+    op.execute("""
+        CREATE VIRTUAL TABLE search_index_fts USING fts5(
+            title,
+            content,
+            keywords,
+            content=search_index
+        )
+    """)
+    # Create FTS triggers
+    op.execute("""
+        CREATE TRIGGER search_index_ai AFTER INSERT ON search_index BEGIN
+            INSERT INTO search_index_fts(rowid, title, content, keywords)
+            VALUES (new.id, new.title, new.content, new.title || ' ' || new.content);
+        END
+    """)
+    op.execute("""
+        CREATE TRIGGER search_index_ad AFTER DELETE ON search_index BEGIN
+            INSERT INTO search_index_fts(search_index_fts, rowid, title, content, keywords)
+            VALUES ('delete', old.id, old.title, old.content, NULL);
+        END
+    """)
+    op.execute("""
+        CREATE TRIGGER search_index_au AFTER UPDATE ON search_index BEGIN
+            DELETE FROM search_index_fts WHERE rowid = old.id;
+            INSERT INTO search_index_fts(rowid, title, content, keywords)
+            VALUES (new.id, new.title, new.content, new.title || ' ' || new.content);
+        END
+    """)
+def downgrade():
+    # Drop tables in reverse order
+    op.drop_table('search_index')
+    op.execute('DROP TABLE IF EXISTS search_index_fts')
+    op.drop_table('content_localization')
+    op.drop_table('user_custom_notes')
+    op.drop_table('user_preferences')
+    op.drop_table('bookmark_tags')
+    op.drop_table('bookmarks')
+    op.drop_table('reading_progress')

alembic/versions/004_add_translation_tables.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""Add translation tables and personalization features
+Revision ID: 004_add_translation_tables
+Revises: 003_reader_features_tables
+Create Date: 2025-01-10
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import sqlite
+# revision identifiers
+revision = '004_add_translation_tables'
+down_revision = '003_reader_features_tables'
+branch_labels = None
+depends_on = None
+def upgrade():
+    # Create translations table
+    op.create_table('translations',
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('content_hash', sa.String(length=64), nullable=False),
+        sa.Column('source_language', sa.String(length=10), nullable=False),
+        sa.Column('target_language', sa.String(length=10), nullable=False),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.Column('translation_model', sa.String(length=50), nullable=False),
+        sa.Column('character_count', sa.Integer(), nullable=False),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('content_hash')
+    )
+    op.create_index('idx_content_lookup', 'translations', ['content_hash', 'source_language', 'target_language'], unique=False)
+    op.create_index(op.f('ix_translations_content_hash'), 'translations', ['content_hash'], unique=True)
+    # Create translation_feedback table
+    op.create_table('translation_feedback',
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('translation_id', sa.Integer(), nullable=False),
+        sa.Column('user_id', sa.String(length=36), nullable=False),
+        sa.Column('rating', sa.SmallInteger(), nullable=False),
+        sa.Column('comment', sa.Text(), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(['translation_id'], ['translations.id'], ),
+        sa.PrimaryKeyConstraint('id'),
+        sa.CheckConstraint('rating IN (-1, 1)', name='check_rating_range')
+    )
+    op.create_index('idx_user_translation', 'translation_feedback', ['user_id', 'translation_id'], unique=True)
+    # Create personalization_profiles table
+    op.create_table('personalization_profiles',
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('user_id', sa.String(length=36), nullable=False),
+        sa.Column('reading_level', sa.String(length=20), nullable=True),
+        sa.Column('preferred_language', sa.String(length=10), nullable=True),
+        sa.Column('focus_areas', sa.JSON(), nullable=True),
+        sa.Column('learning_style', sa.String(length=20), nullable=True),
+        sa.Column('enable_transliteration', sa.Boolean(), nullable=True),
+        sa.Column('technical_term_handling', sa.String(length=20), nullable=True),
+        sa.Column('font_size', sa.Integer(), nullable=True),
+        sa.Column('focus_mode_preferences', sa.JSON(), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('updated_at', sa.DateTime(), nullable=False),
+        sa.Column('last_active', sa.DateTime(), nullable=False),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('user_id')
+    )
+    op.create_index(op.f('ix_personalization_profiles_user_id'), 'personalization_profiles', ['user_id'], unique=False)
+    # Check if content_localization table exists before creating
+    conn = op.get_bind()
+    inspector = sa.inspect(conn)
+    tables = inspector.get_table_names()
+    if 'content_localization' not in tables:
+        # Create content_localization table
+        op.create_table('content_localization',
+            sa.Column('id', sa.Integer(), nullable=False),
+            sa.Column('content_url', sa.String(length=500), nullable=False),
+            sa.Column('content_hash', sa.String(length=64), nullable=False),
+            sa.Column('is_translated', sa.Boolean(), nullable=True),
+            sa.Column('last_translation_date', sa.DateTime(), nullable=True),
+            sa.Column('translation_cache_key', sa.String(length=64), nullable=True),
+            sa.Column('word_count', sa.Integer(), nullable=True),
+            sa.Column('character_count', sa.Integer(), nullable=True),
+            sa.Column('has_code_blocks', sa.Boolean(), nullable=True),
+            sa.Column('detected_languages', sa.JSON(), nullable=True),
+            sa.Column('chunk_count', sa.Integer(), nullable=True),
+            sa.Column('processing_status', sa.String(length=20), nullable=True),
+            sa.Column('created_at', sa.DateTime(), nullable=False),
+            sa.Column('updated_at', sa.DateTime(), nullable=False),
+            sa.PrimaryKeyConstraint('id')
+        )
+        op.create_index(op.f('ix_content_localization_content_hash'), 'content_localization', ['content_hash'], unique=False)
+        op.create_index(op.f('ix_content_localization_content_url'), 'content_localization', ['content_url'], unique=False)
+def downgrade():
+    # Drop tables in reverse order
+    op.drop_index(op.f('ix_content_localization_content_url'), table_name='content_localization')
+    op.drop_index(op.f('ix_content_localization_content_hash'), table_name='content_localization')
+    op.drop_table('content_localization')
+    op.drop_index(op.f('ix_personalization_profiles_user_id'), table_name='personalization_profiles')
+    op.drop_table('personalization_profiles')
+    op.drop_index('idx_user_translation', table_name='translation_feedback')
+    op.drop_table('translation_feedback')
+    op.drop_index(op.f('ix_translations_content_hash'), table_name='translations')
+    op.drop_index('idx_content_lookup', table_name='translations')
+    op.drop_table('translations')

alembic/versions/005_add_openai_translation_tables.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""Add OpenAI Translation System tables
+Revision ID: 005_add_openai_translation_tables
+Revises: 004_add_translation_tables
+Create Date: 2025-12-12
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql, sqlite
+import uuid
+# revision identifiers
+revision = '005_add_openai_translation_tables'
+down_revision = '004_add_translation_tables'
+branch_labels = None
+depends_on = None
+def upgrade():
+    # Create translation_jobs table
+    op.create_table('translation_jobs',
+        sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
+        sa.Column('job_id', sa.String(length=255), nullable=False),
+        sa.Column('user_id', sa.String(length=255), nullable=True),
+        sa.Column('session_id', sa.String(length=255), nullable=True),
+        sa.Column('page_url', sa.String(length=2048), nullable=True),
+        sa.Column('content_hash', sa.String(length=64), nullable=False),
+        sa.Column('source_language', sa.String(length=10), nullable=False, default='en'),
+        sa.Column('target_language', sa.String(length=10), nullable=False, default='ur'),
+        sa.Column('model_name', sa.String(length=100), nullable=False),
+        sa.Column('temperature', sa.Float(), nullable=True),
+        sa.Column('max_tokens', sa.Integer(), nullable=True),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=True),
+        sa.Column('status', sa.String(length=20), nullable=False, default='PENDING'),
+        sa.Column('chunks_total', sa.Integer(), nullable=False, default=0),
+        sa.Column('chunks_completed', sa.Integer(), nullable=False, default=0),
+        sa.Column('chunks_failed', sa.Integer(), nullable=False, default=0),
+        sa.Column('progress_percentage', sa.Float(), nullable=False, default=0.0),
+        sa.Column('input_tokens', sa.Integer(), nullable=False, default=0),
+        sa.Column('output_tokens', sa.Integer(), nullable=False, default=0),
+        sa.Column('total_tokens', sa.Integer(), nullable=False, default=0),
+        sa.Column('estimated_cost_usd', sa.Float(), nullable=False, default=0.0),
+        sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
+        sa.Column('preserve_code_blocks', sa.Boolean(), nullable=False, default=True),
+        sa.Column('enable_transliteration', sa.Boolean(), nullable=False, default=True),
+        sa.Column('chunk_size', sa.Integer(), nullable=False, default=2000),
+        sa.Column('max_chunks', sa.Integer(), nullable=False, default=100),
+        sa.Column('max_retries', sa.Integer(), nullable=False, default=3),
+        sa.Column('user_agent', sa.Text(), nullable=True),
+        sa.Column('ip_address', sa.String(length=45), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('started_at', sa.DateTime(), nullable=True),
+        sa.Column('completed_at', sa.DateTime(), nullable=True),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('job_id'),
+        sa.CheckConstraint('chunks_total >= 0', name='check_chunks_total_non_negative'),
+        sa.CheckConstraint('chunks_completed >= 0', name='check_chunks_completed_non_negative'),
+        sa.CheckConstraint('chunks_failed >= 0', name='check_chunks_failed_non_negative'),
+        sa.CheckConstraint('progress_percentage >= 0.0 AND progress_percentage <= 100.0', name='check_progress_percentage_range'),
+        sa.CheckConstraint('chunk_size > 0', name='check_chunk_size_positive'),
+        sa.CheckConstraint('max_chunks > 0', name='check_max_chunks_positive'),
+        sa.CheckConstraint('max_retries >= 0', name='check_max_retries_non_negative')
+    )
+    op.create_index('ix_translation_jobs_job_id', 'translation_jobs', ['job_id'], unique=False)
+    op.create_index('ix_translation_jobs_user_id', 'translation_jobs', ['user_id'], unique=False)
+    op.create_index('ix_translation_jobs_session_id', 'translation_jobs', ['session_id'], unique=False)
+    op.create_index('ix_translation_jobs_page_url', 'translation_jobs', ['page_url'], unique=False)
+    op.create_index('ix_translation_jobs_content_hash', 'translation_jobs', ['content_hash'], unique=False)
+    op.create_index('ix_translation_jobs_status', 'translation_jobs', ['status'], unique=False)
+    op.create_index('ix_translation_jobs_created_at', 'translation_jobs', ['created_at'], unique=False)
+    # Create translation_chunks table
+    op.create_table('translation_chunks',
+        sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
+        sa.Column('job_id', sa.UUID(), nullable=False),
+        sa.Column('chunk_index', sa.Integer(), nullable=False),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=True),
+        sa.Column('status', sa.String(length=20), nullable=False, default='PENDING'),
+        sa.Column('retry_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('start_position', sa.Integer(), nullable=False),
+        sa.Column('end_position', sa.Integer(), nullable=False),
+        sa.Column('is_code_block', sa.Boolean(), nullable=False, default=False),
+        sa.Column('code_language', sa.String(length=50), nullable=True),
+        sa.Column('word_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('token_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('input_tokens', sa.Integer(), nullable=False, default=0),
+        sa.Column('output_tokens', sa.Integer(), nullable=False, default=0),
+        sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
+        sa.Column('last_error', sa.Text(), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('started_at', sa.DateTime(), nullable=True),
+        sa.Column('completed_at', sa.DateTime(), nullable=True),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('id'),
+        sa.CheckConstraint('chunk_index >= 0', name='check_chunk_index_non_negative'),
+        sa.CheckConstraint('start_position >= 0', name='check_start_position_non_negative'),
+        sa.CheckConstraint('end_position >= start_position', name='check_end_position_after_start'),
+        sa.CheckConstraint('word_count >= 0', name='check_word_count_non_negative'),
+        sa.CheckConstraint('token_count >= 0', name='check_token_count_non_negative'),
+        sa.CheckConstraint('retry_count >= 0', name='check_retry_count_non_negative'),
+        sa.UniqueConstraint('job_id', 'chunk_index', name='uq_job_chunk_index')
+    )
+    op.create_index('ix_translation_chunks_job_id', 'translation_chunks', ['job_id'], unique=False)
+    op.create_index('ix_translation_chunks_status', 'translation_chunks', ['status'], unique=False)
+    op.create_index('ix_translation_chunks_is_code_block', 'translation_chunks', ['is_code_block'], unique=False)
+    # Create translation_cache table
+    op.create_table('translation_cache',
+        sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
+        sa.Column('cache_key', sa.String(length=255), nullable=False),
+        sa.Column('job_id', sa.UUID(), nullable=True),
+        sa.Column('content_hash', sa.String(length=64), nullable=False),
+        sa.Column('page_url', sa.String(length=2048), nullable=True),
+        sa.Column('url_hash', sa.String(length=32), nullable=True),
+        sa.Column('source_language', sa.String(length=10), nullable=False),
+        sa.Column('target_language', sa.String(length=10), nullable=False),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=False),
+        sa.Column('model_version', sa.String(length=100), nullable=True),
+        sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
+        sa.Column('translation_metadata', sa.JSON(), nullable=True),
+        sa.Column('quality_score', sa.Float(), nullable=True),
+        sa.Column('confidence_score', sa.Float(), nullable=True),
+        sa.Column('is_validated', sa.Boolean(), nullable=False, default=False),
+        sa.Column('hit_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('last_hit_at', sa.DateTime(), nullable=True),
+        sa.Column('ttl_hours', sa.Integer(), nullable=False, default=24),
+        sa.Column('priority', sa.String(length=10), nullable=False, default='MEDIUM'),
+        sa.Column('expires_at', sa.DateTime(), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id']),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('cache_key'),
+        sa.CheckConstraint('quality_score >= 0.0 AND quality_score <= 5.0', name='check_quality_score_range'),
+        sa.CheckConstraint('confidence_score >= 0.0 AND confidence_score <= 1.0', name='check_confidence_score_range'),
+        sa.CheckConstraint('hit_count >= 0', name='check_hit_count_non_negative'),
+        sa.CheckConstraint('ttl_hours > 0', name='check_ttl_hours_positive')
+    )
+    op.create_index('ix_translation_cache_cache_key', 'translation_cache', ['cache_key'], unique=False)
+    op.create_index('ix_translation_cache_content_hash', 'translation_cache', ['content_hash'], unique=False)
+    op.create_index('ix_translation_cache_page_url', 'translation_cache', ['page_url'], unique=False)
+    op.create_index('ix_translation_cache_url_hash', 'translation_cache', ['url_hash'], unique=False)
+    op.create_index('ix_translation_cache_expires_at', 'translation_cache', ['expires_at'], unique=False)
+    op.create_index('ix_translation_cache_priority', 'translation_cache', ['priority'], unique=False)
+    # Create translation_errors table
+    op.create_table('translation_errors',
+        sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
+        sa.Column('error_id', sa.String(length=255), nullable=False),
+        sa.Column('job_id', sa.UUID(), nullable=True),
+        sa.Column('chunk_id', sa.UUID(), nullable=True),
+        sa.Column('error_type', sa.String(length=50), nullable=False),
+        sa.Column('error_code', sa.String(length=100), nullable=True),
+        sa.Column('error_message', sa.Text(), nullable=False),
+        sa.Column('error_details', sa.JSON(), nullable=True),
+        sa.Column('severity', sa.String(length=10), nullable=False),
+        sa.Column('category', sa.String(length=50), nullable=False, default='translation'),
+        sa.Column('is_retriable', sa.Boolean(), nullable=False, default=True),
+        sa.Column('retry_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('max_retries', sa.Integer(), nullable=False, default=3),
+        sa.Column('next_retry_at', sa.DateTime(), nullable=True),
+        sa.Column('is_resolved', sa.Boolean(), nullable=False, default=False),
+        sa.Column('resolution_notes', sa.Text(), nullable=True),
+        sa.Column('resolved_at', sa.DateTime(), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id']),
+        sa.ForeignKeyConstraint(['chunk_id'], ['translation_chunks.id']),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('error_id'),
+        sa.CheckConstraint('retry_count >= 0', name='check_error_retry_count_non_negative'),
+        sa.CheckConstraint('max_retries >= 0', name='check_error_max_retries_non_negative')
+    )
+    op.create_index('ix_translation_errors_error_id', 'translation_errors', ['error_id'], unique=False)
+    op.create_index('ix_translation_errors_job_id', 'translation_errors', ['job_id'], unique=False)
+    op.create_index('ix_translation_errors_chunk_id', 'translation_errors', ['chunk_id'], unique=False)
+    op.create_index('ix_translation_errors_error_type', 'translation_errors', ['error_type'], unique=False)
+    op.create_index('ix_translation_errors_severity', 'translation_errors', ['severity'], unique=False)
+    op.create_index('ix_translation_errors_created_at', 'translation_errors', ['created_at'], unique=False)
+    # Create translation_sessions table
+    op.create_table('translation_sessions',
+        sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
+        sa.Column('session_id', sa.String(length=255), nullable=False),
+        sa.Column('user_id', sa.String(length=255), nullable=True),
+        sa.Column('source_language', sa.String(length=10), nullable=False, default='en'),
+        sa.Column('target_language', sa.String(length=10), nullable=False, default='ur'),
+        sa.Column('preferred_model', sa.String(length=100), nullable=True),
+        sa.Column('request_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('character_count', sa.Integer(), nullable=False, default=0),
+        sa.Column('total_cost_usd', sa.Float(), nullable=False, default=0.0),
+        sa.Column('session_data', sa.JSON(), nullable=True),
+        sa.Column('user_agent', sa.Text(), nullable=True),
+        sa.Column('ip_address', sa.String(length=45), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('last_activity_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('expires_at', sa.DateTime(), nullable=False),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('session_id'),
+        sa.CheckConstraint('request_count >= 0', name='check_session_request_count_non_negative'),
+        sa.CheckConstraint('character_count >= 0', name='check_session_character_count_non_negative'),
+        sa.CheckConstraint('total_cost_usd >= 0.0', name='check_session_total_cost_non_negative')
+    )
+    op.create_index('ix_translation_sessions_session_id', 'translation_sessions', ['session_id'], unique=False)
+    op.create_index('ix_translation_sessions_user_id', 'translation_sessions', ['user_id'], unique=False)
+    op.create_index('ix_translation_sessions_expires_at', 'translation_sessions', ['expires_at'], unique=False)
+    # Create translation_metrics table
+    op.create_table('translation_metrics',
+        sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
+        sa.Column('metric_date', sa.DateTime(), nullable=False),
+        sa.Column('period_type', sa.String(length=20), nullable=False, default='daily'),
+        sa.Column('total_requests', sa.Integer(), nullable=False, default=0),
+        sa.Column('successful_requests', sa.Integer(), nullable=False, default=0),
+        sa.Column('failed_requests', sa.Integer(), nullable=False, default=0),
+        sa.Column('cached_requests', sa.Integer(), nullable=False, default=0),
+        sa.Column('avg_processing_time_ms', sa.Float(), nullable=False, default=0.0),
+        sa.Column('p95_processing_time_ms', sa.Float(), nullable=False, default=0.0),
+        sa.Column('p99_processing_time_ms', sa.Float(), nullable=False, default=0.0),
+        sa.Column('total_characters', sa.Integer(), nullable=False, default=0),
+        sa.Column('total_tokens', sa.Integer(), nullable=False, default=0),
+        sa.Column('total_cost_usd', sa.Float(), nullable=False, default=0.0),
+        sa.Column('avg_quality_score', sa.Float(), nullable=True),
+        sa.Column('cache_hit_rate', sa.Float(), nullable=False, default=0.0),
+        sa.Column('error_rate', sa.Float(), nullable=False, default=0.0),
+        sa.Column('top_error_types', sa.JSON(), nullable=True),
+        sa.Column('language_pairs', sa.JSON(), nullable=True),
+        sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
+        sa.PrimaryKeyConstraint('id'),
+        sa.CheckConstraint('total_requests >= 0', name='check_metrics_total_requests_non_negative'),
+        sa.CheckConstraint('successful_requests >= 0', name='check_metrics_successful_requests_non_negative'),
+        sa.CheckConstraint('failed_requests >= 0', name='check_metrics_failed_requests_non_negative'),
+        sa.CheckConstraint('cached_requests >= 0', name='check_metrics_cached_requests_non_negative'),
+        sa.CheckConstraint('total_characters >= 0', name='check_metrics_total_characters_non_negative'),
+        sa.CheckConstraint('total_tokens >= 0', name='check_metrics_total_tokens_non_negative'),
+        sa.CheckConstraint('total_cost_usd >= 0.0', name='check_metrics_total_cost_non_negative'),
+        sa.CheckConstraint('avg_processing_time_ms >= 0.0', name='check_metrics_avg_processing_time_non_negative'),
+        sa.CheckConstraint('p95_processing_time_ms >= 0.0', name='check_metrics_p95_processing_time_non_negative'),
+        sa.CheckConstraint('p99_processing_time_ms >= 0.0', name='check_metrics_p99_processing_time_non_negative'),
+        sa.CheckConstraint('cache_hit_rate >= 0.0 AND cache_hit_rate <= 1.0', name='check_metrics_cache_hit_rate_range'),
+        sa.CheckConstraint('error_rate >= 0.0 AND error_rate <= 1.0', name='check_metrics_error_rate_range'),
+        sa.UniqueConstraint('metric_date', 'period_type', name='uq_metrics_date_period')
+    )
+    op.create_index('ix_translation_metrics_metric_date', 'translation_metrics', ['metric_date'], unique=False)
+    op.create_index('ix_translation_metrics_period_type', 'translation_metrics', ['period_type'], unique=False)
+def downgrade():
+    # Drop tables in reverse order
+    op.drop_index('ix_translation_metrics_period_type', table_name='translation_metrics')
+    op.drop_index('ix_translation_metrics_metric_date', table_name='translation_metrics')
+    op.drop_table('translation_metrics')
+    op.drop_index('ix_translation_sessions_expires_at', table_name='translation_sessions')
+    op.drop_index('ix_translation_sessions_user_id', table_name='translation_sessions')
+    op.drop_index('ix_translation_sessions_session_id', table_name='translation_sessions')
+    op.drop_table('translation_sessions')
+    op.drop_index('ix_translation_errors_created_at', table_name='translation_errors')
+    op.drop_index('ix_translation_errors_severity', table_name='translation_errors')
+    op.drop_index('ix_translation_errors_error_type', table_name='translation_errors')
+    op.drop_index('ix_translation_errors_chunk_id', table_name='translation_errors')
+    op.drop_index('ix_translation_errors_job_id', table_name='translation_errors')
+    op.drop_index('ix_translation_errors_error_id', table_name='translation_errors')
+    op.drop_table('translation_errors')
+    op.drop_index('ix_translation_cache_priority', table_name='translation_cache')
+    op.drop_index('ix_translation_cache_expires_at', table_name='translation_cache')
+    op.drop_index('ix_translation_cache_url_hash', table_name='translation_cache')
+    op.drop_index('ix_translation_cache_page_url', table_name='translation_cache')
+    op.drop_index('ix_translation_cache_content_hash', table_name='translation_cache')
+    op.drop_index('ix_translation_cache_cache_key', table_name='translation_cache')
+    op.drop_table('translation_cache')
+    op.drop_index('ix_translation_chunks_is_code_block', table_name='translation_chunks')
+    op.drop_index('ix_translation_chunks_status', table_name='translation_chunks')
+    op.drop_index('ix_translation_chunks_job_id', table_name='translation_chunks')
+    op.drop_table('translation_chunks')
+    op.drop_index('ix_translation_jobs_created_at', table_name='translation_jobs')
+    op.drop_index('ix_translation_jobs_status', table_name='translation_jobs')
+    op.drop_index('ix_translation_jobs_content_hash', table_name='translation_jobs')
+    op.drop_index('ix_translation_jobs_page_url', table_name='translation_jobs')
+    op.drop_index('ix_translation_jobs_session_id', table_name='translation_jobs')
+    op.drop_index('ix_translation_jobs_user_id', table_name='translation_jobs')
+    op.drop_index('ix_translation_jobs_job_id', table_name='translation_jobs')
+    op.drop_table('translation_jobs')

create_translation_tables.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/usr/bin/env python
+"""
+Create translation tables in the database.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+from src.database.base import engine, Base
+from src.models import *  # Import all models
+def create_tables():
+    """Create all tables in the database."""
+    try:
+        # Import models to register them
+        from src.models.auth import User
+        from src.models.translation_openai import (
+            TranslationJob, TranslationChunk, TranslationError,
+            TranslationSession, TranslationCache, TranslationMetrics
+        )
+        # Create all tables
+        Base.metadata.create_all(bind=engine)
+        print("Translation tables created successfully!")
+        # List created tables
+        from sqlalchemy import inspect
+        inspector = inspect(engine)
+        tables = inspector.get_table_names()
+        print("\nAvailable tables:")
+        for table in sorted(tables):
+            if 'translation' in table.lower():
+                print(f"  - {table}")
+    except Exception as e:
+        print(f"Error creating tables: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    create_tables()

fix_async_client.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python
+"""
+Fix the async client initialization in get_translation_service().
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+# Read the service.py file
+file_path = backend_path / "src" / "services" / "openai_translation" / "service.py"
+content = file_path.read_text(encoding='utf-8')
+# Find and replace the get_translation_service function
+old_function = """async def get_translation_service() -> OpenAITranslationService:
+    \"\"\"Get or create OpenAI translation service instance.\"\"\"
+    global _translation_service
+    if _translation_service is None:
+        _translation_service = OpenAITranslationService()
+    return _translation_service"""
+new_function = """async def get_translation_service() -> OpenAITranslationService:
+    \"\"\"Get or create OpenAI translation service instance.\"\"\"
+    global _translation_service
+    if _translation_service is None:
+        _translation_service = OpenAITranslationService()
+        # Initialize the async client
+        _translation_service.gemini_client = await get_gemini_client()
+    return _translation_service"""
+content = content.replace(old_function, new_function)
+# Write back to file
+file_path.write_text(content, encoding='utf-8')
+print("Fixed async client initialization in get_translation_service()")

fix_jsonb.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python
+"""
+Replace JSONB with JSON for SQLite compatibility.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+# Read the translation_openai.py file
+model_file = backend_path / "src" / "models" / "translation_openai.py"
+content = model_file.read_text(encoding='utf-8')
+# Replace all JSONB with JSON
+content = content.replace('JSONB', 'JSON')
+# Remove JSONB from imports since we're using JSON from sqlalchemy
+content = content.replace('from sqlalchemy.dialects.postgresql import UUID, JSONB',
+                         'from sqlalchemy.dialects.postgresql import UUID')
+# Write back to file
+model_file.write_text(content, encoding='utf-8')
+print("Fixed JSONB to JSON conversion")

fix_translation_endpoint.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/usr/bin/env python
+"""
+Fix translation endpoint to handle User objects properly.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+# Read the translation.py file
+file_path = backend_path / "src" / "api" / "v1" / "translation.py"
+content = file_path.read_text(encoding='utf-8')
+# Add User import
+if "from src.models.auth import User" not in content:
+    # Add User import after other imports
+    content = content.replace(
+        "from src.security.dependencies import get_current_user_or_anonymous",
+        "from src.security.dependencies import get_current_user_or_anonymous\nfrom src.models.auth import User"
+    )
+# Fix type hints
+content = content.replace(
+    "current_user: Optional[Dict] = Depends(get_current_user_or_anonymous),",
+    "current_user: Optional[User] = Depends(get_current_user_or_anonymous),"
+)
+# Fix current_user.get() calls
+content = content.replace(
+    'current_user.get("id") if current_user else None',
+    'current_user.id if current_user else None'
+)
+content = content.replace(
+    'current_user.get("is_admin", False)',
+    'getattr(current_user, "is_admin", False)'
+)
+# Write back to file
+file_path.write_text(content, encoding='utf-8')
+print("Fixed translation endpoint to handle User objects")

fix_user_id_issue.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env python
+"""
+Fix the user_id issue in translation service.
+The User.id is a string but the foreign key expects a UUID.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+# Read the translation_openai.py file
+file_path = backend_path / "src" / "models" / "translation_openai.py"
+content = file_path.read_text(encoding='utf-8')
+# Change user_id from UUID to String to match the User model
+content = content.replace(
+    'user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True)',
+    'user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)'
+)
+# Also fix TranslationSession and TranslationMetrics user_id fields
+content = content.replace(
+    'user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True)',
+    'user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)'
+)
+# Write back to file
+file_path.write_text(content, encoding='utf-8')
+print("Fixed user_id to use String instead of UUID to match User.id field")

fix_user_model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python
+"""
+Fix the User model to add translation relationships.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+# Read the auth.py file
+auth_file = backend_path / "src" / "models" / "auth.py"
+content = auth_file.read_text(encoding='utf-8')
+# Find the User model's relationships section
+import_start = content.find("    # Relationships")
+if import_start == -1:
+    print("Could not find relationships section in User model")
+    sys.exit(1)
+# Find where the relationships end
+relationships_end = content.find("\n\n", import_start)
+if relationships_end == -1:
+    relationships_end = content.find("\nclass", import_start)
+if relationships_end == -1:
+    print("Could not find end of relationships section")
+    sys.exit(1)
+# Extract the relationships section
+relationships_section = content[import_start:relationships_end]
+# Check if translation relationships already exist
+if "translation_jobs" in relationships_section:
+    print("Translation relationships already exist in User model")
+else:
+    # Add the translation relationships
+    new_relationships = relationships_section.rstrip()
+    if not new_relationships.endswith('\n'):
+        new_relationships += '\n'
+    new_relationships += """    translation_jobs = relationship("TranslationJob", back_populates="user", cascade="all, delete-orphan")
+    translation_sessions = relationship("TranslationSession", back_populates="user", cascade="all, delete-orphan")
+    translation_metrics = relationship("TranslationMetrics", back_populates="user", cascade="all, delete-orphan")"""
+    # Replace the old relationships section with the new one
+    new_content = content[:import_start] + new_relationships + content[relationships_end:]
+    # Write back to file
+    auth_file.write_text(new_content, encoding='utf-8')
+    print("✅ Added translation relationships to User model")

main.py CHANGED Viewed

@@ -25,6 +25,7 @@ from rag.chat import ChatHandler
 from rag.qdrant_client import QdrantManager
 from rag.tasks import TaskManager
 from api.exceptions import ContentNotFoundError, RAGException
 # Import security middleware
 from middleware.csrf import CSRFMiddleware
@@ -60,6 +61,7 @@ logger = structlog.get_logger()
 # Load environment variables
 load_dotenv()
 class Settings(BaseSettings):
@@ -91,12 +93,15 @@ class Settings(BaseSettings):
     # CORS Configuration
     allowed_origins: str = os.getenv(
         "ALLOWED_ORIGINS",
-        "http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co"
     )
     # JWT Configuration
     jwt_secret_key: str = os.getenv("JWT_SECRET_KEY", "your-super-secret-jwt-key")
     # Conversation Context
     max_context_messages: int = int(os.getenv("MAX_CONTEXT_MESSAGES", "3"))
     context_window_size: int = int(os.getenv("CONTEXT_WINDOW_SIZE", "4000"))
@@ -182,6 +187,9 @@ async def lifespan(app: FastAPI):
         )
         await task_manager.start()
         logger.info("RAG backend initialized successfully")
         yield
@@ -237,13 +245,13 @@ app.add_middleware(
     httponly=False,
     samesite="lax",
     max_age=3600,
-    exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth/login", "/auth/register", "/api/chat", "/auth/logout", "/auth/me", "/auth/preferences", "/auth/refresh"],
 )
 app.add_middleware(
     AuthMiddleware,
     anonymous_limit=3,
-    exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth"],
     anonymous_header="X-Anonymous-Session-ID",
 )
@@ -253,6 +261,14 @@ app.include_router(auth.router)
 # Include new chat routes
 app.include_router(chat.router)
 # Optional API key security for higher rate limits
 security = HTTPBearer(auto_error=False)
@@ -887,6 +903,45 @@ async def create_chatkit_session(request: Request):
     #     raise HTTPException(status_code=500, detail=f"ChatKit processing error: {str(e)}")
 if __name__ == "__main__":
     import uvicorn

 from rag.qdrant_client import QdrantManager
 from rag.tasks import TaskManager
 from api.exceptions import ContentNotFoundError, RAGException
+from src.services.translation_cache import cache_service
 # Import security middleware
 from middleware.csrf import CSRFMiddleware
 # Load environment variables
 load_dotenv()
+print(f"*** Environment loaded. GEMINI_API_KEY exists: {bool(os.getenv('GEMINI_API_KEY'))} ***")
 class Settings(BaseSettings):
     # CORS Configuration
     allowed_origins: str = os.getenv(
         "ALLOWED_ORIGINS",
+        "http://localhost:3000,http://localhost:3001,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co"
     )
     # JWT Configuration
     jwt_secret_key: str = os.getenv("JWT_SECRET_KEY", "your-super-secret-jwt-key")
+    # Google AI Configuration
+    google_ai_api_key: str = os.getenv("GEMINI_API_KEY", "")
     # Conversation Context
     max_context_messages: int = int(os.getenv("MAX_CONTEXT_MESSAGES", "3"))
     context_window_size: int = int(os.getenv("CONTEXT_WINDOW_SIZE", "4000"))
         )
         await task_manager.start()
+        # Start background task for cache cleanup (runs daily)
+        asyncio.create_task(schedule_cache_cleanup())
         logger.info("RAG backend initialized successfully")
         yield
     httponly=False,
     samesite="lax",
     max_age=3600,
+    exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth/login", "/auth/register", "/api/chat", "/auth/logout", "/auth/me", "/auth/preferences", "/auth/refresh", "/api/v1/translation"],
 )
 app.add_middleware(
     AuthMiddleware,
     anonymous_limit=3,
+    exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth", "/api/v1/translation"],
     anonymous_header="X-Anonymous-Session-ID",
 )
 # Include new chat routes
 app.include_router(chat.router)
+# Include reader features routes
+from src.api.v1 import reader_features
+app.include_router(reader_features.router, prefix="/api/v1")
+# Include translation routes
+from src.api.v1 import translation
+app.include_router(translation.router, prefix="/api/v1")
 # Optional API key security for higher rate limits
 security = HTTPBearer(auto_error=False)
     #     raise HTTPException(status_code=500, detail=f"ChatKit processing error: {str(e)}")
+async def schedule_cache_cleanup():
+    """
+    Schedule periodic cache cleanup task.
+    Runs every 24 hours to clear expired translation cache entries.
+    """
+    import logging
+    cache_logger = logging.getLogger(__name__)
+    while True:
+        try:
+            # Wait for 24 hours
+            await asyncio.sleep(86400)  # 24 hours in seconds
+            # Clean up expired cache entries
+            cleared_count = await cache_service.clear_expired_cache()
+            if cleared_count > 0:
+                cache_logger.info(
+                    f"Cache cleanup completed",
+                    cleared_entries=cleared_count,
+                    timestamp=datetime.utcnow().isoformat()
+                )
+            else:
+                cache_logger.debug(
+                    "Cache cleanup completed - no expired entries found",
+                    timestamp=datetime.utcnow().isoformat()
+                )
+        except Exception as e:
+            cache_logger.error(
+                "Cache cleanup failed",
+                error=str(e),
+                timestamp=datetime.utcnow().isoformat()
+            )
+            # Wait 1 hour before retrying on error
+            await asyncio.sleep(3600)
 if __name__ == "__main__":
     import uvicorn

migrate_user_id.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python
+"""
+Migration script to change user_id from UUID to String in translation tables.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+from sqlalchemy import text
+from src.database.base import engine
+def migrate_user_id_columns():
+    """Migrate user_id columns from UUID to String in translation tables."""
+    # Tables to modify
+    tables = [
+        'translation_jobs',
+        'translation_sessions',
+        'translation_metrics'
+    ]
+    with engine.connect() as connection:
+        # Begin transaction
+        trans = connection.begin()
+        try:
+            for table in tables:
+                print(f"Migrating {table}...")
+                # SQLite doesn't support ALTER COLUMN directly, so we need to:
+                # 1. Create new table with correct schema
+                # 2. Copy data
+                # 3. Drop old table
+                # 4. Rename new table
+                # For simplicity, let's just create new tables and drop the old ones
+                # since this is still development
+                connection.execute(text(f"DROP TABLE IF EXISTS {table}"))
+                print(f"  - Dropped {table}")
+            # Commit transaction
+            trans.commit()
+            print("\nMigration successful!")
+            # Recreate tables
+            from src.models import *  # Import all models
+            from src.database.base import Base
+            Base.metadata.create_all(bind=engine)
+            print("\nTables recreated with new schema!")
+        except Exception as e:
+            # Rollback on error
+            trans.rollback()
+            print(f"\nMigration failed: {e}")
+            raise
+if __name__ == "__main__":
+    migrate_user_id_columns()

migrate_user_id_fixed.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python
+"""
+Migration script to change user_id from UUID to String in translation tables.
+"""
+import sys
+import os
+from pathlib import Path
+# Add backend to path
+backend_path = Path(__file__).parent
+sys.path.insert(0, str(backend_path))
+from sqlalchemy import text
+from src.database.base import engine, Base
+from src.models import *  # Import all models
+def migrate_user_id_columns():
+    """Migrate user_id columns from UUID to String in translation tables."""
+    # Tables to modify
+    tables = [
+        'translation_jobs',
+        'translation_sessions',
+        'translation_metrics'
+    ]
+    with engine.connect() as connection:
+        # Begin transaction
+        trans = connection.begin()
+        try:
+            for table in tables:
+                print(f"Dropping {table}...")
+                connection.execute(text(f"DROP TABLE IF EXISTS {table}"))
+            # Commit transaction
+            trans.commit()
+            print("\nDropped all translation tables successfully!")
+        except Exception as e:
+            # Rollback on error
+            trans.rollback()
+            print(f"\nMigration failed: {e}")
+            raise
+    # Recreate tables with new schema
+    print("Recreating tables with new schema...")
+    Base.metadata.create_all(bind=engine)
+    print("\nMigration completed successfully!")
+if __name__ == "__main__":
+    migrate_user_id_columns()

migration_summary_translation_tables.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Database Migration: Translation Tables (Phase 2, Task T010)
+## Overview
+Created Alembic migration `004_add_translation_tables.py` to add support for translation features, user feedback, personalization, and content localization.
+## Migration Details
+- **Revision ID**: `004_add_translation_tables`
+- **Revises**: `003_reader_features_tables`
+- **File**: `backend/alembic/versions/004_add_translation_tables.py`
+## Tables Created
+### 1. `translations` Table
+Stores cached translations with content hashing for deduplication.
+**Columns:**
+- `id` (Integer, Primary Key)
+- `content_hash` (String(64), Unique, Indexed) - SHA-256 hash for deduplication
+- `source_language` (String(10)) - Source language code
+- `target_language` (String(10)) - Target language code
+- `original_text` (Text) - Original text to translate
+- `translated_text` (Text) - Translated text
+- `created_at` (DateTime) - Creation timestamp
+- `updated_at` (DateTime) - Last update timestamp
+- `translation_model` (String(50)) - Model used for translation (e.g., "gemini-1.5-pro")
+- `character_count` (Integer) - Character count of the text
+**Indexes:**
+- Unique index on `content_hash`
+- Composite index `idx_content_lookup` on (`content_hash`, `source_language`, `target_language`)
+### 2. `translation_feedback` Table
+Stores user feedback on translations for quality improvement.
+**Columns:**
+- `id` (Integer, Primary Key)
+- `translation_id` (Integer, Foreign Key → translations.id)
+- `user_id` (String(36)) - User UUID from auth system
+- `rating` (SmallInteger) - -1 (downvote) or 1 (upvote)
+- `comment` (Text, Optional) - User comment on the translation
+- `created_at` (DateTime) - Feedback timestamp
+**Constraints:**
+- Check constraint: `rating IN (-1, 1)`
+- Unique composite index on (`user_id`, `translation_id`) - One feedback per user per translation
+### 3. `personalization_profiles` Table
+Stores user preferences for personalized content delivery.
+**Columns:**
+- `id` (Integer, Primary Key)
+- `user_id` (String(36), Unique, Indexed) - User UUID
+- `reading_level` (Enum: 'beginner', 'intermediate', 'advanced')
+- `preferred_language` (String(10)) - User's preferred language
+- `focus_areas` (JSON) - Array of topics user cares about
+- `learning_style` (Enum: 'visual', 'practical', 'theoretical', 'balanced')
+- `enable_transliteration` (Boolean) - Whether to show transliterations
+- `technical_term_handling` (Enum: 'translate', 'transliterate', 'keep_english')
+- `font_size` (Integer) - Preferred font size
+- `focus_mode_preferences` (JSON) - Preferences for focus mode
+- `created_at` (DateTime)
+- `updated_at` (DateTime)
+- `last_active` (DateTime)
+### 4. `content_localization` Table (Conditional Creation)
+Tracks translation status and metadata for content pages.
+This table is only created if it doesn't already exist.
+**Columns:**
+- `id` (Integer, Primary Key)
+- `content_url` (String(500), Indexed) - URL of the content page
+- `content_hash` (String(64), Indexed) - Content hash for change detection
+- `is_translated` (Boolean) - Whether the content has been translated
+- `last_translation_date` (DateTime) - When translation was last updated
+- `translation_cache_key` (String(64)) - Cache key for translations
+- `word_count` (Integer) - Number of words in content
+- `character_count` (Integer) - Number of characters
+- `has_code_blocks` (Boolean) - Whether content contains code blocks
+- `detected_languages` (JSON) - Array of detected languages in content
+- `chunk_count` (Integer) - Number of chunks for processing
+- `processing_status` (Enum: 'pending', 'processing', 'completed', 'failed', 'partial')
+- `created_at` (DateTime)
+- `updated_at` (DateTime)
+**Indexes:**
+- Index on `content_hash`
+- Index on `content_url`
+## Database Compatibility
+The migration is designed to work with SQLite (current database) but is compatible with PostgreSQL as well.
+## Foreign Key Relationships
+- `translation_feedback.translation_id` → `translations.id`
+- (Other foreign keys would be to the users table from auth system)
+## Migration Usage
+### To apply the migration:
+```bash
+cd backend
+alembic upgrade head
+```
+### To revert the migration:
+```bash
+cd backend
+alembic downgrade -1
+```
+### To check current status:
+```bash
+cd backend
+alembic current
+```
+## Notes
+1. The migration uses SQLite-compatible syntax but will work with PostgreSQL
+2. Enum types are stored as strings with length constraints for compatibility
+3. JSON fields use SQLite's JSON extension (available in SQLite 3.38+)
+4. The content_localization table check prevents errors if it already exists
+## Updated Files
+1. `backend/alembic/versions/004_add_translation_tables.py` - Main migration file
+2. `backend/alembic/env.py` - Updated to import new models for metadata registration

migrations/versions/001_create_openai_translation_tables.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""Create OpenAI translation system tables
+Revision ID: 001_create_openai_translation_tables
+Revises:
+Create Date: 2024-01-12 12:00:00.000000
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+# revision identifiers, used by Alembic.
+revision = '001_create_openai_translation_tables'
+down_revision = None
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # Create translation_jobs table
+    op.create_table('translation_jobs',
+        sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('job_id', sa.String(length=64), nullable=False),
+        sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column('session_id', sa.String(length=128), nullable=True),
+        sa.Column('content_hash', sa.String(length=64), nullable=False),
+        sa.Column('page_url', sa.Text(), nullable=True),
+        sa.Column('source_language', sa.String(length=10), nullable=False),
+        sa.Column('target_language', sa.String(length=10), nullable=False),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=True),
+        sa.Column('preserve_code_blocks', sa.Boolean(), nullable=False),
+        sa.Column('enable_transliteration', sa.Boolean(), nullable=False),
+        sa.Column('chunk_size', sa.Integer(), nullable=False),
+        sa.Column('max_chunks', sa.Integer(), nullable=False),
+        sa.Column('model_name', sa.String(length=50), nullable=False),
+        sa.Column('temperature', sa.Numeric(precision=3, scale=2), nullable=False),
+        sa.Column('max_tokens', sa.Integer(), nullable=False),
+        sa.Column('status', sa.String(length=20), nullable=False),
+        sa.Column('progress_percentage', sa.Numeric(precision=5, scale=2), nullable=False),
+        sa.Column('chunks_total', sa.Integer(), nullable=False),
+        sa.Column('chunks_completed', sa.Integer(), nullable=False),
+        sa.Column('chunks_failed', sa.Integer(), nullable=False),
+        sa.Column('retry_count', sa.Integer(), nullable=False),
+        sa.Column('max_retries', sa.Integer(), nullable=False),
+        sa.Column('started_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('input_tokens', sa.BigInteger(), nullable=False),
+        sa.Column('output_tokens', sa.BigInteger(), nullable=False),
+        sa.Column('estimated_cost_usd', sa.Numeric(precision=10, scale=6), nullable=False),
+        sa.Column('actual_cost_usd', sa.Numeric(precision=10, scale=6), nullable=True),
+        sa.Column('quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
+        sa.Column('confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('last_activity_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('user_agent', sa.Text(), nullable=True),
+        sa.Column('ip_address', sa.String(length=45), nullable=True),
+        sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('job_id')
+    )
+    # Create indexes for translation_jobs
+    op.create_index('ix_translation_jobs_job_id', 'translation_jobs', ['job_id'], unique=True)
+    op.create_index('ix_translation_jobs_user_id', 'translation_jobs', ['user_id'])
+    op.create_index('ix_translation_jobs_session_id', 'translation_jobs', ['session_id'])
+    op.create_index('ix_translation_jobs_content_hash', 'translation_jobs', ['content_hash'])
+    op.create_index('ix_translation_jobs_page_url', 'translation_jobs', ['page_url'])
+    op.create_index('ix_translation_jobs_source_language', 'translation_jobs', ['source_language'])
+    op.create_index('ix_translation_jobs_target_language', 'translation_jobs', ['target_language'])
+    op.create_index('ix_translation_jobs_status', 'translation_jobs', ['status'])
+    op.create_index('ix_translation_jobs_status_created', 'translation_jobs', ['status', 'created_at'])
+    op.create_index('ix_translation_jobs_user_status', 'translation_jobs', ['user_id', 'status'])
+    op.create_index('ix_translation_jobs_content_lookup', 'translation_jobs', ['content_hash', 'source_language', 'target_language'])
+    op.create_index('ix_translation_jobs_page_cache', 'translation_jobs', ['page_url', 'content_hash'])
+    op.create_index('ix_translation_jobs_activity', 'translation_jobs', ['last_activity_at'])
+    op.create_index('ix_translation_jobs_progress', 'translation_jobs', ['status', 'progress_percentage'])
+    # Create translation_chunks table
+    op.create_table('translation_chunks',
+        sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('chunk_index', sa.Integer(), nullable=False),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=True),
+        sa.Column('start_position', sa.Integer(), nullable=False),
+        sa.Column('end_position', sa.Integer(), nullable=False),
+        sa.Column('is_code_block', sa.Boolean(), nullable=False),
+        sa.Column('code_language', sa.String(length=50), nullable=True),
+        sa.Column('word_count', sa.Integer(), nullable=False),
+        sa.Column('status', sa.String(length=20), nullable=False),
+        sa.Column('retry_count', sa.Integer(), nullable=False),
+        sa.Column('started_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('input_tokens', sa.Integer(), nullable=False),
+        sa.Column('output_tokens', sa.Integer(), nullable=False),
+        sa.Column('confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
+        sa.Column('requires_review', sa.Boolean(), nullable=False),
+        sa.Column('last_error', sa.Text(), nullable=True),
+        sa.Column('error_code', sa.String(length=50), nullable=True),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('job_id', 'chunk_index', name='uq_translation_chunks_job_chunk')
+    )
+    # Create indexes for translation_chunks
+    op.create_index('ix_translation_chunks_job_id', 'translation_chunks', ['job_id'])
+    op.create_index('ix_translation_chunks_job_chunk', 'translation_chunks', ['job_id', 'chunk_index'], unique=True)
+    op.create_index('ix_translation_chunks_status', 'translation_chunks', ['status'])
+    op.create_index('ix_translation_chunks_status_created', 'translation_chunks', ['status', 'created_at'])
+    op.create_index('ix_translation_chunks_is_code_block', 'translation_chunks', ['is_code_block'])
+    op.create_index('ix_translation_chunks_code_language', 'translation_chunks', ['code_language'])
+    # Create translation_errors table
+    op.create_table('translation_errors',
+        sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('chunk_id', postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column('error_id', sa.String(length=64), nullable=False),
+        sa.Column('error_type', sa.String(length=50), nullable=False),
+        sa.Column('error_code', sa.String(length=50), nullable=True),
+        sa.Column('error_message', sa.Text(), nullable=False),
+        sa.Column('error_details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column('severity', sa.String(length=20), nullable=False),
+        sa.Column('category', sa.String(length=50), nullable=False),
+        sa.Column('is_retriable', sa.Boolean(), nullable=False),
+        sa.Column('retry_attempt', sa.Integer(), nullable=False),
+        sa.Column('max_retries', sa.Integer(), nullable=False),
+        sa.Column('next_retry_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('request_payload', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column('response_payload', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column('stack_trace', sa.Text(), nullable=True),
+        sa.Column('debug_info', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column('resolved_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('resolution', sa.String(length=200), nullable=True),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.ForeignKeyConstraint(['chunk_id'], ['translation_chunks.id'], ),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('error_id')
+    )
+    # Create indexes for translation_errors
+    op.create_index('ix_translation_errors_error_id', 'translation_errors', ['error_id'], unique=True)
+    op.create_index('ix_translation_errors_job_id', 'translation_errors', ['job_id'])
+    op.create_index('ix_translation_errors_chunk_id', 'translation_errors', ['chunk_id'])
+    op.create_index('ix_translation_errors_error_type', 'translation_errors', ['error_type'])
+    op.create_index('ix_translation_errors_severity', 'translation_errors', ['severity'])
+    op.create_index('ix_translation_errors_error_type_created', 'translation_errors', ['error_type', 'created_at'])
+    op.create_index('ix_translation_errors_error_severity', 'translation_errors', ['severity', 'created_at'])
+    op.create_index('ix_translation_errors_job_errors', 'translation_errors', ['job_id', 'created_at'])
+    op.create_index('ix_translation_errors_retry_schedule', 'translation_errors', ['next_retry_at', 'is_retriable'])
+    # Create translation_sessions table
+    op.create_table('translation_sessions',
+        sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('session_id', sa.String(length=128), nullable=False),
+        sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column('started_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('last_activity_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('is_active', sa.Boolean(), nullable=False),
+        sa.Column('request_count', sa.Integer(), nullable=False),
+        sa.Column('character_count', sa.Integer(), nullable=False),
+        sa.Column('total_cost_usd', sa.Numeric(precision=10, scale=6), nullable=False),
+        sa.Column('requests_per_minute', sa.Integer(), nullable=False),
+        sa.Column('characters_per_hour', sa.Integer(), nullable=False),
+        sa.Column('source_language', sa.String(length=10), nullable=True),
+        sa.Column('target_language', sa.String(length=10), nullable=True),
+        sa.Column('preferred_model', sa.String(length=50), nullable=True),
+        sa.Column('user_agent', sa.Text(), nullable=True),
+        sa.Column('ip_address', sa.String(length=45), nullable=True),
+        sa.Column('country_code', sa.String(length=2), nullable=True),
+        sa.Column('preferences', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('session_id')
+    )
+    # Create indexes for translation_sessions
+    op.create_index('ix_translation_sessions_session_id', 'translation_sessions', ['session_id'], unique=True)
+    op.create_index('ix_translation_sessions_user_id', 'translation_sessions', ['user_id'])
+    op.create_index('ix_translation_sessions_is_active', 'translation_sessions', ['is_active'])
+    op.create_index('ix_translation_sessions_expires_at', 'translation_sessions', ['expires_at'])
+    op.create_index('ix_translation_sessions_user_sessions', 'translation_sessions', ['user_id', 'is_active'])
+    op.create_index('ix_translation_sessions_session_expiry', 'translation_sessions', ['expires_at', 'is_active'])
+    op.create_index('ix_translation_sessions_ip_address', 'translation_sessions', ['ip_address'])
+    # Create translation_cache table
+    op.create_table('translation_cache',
+        sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('cache_key', sa.String(length=128), nullable=False),
+        sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column('content_hash', sa.String(length=64), nullable=False),
+        sa.Column('page_url', sa.Text(), nullable=True),
+        sa.Column('url_hash', sa.String(length=64), nullable=True),
+        sa.Column('source_language', sa.String(length=10), nullable=False),
+        sa.Column('target_language', sa.String(length=10), nullable=False),
+        sa.Column('original_text', sa.Text(), nullable=False),
+        sa.Column('translated_text', sa.Text(), nullable=False),
+        sa.Column('hit_count', sa.Integer(), nullable=False),
+        sa.Column('last_hit_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
+        sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('model_version', sa.String(length=50), nullable=False),
+        sa.Column('ttl_hours', sa.Integer(), nullable=False),
+        sa.Column('is_pinned', sa.Boolean(), nullable=False),
+        sa.Column('priority', sa.Integer(), nullable=False),
+        sa.Column('is_validated', sa.Boolean(), nullable=False),
+        sa.Column('validated_by', sa.String(length=50), nullable=True),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
+        sa.PrimaryKeyConstraint('id'),
+        sa.UniqueConstraint('cache_key')
+    )
+    # Create indexes for translation_cache
+    op.create_index('ix_translation_cache_cache_key', 'translation_cache', ['cache_key'], unique=True)
+    op.create_index('ix_translation_cache_job_id', 'translation_cache', ['job_id'])
+    op.create_index('ix_translation_cache_content_hash', 'translation_cache', ['content_hash'])
+    op.create_index('ix_translation_cache_page_url', 'translation_cache', ['page_url'])
+    op.create_index('ix_translation_cache_url_hash', 'translation_cache', ['url_hash'])
+    op.create_index('ix_translation_cache_source_language', 'translation_cache', ['source_language'])
+    op.create_index('ix_translation_cache_target_language', 'translation_cache', ['target_language'])
+    op.create_index('ix_translation_cache_expires_at', 'translation_cache', ['expires_at'])
+    op.create_index('ix_translation_cache_cache_lookup', 'translation_cache', ['content_hash', 'source_language', 'target_language'])
+    op.create_index('ix_translation_cache_page_cache', 'translation_cache', ['url_hash', 'content_hash'])
+    op.create_index('ix_translation_cache_cache_expires', 'translation_cache', ['expires_at', 'priority'])
+    op.create_index('ix_translation_cache_cache_popularity', 'translation_cache', ['hit_count', 'last_hit_at'])
+    # Create translation_metrics table
+    op.create_table('translation_metrics',
+        sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column('metric_date', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('period_type', sa.String(length=20), nullable=False),
+        sa.Column('total_requests', sa.Integer(), nullable=False),
+        sa.Column('total_characters', sa.BigInteger(), nullable=False),
+        sa.Column('total_chunks', sa.Integer(), nullable=False),
+        sa.Column('successful_translations', sa.Integer(), nullable=False),
+        sa.Column('failed_translations', sa.Integer(), nullable=False),
+        sa.Column('avg_processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('min_processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('max_processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('p95_processing_time_ms', sa.BigInteger(), nullable=False),
+        sa.Column('total_input_tokens', sa.BigInteger(), nullable=False),
+        sa.Column('total_output_tokens', sa.BigInteger(), nullable=False),
+        sa.Column('total_cost_usd', sa.Numeric(precision=12, scale=6), nullable=False),
+        sa.Column('avg_cost_per_char', sa.Numeric(precision=10, scale=8), nullable=False),
+        sa.Column('avg_quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
+        sa.Column('avg_confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
+        sa.Column('cache_hits', sa.Integer(), nullable=False),
+        sa.Column('cache_misses', sa.Integer(), nullable=False),
+        sa.Column('cache_hit_rate', sa.Numeric(precision=5, scale=2), nullable=False),
+        sa.Column('error_count', sa.Integer(), nullable=False),
+        sa.Column('error_rate', sa.Numeric(precision=5, scale=2), nullable=False),
+        sa.Column('top_error_types', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column('source_language', sa.String(length=10), nullable=True),
+        sa.Column('target_language', sa.String(length=10), nullable=True),
+        sa.Column('model_name', sa.String(length=50), nullable=True),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+        sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
+        sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+        sa.PrimaryKeyConstraint('id')
+    )
+    # Create indexes for translation_metrics
+    op.create_index('ix_translation_metrics_job_id', 'translation_metrics', ['job_id'])
+    op.create_index('ix_translation_metrics_user_id', 'translation_metrics', ['user_id'])
+    op.create_index('ix_translation_metrics_metric_date', 'translation_metrics', ['metric_date'])
+    op.create_index('ix_translation_metrics_period_type', 'translation_metrics', ['period_type'])
+    op.create_index('ix_translation_metrics_source_language', 'translation_metrics', ['source_language'])
+    op.create_index('ix_translation_metrics_target_language', 'translation_metrics', ['target_language'])
+    op.create_index('ix_translation_metrics_model_name', 'translation_metrics', ['model_name'])
+    op.create_index('ix_translation_metrics_date_period', 'translation_metrics', ['metric_date', 'period_type'])
+    op.create_index('ix_translation_metrics_user_metrics', 'translation_metrics', ['user_id', 'metric_date'])
+    op.create_index('ix_translation_metrics_job_metrics', 'translation_metrics', ['job_id', 'metric_date'])
+    op.create_index('ix_translation_metrics_lang_metrics', 'translation_metrics', ['source_language', 'target_language', 'metric_date'])
+def downgrade() -> None:
+    # Drop tables in reverse order of creation
+    op.drop_table('translation_metrics')
+    op.drop_table('translation_cache')
+    op.drop_table('translation_sessions')
+    op.drop_table('translation_errors')
+    op.drop_table('translation_chunks')
+    op.drop_table('translation_jobs')

pyproject.toml CHANGED Viewed

@@ -41,7 +41,8 @@ dependencies = [
     "authlib>=1.2.1",
     "itsdangerous>=2.1.0",
     # OpenAI Integration
-    "openai>=1.6.1",
     "tiktoken>=0.5.2",
     # Vector Database
     "qdrant-client>=1.7.0",
@@ -59,11 +60,16 @@ dependencies = [
     # Logging and Monitoring
     "structlog>=23.2.0",
     "backoff>=2.2.1",
     # Monitoring and Performance
     "psutil>=5.9.6",
     "openai-chatkit>=1.4.0",
     "email-validator>=2.3.0",
     "bcrypt==4.2.0",
 ]
 [project.optional-dependencies]

     "authlib>=1.2.1",
     "itsdangerous>=2.1.0",
     # OpenAI Integration
+    "openai>=1.68.0",
+    "openai-agents>=0.2.9",
     "tiktoken>=0.5.2",
     # Vector Database
     "qdrant-client>=1.7.0",
     # Logging and Monitoring
     "structlog>=23.2.0",
     "backoff>=2.2.1",
+    "python-json-logger>=2.0.7",
+    "PyYAML>=6.0.1",
     # Monitoring and Performance
     "psutil>=5.9.6",
     "openai-chatkit>=1.4.0",
     "email-validator>=2.3.0",
     "bcrypt==4.2.0",
+    "google-genai>=0.3.0",
+    "redis>=7.1.0",
+    "python-json-logger>=4.0.0",
 ]
 [project.optional-dependencies]

requirements.txt CHANGED Viewed

@@ -16,6 +16,7 @@ aiosmtplib>=3.0.0
 jinja2>=3.1.0
 python-dotenv>=1.0.0
 structlog>=23.2.0
 backoff>=2.2.1
 psutil>=5.9.6
 # ChatKit Python SDK
@@ -28,3 +29,9 @@ python-jose[cryptography]>=3.3.0
 passlib[bcrypt]>=1.7.4
 authlib>=1.2.1
 itsdangerous>=2.1.0

 jinja2>=3.1.0
 python-dotenv>=1.0.0
 structlog>=23.2.0
+python-json-logger>=2.0.7
 backoff>=2.2.1
 psutil>=5.9.6
 # ChatKit Python SDK
 passlib[bcrypt]>=1.7.4
 authlib>=1.2.1
 itsdangerous>=2.1.0
+# Cache dependencies
+redis[hiredis]>=5.0.0
+# Google Generative AI for Gemini integration
+google-generativeai>=0.8.0

src/api/v1/progress.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""
+Progress tracking API endpoints.
+Manages user reading progress through chapters and sections.
+"""
+from datetime import datetime, timedelta
+from typing import List, Optional, Dict, Any
+from fastapi import APIRouter, Depends, HTTPException, Query, Body, BackgroundTasks
+from sqlalchemy.orm import Session
+from pydantic import BaseModel, Field, validator
+from src.database.base import get_db
+from src.middleware.auth import get_current_active_user, require_user
+from src.models.auth import User
+from src.models.reading_progress import ReadingProgress
+from src.models.user_preferences import UserPreference
+from src.services.progress import ReadingProgressService
+from src.services.personalization import PersonalizationService
+from src.utils.errors import handle_errors, NotFoundError, ValidationError
+from src.utils.logging import get_logger
+logger = get_logger(__name__)
+router = APIRouter(
+    prefix="/progress",
+    tags=["progress"]
+)
+# Pydantic models for API
+class SectionProgress(BaseModel):
+    section_id: str = Field(..., description="Section identifier")
+    position: float = Field(..., ge=0, le=100, description="Progress percentage (0-100)")
+    time_spent: int = Field(0, ge=0, description="Time spent in minutes")
+    completed: bool = Field(False, description="Whether section is completed")
+    @validator('position')
+    def validate_position(cls, v):
+        if not 0 <= v <= 100:
+            raise ValueError("Position must be between 0 and 100")
+        return v
+class ChapterProgressUpdate(BaseModel):
+    chapter_id: str = Field(..., description="Chapter identifier")
+    sections: List[SectionProgress] = Field(..., description="Section progress updates")
+class ProgressResponse(BaseModel):
+    chapter_id: str
+    overall_progress: float
+    sections_completed: int
+    total_sections: int
+    time_spent: int
+    sections: List[Dict[str, Any]]
+    last_accessed: Optional[str]
+    estimated_completion: Optional[Dict[str, Any]]
+class SessionStart(BaseModel):
+    chapter_id: str = Field(..., description="Chapter identifier")
+    section_id: Optional[str] = Field(None, description="Section identifier")
+class SessionEnd(BaseModel):
+    chapter_id: str = Field(..., description="Chapter identifier")
+    section_id: Optional[str] = Field(None, description="Section identifier")
+    position: float = Field(..., ge=0, le=100, description="Final position")
+    time_spent: int = Field(..., ge=0, description="Time spent in minutes")
+# Helper function to get services
+def get_progress_service(db: Session = Depends(get_db)) -> ReadingProgressService:
+    return ReadingProgressService(db)
+def get_personalization_service(db: Session = Depends(get_db)) -> PersonalizationService:
+    return PersonalizationService(db)
+@router.get("/chapter/{chapter_id}")
+@handle_errors
+async def get_chapter_progress(
+    chapter_id: str,
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> ProgressResponse:
+    """Get comprehensive progress for a specific chapter."""
+    progress = await service.get_chapter_progress(current_user.id, chapter_id)
+    if not progress["total_sections"]:
+        raise NotFoundError("Chapter", chapter_id)
+    return ProgressResponse(**progress)
+@router.get("/summary")
+@handle_errors
+async def get_progress_summary(
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Get overall reading progress summary for the user."""
+    summary = await service.get_user_progress_summary(current_user.id)
+    # Add personalization info
+    personalization_service = PersonalizationService(service.db)
+    personalization = await personalization_service.get_user_personalization(current_user.id)
+    return {
+        **summary,
+        "personalization": personalization,
+        "last_updated": datetime.utcnow().isoformat()
+    }
+@router.post("/session/start")
+@handle_errors
+async def start_reading_session(
+    session_data: SessionStart,
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Start a new reading session."""
+    # Log session start
+    logger.info(
+        "Reading session started",
+        user_id=current_user.id,
+        chapter_id=session_data.chapter_id,
+        section_id=session_data.section_id
+    )
+    # Get or create progress record
+    progress = await service.update_section_progress(
+        user_id=current_user.id,
+        chapter_id=session_data.chapter_id,
+        section_id=session_data.section_id or f"{session_data.chapter_id}_intro",
+        position=0,
+        time_spent_delta=0
+    )
+    return {
+        "session_id": progress.id,
+        "chapter_id": session_data.chapter_id,
+        "section_id": session_data.section_id,
+        "started_at": progress.last_accessed.isoformat(),
+        "message": "Reading session started successfully"
+    }
+@router.post("/session/end")
+@handle_errors
+async def end_reading_session(
+    session_data: SessionEnd,
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """End a reading session with final progress."""
+    # Update progress with session data
+    progress = await service.update_section_progress(
+        user_id=current_user.id,
+        chapter_id=session_data.chapter_id,
+        section_id=session_data.section_id or f"{session_data.chapter_id}_intro",
+        position=session_data.position,
+        time_spent_delta=session_data.time_spent,
+        completed=session_data.position >= 100
+    )
+    # Get updated chapter progress
+    chapter_progress = await service.get_chapter_progress(current_user.id, session_data.chapter_id)
+    # Generate session summary
+    session_summary = {
+        "chapter_id": session_data.chapter_id,
+        "section_id": session_data.section_id,
+        "final_position": session_data.position,
+        "time_spent": session_data.time_spent,
+        "chapter_progress": chapter_progress["overall_progress"],
+        "sections_completed": chapter_progress["sections_completed"],
+        "completed_at": datetime.utcnow().isoformat()
+    }
+    # Log session end
+    logger.info(
+        "Reading session ended",
+        user_id=current_user.id,
+        **session_summary
+    )
+    return {
+        "session_id": progress.id,
+        "summary": session_summary,
+        "message": "Reading session completed successfully"
+    }
+@router.post("/update")
+@handle_errors
+async def update_progress(
+    progress_update: ChapterProgressUpdate,
+    background_tasks: BackgroundTasks,
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Update progress for multiple sections in a chapter."""
+    updated_sections = []
+    errors = []
+    for section in progress_update.sections:
+        try:
+            updated = await service.update_section_progress(
+                user_id=current_user.id,
+                chapter_id=progress_update.chapter_id,
+                section_id=section.section_id,
+                position=section.position,
+                time_spent_delta=section.time_spent,
+                completed=section.completed
+            )
+            updated_sections.append({
+                "section_id": section.section_id,
+                "position": updated.position,
+                "completed": updated.completed,
+                "updated_at": updated.updated_at.isoformat()
+            })
+        except Exception as e:
+            logger.error(
+                "Failed to update section progress",
+                user_id=current_user.id,
+                chapter_id=progress_update.chapter_id,
+                section_id=section.section_id,
+                error=str(e)
+            )
+            errors.append({
+                "section_id": section.section_id,
+                "error": str(e)
+            })
+    # Schedule background task to calculate recommendations
+    if updated_sections:
+        background_tasks.add_task(
+            calculate_recommendations_delayed,
+            current_user.id
+        )
+    return {
+        "chapter_id": progress_update.chapter_id,
+        "updated_sections": updated_sections,
+        "errors": errors,
+        "total_updated": len(updated_sections),
+        "total_errors": len(errors),
+        "message": f"Updated {len(updated_sections)} sections successfully"
+    }
+@router.post("/section/{section_id}/complete")
+@handle_errors
+async def complete_section(
+    chapter_id: str,
+    section_id: str,
+    time_spent: int = Query(0, ge=0, description="Time spent in minutes"),
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Mark a section as completed."""
+    progress = await service.mark_section_complete(
+        user_id=current_user.id,
+        chapter_id=chapter_id,
+        section_id=section_id,
+        time_spent_delta=time_spent
+    )
+    # Get updated chapter progress
+    chapter_progress = await service.get_chapter_progress(current_user.id, chapter_id)
+    # Log completion
+    logger.info(
+        "Section completed",
+        user_id=current_user.id,
+        chapter_id=chapter_id,
+        section_id=section_id,
+        position=100,
+        time_spent=time_spent
+    )
+    return {
+        "section_id": section_id,
+        "chapter_id": chapter_id,
+        "completed_at": progress.updated_at.isoformat(),
+        "time_spent": time_spent,
+        "chapter_progress": chapter_progress["overall_progress"],
+        "sections_completed": chapter_progress["sections_completed"],
+        "message": "Section marked as completed"
+    }
+@router.get("/restore/{chapter_id}")
+@handle_errors
+async def restore_progress(
+    chapter_id: str,
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Restore user's last position in a chapter."""
+    restored = await service.restore_progress(current_user.id, chapter_id)
+    if restored["section_id"]:
+        # Update last accessed
+        progress = await service.update_section_progress(
+            user_id=current_user.id,
+            chapter_id=chapter_id,
+            section_id=restored["section_id"],
+            position=restored["position"],
+            time_spent_delta=0
+        )
+        logger.info(
+            "Progress restored",
+            user_id=current_user.id,
+            chapter_id=chapter_id,
+            section_id=restored["section_id"],
+            position=restored["position"]
+        )
+    return restored
+@router.get("/analytics")
+@handle_errors
+async def get_progress_analytics(
+    timeframe: str = Query("month", regex="^(day|week|month|year)$"),
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Get detailed reading analytics."""
+    analytics = await service.get_reading_analytics(current_user.id, timeframe)
+    # Add additional user-specific analytics
+    personalization_service = PersonalizationService(service.db)
+    personalization = await personalization_service.get_user_personalization(current_user.id)
+    return {
+        **analytics,
+        "user_experience_level": personalization["experience_level"],
+        "user_preferences": personalization["preferences"],
+        "generated_at": datetime.utcnow().isoformat()
+    }
+@router.post("/bulk")
+@handle_errors
+async def bulk_update_progress(
+    updates: List[ChapterProgressUpdate],
+    background_tasks: BackgroundTasks,
+    current_user: User = Depends(get_current_active_user),
+    service: ReadingProgressService = Depends(get_progress_service)
+) -> Dict[str, Any]:
+    """Bulk update progress for multiple chapters."""
+    results = []
+    total_updated = 0
+    total_errors = 0
+    for chapter_update in updates:
+        try:
+            chapter_result = await update_progress(
+                progress_update=chapter_update,
+                background_tasks=background_tasks,
+                current_user=current_user,
+                service=service
+            )
+            results.append(chapter_result)
+            total_updated += chapter_result["total_updated"]
+            total_errors += chapter_result["total_errors"]
+        except Exception as e:
+            logger.error(
+                "Failed to bulk update chapter progress",
+                user_id=current_user.id,
+                chapter_id=chapter_update.chapter_id,
+                error=str(e)
+            )
+            results.append({
+                "chapter_id": chapter_update.chapter_id,
+                "updated_sections": [],
+                "errors": [{"error": str(e)}],
+                "total_updated": 0,
+                "total_errors": 1
+            })
+            total_errors += 1
+    return {
+        "results": results,
+        "summary": {
+            "total_chapters": len(updates),
+            "total_updated": total_updated,
+            "total_errors": total_errors,
+            "success_rate": (total_updated / (total_updated + total_errors)) * 100 if (total_updated + total_errors) > 0 else 0
+        },
+        "message": f"Bulk update completed: {total_updated} sections updated, {total_errors} errors"
+    }
+@router.delete("/chapter/{chapter_id}")
+@handle_errors
+async def reset_chapter_progress(
+    chapter_id: str,
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+) -> Dict[str, Any]:
+    """Reset all progress for a specific chapter."""
+    # Delete all progress records for this chapter
+    deleted = db.query(ReadingProgress).filter(
+        ReadingProgress.user_id == current_user.id,
+        ReadingProgress.chapter_id == chapter_id
+    ).delete()
+    db.commit()
+    logger.info(
+        "Chapter progress reset",
+        user_id=current_user.id,
+        chapter_id=chapter_id,
+        deleted_sections=deleted
+    )
+    return {
+        "chapter_id": chapter_id,
+        "deleted_sections": deleted,
+        "message": f"Progress for chapter {chapter_id} has been reset"
+    }
+# Background task helper
+async def calculate_recommendations_delayed(user_id: str):
+    """Background task to calculate recommendations after progress update."""
+    try:
+        from src.services.personalization import PersonalizationService
+        from src.database.base import SessionLocal
+        db = SessionLocal()
+        try:
+            service = PersonalizationService(db)
+            recommendations = await service.generate_recommendations(user_id, limit=5)
+            logger.info(
+                "Recommendations calculated",
+                user_id=user_id,
+                recommendations_count=len(recommendations)
+            )
+        finally:
+            db.close()
+    except Exception as e:
+        logger.error(
+            "Failed to calculate recommendations in background task",
+            user_id=user_id,
+            error=str(e)
+        )

src/api/v1/reader_features.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Reader features API routes v1.
+API endpoints for progress tracking, bookmarks, preferences, and search.
+"""
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.orm import Session
+from typing import List, Optional
+from src.database.base import get_db
+from src.middleware.auth import get_current_active_user, require_user
+from src.models.auth import User
+from src.utils.errors import handle_errors, NotFoundError, ValidationError
+from src.utils.logging import get_logger
+logger = get_logger(__name__)
+router = APIRouter(
+    prefix="/reader-features",
+    tags=["reader-features"]
+)
+# Health check endpoint for reader features
+@router.get("/health")
+async def health_check():
+    """Health check for reader features API."""
+    return {
+        "status": "healthy",
+        "service": "reader-features",
+        "version": "1.0.0"
+    }
+# Placeholder endpoints - will be implemented in user stories
+@router.get("/progress")
+@handle_errors
+async def get_progress_summary(
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+):
+    """Get user's overall reading progress summary."""
+    # TODO: Implement in User Story 1
+    raise HTTPException(status_code=501, detail="Not implemented yet")
+@router.get("/bookmarks")
+@handle_errors
+async def get_bookmarks(
+    limit: int = Query(50, ge=1, le=100),
+    offset: int = Query(0, ge=0),
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+):
+    """Get user's bookmarks."""
+    # TODO: Implement in User Story 4
+    raise HTTPException(status_code=501, detail="Not implemented yet")
+@router.get("/preferences")
+@handle_errors
+async def get_preferences(
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+):
+    """Get user's reading preferences."""
+    # TODO: Implement in User Story 1
+    raise HTTPException(status_code=501, detail="Not implemented yet")
+@router.get("/search")
+@handle_errors
+async def search_content(
+    q: str = Query(..., min_length=1, description="Search query"),
+    language: Optional[str] = Query(None, description="Filter by language"),
+    chapter: Optional[str] = Query(None, description="Filter by chapter"),
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+):
+    """Search content across all languages."""
+    # TODO: Implement in User Story 3
+    raise HTTPException(status_code=501, detail="Not implemented yet")
+# Import all routers from individual feature modules
+# These will be added as we implement each user story
+# from .progress import router as progress_router
+# from .bookmarks import router as bookmarks_router
+# from .preferences import router as preferences_router
+# from .search import router as search_router
+# from .analytics import router as analytics_router
+# Combine all routers
+# api_router = APIRouter()
+# api_router.include_router(progress_router, prefix="/progress", tags=["progress"])
+# api_router.include_router(bookmarks_router, prefix="/bookmarks", tags=["bookmarks"])
+# api_router.include_router(preferences_router, prefix="/preferences", tags=["preferences"])
+# api_router.include_router(search_router, prefix="/search", tags=["search"])
+# api_router.include_router(analytics_router, prefix="/analytics", tags=["analytics"])

src/api/v1/translation.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+Translation API endpoints using OpenAI Agents SDK.
+Provides RESTful endpoints for translating text from English to Urdu
+using the OpenAI Agents SDK with Gemini API integration.
+"""
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+from fastapi import status
+from typing import Optional, Dict, Any
+import time
+from src.services.openai_translation.translation_agent import OpenAITranslationAgent, TranslationContext
+from src.services.openai_translation.client import get_gemini_client
+from src.services.translation_cache import cache_service
+from src.models.auth import User
+from src.security.dependencies import get_current_user_or_anonymous
+router = APIRouter(prefix="/translation", tags=["translation"])
+@router.post("/translate", response_model=dict)
+async def translate_text(
+    request: dict,
+    http_request: Request,
+    current_user: Optional[User] = Depends(get_current_user_or_anonymous)
+) -> JSONResponse:
+    """
+    Legacy translation endpoint (for backward compatibility).
+    This endpoint uses the OpenAI Agents SDK with the improved agent implementation.
+    Args:
+        request: Translation request with text and parameters
+        http_request: FastAPI request object
+        current_user: Optional current user
+    Returns:
+        Translation result
+    """
+    try:
+        # Extract request data
+        text = request.get("text", "")
+        source_language = request.get("source_language", "en")
+        target_language = request.get("target_language", "ur")
+        document_type = request.get("document_type")
+        technical_domain = request.get("technical_domain")
+        target_audience = request.get("target_audience")
+        model = request.get("model", "gemini-2.0-flash-lite")
+        # Create translation context
+        context = TranslationContext(
+            document_type=document_type,
+            technical_domain=technical_domain,
+            target_audience=target_audience
+        )
+        # Create agent and translate
+        agent = OpenAITranslationAgent(
+            gemini_client=get_gemini_client(),
+            model=model
+        )
+        result = await agent.translate_with_agent(
+            text=text,
+            context=context,
+            user_id=current_user.id if current_user else None
+        )
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content={
+                "job_id": f"translate_{int(time.time())}",
+                "translated_text": result["translated_text"],
+                "status": "completed",
+                "progress": 100.0,
+                "chunks": [],
+                "processing_time_ms": 0,
+                "cached": False,
+                "input_tokens": result.get("tokens_used", 0),
+                "output_tokens": 0,
+                "estimated_cost_usd": 0.0,
+                "confidence_score": result.get("confidence_score", 0.95)
+            }
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={
+                "error": "TRANSLATION_ERROR",
+                "message": "Failed to translate text"
+            }
+        )
+@router.post("/translate/agent")
+async def translate_with_agent(
+    request: dict,
+    http_request: Request,
+    current_user: Optional[User] = Depends(get_current_user_or_anonymous)
+) -> JSONResponse:
+    """
+    Translate text using OpenAI Agents SDK directly with caching.
+    This endpoint uses the OpenAI Agents SDK for translation with enhanced
+    context awareness and proper Runner.run pattern. Translations are cached
+    for 1 week to avoid redundant API calls.
+    Args:
+        request: Translation request
+        http_request: FastAPI request object
+        current_user: Optional current user
+    Returns:
+        Translation result with detailed metadata
+    """
+    try:
+        # Extract request parameters
+        text = request.get("text", "")
+        source_language = request.get("source_language", "en")
+        target_language = request.get("target_language", "ur")
+        page_url = request.get("page_url")
+        model = request.get("model", "gemini-2.0-flash-lite")
+        # Check cache first
+        cached_result = await cache_service.get_cached_translation(
+            text=text,
+            source_language=source_language,
+            target_language=target_language,
+            page_url=page_url
+        )
+        if cached_result:
+            return JSONResponse(
+                status_code=status.HTTP_200_OK,
+                content={
+                    "translated_text": cached_result["translated_text"],
+                    "original_text": cached_result["original_text"],
+                    "cached": True,
+                    "cache_created_at": cached_result["cache_created_at"],
+                    "cache_expires_at": cached_result["cache_expires_at"],
+                    "hit_count": cached_result["hit_count"],
+                    "tokens_used": 0,  # No tokens used for cached result
+                    "model": cached_result["model"],
+                    "confidence_score": cached_result["confidence_score"],
+                    "has_code_blocks": False,  # Would need to be stored in cache
+                    "code_blocks": []  # Would need to be stored in cache
+                }
+            )
+        # Not in cache, proceed with translation
+        # Create translation context
+        context = TranslationContext(
+            page_url=page_url,
+            document_type=request.get("document_type"),
+            technical_domain=request.get("technical_domain"),
+            target_audience=request.get("target_audience")
+        )
+        # Create agent and translate
+        agent = OpenAITranslationAgent(
+            gemini_client=get_gemini_client(),
+            model=model
+        )
+        start_time = time.time()
+        result = await agent.translate_with_agent(
+            text=text,
+            context=context,
+            user_id=current_user.id if current_user else None
+        )
+        processing_time_ms = int((time.time() - start_time) * 1000)
+        # Cache the translation result
+        await cache_service.cache_translation(
+            text=text,
+            translated_text=result["translated_text"],
+            source_language=source_language,
+            target_language=target_language,
+            model=result.get("model", model),
+            confidence_score=result.get("confidence_score", 0.95),
+            processing_time_ms=processing_time_ms,
+            page_url=page_url
+        )
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content={
+                "translated_text": result["translated_text"],
+                "original_text": result["original_text"],
+                "cached": False,
+                "tokens_used": result.get("tokens_used", 0),
+                "model": result.get("model", model),
+                "confidence_score": result.get("confidence_score", 0.95),
+                "has_code_blocks": result.get("has_code_blocks", False),
+                "code_blocks": result.get("code_blocks", []),
+                "processing_time_ms": processing_time_ms
+            }
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={
+                "error": "AGENT_TRANSLATION_ERROR",
+                "message": "Failed to translate text using agent"
+            }
+        )
+@router.get("/health")
+async def health_check() -> JSONResponse:
+    """
+    Simple health check endpoint for translation service.
+    Returns:
+        Health status
+    """
+    return JSONResponse(
+        status_code=status.HTTP_200_OK,
+        content={
+            "status": "healthy",
+            "service": "translation",
+            "version": "2.0.0",
+            "features": ["openai_agents_sdk", "gemini_api", "translation_cache"]
+        }
+    )
+@router.post("/cache/clear-expired")
+async def clear_expired_cache(
+    current_user: Optional[User] = Depends(get_current_user_or_anonymous)
+) -> JSONResponse:
+    """
+    Clear expired cache entries.
+    Returns:
+        Number of cleared entries
+    """
+    try:
+        cleared_count = await cache_service.clear_expired_cache()
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content={
+                "message": f"Cleared {cleared_count} expired cache entries",
+                "cleared_count": cleared_count
+            }
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={
+                "error": "CACHE_CLEAR_ERROR",
+                "message": "Failed to clear expired cache"
+            }
+        )
+@router.post("/cache/clear-url")
+async def clear_cache_by_url(
+    request: dict,
+    current_user: Optional[User] = Depends(get_current_user_or_anonymous)
+) -> JSONResponse:
+    """
+    Clear cache entries for a specific URL.
+    Args:
+        request: Dict containing 'url' and optional 'source_language' and 'target_language'
+    Returns:
+        Number of cleared entries
+    """
+    try:
+        url = request.get("url")
+        if not url:
+            return JSONResponse(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                content={
+                    "error": "INVALID_REQUEST",
+                    "message": "URL is required"
+                }
+            )
+        source_language = request.get("source_language")
+        target_language = request.get("target_language")
+        cleared_count = await cache_service.clear_cache_by_url(
+            page_url=url,
+            source_language=source_language,
+            target_language=target_language
+        )
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content={
+                "message": f"Cleared {cleared_count} cache entries for URL",
+                "url": url,
+                "cleared_count": cleared_count
+            }
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={
+                "error": "CACHE_CLEAR_URL_ERROR",
+                "message": "Failed to clear cache for URL"
+            }
+        )
+@router.get("/cache/stats")
+async def get_cache_stats(
+    current_user: Optional[User] = Depends(get_current_user_or_anonymous)
+) -> JSONResponse:
+    """
+    Get translation cache statistics.
+    Returns:
+        Cache statistics
+    """
+    try:
+        stats = await cache_service.get_cache_stats()
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content=stats
+        )
+    except Exception as e:
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={
+                "error": "CACHE_STATS_ERROR",
+                "message": "Failed to retrieve cache statistics"
+            }
+        )

src/config/logging_config.py ADDED Viewed

	@@ -0,0 +1,442 @@

+"""
+Production-ready logging configuration.
+Configures structured logging with multiple handlers, sensitive data filtering,
+and integration with monitoring systems.
+"""
+import sys
+import json
+import logging
+import logging.handlers
+from typing import Dict, Any, List, Optional, Union
+from pathlib import Path
+from datetime import datetime
+import traceback
+from contextvars import ContextVar
+from pythonjsonlogger import jsonlogger
+from structlog import processors, stdlib, configure
+from structlog.typing import FilteringBoundLogger
+from .translation_config import get_config, LogLevel
+# Context variables for request tracking
+request_id: ContextVar[Optional[str]] = ContextVar('request_id', default=None)
+user_id: ContextVar[Optional[str]] = ContextVar('user_id', default=None)
+session_id: ContextVar[Optional[str]] = ContextVar('session_id', default=None)
+class SensitiveDataFilter(logging.Filter):
+    """Filter to mask sensitive data in log records."""
+    def __init__(self, sensitive_fields: List[str] = None, mask_char: str = "*"):
+        super().__init__()
+        self.sensitive_fields = [field.lower() for field in (sensitive_fields or [])]
+        self.mask_char = mask_char
+    def filter(self, record):
+        """Filter sensitive data from log record."""
+        # Filter message
+        if hasattr(record, 'msg') and record.msg:
+            record.msg = self._mask_sensitive_data(str(record.msg))
+        # Filter args
+        if hasattr(record, 'args') and record.args:
+            record.args = tuple(
+                self._mask_sensitive_data(str(arg)) if isinstance(arg, str) else arg
+                for arg in record.args
+            )
+        # Filter extra attributes
+        for attr_name in dir(record):
+            if not attr_name.startswith('_') and attr_name not in {
+                'name', 'msg', 'args', 'levelname', 'levelno', 'pathname',
+                'filename', 'module', 'lineno', 'funcName', 'created',
+                'msecs', 'relativeCreated', 'thread', 'threadName',
+                'processName', 'process', 'getMessage', 'exc_info',
+                'exc_text', 'stack_info'
+            }:
+                attr_value = getattr(record, attr_name)
+                if isinstance(attr_value, str):
+                    setattr(record, attr_name, self._mask_sensitive_data(attr_value))
+        return True
+    def _mask_sensitive_data(self, text: str) -> str:
+        """Mask sensitive data in text."""
+        import re
+        # General patterns
+        patterns = [
+            (r'(?i)(api[_-]?key["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
+            (r'(?i)(password["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
+            (r'(?i)(token["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
+            (r'(?i)(secret["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
+            (r'(?i)(authorization["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
+            (r'(Bearer\s+)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
+        ]
+        # Custom field patterns
+        for field in self.sensitive_fields:
+            patterns.append(
+                (rf'(?i)({field}["\']?\s*[:=]\s*["\']?)([\w\-\.]+)',
+                 lambda m, f=field: f"{m.group(1)}{self.mask_char * len(m.group(2))}")
+            )
+        # Apply patterns
+        for pattern, replacement in patterns:
+            text = re.sub(pattern, replacement, text)
+        return text
+class ContextFilter(logging.Filter):
+    """Add context information to log records."""
+    def filter(self, record):
+        """Add context variables to log record."""
+        record.request_id = request_id.get()
+        record.user_id = user_id.get()
+        record.session_id = session_id.get()
+        return True
+class JSONFormatter(jsonlogger.JsonFormatter):
+    """Custom JSON formatter with additional fields."""
+    def add_fields(self, log_record, record, message_dict):
+        """Add custom fields to JSON log record."""
+        super().add_fields(log_record, record, message_dict)
+        # Add timestamp
+        if not log_record.get('timestamp'):
+            log_record['timestamp'] = datetime.utcnow().isoformat()
+        # Add context
+        if hasattr(record, 'request_id') and record.request_id:
+            log_record['request_id'] = record.request_id
+        if hasattr(record, 'user_id') and record.user_id:
+            log_record['user_id'] = record.user_id
+        if hasattr(record, 'session_id') and record.session_id:
+            log_record['session_id'] = record.session_id
+        # Add exception details
+        if record.exc_info:
+            log_record['exception'] = {
+                'type': record.exc_info[0].__name__,
+                'message': str(record.exc_info[1]),
+                'traceback': self.formatException(record.exc_info)
+            }
+        # Add source location
+        log_record['source'] = {
+            'file': record.filename,
+            'line': record.lineno,
+            'function': record.funcName,
+            'module': record.module
+        }
+class ColoredFormatter(logging.Formatter):
+    """Colored formatter for console output."""
+    COLORS = {
+        'DEBUG': '\033[36m',     # Cyan
+        'INFO': '\033[32m',      # Green
+        'WARNING': '\033[33m',   # Yellow
+        'ERROR': '\033[31m',     # Red
+        'CRITICAL': '\033[35m',  # Magenta
+        'RESET': '\033[0m'       # Reset
+    }
+    def format(self, record):
+        """Format log record with colors."""
+        log_color = self.COLORS.get(record.levelname, self.COLORS['RESET'])
+        reset = self.COLORS['RESET']
+        # Add color to levelname
+        record.levelname = f"{log_color}{record.levelname}{reset}"
+        # Add request ID if present
+        if hasattr(record, 'request_id') and record.request_id:
+            record.msg = f"[{record.request_id[:8]}] {record.msg}"
+        return super().format(record)
+def setup_logging() -> None:
+    """Setup logging configuration based on environment."""
+    config = get_config()
+    # Get root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(getattr(logging, config.logging.level.value))
+    # Clear existing handlers
+    root_logger.handlers.clear()
+    # Create formatters
+    if config.logging.json_format:
+        formatter = JSONFormatter(
+            '%(asctime)s %(name)s %(levelname)s %(message)s'
+        )
+    else:
+        formatter = logging.Formatter(
+            config.logging.format,
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    if config.logging.json_format:
+        console_handler.setFormatter(formatter)
+    else:
+        console_handler.setFormatter(ColoredFormatter(config.logging.format))
+    console_handler.addFilter(ContextFilter())
+    root_logger.addHandler(console_handler)
+    # File handler (if enabled)
+    if config.logging.file_logging:
+        setup_file_handler(root_logger, formatter, config)
+    # Apply sensitive data filter
+    if config.logging.filter_sensitive_data:
+        sensitive_filter = SensitiveDataFilter(config.logging.sensitive_fields)
+        for handler in root_logger.handlers:
+            handler.addFilter(sensitive_filter)
+    # Configure structlog
+    if config.logging.json_format:
+        configure(
+            processors=[
+                structlog.stdlib.filter_by_level,
+                structlog.stdlib.add_logger_name,
+                structlog.stdlib.add_log_level,
+                structlog.stdlib.PositionalArgumentsFormatter(),
+                structlog.processors.TimeStamper(fmt="iso"),
+                structlog.processors.StackInfoRenderer(),
+                structlog.processors.format_exc_info,
+                structlog.processors.UnicodeDecoder(),
+                structlog.processors.JSONRenderer()
+            ],
+            context_class=dict,
+            logger_factory=stdlib.LoggerFactory(),
+            wrapper_class=stdlib.BoundLogger,
+            cache_logger_on_first_use=True,
+        )
+    else:
+        configure(
+            processors=[
+                structlog.stdlib.filter_by_level,
+                structlog.stdlib.add_logger_name,
+                structlog.stdlib.add_log_level,
+                structlog.stdlib.PositionalArgumentsFormatter(),
+                structlog.processors.TimeStamper(fmt="iso"),
+                structlog.processors.StackInfoRenderer(),
+                structlog.processors.format_exc_info,
+                structlog.processors.UnicodeDecoder(),
+                structlog.dev.ConsoleRenderer()
+            ],
+            context_class=dict,
+            logger_factory=stdlib.LoggerFactory(),
+            wrapper_class=stdlib.BoundLogger,
+            cache_logger_on_first_use=True,
+        )
+    # Log configuration
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Logging configured",
+        level=config.logging.level.value,
+        json_format=config.logging.json_format,
+        file_logging=config.logging.file_logging,
+        filter_sensitive=config.logging.filter_sensitive_data
+    )
+def setup_file_handler(
+    logger: logging.Logger,
+    formatter: Union[logging.Formatter, JSONFormatter],
+    config
+) -> None:
+    """Setup file handler with rotation."""
+    # Create logs directory
+    log_path = Path(config.logging.file_path)
+    log_path.parent.mkdir(parents=True, exist_ok=True)
+    # Parse rotation settings
+    when = "midnight"
+    if config.logging.file_rotation.endswith(" day"):
+        when = "midnight"
+    elif config.logging.file_rotation.endswith(" hour"):
+        when = "H"
+    elif config.logging.file_rotation.endswith(" minute"):
+        when = "M"
+    # Parse backup count from retention
+    backup_count = 30  # Default
+    if "days" in config.logging.file_retention:
+        backup_count = int(config.logging.file_retention.split()[0])
+    # Create rotating file handler
+    try:
+        file_handler = logging.handlers.RotatingFileHandler(
+            filename=log_path,
+            maxBytes=_parse_size(config.logging.max_file_size),
+            backupCount=backup_count,
+            encoding='utf-8'
+        )
+    except Exception:
+        # Fallback to TimedRotatingFileHandler
+        file_handler = logging.handlers.TimedRotatingFileHandler(
+            filename=log_path,
+            when=when,
+            backupCount=backup_count,
+            encoding='utf-8'
+        )
+    file_handler.setFormatter(formatter)
+    file_handler.addFilter(ContextFilter())
+    logger.addHandler(file_handler)
+def _parse_size(size_str: str) -> int:
+    """Parse size string to bytes."""
+    size_str = size_str.upper().strip()
+    multipliers = {
+        'B': 1,
+        'KB': 1024,
+        'MB': 1024 ** 2,
+        'GB': 1024 ** 3
+    }
+    for unit, multiplier in multipliers.items():
+        if size_str.endswith(unit):
+            return int(float(size_str[:-len(unit)]) * multiplier)
+    return int(size_str)
+def bind_context(
+    request_id: Optional[str] = None,
+    user_id: Optional[str] = None,
+    session_id: Optional[str] = None
+) -> Dict[str, Any]:
+    """Bind context variables for logging."""
+    context = {}
+    if request_id:
+        request_id.set(request_id)
+        context['request_id'] = request_id
+    if user_id:
+        user_id.set(user_id)
+        context['user_id'] = user_id
+    if session_id:
+        session_id.set(session_id)
+        context['session_id'] = session_id
+    return context
+def unbind_context() -> None:
+    """Clear all context variables."""
+    request_id.set(None)
+    user_id.set(None)
+    session_id.set(None)
+class LogContext:
+    """Context manager for log context."""
+    def __init__(
+        self,
+        request_id: Optional[str] = None,
+        user_id: Optional[str] = None,
+        session_id: Optional[str] = None,
+        **kwargs
+    ):
+        self.context = bind_context(request_id, user_id, session_id)
+        self.context.update(kwargs)
+        self.old_context = {}
+    def __enter__(self):
+        # Store old context
+        for key, value in self.context.items():
+            var = globals().get(key)
+            if var:
+                self.old_context[key] = var.get()
+                var.set(value)
+        return self.context
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Restore old context
+        for key, value in self.old_context.items():
+            var = globals().get(key)
+            if var:
+                var.set(value)
+def log_function_call(func):
+    """Decorator to log function calls."""
+    import functools
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        logger = logging.getLogger(func.__module__)
+        logger.debug(
+            "Function called",
+            function=func.__name__,
+            args_count=len(args),
+            kwargs=list(kwargs.keys())
+        )
+        try:
+            result = func(*args, **kwargs)
+            logger.debug(
+                "Function completed",
+                function=func.__name__
+            )
+            return result
+        except Exception as e:
+            logger.error(
+                "Function failed",
+                function=func.__name__,
+                error=str(e),
+                error_type=type(e).__name__
+            )
+            raise
+    @functools.wraps(func)
+    async def async_wrapper(*args, **kwargs):
+        logger = logging.getLogger(func.__module__)
+        logger.debug(
+            "Async function called",
+            function=func.__name__,
+            args_count=len(args),
+            kwargs=list(kwargs.keys())
+        )
+        try:
+            result = await func(*args, **kwargs)
+            logger.debug(
+                "Async function completed",
+                function=func.__name__
+            )
+            return result
+        except Exception as e:
+            logger.error(
+                "Async function failed",
+                function=func.__name__,
+                error=str(e),
+                error_type=type(e).__name__
+            )
+            raise
+    return async_wrapper if asyncio.iscoroutinefunction(func) else wrapper
+# Initialize logging on import
+setup_logging()

src/config/translation_config.py ADDED Viewed

	@@ -0,0 +1,432 @@

+"""
+Translation Service Configuration Management.
+Centralized configuration for the OpenAI Translation Service with
+environment-based overrides and validation.
+"""
+import os
+import json
+import yaml
+from typing import Dict, Any, Optional, Union, List
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from enum import Enum
+from pydantic import BaseModel, Field, validator
+from src.utils.translation_logger import get_translation_logger
+logger = get_translation_logger(__name__)
+class LogLevel(str, Enum):
+    """Log levels for the translation service."""
+    DEBUG = "DEBUG"
+    INFO = "INFO"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    CRITICAL = "CRITICAL"
+class Environment(str, Enum):
+    """Environment types."""
+    DEVELOPMENT = "development"
+    TESTING = "testing"
+    STAGING = "staging"
+    PRODUCTION = "production"
+class CacheBackend(str, Enum):
+    """Cache backend types."""
+    MEMORY = "memory"
+    REDIS = "redis"
+    DATABASE = "database"
+@dataclass
+class GeminiConfig:
+    """Configuration for Gemini API."""
+    api_key: str = field(default_factory=lambda: os.getenv("GEMINI_API_KEY", ""))
+    base_url: str = field(
+        default_factory=lambda: os.getenv(
+            "GEMINI_BASE_URL",
+            "https://generativelanguage.googleapis.com/v1beta/openai/"
+        )
+    )
+    default_model: str = field(
+        default_factory=lambda: os.getenv("GEMINI_MODEL", "gemini-2.0-flash-lite")
+    )
+    organization: Optional[str] = field(default_factory=lambda: os.getenv("OPENAI_ORGANIZATION"))
+    # Connection settings
+    timeout: float = field(default_factory=lambda: float(os.getenv("GEMINI_TIMEOUT", "60")))
+    max_retries: int = field(default_factory=lambda: int(os.getenv("GEMINI_MAX_RETRIES", "3")))
+    retry_delay: float = field(default_factory=lambda: float(os.getenv("GEMINI_RETRY_DELAY", "1.0")))
+    # Advanced settings
+    proxy: Optional[str] = field(default_factory=lambda: os.getenv("HTTP_PROXY"))
+    custom_headers: Dict[str, str] = field(default_factory=dict)
+    http2: bool = field(default_factory=lambda: os.getenv("GEMINI_HTTP2", "true").lower() == "true")
+    # Rate limiting
+    requests_per_minute: int = field(default_factory=lambda: int(os.getenv("GEMINI_RPM", "60")))
+    requests_per_hour: int = field(default_factory=lambda: int(os.getenv("GEMINI_RPH", "1000")))
+    # Model pricing (USD per 1M tokens)
+    pricing: Dict[str, Dict[str, float]] = field(default_factory=lambda: {
+        "gemini-2.0-flash-lite": {"input": 0.000075, "output": 0.00015},
+        "gemini-2.5-pro": {"input": 0.00125, "output": 0.00375}
+    })
+@dataclass
+class OpenAIAgentsConfig:
+    """Configuration for OpenAI Agents SDK."""
+    enabled: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_ENABLED", "true").lower() == "true")
+    enable_tracing: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_TRACING", "false").lower() == "true")
+    verbose_logging: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_VERBOSE", "false").lower() == "true")
+    # Agent settings
+    default_temperature: float = field(default_factory=lambda: float(os.getenv("AGENT_DEFAULT_TEMPERATURE", "0.3")))
+    default_max_tokens: int = field(default_factory=lambda: int(os.getenv("AGENT_MAX_TOKENS", "2048")))
+    max_turns: int = field(default_factory=lambda: int(os.getenv("AGENT_MAX_TURNS", "5")))
+    # Tool settings
+    enable_html_tool: bool = field(default_factory=lambda: os.getenv("AGENT_HTML_TOOL", "true").lower() == "true")
+    enable_code_tool: bool = field(default_factory=lambda: os.getenv("AGENT_CODE_TOOL", "true").lower() == "true")
+    enable_quality_tool: bool = field(default_factory=lambda: os.getenv("AGENT_QUALITY_TOOL", "true").lower() == "true")
+    # Quality settings
+    quality_check_enabled: bool = field(default_factory=lambda: os.getenv("AGENT_QUALITY_CHECK", "true").lower() == "true")
+    confidence_threshold: float = field(default_factory=lambda: float(os.getenv("AGENT_CONFIDENCE_THRESHOLD", "0.8")))
+@dataclass
+class CacheConfig:
+    """Configuration for caching."""
+    backend: CacheBackend = field(
+        default_factory=lambda: CacheBackend(os.getenv("CACHE_BACKEND", "memory"))
+    )
+    # TTL settings
+    default_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_DEFAULT_TTL", "168")))  # 7 days
+    high_quality_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_HIGH_QUALITY_TTL", "720")))  # 30 days
+    low_quality_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_LOW_QUALITY_TTL", "24")))  # 1 day
+    # Redis settings
+    redis_url: str = field(default_factory=lambda: os.getenv("REDIS_URL", "redis://localhost:6379"))
+    redis_prefix: str = field(default_factory=lambda: os.getenv("REDIS_PREFIX", "translation:"))
+    redis_max_connections: int = field(default_factory=lambda: int(os.getenv("REDIS_MAX_CONNECTIONS", "10")))
+    # Memory cache settings
+    memory_max_size: int = field(default_factory=lambda: int(os.getenv("CACHE_MEMORY_MAX_SIZE", "1000")))
+    memory_cleanup_interval: int = field(default_factory=lambda: int(os.getenv("CACHE_CLEANUP_INTERVAL", "3600")))
+@dataclass
+class DatabaseConfig:
+    """Configuration for database connections."""
+    url: str = field(default_factory=lambda: os.getenv(
+        "DATABASE_URL",
+        "sqlite:///./translation.db"
+    ))
+    pool_size: int = field(default_factory=lambda: int(os.getenv("DB_POOL_SIZE", "5")))
+    max_overflow: int = field(default_factory=lambda: int(os.getenv("DB_MAX_OVERFLOW", "10")))
+    pool_timeout: int = field(default_factory=lambda: int(os.getenv("DB_POOL_TIMEOUT", "30")))
+    pool_recycle: int = field(default_factory=lambda: int(os.getenv("DB_POOL_RECYCLE", "3600")))
+    # Migration settings
+    auto_migrate: bool = field(default_factory=lambda: os.getenv("DB_AUTO_MIGRATE", "true").lower() == "true")
+    migration_timeout: int = field(default_factory=lambda: int(os.getenv("DB_MIGRATION_TIMEOUT", "300")))
+@dataclass
+class LoggingConfig:
+    """Configuration for logging."""
+    level: LogLevel = field(default_factory=lambda: LogLevel(os.getenv("LOG_LEVEL", "INFO")))
+    format: str = field(
+        default_factory=lambda: os.getenv(
+            "LOG_FORMAT",
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+    )
+    # File logging
+    file_logging: bool = field(default_factory=lambda: os.getenv("LOG_FILE_ENABLED", "true").lower() == "true")
+    file_path: str = field(default_factory=lambda: os.getenv("LOG_FILE_PATH", "logs/translation.log"))
+    file_rotation: str = field(default_factory=lambda: os.getenv("LOG_FILE_ROTATION", "1 day"))
+    file_retention: str = field(default_factory=lambda: os.getenv("LOG_FILE_RETENTION", "30 days"))
+    max_file_size: str = field(default_factory=lambda: os.getenv("LOG_MAX_FILE_SIZE", "100 MB"))
+    # Structured logging
+    json_format: bool = field(default_factory=lambda: os.getenv("LOG_JSON_FORMAT", "false").lower() == "true")
+    include_request_id: bool = field(default_factory=lambda: os.getenv("LOG_INCLUDE_REQUEST_ID", "true").lower() == "true")
+    # Sensitive data filtering
+    filter_sensitive_data: bool = field(default_factory=lambda: os.getenv("LOG_FILTER_SENSITIVE", "true").lower() == "true")
+    sensitive_fields: List[str] = field(default_factory=lambda: [
+        "api_key", "password", "token", "authorization"
+    ])
+@dataclass
+class RateLimitConfig:
+    """Configuration for rate limiting."""
+    enabled: bool = field(default_factory=lambda: os.getenv("RATE_LIMIT_ENABLED", "true").lower() == "true")
+    # Global limits
+    requests_per_minute: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPM", "60")))
+    requests_per_hour: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPH", "1000")))
+    requests_per_day: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPD", "10000")))
+    # Translation-specific limits
+    translation_rpm: int = field(default_factory=lambda: int(os.getenv("TRANSLATION_RPM", "10")))
+    translation_rph: int = field(default_factory=lambda: int(os.getenv("TRANSLATION_RPH", "500")))
+    # Enforcement
+    block_duration: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_BLOCK_DURATION", "3600")))
+    warning_threshold: float = field(default_factory=lambda: float(os.getenv("RATE_LIMIT_WARNING_THRESHOLD", "0.8")))
+    # Redis backend for distributed limiting
+    redis_backend: bool = field(default_factory=lambda: os.getenv("RATE_LIMIT_REDIS", "false").lower() == "true")
+@dataclass
+class SecurityConfig:
+    """Configuration for security settings."""
+    # API key validation
+    require_api_key: bool = field(default_factory=lambda: os.getenv("SECURITY_REQUIRE_API_KEY", "false").lower() == "true")
+    api_key_header: str = field(default_factory=lambda: os.getenv("SECURITY_API_KEY_HEADER", "X-API-Key"))
+    # Request validation
+    max_text_length: int = field(default_factory=lambda: int(os.getenv("SECURITY_MAX_TEXT_LENGTH", "100000")))
+    max_chunks: int = field(default_factory=lambda: int(os.getenv("SECURITY_MAX_CHUNKS", "100")))
+    # CORS settings
+    cors_origins: List[str] = field(default_factory=lambda: os.getenv("CORS_ORIGINS", "*").split(","))
+    cors_methods: List[str] = field(default_factory=lambda: os.getenv("CORS_METHODS", "GET,POST").split(","))
+    cors_headers: List[str] = field(default_factory=lambda: os.getenv("CORS_HEADERS", "*").split(","))
+    # Content filtering
+    enable_content_filter: bool = field(default_factory=lambda: os.getenv("SECURITY_CONTENT_FILTER", "true").lower() == "true")
+    blocked_patterns: List[str] = field(default_factory=lambda: os.getenv(
+        "SECURITY_BLOCKED_PATTERNS",
+        ""
+    ).split(",") if os.getenv("SECURITY_BLOCKED_PATTERNS") else [])
+    # IP-based restrictions
+    ip_whitelist: List[str] = field(default_factory=lambda: os.getenv("SECURITY_IP_WHITELIST", "").split(","))
+    ip_blacklist: List[str] = field(default_factory=lambda: os.getenv("SECURITY_IP_BLACKLIST", "").split(","))
+@dataclass
+class MonitoringConfig:
+    """Configuration for monitoring and metrics."""
+    enabled: bool = field(default_factory=lambda: os.getenv("MONITORING_ENABLED", "true").lower() == "true")
+    # Metrics
+    metrics_endpoint: str = field(default_factory=lambda: os.getenv("METRICS_ENDPOINT", "/metrics"))
+    metrics_port: int = field(default_factory=lambda: int(os.getenv("METRICS_PORT", "9090")))
+    # Health checks
+    health_endpoint: str = field(default_factory=lambda: os.getenv("HEALTH_ENDPOINT", "/health"))
+    detailed_health: bool = field(default_factory=lambda: os.getenv("HEALTH_DETAILED", "true").lower() == "true")
+    # Performance tracking
+    track_performance: bool = field(default_factory=lambda: os.getenv("TRACK_PERFORMANCE", "true").lower() == "true")
+    slow_query_threshold_ms: int = field(default_factory=lambda: int(os.getenv("SLOW_QUERY_THRESHOLD", "1000")))
+    # Error tracking
+    track_errors: bool = field(default_factory=lambda: os.getenv("TRACK_ERRORS", "true").lower() == "true")
+    error_sample_rate: float = field(default_factory=lambda: float(os.getenv("ERROR_SAMPLE_RATE", "1.0")))
+    # External integrations
+    sentry_dsn: Optional[str] = field(default_factory=lambda: os.getenv("SENTRY_DSN"))
+    prometheus_gateway: Optional[str] = field(default_factory=lambda: os.getenv("PROMETHEUS_GATEWAY"))
+class TranslationConfig(BaseModel):
+    """Main configuration for the translation service."""
+    environment: Environment = Field(default=Environment.DEVELOPMENT)
+    debug: bool = Field(default=False)
+    # Component configurations
+    gemini: GeminiConfig = Field(default_factory=GeminiConfig)
+    openai_agents: OpenAIAgentsConfig = Field(default_factory=OpenAIAgentsConfig)
+    cache: CacheConfig = Field(default_factory=CacheConfig)
+    database: DatabaseConfig = Field(default_factory=DatabaseConfig)
+    logging: LoggingConfig = Field(default_factory=LoggingConfig)
+    rate_limit: RateLimitConfig = Field(default_factory=RateLimitConfig)
+    security: SecurityConfig = Field(default_factory=SecurityConfig)
+    monitoring: MonitoringConfig = Field(default_factory=MonitoringConfig)
+    # Feature flags
+    features: Dict[str, bool] = Field(default_factory=lambda: {
+        "streaming": True,
+        "quality_check": True,
+        "chunking": True,
+        "code_preservation": True,
+        "html_preservation": True,
+        "batch_translation": True
+    })
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        case_sensitive = False
+    @validator("environment", pre=True)
+    def parse_environment(cls, v):
+        """Parse environment from string."""
+        if isinstance(v, str):
+            return Environment(v.lower())
+        return v
+    def __init__(self, **data):
+        """Initialize configuration with environment detection."""
+        # Auto-detect environment if not specified
+        if "environment" not in data:
+            env = os.getenv("ENVIRONMENT", os.getenv("ENV", "development")).lower()
+            data["environment"] = Environment(env)
+        # Set debug flag based on environment
+        if "debug" not in data:
+            data["debug"] = data["environment"] == Environment.DEVELOPMENT
+        super().__init__(**data)
+        # Validate configuration
+        self.validate_config()
+    def validate_config(self) -> None:
+        """Validate the configuration."""
+        errors = []
+        # Validate Gemini configuration
+        if not self.gemini.api_key:
+            errors.append("GEMINI_API_KEY is required")
+        if self.gemini.timeout <= 0:
+            errors.append("GEMINI_TIMEOUT must be positive")
+        if self.gemini.max_retries < 0:
+            errors.append("GEMINI_MAX_RETRIES must be non-negative")
+        # Validate database URL if provided
+        if self.database.url and not self.database.url.startswith(("sqlite://", "postgresql://", "mysql://")):
+            errors.append("DATABASE_URL must be a valid database connection string")
+        # Validate cache configuration
+        if self.cache.backend == CacheBackend.REDIS and not self.cache.redis_url:
+            errors.append("REDIS_URL is required when using Redis cache backend")
+        # Validate rate limits
+        if self.rate_limit.requests_per_minute <= 0:
+            errors.append("RATE_LIMIT_RPM must be positive")
+        # Log errors and raise if any
+        if errors:
+            for error in errors:
+                logger.error(f"Configuration validation error: {error}")
+            raise ValueError(f"Configuration validation failed: {'; '.join(errors)}")
+        logger.info("Configuration validated successfully", environment=self.environment.value)
+    @classmethod
+    def from_file(cls, config_path: Union[str, Path]) -> "TranslationConfig":
+        """Load configuration from file."""
+        config_path = Path(config_path)
+        if not config_path.exists():
+            raise FileNotFoundError(f"Configuration file not found: {config_path}")
+        # Parse based on file extension
+        with open(config_path, "r", encoding="utf-8") as f:
+            if config_path.suffix.lower() in [".yaml", ".yml"]:
+                data = yaml.safe_load(f)
+            elif config_path.suffix.lower() == ".json":
+                data = json.load(f)
+            else:
+                raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
+        # Override with environment variables
+        return cls(**data)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert configuration to dictionary."""
+        return {
+            "environment": self.environment.value,
+            "debug": self.debug,
+            "gemini": asdict(self.gemini),
+            "openai_agents": asdict(self.openai_agents),
+            "cache": asdict(self.cache),
+            "database": asdict(self.database),
+            "logging": {
+                **asdict(self.logging),
+                "level": self.logging.level.value
+            },
+            "rate_limit": asdict(self.rate_limit),
+            "security": asdict(self.security),
+            "monitoring": asdict(self.monitoring),
+            "features": self.features
+        }
+    def save_to_file(self, config_path: Union[str, Path]) -> None:
+        """Save configuration to file."""
+        config_path = Path(config_path)
+        config_path.parent.mkdir(parents=True, exist_ok=True)
+        data = self.to_dict()
+        with open(config_path, "w", encoding="utf-8") as f:
+            if config_path.suffix.lower() in [".yaml", ".yml"]:
+                yaml.dump(data, f, default_flow_style=False, indent=2)
+            elif config_path.suffix.lower() == ".json":
+                json.dump(data, f, indent=2)
+            else:
+                raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
+        logger.info(f"Configuration saved to {config_path}")
+    def get_model_pricing(self, model: str) -> Dict[str, float]:
+        """Get pricing for a specific model."""
+        return self.gemini.pricing.get(model, self.gemini.pricing["gemini-2.0-flash-lite"])
+    def is_feature_enabled(self, feature: str) -> bool:
+        """Check if a feature is enabled."""
+        return self.features.get(feature, False)
+    def should_use_agents(self) -> bool:
+        """Determine if OpenAI Agents SDK should be used."""
+        return self.openai_agents.enabled and self.is_feature_enabled("quality_check")
+# Global configuration instance
+_config: Optional[TranslationConfig] = None
+def get_config() -> TranslationConfig:
+    """Get the global configuration instance."""
+    global _config
+    if _config is None:
+        _config = TranslationConfig()
+    return _config
+def load_config(config_path: Optional[Union[str, Path]] = None) -> TranslationConfig:
+    """Load configuration from file or environment."""
+    global _config
+    if config_path:
+        _config = TranslationConfig.from_file(config_path)
+    else:
+        _config = TranslationConfig()
+    return _config
+def reload_config() -> TranslationConfig:
+    """Reload configuration from environment."""
+    global _config
+    _config = TranslationConfig()
+    return _config

src/database/base.py CHANGED Viewed

@@ -7,7 +7,7 @@ from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 import os
-# Create the declarative base
 Base = declarative_base()
 # Database URL from environment

 from sqlalchemy.orm import sessionmaker
 import os
+# Create the declarative base - this will be the single source of truth
 Base = declarative_base()
 # Database URL from environment

src/middleware/auth.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+Authentication middleware for API routes.
+This module provides JWT-based authentication middleware for protecting API endpoints.
+"""
+from datetime import datetime, timedelta
+from typing import Optional, Dict, Any
+from fastapi import HTTPException, status, Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from jose import JWTError, jwt
+from passlib.context import CryptContext
+from sqlalchemy.orm import Session
+from src.database.base import get_db
+from src.models.auth import User
+# Configuration
+SECRET_KEY = "your-secret-key-here"  # Should be in environment variables
+ALGORITHM = "HS256"
+ACCESS_TOKEN_EXPIRE_MINUTES = 30
+# Password hashing
+pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
+# Security scheme for FastAPI
+security = HTTPBearer(auto_error=False)
+def verify_password(plain_password: str, hashed_password: str) -> bool:
+    """Verify a password against its hash."""
+    return pwd_context.verify(plain_password, hashed_password)
+def get_password_hash(password: str) -> str:
+    """Generate password hash."""
+    return pwd_context.hash(password)
+def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
+    """Create JWT access token."""
+    to_encode = data.copy()
+    if expires_delta:
+        expire = datetime.utcnow() + expires_delta
+    else:
+        expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    to_encode.update({"exp": expire})
+    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return encoded_jwt
+def decode_token(token: str) -> Dict[str, Any]:
+    """Decode and validate JWT token."""
+    try:
+        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+        return payload
+    except JWTError as e:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail=f"Could not validate credentials: {str(e)}",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+async def get_current_user(
+    credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
+    db: Session = Depends(get_db)
+) -> User:
+    """Get the current authenticated user."""
+    if not credentials:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Not authenticated",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    token = credentials.credentials
+    payload = decode_token(token)
+    user_id: str = payload.get("sub")
+    if user_id is None:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Could not validate credentials",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    user = db.query(User).filter(User.id == user_id).first()
+    if user is None:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="User not found",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    return user
+async def get_current_active_user(
+    current_user: User = Depends(get_current_user)
+) -> User:
+    """Get the current active user."""
+    if not current_user.is_active:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Inactive user"
+        )
+    return current_user
+async def get_optional_current_user(
+    credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
+    db: Session = Depends(get_db)
+) -> Optional[User]:
+    """Get the current user if authenticated, otherwise return None."""
+    if not credentials:
+        return None
+    try:
+        token = credentials.credentials
+        payload = decode_token(token)
+        user_id: str = payload.get("sub")
+        if user_id is None:
+            return None
+        user = db.query(User).filter(User.id == user_id).first()
+        return user if user and user.is_active else None
+    except HTTPException:
+        return None
+# Role-based access control
+class RoleChecker:
+    """Check if user has required role."""
+    def __init__(self, allowed_roles: list):
+        self.allowed_roles = allowed_roles
+    def __call__(self, current_user: User = Depends(get_current_active_user)) -> User:
+        if current_user.role not in self.allowed_roles:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Not enough permissions"
+            )
+        return current_user
+# Pre-defined role checkers
+require_admin = RoleChecker(["admin"])
+require_user = RoleChecker(["user", "admin"])
+# Authentication dependencies
+def authenticate_user(email: str, password: str, db: Session) -> Optional[User]:
+    """Authenticate user with email and password."""
+    user = db.query(User).filter(User.email == email).first()
+    if not user:
+        return None
+    if not verify_password(password, user.hashed_password):
+        return None
+    return user
+# Rate limiting middleware
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+limiter = Limiter(key_func=get_remote_address)
+class RateLimitMiddleware:
+    """Rate limiting middleware for API endpoints."""
+    def __init__(self, times: int, milliseconds: int):
+        self.times = times
+        self.milliseconds = milliseconds
+    def __call__(self, endpoint):
+        return limiter.limit(f"{self.times}/{self.milliseconds}milliseconds")(endpoint)
+# Pre-defined rate limiters
+auth_rate_limit = RateLimitMiddleware(5, 60000)  # 5 requests per minute
+general_rate_limit = RateLimitMiddleware(100, 60000)  # 100 requests per minute
+upload_rate_limit = RateLimitMiddleware(10, 60000)  # 10 requests per minute
+# CORS middleware configuration
+from fastapi.middleware.cors import CORSMiddleware
+def create_cors_middleware(allow_origins: list = None) -> CORSMiddleware:
+    """Create CORS middleware with specified origins."""
+    return CORSMiddleware(
+        allow_origins=allow_origins or ["http://localhost:3000"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+# Request logging middleware
+import logging
+import time
+from fastapi import Request, Response
+logger = logging.getLogger(__name__)
+async def log_requests(request: Request, call_next):
+    """Log all API requests with timing."""
+    start_time = time.time()
+    # Get client IP
+    client_ip = request.client.host if request.client else "unknown"
+    # Get user if authenticated
+    user = getattr(request.state, 'user', None)
+    user_id = user.id if user else "anonymous"
+    # Log request
+    logger.info(
+        f"Request started",
+        extra={
+            "method": request.method,
+            "url": str(request.url),
+            "client_ip": client_ip,
+            "user_id": user_id,
+            "headers": dict(request.headers),
+        }
+    )
+    # Process request
+    response = await call_next(request)
+    # Calculate duration
+    process_time = time.time() - start_time
+    # Log response
+    logger.info(
+        f"Request completed",
+        extra={
+            "method": request.method,
+            "url": str(request.url),
+            "status_code": response.status_code,
+            "process_time": process_time,
+            "client_ip": client_ip,
+            "user_id": user_id,
+        }
+    )
+    # Add timing header
+    response.headers["X-Process-Time"] = str(process_time)
+    return response
+# Security headers middleware
+async def add_security_headers(request: Request, call_next):
+    """Add security headers to responses."""
+    response = await call_next(request)
+    # Add security headers
+    response.headers["X-Content-Type-Options"] = "nosniff"
+    response.headers["X-Frame-Options"] = "DENY"
+    response.headers["X-XSS-Protection"] = "1; mode=block"
+    response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
+    response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+    response.headers["Content-Security-Policy"] = "default-src 'self'"
+    return response
+# Token refresh endpoint dependencies
+def create_refresh_token(data: dict) -> str:
+    """Create a refresh token with longer expiry."""
+    to_encode = data.copy()
+    expire = datetime.utcnow() + timedelta(days=7)  # 7 days
+    to_encode.update({"exp": expire, "type": "refresh"})
+    return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+async def verify_refresh_token(token: str) -> Dict[str, Any]:
+    """Verify refresh token and return payload."""
+    try:
+        payload = decode_token(token)
+        if payload.get("type") != "refresh":
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                detail="Invalid refresh token"
+            )
+        return payload
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail=f"Could not validate refresh token: {str(e)}"
+        )

src/middleware/cors.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""
+CORS middleware configuration for frontend-backend communication.
+Provides configurable Cross-Origin Resource Sharing middleware.
+"""
+import os
+from typing import List, Optional
+from fastapi import FastAPI, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.base import BaseHTTPMiddleware
+from starlette.middleware.base import RequestResponseEndpoint
+class CustomCORSMiddleware(BaseHTTPMiddleware):
+    """Custom CORS middleware with additional security features."""
+    def __init__(
+        self,
+        app: FastAPI,
+        allow_origins: List[str] = None,
+        allow_methods: List[str] = None,
+        allow_headers: List[str] = None,
+        expose_headers: List[str] = None,
+        allow_credentials: bool = True,
+        max_age: int = 86400,  # 24 hours
+        strict_mode: bool = False
+    ):
+        super().__init__(app)
+        self.allow_origins = allow_origins or self._get_default_origins()
+        self.allow_methods = allow_methods or ["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"]
+        self.allow_headers = allow_headers or ["*"]
+        self.expose_headers = expose_headers or []
+        self.allow_credentials = allow_credentials
+        self.max_age = max_age
+        self.strict_mode = strict_mode
+        # Apply FastAPI's CORS middleware
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=self.allow_origins,
+            allow_credentials=self.allow_credentials,
+            allow_methods=self.allow_methods,
+            allow_headers=self.allow_headers,
+            expose_headers=self.expose_headers,
+            max_age=self.max_age
+        )
+    def _get_default_origins(self) -> List[str]:
+        """Get default allowed origins from environment."""
+        env_origins = os.getenv("CORS_ORIGINS", "")
+        if env_origins:
+            return [origin.strip() for origin in env_origins.split(",")]
+        # Default origins for development
+        default_origins = [
+            "http://localhost:3000",
+            "http://localhost:3001",
+            "http://127.0.0.1:3000",
+            "http://127.0.0.1:3001",
+        ]
+        # Add production URL if available
+        if os.getenv("FRONTEND_URL"):
+            default_origins.append(os.getenv("FRONTEND_URL"))
+        return default_origins
+    async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
+        """Add additional CORS security features."""
+        # Handle preflight requests
+        if request.method == "OPTIONS":
+            # Add additional security headers for preflight
+            response = await call_next(request)
+        else:
+            response = await call_next(request)
+        # Add security headers
+        self._add_security_headers(request, response)
+        # Log CORS requests in strict mode
+        if self.strict_mode:
+            self._log_cors_request(request, response)
+        return response
+    def _add_security_headers(self, request: Request, response: Response):
+        """Add additional security headers."""
+        # Remove server information
+        response.headers["Server"] = ""
+        # CSP header (Content Security Policy)
+        csp_directives = [
+            "default-src 'self'",
+            "script-src 'self' 'unsafe-inline' 'unsafe-eval'",
+            "style-src 'self' 'unsafe-inline'",
+            "img-src 'self' data: https:",
+            "font-src 'self' data:",
+            "connect-src 'self'",
+            "frame-ancestors 'none'",
+            "base-uri 'self'",
+            "form-action 'self'",
+        ]
+        response.headers["Content-Security-Policy"] = "; ".join(csp_directives)
+        # Additional security headers
+        response.headers["X-Content-Type-Options"] = "nosniff"
+        response.headers["X-Frame-Options"] = "DENY"
+        response.headers["X-XSS-Protection"] = "1; mode=block"
+        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+        # HSTS (only in production with HTTPS)
+        if os.getenv("ENVIRONMENT") == "production" and request.url.scheme == "https":
+            response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
+        # Permissions Policy
+        permissions_policy = [
+            "geolocation=()",
+            "microphone=()",
+            "camera=()",
+            "payment=()",
+            "usb=()",
+            "magnetometer=()",
+            "gyroscope=()",
+            "accelerometer=()",
+        ]
+        response.headers["Permissions-Policy"] = ", ".join(permissions_policy)
+    def _log_cors_request(self, request: Request, response: Response):
+        """Log CORS-related requests for monitoring."""
+        from src.utils.logging import get_logger
+        logger = get_logger("cors")
+        origin = request.headers.get("origin")
+        if origin:
+            if origin not in self.allow_origins:
+                logger.warning(
+                    "Cross-origin request from unauthorized origin",
+                    origin=origin,
+                    path=request.url.path,
+                    method=request.method,
+                )
+            else:
+                logger.info(
+                    "Cross-origin request allowed",
+                    origin=origin,
+                    path=request.url.path,
+                    method=request.method,
+                )
+class RateLimitCORSMiddleware(BaseHTTPMiddleware):
+    """CORS middleware with rate limiting per origin."""
+    def __init__(
+        self,
+        app: FastAPI,
+        requests_per_minute: int = 100,
+        burst_size: int = 200
+    ):
+        super().__init__(app)
+        self.requests_per_minute = requests_per_minute
+        self.burst_size = burst_size
+        self.request_counts = {}  # Simple in-memory tracking
+    async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
+        """Apply rate limiting based on origin."""
+        import time
+        from fastapi import HTTPException
+        origin = request.headers.get("origin")
+        if origin:
+            current_time = time.time()
+            minute_key = int(current_time // 60)
+            # Clean old entries
+            self._cleanup_old_entries(minute_key)
+            # Track requests
+            origin_key = f"{origin}:{minute_key}"
+            count = self.request_counts.get(origin_key, 0)
+            if count >= self.requests_per_minute:
+                raise HTTPException(
+                    status_code=429,
+                    detail="Too many requests from this origin",
+                    headers={
+                        "Retry-After": "60",
+                        "X-RateLimit-Limit": str(self.requests_per_minute),
+                        "X-RateLimit-Remaining": "0",
+                        "X-RateLimit-Reset": str((minute_key + 1) * 60)
+                    }
+                )
+            self.request_counts[origin_key] = count + 1
+        response = await call_next(request)
+        # Add rate limit headers
+        if origin:
+            response.headers["X-RateLimit-Limit"] = str(self.requests_per_minute)
+            remaining = max(0, self.requests_per_minute - self.request_counts.get(origin_key, 0))
+            response.headers["X-RateLimit-Remaining"] = str(remaining)
+        return response
+    def _cleanup_old_entries(self, current_minute: int):
+        """Remove old entries from request counts."""
+        keys_to_remove = []
+        for key in self.request_counts.keys():
+            key_minute = int(key.split(":")[-1])
+            if current_minute - key_minute > 5:  # Keep 5 minutes of history
+                keys_to_remove.append(key)
+        for key in keys_to_remove:
+            del self.request_counts[key]
+def configure_cors(
+    app: FastAPI,
+    environment: str = "development"
+) -> None:
+    """Configure CORS based on environment."""
+    if environment == "production":
+        # Production CORS settings
+        origins = os.getenv("CORS_ORIGINS", "").split(",") if os.getenv("CORS_ORIGINS") else []
+        # Add production frontend URL
+        frontend_url = os.getenv("FRONTEND_URL")
+        if frontend_url and frontend_url not in origins:
+            origins.append(frontend_url)
+        # In production, be strict about origins
+        if origins:
+            app.add_middleware(
+                CustomCORSMiddleware,
+                allow_origins=origins,
+                allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+                allow_headers=["Authorization", "Content-Type", "X-Requested-With"],
+                expose_headers=["X-Total-Count", "X-Page-Count"],
+                strict_mode=True
+            )
+        # Add rate limiting
+        app.add_middleware(
+            RateLimitCORSMiddleware,
+            requests_per_minute=int(os.getenv("RATE_LIMIT_PER_MINUTE", "100"))
+        )
+    else:
+        # Development CORS settings - more permissive
+        app.add_middleware(
+            CustomCORSMiddleware,
+            allow_origins=[
+                "http://localhost:3000",
+                "http://localhost:3001",
+                "http://127.0.0.1:3000",
+                "http://127.0.0.1:3001",
+                "http://localhost:5173",  # Vite dev server
+                "http://127.0.0.1:5173",
+            ],
+            allow_credentials=True,
+            strict_mode=False
+        )
+# CORS configuration for specific routes
+class RouteSpecificCORSMiddleware(BaseHTTPMiddleware):
+    """Apply different CORS settings to specific routes."""
+    def __init__(
+        self,
+        app: FastAPI,
+        path_prefix: str,
+        cors_config: dict
+    ):
+        super().__init__(app)
+        self.path_prefix = path_prefix
+        self.cors_config = cors_config
+    async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
+        """Apply route-specific CORS configuration."""
+        if request.url.path.startswith(self.path_prefix):
+            # Apply custom CORS settings for this route
+            origin = request.headers.get("origin")
+            if origin and self.cors_config.get("allowed_origins"):
+                if origin in self.cors_config["allowed_origins"]:
+                    response = await call_next(request)
+                    response.headers["Access-Control-Allow-Origin"] = origin
+                    response.headers["Access-Control-Allow-Credentials"] = "true"
+                    for method in self.cors_config.get("allowed_methods", []):
+                        response.headers["Access-Control-Allow-Methods"] = ", ".join(methods)
+                    for header in self.cors_config.get("allowed_headers", []):
+                        response.headers["Access-Control-Allow-Headers"] = ", ".join(headers)
+                    return response
+        else:
+            # Use default CORS handling
+            return await call_next(request)
+# Pre-configured CORS settings for different environments
+CORS_CONFIGS = {
+    "development": {
+        "allowed_origins": ["http://localhost:3000", "http://localhost:5173"],
+        "allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+        "allowed_headers": ["*"],
+        "allow_credentials": True,
+        "strict_mode": False
+    },
+    "staging": {
+        "allowed_origins": ["https://staging.example.com"],
+        "allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+        "allowed_headers": ["Authorization", "Content-Type"],
+        "allow_credentials": True,
+        "strict_mode": True
+    },
+    "production": {
+        "allowed_origins": ["https://example.com"],
+        "allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+        "allowed_headers": ["Authorization", "Content-Type"],
+        "allow_credentials": True,
+        "strict_mode": True
+    }
+}
+def setup_cors_with_config(
+    app: FastAPI,
+    config_name: str = "development"
+) -> None:
+    """Setup CORS using pre-configured settings."""
+    config = CORS_CONFIGS.get(config_name, CORS_CONFIGS["development"])
+    app.add_middleware(
+        CustomCORSMiddleware,
+        **config
+    )
+    # Log CORS configuration
+    from src.utils.logging import get_logger
+    logger = get_logger("cors")
+    logger.info(
+        "CORS configured",
+        environment=config_name,
+        allowed_origins=config["allowed_origins"],
+        allow_credentials=config["allow_credentials"]
+    )

src/middleware/rate_limit.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""
+Rate Limiting Middleware for Translation API.
+This middleware implements per-IP and per-user rate limiting
+to prevent abuse and manage Gemini API quotas effectively.
+"""
+import time
+import asyncio
+from typing import Dict, Optional
+from fastapi import Request, HTTPException, status
+from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
+from src.utils.translation_logger import get_translation_logger
+logger = get_translation_logger(__name__)
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    """
+    Middleware for rate limiting API requests.
+    Implements:
+    - Per-IP rate limiting
+    - Per-user rate limiting (if authenticated)
+    - Sliding window algorithm
+    - Redis-based storage (if available)
+    - In-memory fallback
+    """
+    def __init__(
+        self,
+        app,
+        *,
+        requests_per_minute: int = 60,
+        requests_per_hour: int = 1000,
+        redis_client=None
+    ):
+        """
+        Initialize rate limit middleware.
+        Args:
+            app: FastAPI application
+            requests_per_minute: Requests allowed per minute per client
+            requests_per_hour: Requests allowed per hour per client
+            redis_client: Optional Redis client for distributed rate limiting
+        """
+        super().__init__(app)
+        self.requests_per_minute = requests_per_minute
+        self.requests_per_hour = requests_per_hour
+        self.redis_client = redis_client
+        # In-memory storage fallback
+        self.ip_rate_limits: Dict[str, Dict[str, Any]] = {}
+        self.user_rate_limits: Dict[str, Dict[str, Any]] = {}
+        logger.info(
+            "Rate limit middleware initialized",
+            requests_per_minute=requests_per_minute,
+            requests_per_hour=requests_hour,
+            redis_enabled=redis_client is not None
+        )
+    async def dispatch(self, request: Request, call_next):
+        """
+        Process request with rate limiting.
+        Args:
+            request: Incoming request
+            call_next: Next middleware/endpoint
+        Returns:
+            Response or rate limit error
+        """
+        # Skip rate limiting for health checks
+        if request.url.path in ["/health", "/health/enhanced", "/metrics/health"]:
+            return await call_next(request)
+        # Get client identifiers
+        client_ip = self._get_client_ip(request)
+        user_id = self._get_user_id(request)
+        # Check rate limits
+        await self._check_rate_limits(client_ip, user_id)
+        # Process request
+        response = await call_next(request)
+        # Add rate limit headers
+        self._add_rate_limit_headers(response, client_ip, user_id)
+        return response
+    def _get_client_ip(self, request: Request) -> str:
+        """Get client IP address from request."""
+        # Check for forwarded headers
+        forwarded_for = request.headers.get("X-Forwarded-For")
+        if forwarded_for:
+            return forwarded_for.split(",")[0].strip()
+        real_ip = request.headers.get("X-Real-IP")
+        if real_ip:
+            return real_ip
+        # Fall back to direct connection IP
+        return request.client.host if request.client else "unknown"
+    def _get_user_id(self, request: Request) -> Optional[str]:
+        """Get user ID from request if authenticated."""
+        # This would extract from JWT token or session
+        # For now, return None to implement IP-based limiting only
+        return None
+    async def _check_rate_limits(self, client_ip: str, user_id: Optional[str]) -> None:
+        """
+        Check if client has exceeded rate limits.
+        Args:
+            client_ip: Client IP address
+            user_id: Optional user ID
+        Raises:
+            HTTPException: If rate limit exceeded
+        """
+        now = time.time()
+        # Check per-IP limits
+        ip_data = await self._get_rate_limit_data(f"ip:{client_ip}")
+        if self._is_rate_limited(ip_data, now):
+            retry_after = self._calculate_retry_after(ip_data, now)
+            logger.warning(
+                "IP rate limit exceeded",
+                client_ip=client_ip,
+                requests_in_minute=ip_data.get("minute_requests", 0),
+                retry_after=retry_after
+            )
+            raise HTTPException(
+                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                detail={
+                    "error": "RATE_LIMIT_EXCEEDED",
+                    "message": f"IP rate limit exceeded. Please wait {retry_after:.1f} seconds.",
+                    "retry_after": retry_after,
+                    "limit_type": "ip"
+                }
+            )
+        # Check per-user limits if authenticated
+        if user_id:
+            user_data = await self._get_rate_limit_data(f"user:{user_id}")
+            if self._is_rate_limited(user_data, now):
+                retry_after = self._calculate_retry_after(user_data, now)
+                logger.warning(
+                    "User rate limit exceeded",
+                    user_id=user_id,
+                    requests_in_minute=user_data.get("minute_requests", 0),
+                    retry_after=retry_after
+                )
+                raise HTTPException(
+                    status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                    detail={
+                        "error": "RATE_LIMIT_EXCEEDED",
+                        "message": f"User rate limit exceeded. Please wait {retry_after:.1f} seconds.",
+                        "retry_after": retry_after,
+                        "limit_type": "user"
+                    }
+                )
+        # Update rate limit data
+        await self._update_rate_limit_data(f"ip:{client_ip}", now)
+        if user_id:
+            await self._update_rate_limit_data(f"user:{user_id}", now)
+    async def _get_rate_limit_data(self, key: str) -> Dict[str, Any]:
+        """Get rate limit data for a client."""
+        if self.redis_client:
+            try:
+                # Get data from Redis
+                data = await self.redis_client.hgetall(f"rate_limit:{key}")
+                if data:
+                    return {
+                        "minute_requests": int(data.get("minute_requests", 0)),
+                        "minute_window": float(data.get("minute_window", 0)),
+                        "hour_requests": int(data.get("hour_requests", 0)),
+                        "hour_window": float(data.get("hour_window", 0)),
+                        "last_request": float(data.get("last_request", 0))
+                    }
+            except Exception as e:
+                logger.warning("Redis rate limit read failed", error=str(e))
+        # Fall back to in-memory
+        if key.startswith("ip:"):
+            storage = self.ip_rate_limits
+            key = key[3:]  # Remove "ip:" prefix
+        else:
+            storage = self.user_rate_limits
+            key = key[5:]  # Remove "user:" prefix
+        return storage.get(key, {
+            "minute_requests": 0,
+            "minute_window": 0,
+            "hour_requests": 0,
+            "hour_window": 0,
+            "last_request": 0
+        })
+    async def _update_rate_limit_data(self, key: str, now: float) -> None:
+        """Update rate limit data for a client."""
+        # Get current data
+        data = await self._get_rate_limit_data(key)
+        # Update minute window
+        if now - data["minute_window"] > 60:
+            data["minute_requests"] = 1
+            data["minute_window"] = now
+        else:
+            data["minute_requests"] += 1
+        # Update hour window
+        if now - data["hour_window"] > 3600:
+            data["hour_requests"] = 1
+            data["hour_window"] = now
+        else:
+            data["hour_requests"] += 1
+        data["last_request"] = now
+        # Save updated data
+        if self.redis_client:
+            try:
+                # Save to Redis with TTL
+                await self.redis_client.hset(
+                    f"rate_limit:{key}",
+                    mapping={
+                        "minute_requests": str(data["minute_requests"]),
+                        "minute_window": str(data["minute_window"]),
+                        "hour_requests": str(data["hour_requests"]),
+                        "hour_window": str(data["hour_window"]),
+                        "last_request": str(data["last_request"])
+                    }
+                )
+                # Set TTL to 1 hour
+                await self.redis_client.expire(f"rate_limit:{key}", 3600)
+            except Exception as e:
+                logger.warning("Redis rate limit write failed", error=str(e))
+        # Fall back to in-memory
+        if key.startswith("ip:"):
+            storage = self.ip_rate_limits
+            key = key[3:]  # Remove "ip:" prefix
+        else:
+            storage = self.user_rate_limits
+            key = key[5:]  # Remove "user:" prefix
+        storage[key] = data
+        # Cleanup old entries (simple cleanup every 100 requests)
+        if data["minute_requests"] % 100 == 0:
+            await self._cleanup_old_entries(now)
+    async def _cleanup_old_entries(self, now: float) -> None:
+        """Clean up old rate limit entries."""
+        cutoff = now - 3600  # 1 hour ago
+        # Cleanup IP entries
+        to_remove = []
+        for ip, data in self.ip_rate_limits.items():
+            if data["last_request"] < cutoff:
+                to_remove.append(ip)
+        for ip in to_remove:
+            del self.ip_rate_limits[ip]
+        # Cleanup user entries
+        to_remove = []
+        for user, data in self.user_rate_limits.items():
+            if data["last_request"] < cutoff:
+                to_remove.append(user)
+        for user in to_remove:
+            del self.user_rate_limits[user]
+        if to_remove:
+            logger.debug("Cleaned up old rate limit entries", count=len(to_remove))
+    def _is_rate_limited(self, data: Dict[str, Any], now: float) -> bool:
+        """Check if client has exceeded rate limits."""
+        # Check minute limit
+        if now - data["minute_window"] < 60:
+            if data["minute_requests"] >= self.requests_per_minute:
+                return True
+        # Check hour limit
+        if now - data["hour_window"] < 3600:
+            if data["hour_requests"] >= self.requests_per_hour:
+                return True
+        return False
+    def _calculate_retry_after(self, data: Dict[str, Any], now: float) -> float:
+        """Calculate retry-after time based on rate limit data."""
+        # Check minute limit
+        if now - data["minute_window"] < 60 and data["minute_requests"] >= self.requests_per_minute:
+            return 60 - (now - data["minute_window"])
+        # Check hour limit
+        if now - data["hour_window"] < 3600 and data["hour_requests"] >= self.requests_per_hour:
+            return 3600 - (now - data["hour_window"])
+        return 60.0  # Default retry after
+    def _add_rate_limit_headers(
+        self,
+        response,
+        client_ip: str,
+        user_id: Optional[str]
+    ) -> None:
+        """Add rate limit headers to response."""
+        now = time.time()
+        # Get current limits
+        ip_data = asyncio.create_task(self._get_rate_limit_data(f"ip:{client_ip}"))
+        ip_data_result = asyncio.run(ip_data)
+        # Add headers
+        response.headers["X-RateLimit-Limit-Minute"] = str(self.requests_per_minute)
+        response.headers["X-RateLimit-Limit-Hour"] = str(self.requests_per_hour)
+        response.headers["X-RateLimit-Remaining-Minute"] = str(
+            max(0, self.requests_per_minute - ip_data_result.get("minute_requests", 0))
+        )
+        response.headers["X-RateLimit-Remaining-Hour"] = str(
+            max(0, self.requests_per_hour - ip_data_result.get("hour_requests", 0))
+        )
+        # Add reset time
+        if ip_data_result.get("minute_window", 0):
+            reset_time = ip_data_result["minute_window"] + 60
+            response.headers["X-RateLimit-Reset"] = str(int(reset_time))
+class TranslationRateLimitMiddleware(RateLimitMiddleware):
+    """
+    Specialized rate limit middleware for translation endpoints.
+    Implements stricter limits for translation endpoints to manage
+    Gemini API quotas effectively.
+    """
+    def __init__(
+        self,
+        app,
+        *,
+        redis_client=None
+    ):
+        """
+        Initialize translation rate limit middleware.
+        Args:
+            app: FastAPI application
+            redis_client: Optional Redis client
+        """
+        # Stricter limits for translation endpoints
+        super().__init__(
+            app,
+            requests_per_minute=10,  # 10 translations per minute
+            requests_per_hour=500,   # 500 translations per hour
+            redis_client=redis_client
+        )
+        logger.info(
+            "Translation rate limit middleware initialized",
+            requests_per_minute=10,
+            requests_per_hour=500
+        )
+    async def dispatch(self, request: Request, call_next):
+        """
+        Process request with translation-specific rate limiting.
+        Only applies to translation endpoints.
+        """
+        # Check if this is a translation endpoint
+        if not request.url.path.startswith("/translation/"):
+            return await call_next(request)
+        # Apply rate limiting
+        return await super().dispatch(request, call_next)

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Import all models to ensure they are registered with SQLAlchemy.
+"""
+# Import all models to register them with SQLAlchemy
+from .auth import (
+    User, Account, UserBackground, OnboardingResponse, Session,
+    PasswordResetToken, AnonymousSession, ChatSession, ChatMessage,
+    UserPreferences, MessageVersion, ChatFolder, ChatTag, MessageReaction
+)
+from .translation_openai import (
+    TranslationJob, TranslationChunk, TranslationError,
+    TranslationSession, TranslationCache, TranslationMetrics,
+    TranslationJobStatus, ChunkStatus, ErrorSeverity
+)
+# Export all models
+__all__ = [
+    # Auth models
+    "User", "Account", "UserBackground", "OnboardingResponse", "Session",
+    "PasswordResetToken", "AnonymousSession", "ChatSession", "ChatMessage",
+    "UserPreferences", "MessageVersion", "ChatFolder", "ChatTag", "MessageReaction",
+    # Translation models
+    "TranslationJob", "TranslationChunk", "TranslationError",
+    "TranslationSession", "TranslationCache", "TranslationMetrics",
+    "TranslationJobStatus", "ChunkStatus", "ErrorSeverity"
+]

src/models/auth.py CHANGED Viewed

@@ -42,6 +42,9 @@ class User(Base):
     chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
     folders = relationship("ChatFolder", back_populates="user", cascade="all, delete-orphan")
     tags = relationship("ChatTag", back_populates="user", cascade="all, delete-orphan")
 class Account(Base):

     chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
     folders = relationship("ChatFolder", back_populates="user", cascade="all, delete-orphan")
     tags = relationship("ChatTag", back_populates="user", cascade="all, delete-orphan")
+    translation_jobs = relationship("TranslationJob", back_populates="user", cascade="all, delete-orphan")
+    translation_sessions = relationship("TranslationSession", back_populates="user", cascade="all, delete-orphan")
+    translation_metrics = relationship("TranslationMetrics", back_populates="user", cascade="all, delete-orphan")
 class Account(Base):

src/models/base.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Base model for reader features.
+"""
+from datetime import datetime
+import uuid
+from sqlalchemy import Column, String, DateTime
+from sqlalchemy.sql import func
+from src.database.base import Base
+class BaseModel(Base):
+    """Base model with common fields for reader features."""
+    __abstract__ = True
+    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
+    def to_dict(self):
+        """Convert model to dictionary."""
+        return {
+            column.name: getattr(self, column.name)
+            for column in self.__table__.columns
+        }

src/models/bookmark.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Bookmark model for user-saved page references with optional metadata.
+"""
+from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey
+from sqlalchemy.orm import relationship
+from sqlalchemy.sql import func
+from src.models.base import BaseModel
+class Bookmark(BaseModel):
+    """Represents user-saved page references with optional metadata."""
+    __tablename__ = "bookmarks"
+    user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
+    chapter_id = Column(String(255), nullable=False, index=True)
+    section_id = Column(String(255), nullable=True)
+    page_url = Column(String(2048), nullable=False)
+    page_title = Column(String(255), nullable=False)
+    snippet = Column(Text, nullable=True)
+    note = Column(String(1000), nullable=True)
+    is_private = Column(Boolean, nullable=False, default=True)
+    # Relationships
+    user = relationship("User", back_populates="bookmarks")
+    tags = relationship("BookmarkTag", back_populates="bookmark", cascade="all, delete-orphan")
+    __table_args__ = (
+        {"extend_existing": True},
+    )
+    def __repr__(self):
+        return f"<Bookmark(id='{self.id}', user_id='{self.user_id}', title='{self.page_title}')>"
+class BookmarkTag(BaseModel):
+    """Tags for organizing bookmarks."""
+    __tablename__ = "bookmark_tags"
+    bookmark_id = Column(String(36), ForeignKey("bookmarks.id"), nullable=False, index=True)
+    tag = Column(String(50), nullable=False, index=True)
+    # Relationships
+    bookmark = relationship("Bookmark", back_populates="tags")
+    __table_args__ = (
+        {"extend_existing": True},
+    )
+    def __repr__(self):
+        return f"<BookmarkTag(bookmark_id='{self.bookmark_id}', tag='{self.tag}')>"

src/models/chat.py CHANGED Viewed

@@ -30,7 +30,7 @@ class ChatMessage(Base):
     chat_session_id = Column(String(36), ForeignKey("chat_sessions.id"), nullable=False)
     role = Column(SQLEnum(Role), nullable=False)
     content = Column(Text, nullable=False)
-    metadata = Column(JSON, nullable=True)
     created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
     # Relationships

     chat_session_id = Column(String(36), ForeignKey("chat_sessions.id"), nullable=False)
     role = Column(SQLEnum(Role), nullable=False)
     content = Column(Text, nullable=False)
+    message_metadata = Column(JSON, nullable=True)
     created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
     # Relationships

src/models/content_localization.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Content localization model for tracking translation status of content pages.
+"""
+from datetime import datetime
+from enum import Enum
+from sqlalchemy import Column, String, Integer, DateTime, Boolean, JSON, Index
+from sqlalchemy.dialects.postgresql import ENUM as Enum
+from src.database.base import Base
+class ProcessingStatus(Enum):
+    """Processing status for content localization."""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    PARTIAL = "partial"  # Some chunks failed
+class ContentLocalization(Base):
+    """Tracks the translation status and metadata for content pages."""
+    __tablename__ = "content_localization"
+    id = Column(Integer, primary_key=True)
+    content_url = Column(String(500), nullable=False, index=True)
+    content_hash = Column(String(64), nullable=False, index=True)
+    # Localization status
+    is_translated = Column(Boolean, default=False)
+    last_translation_date = Column(DateTime)
+    translation_cache_key = Column(String(64))
+    # Content metadata
+    word_count = Column(Integer)
+    character_count = Column(Integer)
+    has_code_blocks = Column(Boolean, default=False)
+    detected_languages = Column(JSON)  # Array of detected languages
+    # Processing metadata
+    chunk_count = Column(Integer, default=1)
+    processing_status = Column(Enum(ProcessingStatus), default=ProcessingStatus.PENDING)
+    # Metadata
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    def __repr__(self):
+        return f"<ContentLocalization(url='{self.content_url}', status='{self.processing_status}', translated={self.is_translated})>"

src/models/personalization.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+PersonalizationProfile model for managing user preferences and learning styles.
+"""
+from datetime import datetime
+from enum import Enum
+from sqlalchemy import Column, Integer, String, DateTime, Boolean, JSON
+from sqlalchemy.dialects.postgresql import ENUM as Enum
+from src.database.base import Base
+class ReadingLevel(Enum):
+    """Reading proficiency levels."""
+    BEGINNER = "beginner"
+    INTERMEDIATE = "intermediate"
+    ADVANCED = "advanced"
+class LearningStyle(Enum):
+    """Learning style preferences."""
+    VISUAL = "visual"      # More examples, diagrams
+    PRACTICAL = "practical"  # Focus on code, implementation
+    THEORETICAL = "theoretical"  # Focus on concepts, theory
+    BALANCED = "balanced"
+class TermHandling(Enum):
+    """Technical term handling preferences."""
+    TRANSLATE = "translate"      # Translate technical terms
+    TRANSLITERATE = "transliterate"  # Keep in Urdu script
+    KEEP_ENGLISH = "keep_english"  # Leave in English
+class PersonalizationProfile(Base):
+    """Represents user preferences for personalized content delivery."""
+    __tablename__ = "personalization_profiles"
+    id = Column(Integer, primary_key=True)
+    user_id = Column(String(36), unique=True, nullable=False, index=True)
+    # Reading preferences
+    reading_level = Column(Enum(ReadingLevel), default=ReadingLevel.INTERMEDIATE)
+    preferred_language = Column(String(10), default='en')
+    # Content preferences
+    focus_areas = Column(JSON)  # Array of topics user cares about
+    learning_style = Column(Enum(LearningStyle), default=LearningStyle.BALANCED)
+    # Translation preferences
+    enable_transliteration = Column(Boolean, default=True)
+    technical_term_handling = Column(Enum(TermHandling), default=TermHandling.TRANSLITERATE)
+    # UI preferences
+    font_size = Column(Integer, default=16)
+    focus_mode_preferences = Column(JSON)
+    # Metadata
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    last_active = Column(DateTime, default=datetime.utcnow)
+    def __repr__(self):
+        return f"<PersonalizationProfile(user_id='{self.user_id}', reading_level='{self.reading_level}')>"

src/models/reading_progress.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Reading progress model for tracking user progress through chapters and sections.
+"""
+from sqlalchemy import Column, String, Float, Boolean, Integer, DateTime, ForeignKey
+from sqlalchemy.orm import relationship
+from sqlalchemy.sql import func
+from src.models.base import BaseModel
+class ReadingProgress(BaseModel):
+    """Stores user's reading progress through chapters and sections."""
+    __tablename__ = "reading_progress"
+    user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
+    chapter_id = Column(String(255), nullable=False, index=True)
+    section_id = Column(String(255), nullable=False)
+    position = Column(Float, nullable=False, default=0.0)  # 0-100 percentage
+    completed = Column(Boolean, nullable=False, default=False)
+    time_spent = Column(Integer, nullable=False, default=0)  # Minutes
+    last_accessed = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    # Relationships
+    user = relationship("User", back_populates="reading_progress")
+    # Unique constraint to ensure one progress record per user per section
+    __table_args__ = (
+        {"extend_existing": True},
+    )
+    def __repr__(self):
+        return f"<ReadingProgress(user_id='{self.user_id}', chapter='{self.chapter_id}', position={self.position}%)>"

src/models/search_index.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+Search index model for enabling fast content retrieval across languages.
+"""
+from sqlalchemy import Column, String, Float, DateTime, Text
+from sqlalchemy.sql import func
+from src.models.base import BaseModel
+class SearchIndex(BaseModel):
+    """Enables fast content retrieval across languages."""
+    __tablename__ = "search_index"
+    content_id = Column(String(255), nullable=False, index=True)
+    language = Column(String(10), nullable=False, index=True)  # en, ur, ur-roman
+    content_type = Column(String(20), nullable=False, index=True)  # chapter, section, bookmark
+    title = Column(String(255), nullable=False)
+    content = Column(Text, nullable=False)
+    chapter_id = Column(String(255), nullable=False, index=True)
+    section_id = Column(String(255), nullable=True)
+    rank = Column(Float, nullable=False, default=0.5)  # 0-1 for result ranking
+    indexed_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    __table_args__ = (
+        {"extend_existing": True},
+    )
+    def __repr__(self):
+        return f"<SearchIndex(content_id='{self.content_id}', language='{self.language}', type='{self.content_type}')>"

src/models/translation_openai.py ADDED Viewed

	@@ -0,0 +1,512 @@

+"""
+Comprehensive OpenAI Translation System Models.
+Provides database models for:
+- Translation jobs with progress tracking
+- Chunk-based translation processing
+- Enhanced caching with page URL + content hash
+- Error logging and retry tracking
+- User session management
+- Translation quality metrics
+"""
+from datetime import datetime, timedelta
+from typing import Optional, Dict, Any, List
+from enum import Enum
+import uuid
+from sqlalchemy import (
+    Column, Integer, String, Text, DateTime, SmallInteger, ForeignKey,
+    Index, Boolean, Numeric, JSON, BigInteger, CheckConstraint, UniqueConstraint
+)
+from sqlalchemy.orm import relationship
+from sqlalchemy.dialects.postgresql import UUID, JSON
+from sqlalchemy.sql import func
+from src.database.base import Base
+class TranslationJobStatus(Enum):
+    """Translation job status values."""
+    PENDING = "pending"
+    QUEUED = "queued"
+    PROCESSING = "processing"
+    CHUNK_PROCESSING = "chunk_processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+    RETRYING = "retrying"
+    TIMEOUT = "timeout"
+class ChunkStatus(Enum):
+    """Translation chunk status values."""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    RETRY = "retry"
+    SKIPPED = "skipped"  # For code blocks
+class ErrorSeverity(Enum):
+    """Error severity levels."""
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+    CRITICAL = "critical"
+class TranslationJob(Base):
+    """
+    Represents a translation job with comprehensive tracking.
+    Supports:
+    - Large text translation with chunking
+    - Progress tracking
+    - Error handling and retries
+    - Performance metrics
+    - Cost tracking
+    """
+    __tablename__ = "translation_jobs"
+    # Primary key and identifiers
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    job_id = Column(String(64), unique=True, nullable=False, index=True)  # External job ID
+    user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
+    session_id = Column(String(128), nullable=True, index=True)
+    # Content identifiers for caching
+    content_hash = Column(String(64), nullable=False, index=True)
+    page_url = Column(Text, nullable=True, index=True)  # Source page URL for caching
+    # Translation parameters
+    source_language = Column(String(10), nullable=False, index=True)
+    target_language = Column(String(10), nullable=False, index=True)
+    # Content information
+    original_text = Column(Text, nullable=False)
+    translated_text = Column(Text, nullable=True)
+    # Processing options
+    preserve_code_blocks = Column(Boolean, default=True, nullable=False)
+    enable_transliteration = Column(Boolean, default=True, nullable=False)
+    chunk_size = Column(Integer, default=2000, nullable=False)  # Characters per chunk
+    max_chunks = Column(Integer, default=100, nullable=False)
+    # OpenAI specific settings
+    model_name = Column(String(50), nullable=False, default="gpt-4-turbo-preview")
+    temperature = Column(Numeric(3, 2), default=0.3, nullable=False)
+    max_tokens = Column(Integer, default=2048, nullable=False)
+    # Status and progress
+    status = Column(String(20), default=TranslationJobStatus.PENDING.value, nullable=False, index=True)
+    progress_percentage = Column(Numeric(5, 2), default=0.0, nullable=False)
+    chunks_total = Column(Integer, default=0, nullable=False)
+    chunks_completed = Column(Integer, default=0, nullable=False)
+    chunks_failed = Column(Integer, default=0, nullable=False)
+    # Retry settings
+    retry_count = Column(Integer, default=0, nullable=False)
+    max_retries = Column(Integer, default=3, nullable=False)
+    # Performance metrics
+    started_at = Column(DateTime(timezone=True), nullable=True)
+    completed_at = Column(DateTime(timezone=True), nullable=True)
+    processing_time_ms = Column(BigInteger, default=0, nullable=False)
+    # Cost tracking
+    input_tokens = Column(BigInteger, default=0, nullable=False)
+    output_tokens = Column(BigInteger, default=0, nullable=False)
+    estimated_cost_usd = Column(Numeric(10, 6), default=0.000000, nullable=False)
+    actual_cost_usd = Column(Numeric(10, 6), nullable=True)
+    # Quality metrics
+    quality_score = Column(Numeric(5, 2), nullable=True)  # 1-5 score
+    confidence_score = Column(Numeric(5, 2), nullable=True)  # 1-5 score
+    # Metadata
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
+    last_activity_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    user_agent = Column(Text, nullable=True)
+    ip_address = Column(String(45), nullable=True)  # Supports IPv6
+    # Relationships
+    user = relationship("User", back_populates="translation_jobs")
+    chunks = relationship("TranslationChunk", back_populates="job", cascade="all, delete-orphan")
+    errors = relationship("TranslationError", back_populates="job", cascade="all, delete-orphan")
+    metrics = relationship("TranslationMetrics", back_populates="job", cascade="all, delete-orphan")
+    cache_entries = relationship("TranslationCache", back_populates="job", cascade="all, delete-orphan")
+    # Constraints and indexes
+    __table_args__ = (
+        Index('idx_job_status_created', 'status', 'created_at'),
+        Index('idx_user_status', 'user_id', 'status'),
+        Index('idx_content_lookup', 'content_hash', 'source_language', 'target_language'),
+        Index('idx_page_cache', 'page_url', 'content_hash'),
+        Index('idx_activity', 'last_activity_at'),
+        Index('idx_progress', 'status', 'progress_percentage'),
+        CheckConstraint('progress_percentage >= 0 AND progress_percentage <= 100', name='check_progress_range'),
+        CheckConstraint('temperature >= 0 AND temperature <= 2', name='check_temperature_range'),
+        CheckConstraint('chunk_size > 0 AND chunk_size <= 10000', name='check_chunk_size'),
+    )
+    def __repr__(self):
+        return f"<TranslationJob(id={self.id}, status={self.status}, progress={self.progress_percentage}%)>"
+class TranslationChunk(Base):
+    """
+    Represents a chunk of text being translated.
+    Supports:
+    - Individual chunk status tracking
+    - Retry mechanism
+    - Performance metrics per chunk
+    - Code block detection
+    """
+    __tablename__ = "translation_chunks"
+    # Primary key and identifiers
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
+    chunk_index = Column(Integer, nullable=False)
+    # Content
+    original_text = Column(Text, nullable=False)
+    translated_text = Column(Text, nullable=True)
+    # Position in original text
+    start_position = Column(Integer, nullable=False)
+    end_position = Column(Integer, nullable=False)
+    # Chunk properties
+    is_code_block = Column(Boolean, default=False, nullable=False)
+    code_language = Column(String(50), nullable=True)
+    word_count = Column(Integer, nullable=False)
+    # Status and processing
+    status = Column(String(20), default=ChunkStatus.PENDING.value, nullable=False, index=True)
+    retry_count = Column(Integer, default=0, nullable=False)
+    # Processing metrics
+    started_at = Column(DateTime(timezone=True), nullable=True)
+    completed_at = Column(DateTime(timezone=True), nullable=True)
+    processing_time_ms = Column(BigInteger, default=0, nullable=False)
+    # Token usage
+    input_tokens = Column(Integer, default=0, nullable=False)
+    output_tokens = Column(Integer, default=0, nullable=False)
+    # Quality indicators
+    confidence_score = Column(Numeric(5, 2), nullable=True)
+    requires_review = Column(Boolean, default=False, nullable=False)
+    # Error information
+    last_error = Column(Text, nullable=True)
+    error_code = Column(String(50), nullable=True)
+    # Metadata
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
+    # Relationships
+    job = relationship("TranslationJob", back_populates="chunks")
+    # Constraints and indexes
+    __table_args__ = (
+        Index('idx_job_chunk', 'job_id', 'chunk_index', unique=True),
+        Index('idx_chunk_status', 'status', 'created_at'),
+        Index('idx_code_blocks', 'is_code_block', 'code_language'),
+        CheckConstraint('chunk_index >= 0', name='check_chunk_index'),
+        CheckConstraint('start_position >= 0 AND end_position >= start_position', name='check_positions'),
+        CheckConstraint('word_count >= 0', name='check_word_count'),
+    )
+    def __repr__(self):
+        return f"<TranslationChunk(job_id={self.job_id}, index={self.chunk_index}, status={self.status})>"
+class TranslationError(Base):
+    """
+    Tracks errors during translation processing.
+    Supports:
+    - Detailed error logging
+    - Error categorization
+    - Retry tracking
+    - Error analytics
+    """
+    __tablename__ = "translation_errors"
+    # Primary key and identifiers
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
+    chunk_id = Column(UUID(as_uuid=True), ForeignKey("translation_chunks.id"), nullable=True, index=True)
+    error_id = Column(String(64), unique=True, nullable=False, index=True)  # Unique error identifier
+    # Error details
+    error_type = Column(String(50), nullable=False, index=True)  # e.g., "api_error", "timeout", "rate_limit"
+    error_code = Column(String(50), nullable=True)  # API error code
+    error_message = Column(Text, nullable=False)
+    error_details = Column(JSON, nullable=True)  # Additional error context
+    # Severity and categorization
+    severity = Column(String(20), default=ErrorSeverity.MEDIUM.value, nullable=False, index=True)
+    category = Column(String(50), nullable=False, index=True)  # e.g., "network", "parsing", "validation"
+    # Retry information
+    is_retriable = Column(Boolean, default=True, nullable=False)
+    retry_attempt = Column(Integer, default=1, nullable=False)
+    max_retries = Column(Integer, default=3, nullable=False)
+    next_retry_at = Column(DateTime(timezone=True), nullable=True, index=True)
+    # Context information
+    request_payload = Column(JSON, nullable=True)  # Sanitized request data
+    response_payload = Column(JSON, nullable=True)  # Sanitized response data
+    # Stack trace and debugging
+    stack_trace = Column(Text, nullable=True)
+    debug_info = Column(JSON, nullable=True)
+    # Resolution
+    resolved_at = Column(DateTime(timezone=True), nullable=True)
+    resolution = Column(String(200), nullable=True)  # How the error was resolved
+    # Metadata
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
+    # Relationships
+    job = relationship("TranslationJob", back_populates="errors")
+    chunk = relationship("TranslationChunk")
+    # Constraints and indexes
+    __table_args__ = (
+        Index('idx_error_type_created', 'error_type', 'created_at'),
+        Index('idx_error_severity', 'severity', 'created_at'),
+        Index('idx_job_errors', 'job_id', 'created_at'),
+        Index('idx_retry_schedule', 'next_retry_at', 'is_retriable'),
+    )
+    def __repr__(self):
+        return f"<TranslationError(id={self.id}, type={self.error_type}, severity={self.severity})>"
+class TranslationSession(Base):
+    """
+    Manages user translation sessions.
+    Supports:
+    - Session-based tracking
+    - Rate limiting
+    - User preferences
+    - Analytics
+    """
+    __tablename__ = "translation_sessions"
+    # Primary key and identifiers
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    session_id = Column(String(128), unique=True, nullable=False, index=True)
+    user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
+    # Session information
+    started_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    last_activity_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    expires_at = Column(DateTime(timezone=True), nullable=False, index=True)
+    is_active = Column(Boolean, default=True, nullable=False, index=True)
+    # Usage tracking
+    request_count = Column(Integer, default=0, nullable=False)
+    character_count = Column(Integer, default=0, nullable=False)
+    total_cost_usd = Column(Numeric(10, 6), default=0.000000, nullable=False)
+    # Rate limiting
+    requests_per_minute = Column(Integer, default=60, nullable=False)
+    characters_per_hour = Column(Integer, default=100000, nullable=False)
+    # Session context
+    source_language = Column(String(10), nullable=True)
+    target_language = Column(String(10), nullable=True)
+    preferred_model = Column(String(50), nullable=True)
+    # Client information
+    user_agent = Column(Text, nullable=True)
+    ip_address = Column(String(45), nullable=True, index=True)
+    country_code = Column(String(2), nullable=True)
+    # Session preferences (stored as JSON)
+    preferences = Column(JSON, nullable=True)
+    # Relationships
+    user = relationship("User", back_populates="translation_sessions")
+    # Constraints and indexes
+    __table_args__ = (
+        Index('idx_user_sessions', 'user_id', 'is_active'),
+        Index('idx_session_expiry', 'expires_at', 'is_active'),
+        Index('idx_ip_sessions', 'ip_address', 'started_at'),
+        CheckConstraint('request_count >= 0', name='check_request_count'),
+        CheckConstraint('character_count >= 0', name='check_character_count'),
+        CheckConstraint('requests_per_minute > 0', name='check_rate_limit_requests'),
+        CheckConstraint('characters_per_hour > 0', name='check_rate_limit_chars'),
+    )
+    def __repr__(self):
+        return f"<TranslationSession(id={self.session_id}, active={self.is_active}, requests={self.request_count})>"
+class TranslationCache(Base):
+    """
+    Enhanced translation caching with page URL support.
+    Supports:
+    - Page URL + content hash keys
+    - Hierarchical caching
+    - Cache invalidation
+    - Cache analytics
+    """
+    __tablename__ = "translation_cache"
+    # Primary key and identifiers
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    cache_key = Column(String(128), unique=True, nullable=False, index=True)
+    job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=True, index=True)
+    # Cache keys
+    content_hash = Column(String(64), nullable=False, index=True)
+    page_url = Column(Text, nullable=True, index=True)
+    url_hash = Column(String(64), nullable=True, index=True)  # Hash of URL for privacy
+    # Translation data
+    source_language = Column(String(10), nullable=False, index=True)
+    target_language = Column(String(10), nullable=False, index=True)
+    original_text = Column(Text, nullable=False)
+    translated_text = Column(Text, nullable=False)
+    # Cache metadata
+    hit_count = Column(Integer, default=0, nullable=False)
+    last_hit_at = Column(DateTime(timezone=True), nullable=True)
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    expires_at = Column(DateTime(timezone=True), nullable=False, index=True)
+    # Quality and performance
+    quality_score = Column(Numeric(5, 2), nullable=True)
+    processing_time_ms = Column(BigInteger, nullable=False)
+    model_version = Column(String(50), nullable=False)
+    # Cache configuration
+    ttl_hours = Column(Integer, default=168, nullable=False)  # 7 days default
+    is_pinned = Column(Boolean, default=False, nullable=False)  # Never expires if pinned
+    priority = Column(Integer, default=0, nullable=False)  # Higher priority less likely to evict
+    # Validation
+    is_validated = Column(Boolean, default=False, nullable=False)
+    validated_by = Column(String(50), nullable=True)  # user_id or "system"
+    # Relationships
+    job = relationship("TranslationJob", back_populates="cache_entries")
+    # Constraints and indexes
+    __table_args__ = (
+        Index('idx_cache_lookup', 'content_hash', 'source_language', 'target_language'),
+        Index('idx_page_cache', 'url_hash', 'content_hash'),
+        Index('idx_cache_expires', 'expires_at', 'priority'),
+        Index('idx_cache_popularity', 'hit_count', 'last_hit_at'),
+        CheckConstraint('hit_count >= 0', name='check_hit_count'),
+        CheckConstraint('processing_time_ms >= 0', name='check_processing_time'),
+        CheckConstraint('ttl_hours > 0', name='check_ttl_hours'),
+    )
+    def __repr__(self):
+        return f"<TranslationCache(key={self.cache_key[:20]}..., hits={self.hit_count})>"
+class TranslationMetrics(Base):
+    """
+    Tracks detailed translation metrics and analytics.
+    Supports:
+    - Performance monitoring
+    - Quality analytics
+    - Cost tracking
+    - Usage statistics
+    """
+    __tablename__ = "translation_metrics"
+    # Primary key and identifiers
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
+    user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
+    # Time period
+    metric_date = Column(DateTime(timezone=True), nullable=False, index=True)
+    period_type = Column(String(20), nullable=False, index=True)  # hourly, daily, weekly, monthly
+    # Usage metrics
+    total_requests = Column(Integer, default=0, nullable=False)
+    total_characters = Column(BigInteger, default=0, nullable=False)
+    total_chunks = Column(Integer, default=0, nullable=False)
+    successful_translations = Column(Integer, default=0, nullable=False)
+    failed_translations = Column(Integer, default=0, nullable=False)
+    # Performance metrics
+    avg_processing_time_ms = Column(BigInteger, default=0, nullable=False)
+    min_processing_time_ms = Column(BigInteger, default=0, nullable=False)
+    max_processing_time_ms = Column(BigInteger, default=0, nullable=False)
+    p95_processing_time_ms = Column(BigInteger, default=0, nullable=False)
+    # Cost metrics
+    total_input_tokens = Column(BigInteger, default=0, nullable=False)
+    total_output_tokens = Column(BigInteger, default=0, nullable=False)
+    total_cost_usd = Column(Numeric(12, 6), default=0.000000, nullable=False)
+    avg_cost_per_char = Column(Numeric(10, 8), default=0.00000000, nullable=False)
+    # Quality metrics
+    avg_quality_score = Column(Numeric(5, 2), nullable=True)
+    avg_confidence_score = Column(Numeric(5, 2), nullable=True)
+    # Cache metrics
+    cache_hits = Column(Integer, default=0, nullable=False)
+    cache_misses = Column(Integer, default=0, nullable=False)
+    cache_hit_rate = Column(Numeric(5, 2), default=0.0, nullable=False)
+    # Error metrics
+    error_count = Column(Integer, default=0, nullable=False)
+    error_rate = Column(Numeric(5, 2), default=0.0, nullable=False)
+    top_error_types = Column(JSON, nullable=True)  # Top 5 error types with counts
+    # Additional dimensions
+    source_language = Column(String(10), nullable=True, index=True)
+    target_language = Column(String(10), nullable=True, index=True)
+    model_name = Column(String(50), nullable=True, index=True)
+    # Metadata
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
+    # Relationships
+    job = relationship("TranslationJob", back_populates="metrics")
+    user = relationship("User", back_populates="translation_metrics")
+    # Constraints and indexes
+    __table_args__ = (
+        Index('idx_metrics_date_period', 'metric_date', 'period_type'),
+        Index('idx_user_metrics', 'user_id', 'metric_date'),
+        Index('idx_job_metrics', 'job_id', 'metric_date'),
+        Index('idx_lang_metrics', 'source_language', 'target_language', 'metric_date'),
+        CheckConstraint('total_requests >= 0', name='check_total_requests'),
+        CheckConstraint('total_characters >= 0', name='check_total_characters'),
+        CheckConstraint('cache_hit_rate >= 0 AND cache_hit_rate <= 100', name='check_cache_hit_rate'),
+        CheckConstraint('error_rate >= 0 AND error_rate <= 100', name='check_error_rate'),
+    )
+    def __repr__(self):
+        return f"<TranslationMetrics(date={self.metric_date}, requests={self.total_requests})>"

src/models/user_preferences.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+User preferences model for storing personalization settings.
+"""
+from sqlalchemy import Column, String, Boolean, Integer, Float, ForeignKey, Text
+from sqlalchemy.orm import relationship
+from src.models.base import BaseModel
+class UserPreference(BaseModel):
+    """Stores user personalization settings."""
+    __tablename__ = "user_preferences"
+    user_id = Column(String(36), ForeignKey("users.id"), nullable=False, unique=True, index=True)
+    language = Column(String(10), nullable=False, default='en')  # en, ur, ur-roman
+    reading_pace = Column(String(20), nullable=False, default='medium')  # slow, medium, fast
+    preferred_depth = Column(String(20), nullable=False, default='detailed')  # overview, detailed, comprehensive
+    show_code_examples = Column(Boolean, nullable=False, default=True)
+    adaptive_difficulty = Column(Boolean, nullable=False, default=False)
+    theme = Column(String(20), nullable=False, default='auto')  # light, dark, auto
+    font_size = Column(Integer, nullable=False, default=16)
+    line_height = Column(Float, nullable=False, default=1.5)
+    # Relationships
+    user = relationship("User", back_populates="preferences")
+    custom_notes = relationship("UserCustomNote", back_populates="preference", cascade="all, delete-orphan")
+    __table_args__ = (
+        {"extend_existing": True},
+    )
+    def __repr__(self):
+        return f"<UserPreference(user_id='{self.user_id}', language='{self.language}', theme='{self.theme}')>"
+class UserCustomNote(BaseModel):
+    """Custom notes as key-value pairs for user preferences."""
+    __tablename__ = "user_custom_notes"
+    user_preference_id = Column(String(36), ForeignKey("user_preferences.id"), nullable=False)
+    key = Column(String(100), nullable=False)
+    value = Column(Text, nullable=False)
+    # Relationships
+    preference = relationship("UserPreference", back_populates="custom_notes")
+    __table_args__ = (
+        {"extend_existing": True},
+    )
+    def __repr__(self):
+        return f"<UserCustomNote(key='{self.key}', preference_id='{self.user_preference_id}')>"

src/services/cache_examples.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Cache service usage examples.
+This file demonstrates how to use the cache service for various scenarios
+including translations, user preferences, and API response caching.
+"""
+import asyncio
+from typing import Dict, Any
+from src.services.cache_service import (
+    get_cache_service,
+    CacheType,
+    cache_translation,
+    get_cached_translation,
+    cache_user_preference,
+    get_cached_user_preference,
+    cache_api_response,
+    get_cached_api_response
+)
+async def example_basic_usage():
+    """Basic cache service usage example."""
+    # Get cache service instance
+    cache = await get_cache_service()
+    # Generate a cache key
+    cache_key = cache._generate_cache_key(
+        prefix="example",
+        identifier="user_123",
+        version="v1",
+        param1="value1",
+        param2="value2"
+    )
+    # Set a value
+    await cache.set(
+        key=cache_key,
+        value={"message": "Hello, cached world!"},
+        cache_type=CacheType.API_RESPONSE,
+        ttl=60  # 1 minute
+    )
+    # Get the value
+    cached_value = await cache.get(cache_key, CacheType.API_RESPONSE)
+    print(f"Cached value: {cached_value}")
+    # Delete the value
+    await cache.delete(cache_key)
+async def example_translation_caching():
+    """Example of caching translations."""
+    # Cache a translation
+    translation_data = {
+        "en": "Hello, World!",
+        "ur": "ہیلو، دنیا!",
+        "ur-roman": "Hello, Duniya!"
+    }
+    success = await cache_translation(
+        key="greeting.hello_world",
+        translation=translation_data,
+        language="all"
+    )
+    if success:
+        print("Translation cached successfully")
+    # Retrieve cached translation
+    cached_translation = await get_cached_translation(
+        key="greeting.hello_world",
+        language="all"
+    )
+    if cached_translation:
+        print(f"Cached translation: {cached_translation}")
+async def example_user_preference_caching():
+    """Example of caching user preferences."""
+    # Cache user preferences
+    user_prefs = {
+        "language": "en",
+        "theme": "dark",
+        "font_size": 16,
+        "reading_pace": "medium",
+        "show_code_examples": True
+    }
+    success = await cache_user_preference(
+        user_id="user_456",
+        preferences=user_prefs
+    )
+    if success:
+        print("User preferences cached successfully")
+    # Retrieve cached preferences
+    cached_prefs = await get_cached_user_preference("user_456")
+    if cached_prefs:
+        print(f"Cached preferences: {cached_prefs}")
+async def example_api_response_caching():
+    """Example of caching API responses."""
+    # Cache API response
+    api_response = {
+        "status": "success",
+        "data": [
+            {"id": 1, "title": "Chapter 1"},
+            {"id": 2, "title": "Chapter 2"}
+        ],
+        "pagination": {
+            "page": 1,
+            "total_pages": 10
+        }
+    }
+    success = await cache_api_response(
+        endpoint="/api/v1/chapters",
+        params={"page": 1, "limit": 10},
+        response=api_response,
+        ttl=300  # 5 minutes
+    )
+    if success:
+        print("API response cached successfully")
+    # Retrieve cached API response
+    cached_response = await get_cached_api_response(
+        endpoint="/api/v1/chapters",
+        params={"page": 1, "limit": 10}
+    )
+    if cached_response:
+        print(f"Cached API response: {cached_response}")
+async def example_cache_statistics():
+    """Example of retrieving cache statistics."""
+    cache = await get_cache_service()
+    # Get cache statistics
+    stats = cache.get_stats()
+    print("Cache Statistics:")
+    print(f"  Total requests: {stats['total_requests']}")
+    print(f"  Cache hits: {stats['hits']}")
+    print(f"  Cache misses: {stats['misses']}")
+    print(f"  Hit rate: {stats['hit_rate']}%")
+    print(f"  Redis hits: {stats['redis_hits']}")
+    print(f"  Local hits: {stats['local_hits']}")
+    print(f"  Errors: {stats['errors']}")
+    print(f"  Redis enabled: {stats['redis_enabled']}")
+    print(f"  Memory cache size: {stats['memory_cache_size']}")
+async def example_cache_cleanup():
+    """Example of cleaning up expired cache entries."""
+    cache = await get_cache_service()
+    # Clean up expired entries
+    cleaned_count = await cache.cleanup_expired()
+    print(f"Cleaned up {cleaned_count} expired cache entries")
+    # Clear all cache entries for a specific type
+    cleared_count = await cache.clear(cache_type=CacheType.TRANSLATION)
+    print(f"Cleared {cleared_count} translation cache entries")
+    # Clear cache entries matching a pattern
+    cleared_count = await cache.clear(pattern="api:v1:user_*")
+    print(f"Cleared {cleared_count} entries matching pattern")
+async def example_concurrent_access():
+    """Example demonstrating thread-safe concurrent access."""
+    async def worker(worker_id: int):
+        cache = await get_cache_service()
+        # Each worker uses its own key space
+        key = f"worker_{worker_id}:data"
+        for i in range(10):
+            # Set value
+            await cache.set(
+                key=key,
+                value={"worker": worker_id, "iteration": i},
+                cache_type=CacheType.API_RESPONSE,
+                ttl=60
+            )
+            # Get value
+            value = await cache.get(key, CacheType.API_RESPONSE)
+            print(f"Worker {worker_id}, iteration {i}: {value}")
+            # Small delay
+            await asyncio.sleep(0.1)
+    # Run multiple workers concurrently
+    tasks = [worker(i) for i in range(5)]
+    await asyncio.gather(*tasks)
+async def main():
+    """Run all examples."""
+    print("=== Basic Usage ===")
+    await example_basic_usage()
+    print("\n=== Translation Caching ===")
+    await example_translation_caching()
+    print("\n=== User Preference Caching ===")
+    await example_user_preference_caching()
+    print("\n=== API Response Caching ===")
+    await example_api_response_caching()
+    print("\n=== Cache Statistics ===")
+    await example_cache_statistics()
+    print("\n=== Cache Cleanup ===")
+    await example_cache_cleanup()
+    print("\n=== Concurrent Access ===")
+    await example_concurrent_access()
+if __name__ == "__main__":
+    asyncio.run(main())

src/services/cache_service.py ADDED Viewed

	@@ -0,0 +1,690 @@

+"""
+Cache service for server-side caching with localStorage fallback.
+Provides Redis caching with localStorage fallback, supporting different TTLs
+for various cache types including translations, user preferences, and API responses.
+"""
+import json
+import pickle
+import asyncio
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Union
+from enum import Enum
+import hashlib
+import os
+from pathlib import Path
+try:
+    import redis.asyncio as redis
+    REDIS_AVAILABLE = True
+except ImportError:
+    REDIS_AVAILABLE = False
+    redis = None
+from src.utils.errors import CacheError, ValidationError
+from src.utils.logging import get_logger
+logger = get_logger(__name__)
+class CacheType(Enum):
+    """Cache types with different TTLs."""
+    TRANSLATION = "translation"
+    USER_PREFERENCE = "user_preference"
+    API_RESPONSE = "api_response"
+    PERSONALIZATION = "personalization"
+    PROGRESS = "progress"
+    SEARCH_RESULT = "search_result"
+    BOOKMARK = "bookmark"
+class CacheService:
+    """
+    Cache service with Redis primary and localStorage fallback.
+    Features:
+    - Redis as primary cache (if available)
+    - localStorage as fallback
+    - TTL support per cache type
+    - Compression for large objects
+    - Statistics tracking
+    - Error handling and logging
+    """
+    # TTL configurations (in seconds)
+    TTL_CONFIG = {
+        CacheType.TRANSLATION: 7 * 24 * 60 * 60,  # 7 days
+        CacheType.USER_PREFERENCE: 30 * 24 * 60 * 60,  # 30 days
+        CacheType.API_RESPONSE: 5 * 60,  # 5 minutes
+        CacheType.PERSONALIZATION: 1 * 60 * 60,  # 1 hour
+        CacheType.PROGRESS: 24 * 60 * 60,  # 24 hours
+        CacheType.SEARCH_RESULT: 10 * 60,  # 10 minutes
+        CacheType.BOOKMARK: 30 * 24 * 60 * 60,  # 30 days
+    }
+    # Statistics
+    _stats = {
+        "hits": 0,
+        "misses": 0,
+        "errors": 0,
+        "redis_hits": 0,
+        "local_hits": 0,
+    }
+    def __init__(
+        self,
+        redis_url: Optional[str] = None,
+        localStorage_path: Optional[str] = None,
+        enable_redis: bool = True,
+        enable_compression: bool = True,
+        compression_threshold: int = 1024
+    ):
+        """
+        Initialize cache service.
+        Args:
+            redis_url: Redis connection URL
+            localStorage_path: Path to localStorage directory
+            enable_redis: Whether to use Redis if available
+            enable_compression: Whether to compress large objects
+            compression_threshold: Size threshold for compression (bytes)
+        """
+        self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379/0")
+        self.localStorage_path = Path(localStorage_path or os.getenv("CACHE_LOCAL_PATH", "./cache_data"))
+        self.enable_redis = enable_redis and REDIS_AVAILABLE
+        self.enable_compression = enable_compression
+        self.compression_threshold = compression_threshold
+        self._redis_client = None
+        self._local_cache = {}
+        # Initialize localStorage
+        self.localStorage_path.mkdir(parents=True, exist_ok=True)
+        logger.info(
+            "Cache service initialized",
+            redis_enabled=self.enable_redis,
+            localStorage_path=str(self.localStorage_path),
+            compression_enabled=self.enable_compression
+        )
+    async def _get_redis_client(self):
+        """Get or create Redis client."""
+        if not self.enable_redis:
+            return None
+        if self._redis_client is None:
+            try:
+                self._redis_client = redis.from_url(
+                    self.redis_url,
+                    encoding="utf-8",
+                    decode_responses=False,
+                    socket_connect_timeout=5,
+                    socket_timeout=5,
+                    retry_on_timeout=True,
+                    health_check_interval=30
+                )
+                # Test connection
+                await self._redis_client.ping()
+                logger.info("Redis connection established")
+            except Exception as e:
+                logger.warning("Failed to connect to Redis", error=str(e))
+                self.enable_redis = False
+                self._redis_client = None
+        return self._redis_client
+    def _generate_cache_key(
+        self,
+        prefix: str,
+        identifier: str,
+        version: str = "v1",
+        **kwargs
+    ) -> str:
+        """
+        Generate a consistent cache key.
+        Args:
+            prefix: Cache type or prefix
+            identifier: Unique identifier for the cache entry
+            version: Version of the cache schema
+            **kwargs: Additional parameters to include in key
+        Returns:
+            Generated cache key
+        """
+        # Create a stable representation of parameters
+        params = sorted(kwargs.items())
+        param_str = json.dumps(params, sort_keys=True, separators=(',', ':'))
+        # Create hash of identifier and params
+        hash_input = f"{identifier}:{param_str}"
+        hash_value = hashlib.sha256(hash_input.encode()).hexdigest()[:16]
+        return f"{prefix}:{version}:{identifier}:{hash_value}"
+    async def get(
+        self,
+        key: str,
+        cache_type: CacheType = CacheType.API_RESPONSE,
+        use_compression: Optional[bool] = None
+    ) -> Optional[Any]:
+        """
+        Get value from cache.
+        Args:
+            key: Cache key
+            cache_type: Type of cache entry
+            use_compression: Override compression setting
+        Returns:
+            Cached value or None if not found
+        """
+        try:
+            # Try Redis first
+            if self.enable_redis:
+                redis_client = await self._get_redis_client()
+                if redis_client:
+                    value = await self._get_from_redis(
+                        redis_client,
+                        key,
+                        cache_type,
+                        use_compression
+                    )
+                    if value is not None:
+                        self._stats["hits"] += 1
+                        self._stats["redis_hits"] += 1
+                        return value
+            # Fallback to localStorage
+            value = await self._get_from_local(key, cache_type, use_compression)
+            if value is not None:
+                self._stats["hits"] += 1
+                self._stats["local_hits"] += 1
+                # If found locally but not in Redis, backfill to Redis
+                if self.enable_redis:
+                    redis_client = await self._get_redis_client()
+                    if redis_client:
+                        ttl = self.TTL_CONFIG[cache_type]
+                        await self._set_to_redis(
+                            redis_client,
+                            key,
+                            value,
+                            ttl,
+                            use_compression
+                        )
+                return value
+            # Cache miss
+            self._stats["misses"] += 1
+            return None
+        except Exception as e:
+            self._stats["errors"] += 1
+            logger.error("Cache get failed", key=key, error=str(e))
+            return None
+    async def set(
+        self,
+        key: str,
+        value: Any,
+        cache_type: CacheType = CacheType.API_RESPONSE,
+        ttl: Optional[int] = None,
+        use_compression: Optional[bool] = None
+    ) -> bool:
+        """
+        Set value in cache.
+        Args:
+            key: Cache key
+            value: Value to cache
+            cache_type: Type of cache entry
+            ttl: Time to live in seconds (overrides type TTL)
+            use_compression: Override compression setting
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            success = True
+            ttl = ttl or self.TTL_CONFIG[cache_type]
+            # Set in Redis
+            if self.enable_redis:
+                redis_client = await self._get_redis_client()
+                if redis_client:
+                    success = await self._set_to_redis(
+                        redis_client,
+                        key,
+                        value,
+                        ttl,
+                        use_compression
+                    ) and success
+            # Set in localStorage (always set as fallback)
+            local_success = await self._set_to_local(
+                key,
+                value,
+                cache_type,
+                ttl,
+                use_compression
+            )
+            success = local_success and success
+            return success
+        except Exception as e:
+            self._stats["errors"] += 1
+            logger.error("Cache set failed", key=key, error=str(e))
+            return False
+    async def delete(self, key: str) -> bool:
+        """
+        Delete key from cache.
+        Args:
+            key: Cache key to delete
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            success = True
+            # Delete from Redis
+            if self.enable_redis:
+                redis_client = await self._get_redis_client()
+                if redis_client:
+                    await redis_client.delete(key)
+            # Delete from localStorage
+            local_file = self.localStorage_path / f"{key}.cache"
+            if local_file.exists():
+                local_file.unlink()
+            # Remove from memory cache
+            if key in self._local_cache:
+                del self._local_cache[key]
+            return True
+        except Exception as e:
+            self._stats["errors"] += 1
+            logger.error("Cache delete failed", key=key, error=str(e))
+            return False
+    async def clear(
+        self,
+        pattern: Optional[str] = None,
+        cache_type: Optional[CacheType] = None
+    ) -> int:
+        """
+        Clear cache entries.
+        Args:
+            pattern: Pattern to match keys (supports wildcards)
+            cache_type: Clear only this cache type
+        Returns:
+            Number of entries cleared
+        """
+        try:
+            cleared_count = 0
+            # Build pattern if cache_type specified
+            if cache_type and not pattern:
+                pattern = f"{cache_type.value}:*"
+            # Clear from Redis
+            if self.enable_redis:
+                redis_client = await self._get_redis_client()
+                if redis_client:
+                    if pattern:
+                        keys = await redis_client.keys(pattern)
+                        if keys:
+                            await redis_client.delete(*keys)
+                            cleared_count += len(keys)
+                    else:
+                        await redis_client.flushdb()
+                        cleared_count = -1  # Indicate full clear
+            # Clear from localStorage
+            if pattern:
+                # Convert pattern to file pattern
+                file_pattern = pattern.replace("*", "").replace(":", "_") + "*.cache"
+                for cache_file in self.localStorage_path.glob(file_pattern):
+                    cache_file.unlink()
+                    cleared_count += 1
+            else:
+                # Clear all files
+                for cache_file in self.localStorage_path.glob("*.cache"):
+                    cache_file.unlink()
+                    cleared_count += 1
+                # Clear memory cache
+                self._local_cache.clear()
+            logger.info("Cache cleared", pattern=pattern, count=cleared_count)
+            return cleared_count
+        except Exception as e:
+            self._stats["errors"] += 1
+            logger.error("Cache clear failed", pattern=pattern, error=str(e))
+            return 0
+    async def _get_from_redis(
+        self,
+        redis_client,
+        key: str,
+        cache_type: CacheType,
+        use_compression: Optional[bool]
+    ) -> Optional[Any]:
+        """Get value from Redis."""
+        try:
+            data = await redis_client.get(key)
+            if data is None:
+                return None
+            # Uncompress if needed
+            if use_compression or (use_compression is None and self.enable_compression):
+                if data.startswith(b"COMP:"):
+                    import gzip
+                    data = gzip.decompress(data[5:])
+            # Deserialize
+            return pickle.loads(data)
+        except Exception as e:
+            logger.warning("Redis get failed", key=key, error=str(e))
+            return None
+    async def _set_to_redis(
+        self,
+        redis_client,
+        key: str,
+        value: Any,
+        ttl: int,
+        use_compression: Optional[bool]
+    ) -> bool:
+        """Set value in Redis."""
+        try:
+            # Serialize
+            data = pickle.dumps(value)
+            # Compress if needed
+            if (use_compression or (use_compression is None and self.enable_compression)) \
+               and len(data) > self.compression_threshold:
+                import gzip
+                data = b"COMP:" + gzip.compress(data)
+            await redis_client.setex(key, ttl, data)
+            return True
+        except Exception as e:
+            logger.warning("Redis set failed", key=key, error=str(e))
+            return False
+    async def _get_from_local(
+        self,
+        key: str,
+        cache_type: CacheType,
+        use_compression: Optional[bool]
+    ) -> Optional[Any]:
+        """Get value from localStorage."""
+        try:
+            # Check memory cache first
+            cache_entry = self._local_cache.get(key)
+            if cache_entry:
+                # Check if expired
+                if cache_entry["expires"] > datetime.utcnow():
+                    return cache_entry["value"]
+                else:
+                    # Remove expired entry
+                    del self._local_cache[key]
+            # Check file cache
+            cache_file = self.localStorage_path / f"{key}.cache"
+            if not cache_file.exists():
+                return None
+            # Read and validate file
+            data = cache_file.read_bytes()
+            cache_entry = json.loads(data.decode())
+            # Check if expired
+            expires = datetime.fromisoformat(cache_entry["expires"])
+            if expires <= datetime.utcnow():
+                cache_file.unlink()
+                return None
+            # Decode value
+            if cache_entry.get("compressed") and (use_compression or (use_compression is None and self.enable_compression)):
+                import gzip
+                value = pickle.loads(gzip.decompress(cache_entry["value"].encode()))
+            else:
+                value = pickle.loads(cache_entry["value"].encode())
+            # Update memory cache
+            self._local_cache[key] = {
+                "value": value,
+                "expires": expires
+            }
+            return value
+        except Exception as e:
+            logger.warning("Local cache get failed", key=key, error=str(e))
+            return None
+    async def _set_to_local(
+        self,
+        key: str,
+        value: Any,
+        cache_type: CacheType,
+        ttl: int,
+        use_compression: Optional[bool]
+    ) -> bool:
+        """Set value in localStorage."""
+        try:
+            expires = datetime.utcnow() + timedelta(seconds=ttl)
+            # Compress if needed
+            compressed = False
+            if (use_compression or (use_compression is None and self.enable_compression)):
+                serialized = pickle.dumps(value)
+                if len(serialized) > self.compression_threshold:
+                    import gzip
+                    value_serialized = gzip.compress(serialized).decode()
+                    compressed = True
+                else:
+                    value_serialized = serialized.decode()
+            else:
+                value_serialized = pickle.dumps(value).decode()
+            # Create cache entry
+            cache_entry = {
+                "value": value_serialized,
+                "expires": expires.isoformat(),
+                "compressed": compressed,
+                "cache_type": cache_type.value
+            }
+            # Write to file
+            cache_file = self.localStorage_path / f"{key}.cache"
+            cache_file.write_bytes(json.dumps(cache_entry).encode())
+            # Update memory cache
+            self._local_cache[key] = {
+                "value": value,
+                "expires": expires
+            }
+            return True
+        except Exception as e:
+            logger.warning("Local cache set failed", key=key, error=str(e))
+            return False
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        total_requests = self._stats["hits"] + self._stats["misses"]
+        hit_rate = self._stats["hits"] / max(total_requests, 1) * 100
+        return {
+            **self._stats,
+            "total_requests": total_requests,
+            "hit_rate": round(hit_rate, 2),
+            "redis_enabled": self.enable_redis,
+            "memory_cache_size": len(self._local_cache)
+        }
+    async def cleanup_expired(self) -> int:
+        """Clean up expired cache entries."""
+        cleaned = 0
+        try:
+            # Clean memory cache
+            now = datetime.utcnow()
+            expired_keys = [
+                key for key, entry in self._local_cache.items()
+                if entry["expires"] <= now
+            ]
+            for key in expired_keys:
+                del self._local_cache[key]
+                cleaned += 1
+            # Clean file cache
+            for cache_file in self.localStorage_path.glob("*.cache"):
+                try:
+                    data = json.loads(cache_file.read_bytes().decode())
+                    expires = datetime.fromisoformat(data["expires"])
+                    if expires <= datetime.utcnow():
+                        cache_file.unlink()
+                        cleaned += 1
+                except:
+                    # Invalid cache file, remove it
+                    cache_file.unlink()
+                    cleaned += 1
+            logger.info("Cache cleanup completed", cleaned_entries=cleaned)
+            return cleaned
+        except Exception as e:
+            logger.error("Cache cleanup failed", error=str(e))
+            return 0
+# Global cache service instance
+_cache_service: Optional[CacheService] = None
+async def get_cache_service() -> CacheService:
+    """Get or create cache service instance."""
+    global _cache_service
+    if _cache_service is None:
+        _cache_service = CacheService()
+    return _cache_service
+# Utility functions for specific cache types
+async def cache_translation(
+    key: str,
+    translation: Dict[str, Any],
+    language: str
+) -> bool:
+    """Cache a translation entry."""
+    cache = await get_cache_service()
+    cache_key = cache._generate_cache_key(
+        "translation",
+        key,
+        lang=language
+    )
+    return await cache.set(
+        cache_key,
+        translation,
+        CacheType.TRANSLATION
+    )
+async def get_cached_translation(
+    key: str,
+    language: str
+) -> Optional[Dict[str, Any]]:
+    """Get cached translation."""
+    cache = await get_cache_service()
+    cache_key = cache._generate_cache_key(
+        "translation",
+        key,
+        lang=language
+    )
+    return await cache.get(cache_key, CacheType.TRANSLATION)
+async def cache_user_preference(
+    user_id: str,
+    preferences: Dict[str, Any]
+) -> bool:
+    """Cache user preferences."""
+    cache = await get_cache_service()
+    cache_key = cache._generate_cache_key(
+        "user_pref",
+        user_id
+    )
+    return await cache.set(
+        cache_key,
+        preferences,
+        CacheType.USER_PREFERENCE
+    )
+async def get_cached_user_preference(
+    user_id: str
+) -> Optional[Dict[str, Any]]:
+    """Get cached user preferences."""
+    cache = await get_cache_service()
+    cache_key = cache._generate_cache_key(
+        "user_pref",
+        user_id
+    )
+    return await cache.get(cache_key, CacheType.USER_PREFERENCE)
+async def cache_api_response(
+    endpoint: str,
+    params: Dict[str, Any],
+    response: Dict[str, Any],
+    ttl: Optional[int] = None
+) -> bool:
+    """Cache API response."""
+    cache = await get_cache_service()
+    cache_key = cache._generate_cache_key(
+        "api",
+        endpoint,
+        **params
+    )
+    return await cache.set(
+        cache_key,
+        response,
+        CacheType.API_RESPONSE,
+        ttl=ttl
+    )
+async def get_cached_api_response(
+    endpoint: str,
+    params: Dict[str, Any]
+) -> Optional[Dict[str, Any]]:
+    """Get cached API response."""
+    cache = await get_cache_service()
+    cache_key = cache._generate_cache_key(
+        "api",
+        endpoint,
+        **params
+    )
+    return await cache.get(cache_key, CacheType.API_RESPONSE)

src/services/code_block_handler.py ADDED Viewed

	@@ -0,0 +1,630 @@

+"""
+Code Block Handler for Translation System.
+This module handles detection, preservation, and intelligent processing
+of code blocks during translation.
+"""
+import re
+from typing import Dict, List, Optional, Any, Tuple, Set
+from dataclasses import dataclass
+from enum import Enum
+from bs4 import BeautifulSoup, Tag
+import markdown
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name, guess_lexer
+from pygments.formatters import HtmlFormatter
+from src.utils.translation_logger import get_translation_logger
+logger = get_translation_logger(__name__)
+class CodeBlockType(Enum):
+    """Types of code blocks."""
+    MARKDOWN = "markdown"
+    HTML_PRE = "html_pre"
+    HTML_INLINE = "html_inline"
+    INDENTED = "indented"
+    FENCED = "fenced"
+@dataclass
+class CodeBlock:
+    """Represents a detected code block."""
+    block_type: CodeBlockType
+    language: Optional[str]
+    content: str
+    original_text: str
+    start_position: int
+    end_position: int
+    attributes: Dict[str, Any]
+    preserve_formatting: bool = True
+    add_urdu_comments: bool = False
+    translated: bool = False
+class CodeBlockHandler:
+    """
+    Handles code block detection, preservation, and processing.
+    Features:
+    - Multi-format code block detection
+    - Language identification
+    - Format preservation
+    - Urdu comment injection
+    - Syntax highlighting
+    - Code validation
+    """
+    # Code block patterns
+    PATTERNS = {
+        CodeBlockType.MARKDOWN: [
+            re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL),
+            re.compile(r'~~~(\w+)?\n(.*?)\n~~~', re.DOTALL),
+        ],
+        CodeBlockType.HTML_PRE: [
+            re.compile(r'<pre[^>]*>.*?<code[^>]*>(.*?)</code>.*?</pre>', re.DOTALL | re.IGNORECASE),
+        ],
+        CodeBlockType.HTML_INLINE: [
+            re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL | re.IGNORECASE),
+        ],
+        CodeBlockType.INDENTED: [
+            # Detect 4+ spaces or tabs at start of line
+            re.compile(r'^(    |\t).*$', re.MULTILINE),
+        ],
+    }
+    # Language patterns for detection
+    LANGUAGE_PATTERNS = {
+        'python': [r'import\s+\w+', r'def\s+\w+', r'class\s+\w+', r'if\s+__name__\s*=='],
+        'javascript': [r'function\s+\w+', r'const\s+\w+\s*=', r'let\s+\w+\s*=', r'var\s+\w+\s*='],
+        'java': [r'public\s+class\s+\w+', r'private\s+\w+\s+\w+', r'import\s+java\.'],
+        'cpp': [r'#include\s*<', r'using\s+namespace\s+', r'::\w+\s*\('],
+        'html': [r'<!DOCTYPE\s+html>', r'<html[^>]*>', r'<div[^>]*>'],
+        'css': [r'\.[\w-]+\s*{', r'#[\w-]+\s*{', r'@\w+\s*\w+\s*{'],
+        'sql': [r'SELECT\s+', r'FROM\s+', r'WHERE\s+', r'INSERT\s+INTO'],
+        'json': [r'^\s*{\s*"', r'^\s*\[', r'"[^"]*":\s*'],
+        'yaml': [r'^\s*\w+:', r'^\s+-\s+', r'^\s*  \w+:'],
+        'bash': [r'#!/bin/bash', r'echo\s+', r'export\s+\w+='],
+        'powershell': [r'Write-Host\s+', r'$\w+\s*=', r'Get-'],
+        'dockerfile': [r'FROM\s+\w+', r'RUN\s+', r'CMD\s+'],
+    }
+    # Common programming keywords
+    PROGRAMMING_KEYWORDS = [
+        'function', 'class', 'import', 'export', 'return', 'if', 'else', 'for', 'while',
+        'def', 'var', 'let', 'const', 'try', 'catch', 'throw', 'new', 'this', 'super'
+    ]
+    def __init__(self):
+        """Initialize code block handler."""
+        self.detected_languages: Set[str] = set()
+        self.urdu_comments = {
+            'python': '#',
+            'javascript': '//',
+            'java': '//',
+            'cpp': '//',
+            'c': '//',
+            'css': '/*',
+            'sql': '--',
+            'bash': '#',
+            'powershell': '#',
+        }
+    def detect_code_blocks(
+        self,
+        content: str,
+        source_format: str = "html"
+    ) -> List[CodeBlock]:
+        """
+        Detect all code blocks in content.
+        Args:
+            content: Content to analyze
+            source_format: Format type (html, markdown, etc.)
+        Returns:
+            List of detected code blocks
+        """
+        logger.info(
+            "Detecting code blocks",
+            content_length=len(content),
+            source_format=source_format
+        )
+        blocks = []
+        # Try each pattern type
+        for block_type, patterns in self.PATTERNS.items():
+            for pattern in patterns:
+                matches = list(pattern.finditer(content))
+                for match in matches:
+                    block = self._create_code_block(
+                        match, block_type, content
+                    )
+                    if block:
+                        blocks.append(block)
+        # Remove duplicates (blocks that overlap)
+        blocks = self._remove_overlapping_blocks(blocks)
+        # Detect language for each block
+        for block in blocks:
+            block.language = self._detect_language(block.content)
+        logger.info(
+            "Code blocks detected",
+            total_blocks=len(blocks),
+            languages=list(set(b.language for b in blocks if b.language)),
+            block_types=[b.block_type.value for b in blocks]
+        )
+        return blocks
+    def _create_code_block(
+        self,
+        match: re.Match,
+        block_type: CodeBlockType,
+        content: str
+    ) -> Optional[CodeBlock]:
+        """Create a CodeBlock object from a regex match."""
+        start_pos = match.start()
+        end_pos = match.end()
+        original_text = match.group(0)
+        if block_type in [CodeBlockType.MARKDOWN, CodeBlockType.FENCED]:
+            # Extract language from fence
+            language = match.group(1) if match.groups() and match.group(1) else None
+            code_content = match.group(2) if match.groups() and len(match.groups()) > 1 else ""
+        elif block_type == CodeBlockType.HTML_PRE:
+            # Extract from HTML pre/code structure
+            soup = BeautifulSoup(original_text, 'html.parser')
+            code_tag = soup.find('code')
+            if code_tag:
+                language = self._extract_language_from_classes(code_tag.get('class', []))
+                code_content = code_tag.get_text()
+            else:
+                language = None
+                code_content = original_text
+        elif block_type == CodeBlockType.HTML_INLINE:
+            # Inline code
+            soup = BeautifulSoup(original_text, 'html.parser')
+            code_content = soup.get_text()
+            language = None
+        else:
+            # Other types
+            code_content = original_text
+            language = None
+        if not code_content.strip():
+            return None
+        return CodeBlock(
+            block_type=block_type,
+            language=language,
+            content=code_content,
+            original_text=original_text,
+            start_position=start_pos,
+            end_position=end_pos,
+            attributes={'match_groups': match.groups()},
+            preserve_formatting=True,
+            add_urdu_comments=self._should_add_urdu_comments(code_content, language)
+        )
+    def _remove_overlapping_blocks(self, blocks: List[CodeBlock]) -> List[CodeBlock]:
+        """Remove overlapping code blocks."""
+        if not blocks:
+            return []
+        # Sort by start position
+        blocks.sort(key=lambda x: x.start_position)
+        filtered_blocks = []
+        last_end = -1
+        for block in blocks:
+            if block.start_position >= last_end:
+                filtered_blocks.append(block)
+                last_end = block.end_position
+        return filtered_blocks
+    def _detect_language(self, code_content: str) -> Optional[str]:
+        """Detect the programming language of code content."""
+        # Try language hints first
+        language = self._detect_language_from_hints(code_content)
+        if language:
+            return language
+        # Try pattern matching
+        language = self._detect_language_from_patterns(code_content)
+        if language:
+            return language
+        # Use pygments as fallback
+        try:
+            lexer = guess_lexer(code_content)
+            if lexer:
+                return lexer.name.lower()
+        except:
+            pass
+        return None
+    def _detect_language_from_hints(self, code_content: str) -> Optional[str]:
+        """Detect language from explicit hints."""
+        # Check for shebang
+        shebang_match = re.match(r'^#!\s*/.*(?:python|node|bash|perl|ruby|php)\s*', code_content, re.MULTILINE)
+        if shebang_match:
+            shebang = shebang_match.group()
+            if 'python' in shebang:
+                return 'python'
+            elif 'node' in shebang:
+                return 'javascript'
+            elif 'bash' in shebang:
+                return 'bash'
+            elif 'perl' in shebang:
+                return 'perl'
+            elif 'ruby' in shebang:
+                return 'ruby'
+            elif 'php' in shebang:
+                return 'php'
+        # Check for language comments
+        if code_content.strip().startswith('#!'):
+            return 'bash'  # Likely shell script
+        return None
+    def _detect_language_from_patterns(self, code_content: str) -> Optional[str]:
+        """Detect language using pattern matching."""
+        scores = {}
+        for language, patterns in self.LANGUAGE_PATTERNS.items():
+            score = 0
+            for pattern in patterns:
+                matches = len(list(re.finditer(pattern, code_content, re.MULTILINE)))
+                score += matches
+            if score > 0:
+                scores[language] = score
+        if scores:
+            return max(scores.items(), key=lambda x: x[1])[0]
+        return None
+    def _extract_language_from_classes(self, classes: List[str]) -> Optional[str]:
+        """Extract language from CSS classes."""
+        for cls in classes:
+            if isinstance(cls, str):
+                # Check for language- prefixed classes
+                if cls.startswith('language-'):
+                    return cls[9:]
+                # Check for known language classes
+                if cls.lower() in ['python', 'javascript', 'java', 'cpp', 'c', 'html', 'css', 'sql', 'json']:
+                    return cls.lower()
+                # Check for highlight.js classes
+                if cls.startswith('hljs-'):
+                    lang = cls[5:]
+                    if lang != 'language':
+                        return lang
+        return None
+    def _should_add_urdu_comments(self, code_content: str, language: Optional[str]) -> bool:
+        """Determine if Urdu comments should be added."""
+        if not language or language not in self.urdu_comments:
+            return False
+        # Don't add comments to very short code blocks
+        if len(code_content.split('\n')) < 3:
+            return False
+        # Don't add if there are already comments in the target language
+        comment_char = self.urdu_comments[language]
+        if comment_char and comment_char in code_content:
+            # Check for non-English characters in comments
+            comment_pattern = re.compile(f'{re.escape(comment_char)}.*[^\x00-\x7F]+')
+            if comment_pattern.search(code_content):
+                return False
+        return True
+    def add_urdu_comments(self, code_block: CodeBlock) -> str:
+        """
+        Add Urdu explanatory comments to code block.
+        Args:
+            code_block: Code block to enhance
+        Returns:
+            Code block with Urdu comments added
+        """
+        if not code_block.language or not code_block.add_urdu_comments:
+            return code_block.content
+        language = code_block.language
+        comment_char = self.urdu_comments[language]
+        lines = code_block.content.split('\n')
+        enhanced_lines = []
+        for i, line in enumerate(lines):
+            enhanced_lines.append(line)
+            # Add comments after key lines
+            if self._is_comment_line(line, language):
+                continue
+            # Add Urdu comment after function definitions
+            if re.search(r'^(def|function|class|interface)\s+\w+', line):
+                # Extract function/class name
+                match = re.search(r'(def|function|class|interface)\s+(\w+)', line)
+                if match:
+                    name = match.group(2)
+                    urdu_translation = self._translate_code_name(name)
+                    enhanced_lines.append(f"{comment_char} {urdu_translation}")
+            # Add comment after important statements
+            elif re.search(r'\b(return|break|continue|pass)\b', line):
+                urdu_comment = self._translate_statement(line.strip())
+                if urdu_comment:
+                    enhanced_lines.append(f"{comment_char} {urdu_comment}")
+            # Add comment after imports
+            elif re.match(r'^(import|from|include)\s+', line):
+                urdu_comment = self._translate_import(line.strip())
+                if urdu_comment:
+                    enhanced_lines.append(f"{comment_char} {urdu_comment}")
+        return '\n'.join(enhanced_lines)
+    def _is_comment_line(self, line: str, language: str) -> bool:
+        """Check if line is already a comment."""
+        comment_char = self.urdu_comments.get(language, '')
+        return comment_char and line.strip().startswith(comment_char)
+    def _translate_code_name(self, name: str) -> str:
+        """Translate a code identifier to Urdu."""
+        # Common translations
+        translations = {
+            'main': 'مین',
+            'init': 'ابتدائی',
+            'start': 'شروع',
+            'setup': 'سیٹ اپ',
+            'run': 'چلائیں',
+            'process': 'عملدرس',
+            'handle': 'ہینڈل کریں',
+            'update': 'اپڈیٹ کرنا',
+            'get': 'حاصل کریں',
+            'set': 'سیٹ کرنا',
+            'create': 'بنانا',
+            'delete': 'حذف کرنا',
+            'calculate': 'حساب لگانا',
+            'validate': 'تصدیق کرنا',
+            'convert': 'تبدیل کرنا',
+            'transform': 'تبدیل کرنا',
+            'parse': 'پارس کرنا',
+            'render': 'رینڈر کرنا',
+            'fetch': 'لانا',
+            'send': 'بھیجنا',
+            'receive': 'صول کرنا',
+            'connect': 'ربط جوڑنا',
+            'close': 'بند کرنا',
+            'open': 'کھولنا',
+            'save': 'محفوظ کرنا',
+            'load': 'لوڈ کرنا',
+            'read': 'پڑھنا',
+            'write': 'لکھنا',
+        }
+        return translations.get(name, name)
+    def _translate_statement(self, statement: str) -> str:
+        """Translate a code statement to Urdu."""
+        # Common statement translations
+        translations = {
+            'return': 'واپس کریں',
+            'break': 'روک جائیں',
+            'continue': 'جاری رکھیں',
+            'pass': 'چھوٹ دیں',
+            'yield': 'دیں',
+            'raise': 'پھلاؤ',
+            'try': 'کوشش کریں',
+            'except': 'چھوٹ',
+            'finally': 'آخر میں',
+            'assert': 'تصدیق کریں',
+            'del': 'حذف کریں',
+        }
+        # Extract keyword
+        match = re.search(r'\b(' + '|'.join(translations.keys()) + r')\b', statement)
+        if match:
+            keyword = match.group(1)
+            translated = translations.get(keyword, keyword)
+            return statement.replace(keyword, translated, 1)
+        return None
+    def _translate_import(self, import_statement: str) -> str:
+        """Translate an import statement to Urdu."""
+        if 'import ' in import_statement:
+            return 'لائبریری امپورٹ کریں'
+        elif 'from ' in import_statement:
+            return 'سے امپورٹ کریں'
+        elif 'include ' in import_statement:
+            return 'شامل کریں'
+        return None
+    def preserve_code_blocks(
+        self,
+        original_content: str,
+        translated_content: str,
+        code_blocks: List[CodeBlock]
+    ) -> str:
+        """
+        Preserve code blocks in translated content.
+        Args:
+            original_content: Original content with code blocks
+            translated_content: Translated content
+            code_blocks: Detected code blocks
+        Returns:
+            Content with original code blocks preserved
+        """
+        logger.info(
+            "Preserving code blocks",
+            original_blocks=len(code_blocks)
+        )
+        # Replace translated code blocks with original ones
+        result = translated_content
+        blocks_preserved = 0
+        for block in code_blocks:
+            # Find and replace the corresponding block in translated content
+            # This is simplified - in practice, you'd want more precise matching
+            translated_block_content = self._find_translated_block(
+                result, block, original_content
+            )
+            if translated_block_content is not None:
+                # Replace with original
+                result = result.replace(
+                    translated_block_content,
+                    block.original_text,
+                    1
+                )
+                blocks_preserved += 1
+                # Add Urdu comments if configured
+                if block.add_urdu_comments:
+                    enhanced_code = self.add_urdu_comments(block)
+                    result = result.replace(
+                        block.original_text,
+                        enhanced_code,
+                        1
+                    )
+        logger.info(
+            "Code blocks preserved",
+            blocks_preserved=blocks_preserved,
+            blocks_total=len(code_blocks)
+        )
+        return result
+    def _find_translated_block(
+        self,
+        content: str,
+        original_block: CodeBlock,
+        original_content: str
+    ) -> Optional[str]:
+        """Find the translated version of a code block."""
+        # This is a simplified implementation
+        # In practice, you'd track blocks more precisely during translation
+        # Look for the block content in the translated content
+        # This might not work perfectly due to translation changes
+        if original_block.content in content:
+            return original_block.content
+        # Try to find by looking for unique lines
+        original_lines = original_block.content.split('\n')
+        if len(original_lines) > 3:
+            # Use first and last lines as markers
+            first_line = original_lines[0]
+            last_line = original_lines[-1]
+            if first_line in content and last_line in content:
+                # Extract content between markers
+                start = content.find(first_line)
+                end = content.rfind(last_line) + len(last_line)
+                return content[start:end]
+        return None
+    def add_syntax_highlighting(
+        self,
+        code_block: CodeBlock,
+        theme: str = "default"
+    ) -> str:
+        """
+        Add syntax highlighting to a code block.
+        Args:
+            code_block: Code block to highlight
+            theme: Highlighting theme
+        Returns:
+            HTML with syntax highlighting
+        """
+        try:
+            lexer = get_lexer_by_name(code_block.language or 'text')
+            formatter = HtmlFormatter(
+                style=theme,
+                linenos=True,
+                cssclass="highlight"
+            )
+            return highlight(code_block.content, lexer, formatter)
+        except:
+            # Fallback to plain code block
+            return f'<pre><code>{code_block.content}</code></pre>'
+    def validate_code_blocks(
+        self,
+        code_blocks: List[CodeBlock],
+        content: str
+    ) -> Dict[str, Any]:
+        """
+        Validate detected code blocks.
+        Args:
+            code_blocks: Detected code blocks
+            content: Original content
+        Returns:
+            Validation report
+        """
+        report = {
+            'valid': True,
+            'warnings': [],
+            'errors': [],
+            'stats': {
+                'total_blocks': len(code_blocks),
+                'languages_detected': list(set(b.language for b in code_blocks if b.language)),
+                'blocks_with_languages': len([b for b in code_blocks if b.language])
+            }
+        }
+        for block in code_blocks:
+            # Check for empty blocks
+            if not block.content.strip():
+                report['warnings'].append(
+                    f"Empty code block at position {block.start_position}"
+                )
+            # Check for very long blocks
+            if len(block.content) > 10000:
+                report['warnings'].append(
+                    f"Very long code block ({len(block.content)} chars) at position {block.start_position}"
+                )
+            # Check for potential formatting issues
+            if block.block_type == CodeBlockType.INDENTED and block.content.strip():
+                report['warnings'].append(
+                    f"Indented code block detected at position {block.start_position} - might be unintentional"
+                )
+        logger.info(
+            "Code block validation complete",
+            total_warnings=len(report['warnings']),
+            total_errors=len(report['errors'])
+        )
+        return report

src/services/content_reconstructor.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""
+Content Reconstructor for Translation System.
+This module reconstructs HTML content from parsed elements,
+injecting translated text while preserving original formatting
+and structure.
+"""
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+from bs4 import BeautifulSoup, Tag, NavigableString
+import re
+import markdown
+from src.services.html_parser import ContentElement, ContentType
+from src.utils.translation_logger import get_translation_logger
+logger = get_translation_logger(__name__)
+@dataclass
+class ReconstructionConfig:
+    """Configuration for content reconstruction."""
+    preserve_classes: bool = True
+    preserve_ids: bool = True
+    preserve_data_attributes: bool = False
+    preserve_style: bool = True
+    add_translation_markers: bool = False
+    cleanup_empty_elements: bool = True
+class ContentReconstructor:
+    """
+    Reconstructs HTML content from parsed elements with translations.
+    Features:
+    - Recursive HTML reconstruction
+    - Formatting preservation
+    - Code block protection
+    - Translation marker injection
+    - Structure validation
+    """
+    def __init__(self, config: Optional[ReconstructionConfig] = None):
+        """
+        Initialize content reconstructor.
+        Args:
+            config: Reconstruction configuration
+        """
+        self.config = config or ReconstructionConfig()
+        self.translation_markers = {
+            'translated': 'data-translated="true"',
+            'original': 'data-original="',
+            'preserve': 'data-preserve="true"'
+        }
+    def reconstruct_html(
+        self,
+        elements: List[ContentElement],
+        translated_map: Dict[str, str],
+        base_format: str = "html"
+    ) -> str:
+        """
+        Reconstruct HTML from parsed elements with translations.
+        Args:
+            elements: Parsed content elements
+            translated_map: Mapping of original text to translated text
+            base_format: Base format (html, markdown, etc.)
+        Returns:
+            Reconstructed HTML content
+        """
+        logger.info(
+            "Reconstructing HTML content",
+            elements_count=len(elements),
+            translations_count=len(translated_map),
+            base_format=base_format
+        )
+        # Create base document
+        if base_format == "html":
+            soup = BeautifulSoup("", "html.parser")
+            body = soup.new_tag("body")
+            soup.append(body)
+        else:
+            soup = BeautifulSoup("", "html.parser")
+        # Reconstruct elements
+        container = soup.body if soup.body else soup
+        for element in elements:
+            reconstructed = self._reconstruct_element(element, translated_map, soup)
+            if reconstructed:
+                container.append(reconstructed)
+        # Post-processing
+        html_content = str(soup)
+        if self.config.cleanup_empty_elements:
+            html_content = self._cleanup_empty_elements(html_content)
+        logger.info(
+            "HTML reconstruction complete",
+            output_length=len(html_content)
+        )
+        return html_content
+    def _reconstruct_element(
+        self,
+        element: ContentElement,
+        translated_map: Dict[str, str],
+        soup: BeautifulSoup
+    ) -> Optional[Tag]:
+        """
+        Reconstruct a single element.
+        Args:
+            element: Content element to reconstruct
+            translated_map: Translation mapping
+            soup: BeautifulSoup document
+        Returns:
+            Reconstructed HTML tag
+        """
+        # Handle special content types
+        if element.element_type == ContentType.CODE:
+            return self._reconstruct_code_element(element, soup)
+        elif element.element_type == ContentType.IMAGE:
+            return self._reconstruct_image_element(element, soup)
+        elif element.element_type == ContentType.LINK:
+            return self._reconstruct_link_element(element, soup)
+        elif element.element_type == ContentType.METADATA:
+            return None  # Skip metadata
+        # Create appropriate tag
+        tag = self._create_tag(element.element_type, soup, element)
+        # Add attributes
+        self._add_attributes(tag, element)
+        # Add content or children
+        if element.should_translate and element.element_type == ContentType.TEXT:
+            # Add translated text
+            translated_text = translated_map.get(element.content, element.content)
+            tag.string = translated_text
+            # Add translation marker if configured
+            if self.config.add_translation_markers:
+                tag['data-translated'] = 'true'
+                tag['data-original'] = element.content
+        elif element.children:
+            # Reconstruct children
+            for child in element.children:
+                child_tag = self._reconstruct_element(child, translated_map, soup)
+                if child_tag:
+                    tag.append(child_tag)
+        elif element.content:
+            # Add original content for non-translatable elements
+            tag.string = element.content
+            if element.element_type != ContentType.CODE:
+                tag['data-preserve'] = 'true'
+        return tag
+    def _reconstruct_code_element(
+        self,
+        element: ContentElement,
+        soup: BeautifulSoup
+    ) -> Tag:
+        """Reconstruct a code element."""
+        # Determine if it's inline or block code
+        is_inline = (
+            element.element_type == ContentType.INLINE_CODE or
+            not element.attributes.get('class', [])
+        )
+        if is_inline:
+            tag = soup.new_tag("code")
+        else:
+            tag = soup.new_tag("pre")
+            code_tag = soup.new_tag("code")
+            tag.append(code_tag)
+            tag = code_tag
+        # Add language class if specified
+        if 'language' in element.attributes:
+            tag['class'] = f"language-{element.attributes['language']}"
+        # Add original content
+        tag.string = element.content
+        tag['data-preserve'] = 'true'
+        return tag
+    def _reconstruct_image_element(
+        self,
+        element: ContentElement,
+        soup: BeautifulSoup
+    ) -> Tag:
+        """Reconstruct an image element."""
+        tag = soup.new_tag("img")
+        # Add attributes
+        for attr, value in element.attributes.items():
+            if attr in ['src', 'alt', 'title', 'width', 'height', 'class', 'id']:
+                tag[attr] = value
+        # Ensure essential attributes
+        if 'src' not in element.attributes and 'data-src' in element.attributes:
+            tag['src'] = element.attributes['data-src']
+        tag['data-preserve'] = 'true'
+        return tag
+    def _reconstruct_link_element(
+        self,
+        element: ContentElement,
+        soup: BeautifulSoup
+    ) -> Tag:
+        """Reconstruct a link element."""
+        tag = soup.new_tag("a")
+        # Add attributes
+        for attr, value in element.attributes.items():
+            if attr in ['href', 'title', 'target', 'class', 'id']:
+                tag[attr] = value
+        # Add content (typically don't translate URLs)
+        tag.string = element.content
+        tag['data-preserve'] = 'true'
+        return tag
+    def _create_tag(self, element_type: ContentType, soup: BeautifulSoup, element=None) -> Tag:
+        """Create appropriate HTML tag for element type."""
+        tag_mapping = {
+            ContentType.TEXT: "p",
+            ContentType.HEADING: "p",  # Will be updated based on attributes
+            ContentType.LIST: "ul",  # Default to unordered list
+            ContentType.QUOTE: "blockquote",
+            ContentType.EMPHASIS: "em",
+            ContentType.STRONG: "strong",
+            ContentType.TABLE: "table",
+            ContentType.CODE: "code",
+        }
+        tag_name = tag_mapping.get(element_type, "div")
+        if element_type == ContentType.HEADING and element and 'level' in element.attributes:
+            level = element.attributes['level']
+            if isinstance(level, int) and 1 <= level <= 6:
+                tag_name = f"h{level}"
+        return soup.new_tag(tag_name)
+    def _add_attributes(self, tag: Tag, element: ContentElement) -> None:
+        """Add attributes to reconstructed tag."""
+        for attr, value in element.attributes.items():
+            # Skip internal attributes
+            if attr.startswith('_'):
+                continue
+            # Skip content attributes
+            if attr in ['content', 'text']:
+                continue
+            # Attribute filtering based on config
+            if attr == 'class' and not self.config.preserve_classes:
+                continue
+            elif attr == 'id' and not self.config.preserve_ids:
+                continue
+            elif attr.startswith('data-') and not self.config.preserve_data_attributes:
+                continue
+            elif attr == 'style' and not self.config.preserve_style:
+                continue
+            tag[attr] = value
+    def _cleanup_empty_elements(self, html: str) -> str:
+        """Remove empty elements from HTML."""
+        # Remove empty tags
+        html = re.sub(r'<([a-z]+)[^>]*>\s*</\1>', '', html)
+        # Remove extra whitespace
+        html = re.sub(r'\s+', ' ', html)
+        # Clean up around tags
+        html = re.sub(r'>\s+<', '><', html)
+        html = re.sub(r'\s+', ' ', html)
+        return html.strip()
+    def inject_translated_text(
+        self,
+        html_content: str,
+        translated_segments: List[Dict[str, Any]]
+    ) -> str:
+        """
+        Inject translated text segments into HTML content.
+        Args:
+            html_content: Original HTML content
+            translated_segments: List of translated text segments with positions
+        Returns:
+            HTML content with translated text injected
+        """
+        logger.info(
+            "Injecting translated text",
+            segments_count=len(translated_segments)
+        )
+        # Sort segments by position (reverse order to maintain indices)
+        segments = sorted(translated_segments, key=lambda x: x.get('position', 0), reverse=True)
+        result = html_content
+        for segment in segments:
+            start = segment.get('start', 0)
+            end = segment.get('end', len(result))
+            translated_text = segment.get('translated_text', '')
+            # Replace the segment
+            result = result[:start] + translated_text + result[end:]
+        return result
+    def create_translation_markers(
+        self,
+        elements: List[ContentElement]
+    ) -> List[Dict[str, Any]]:
+        """
+        Create marker positions for text segments to be translated.
+        Args:
+            elements: Parsed content elements
+        Returns:
+            List of marker positions
+        """
+        markers = []
+        current_position = 0
+        for element in elements:
+            if element.should_translate and element.element_type == ContentType.TEXT:
+                text = element.content
+                if text.strip():
+                    markers.append({
+                        'start': current_position,
+                        'end': current_position + len(text),
+                        'original_text': text,
+                        'element_id': id(element)
+                    })
+                    current_position += len(text)
+        logger.info(
+            "Created translation markers",
+            markers_count=len(markers),
+            text_length=current_position
+        )
+        return markers
+    def validate_reconstruction(
+        self,
+        original_html: str,
+        reconstructed_html: str,
+        original_elements: List[ContentElement],
+        reconstructed_elements: List[ContentElement]
+    ) -> Dict[str, Any]:
+        """
+        Validate the reconstruction process.
+        Args:
+            original_html: Original HTML content
+            reconstructed_html: Reconstructed HTML content
+            original_elements: Original parsed elements
+            reconstructed_elements: Reconstructed elements
+        Returns:
+            Validation report
+        """
+        report = {
+            'is_valid': True,
+            'errors': [],
+            'warnings': [],
+            'stats': {
+                'original_length': len(original_html),
+                'reconstructed_length': len(reconstructed_html),
+                'original_elements': len(original_elements),
+                'reconstructed_elements': len(reconstructed_elements)
+            }
+        }
+        # Check element counts
+        original_types = self._count_elements_by_type(original_elements)
+        reconstructed_types = self._count_elements_by_type(reconstructed_elements)
+        for element_type, count in original_types.items():
+            reconstructed_count = reconstructed_types.get(element_type, 0)
+            if count != reconstructed_count:
+                report['errors'].append(
+                    f"Element count mismatch for {element_type.value}: "
+                    f"original={count}, reconstructed={reconstructed_count}"
+                )
+                report['is_valid'] = False
+        # Check code blocks preservation
+        original_code = len([e for e in original_elements if e.element_type == ContentType.CODE])
+        reconstructed_code = len([e for e in reconstructed_elements if e.element_type == ContentType.CODE])
+        if original_code != reconstructed_code:
+            report['errors'].append(
+                f"Code blocks not preserved: original={original_code}, reconstructed={reconstructed_code}"
+            )
+            report['is_valid'] = False
+        # Check for preserved attributes
+        preserved_attributes = self._check_preserved_attributes(
+            original_elements,
+            reconstructed_elements
+        )
+        if not preserved_attributes['all_preserved']:
+            report['warnings'].extend(preserved_attributes['missing_attributes'])
+        logger.info(
+            "Reconstruction validation complete",
+            is_valid=report['is_valid'],
+            errors_count=len(report['errors']),
+            warnings_count=len(report['warnings'])
+        )
+        return report
+    def _count_elements_by_type(self, elements: List[ContentElement]) -> Dict[ContentType, int]:
+        """Count elements by type."""
+        counts = {}
+        for element in elements:
+            counts[element.element_type] = counts.get(element.element_type, 0) + 1
+        return counts
+    def _check_preserved_attributes(
+        self,
+        original_elements: List[ContentElement],
+        reconstructed_elements: List[ContentElement]
+    ) -> Dict[str, Any]:
+        """Check if important attributes are preserved."""
+        result = {
+            'all_preserved': True,
+            'missing_attributes': []
+        }
+        important_attrs = ['id', 'class', 'href', 'src', 'alt']
+        # This is a simplified check
+        # In practice, you'd want more sophisticated comparison
+        for orig_elem in original_elements:
+            for attr in important_attrs:
+                if attr in orig_elem.attributes:
+                    result['missing_attributes'].append(
+                        f"Attribute '{attr}' may not be preserved in element {orig_elem.element_type.value}"
+                    )
+        if result['missing_attributes']:
+            result['all_preserved'] = False
+        return result

src/services/html_parser.py ADDED Viewed

	@@ -0,0 +1,565 @@

+"""
+HTML Parser for Translation Formatting Preservation.
+This module parses HTML content to extract structure, identify
+different content types, and prepare for translation while preserving
+formatting.
+"""
+import re
+from typing import Dict, List, Optional, Any, Tuple
+from dataclasses import dataclass
+from enum import Enum
+from bs4 import BeautifulSoup, Tag, NavigableString
+import markdown
+from src.utils.translation_logger import get_translation_logger
+logger = get_translation_logger(__name__)
+class ContentType(Enum):
+    """Content types for translation handling."""
+    TEXT = "text"
+    CODE = "code"
+    HEADING = "heading"
+    LIST = "list"
+    LINK = "link"
+    IMAGE = "image"
+    TABLE = "table"
+    QUOTE = "quote"
+    EMPHASIS = "emphasis"
+    STRONG = "strong"
+    INLINE_CODE = "inline_code"
+    MATH = "math"
+    METADATA = "metadata"
+@dataclass
+class ContentElement:
+    """Represents a parsed content element."""
+    element_type: ContentType
+    content: str
+    attributes: Dict[str, Any]
+    children: List['ContentElement']
+    parent: Optional['ContentElement'] = None
+    should_translate: bool = True
+    preserve_formatting: bool = True
+    position: int = 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "type": self.element_type.value,
+            "content": self.content,
+            "attributes": self.attributes,
+            "children": [child.to_dict() for child in self.children],
+            "should_translate": self.should_translate,
+            "preserve_formatting": self.preserve_formatting,
+            "position": self.position
+        }
+class HTMLParser:
+    """
+    HTML parser for translation with formatting preservation.
+    Features:
+    - Recursive HTML parsing
+    - Content type identification
+    - Code block detection and preservation
+    - Formatting marker injection
+    - Structure reconstruction support
+    """
+    # Code block patterns
+    CODE_BLOCK_PATTERNS = [
+        re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL),  # Markdown code blocks
+        re.compile(r'<pre><code[^>]*>(.*?)</code></pre>', re.DOTALL | re.IGNORECASE),  # HTML pre/code blocks
+        re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL | re.IGNORECASE),  # Inline code
+    ]
+    # Special tags that should not be translated
+    NON_TRANSLATABLE_TAGS = {
+        'script', 'style', 'noscript', 'iframe', 'object', 'embed',
+        'svg', 'math', 'canvas', 'video', 'audio'
+    }
+    # Tags that preserve inner structure
+    STRUCTURE_PRESERVING_TAGS = {
+        'pre', 'code', 'kbd', 'samp', 'var'
+    }
+    # Formatting tags
+    FORMATTING_TAGS = {
+        'em', 'i', 'strong', 'b', 'mark', 'small', 'del', 'ins',
+        'sub', 'sup', 'u', 'tt'
+    }
+    def __init__(self):
+        """Initialize HTML parser."""
+        self.position_counter = 0
+        self.translation_markers = {
+            'start': '{{TRANSLATE_START}}',
+            'end': '{{TRANSLATE_END}}',
+            'skip': '{{SKIP_TRANSLATION}}'
+        }
+    def parse_html(
+        self,
+        html_content: str,
+        source_format: str = "html"
+    ) -> List[ContentElement]:
+        """
+        Parse HTML content into structured elements.
+        Args:
+            html_content: HTML content to parse
+            source_format: Format type (html, markdown, etc.)
+        Returns:
+            List of parsed content elements
+        """
+        logger.info(
+            "Parsing HTML content",
+            content_length=len(html_content),
+            source_format=source_format
+        )
+        # Convert markdown to HTML if needed
+        if source_format == "markdown":
+            html_content = markdown.markdown(
+                html_content,
+                extensions=['codehilite', 'tables', 'toc']
+            )
+        # Parse with BeautifulSoup
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Extract and parse elements
+        elements = []
+        self.position_counter = 0
+        for child in soup.body.children if soup.body else soup.children:
+            element = self._parse_node(child)
+            if element:
+                elements.append(element)
+        logger.info(
+            "HTML parsing complete",
+            elements_count=len(elements),
+            translate_elements=len([e for e in self._flatten_elements(elements) if e.should_translate])
+        )
+        return elements
+    def _parse_node(self, node) -> Optional[ContentElement]:
+        """
+        Parse a BeautifulSoup node into a content element.
+        Args:
+            node: BeautifulSoup node
+        Returns:
+            Parsed content element or None
+        """
+        if isinstance(node, NavigableString):
+            # Handle text content
+            text = str(node).strip()
+            if text:
+                return ContentElement(
+                    element_type=ContentType.TEXT,
+                    content=text,
+                    attributes={},
+                    children=[],
+                    should_translate=True,
+                    preserve_formatting=False,
+                    position=self.position_counter
+                )
+            return None
+        elif isinstance(node, Tag):
+            tag_name = node.name.lower()
+            attributes = dict(node.attrs)
+            # Determine content type
+            element_type = self._determine_content_type(node, tag_name)
+            # Check if should translate
+            should_translate = self._should_translate_content(node, tag_name)
+            # Parse children
+            children = []
+            for child in node.children:
+                child_element = self._parse_node(child)
+                if child_element:
+                    child_element.parent = node  # type: ignore
+                    children.append(child_element)
+            # Create element
+            element = ContentElement(
+                element_type=element_type,
+                content=node.get_text(strip=True) if should_translate else "",
+                attributes=attributes,
+                children=children,
+                should_translate=should_translate,
+                preserve_formatting=self._should_preserve_formatting(tag_name),
+                position=self.position_counter
+            )
+            self.position_counter += 1
+            return element
+        return None
+    def _determine_content_type(self, node: Tag, tag_name: str) -> ContentType:
+        """Determine the content type of a node."""
+        # Code blocks
+        if tag_name in ['pre', 'code'] or self._has_code_class(node):
+            return ContentType.CODE
+        # Headings
+        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            return ContentType.HEADING
+        # Lists
+        elif tag_name in ['ul', 'ol', 'li', 'dl', 'dt', 'dd']:
+            return ContentType.LIST
+        # Links
+        elif tag_name == 'a':
+            return ContentType.LINK
+        # Images
+        elif tag_name == 'img':
+            return ContentType.IMAGE
+        # Tables
+        elif tag_name in ['table', 'thead', 'tbody', 'tr', 'td', 'th']:
+            return ContentType.TABLE
+        # Quotes
+        elif tag_name in ['blockquote', 'q']:
+            return ContentType.QUOTE
+        # Inline formatting
+        elif tag_name in self.FORMATTING_TAGS:
+            if tag_name in ['em', 'i']:
+                return ContentType.EMPHASIS
+            elif tag_name in ['strong', 'b']:
+                return ContentType.STRONG
+            elif tag_name == 'code' and not self._is_block_code(node):
+                return ContentType.INLINE_CODE
+        # Math
+        elif tag_name in ['math', 'mrow', 'mfrac', 'msqrt', 'mroot']:
+            return ContentType.MATH
+        # Metadata
+        elif tag_name in ['meta', 'title', 'head', 'style', 'script']:
+            return ContentType.METADATA
+        # Default to text
+        else:
+            return ContentType.TEXT
+    def _should_translate_content(self, node: Tag, tag_name: str) -> bool:
+        """Determine if content should be translated."""
+        # Don't translate non-translatable tags
+        if tag_name in self.NON_TRANSLATABLE_TAGS:
+            return False
+        # Don't translate code blocks
+        if tag_name == 'code' and (node.parent and node.parent.name == 'pre'):
+            return False
+        if tag_name == 'pre':
+            return False
+        # Don't translate if class indicates code
+        if self._has_code_class(node):
+            return False
+        # Don't translate image alt text that's purely technical
+        if tag_name == 'img' and self._is_technical_alt_text(node.get('alt', '')):
+            return False
+        return True
+    def _should_preserve_formatting(self, tag_name: str) -> bool:
+        """Check if formatting should be preserved."""
+        return tag_name in (self.STRUCTURE_PRESERVING_TAGS | self.FORMATTING_TAGS)
+    def _has_code_class(self, node: Tag) -> bool:
+        """Check if node has code-related classes."""
+        classes = node.get('class', [])
+        if isinstance(classes, str):
+            classes = [classes]
+        code_indicators = [
+            'language-', 'highlight', 'code', ' hljs', 'chroma',
+            'source-code', 'pre', 'verbatim', 'literal'
+        ]
+        return any(
+            any(indicator in cls for indicator in code_indicators)
+            for cls in classes
+        )
+    def _is_block_code(self, node: Tag) -> bool:
+        """Check if code element is a block code."""
+        return (
+            node.name == 'code' and
+            node.parent and
+            node.parent.name == 'pre'
+        )
+    def _is_technical_alt_text(self, alt_text: str) -> bool:
+        """Check if alt text is purely technical."""
+        technical_indicators = [
+            'diagram', 'chart', 'graph', 'formula', 'equation',
+            'algorithm', 'flowchart', 'schema', 'architecture'
+        ]
+        return any(indicator in alt_text.lower() for indicator in technical_indicators)
+    def _flatten_elements(self, elements: List[ContentElement]) -> List[ContentElement]:
+        """Flatten nested elements into a single list."""
+        flattened = []
+        for element in elements:
+            flattened.append(element)
+            flattened.extend(self._flatten_elements(element.children))
+        return flattened
+    def extract_translatable_text(self, elements: List[ContentElement]) -> str:
+        """
+        Extract only translatable text content from elements.
+        Args:
+            elements: Parsed content elements
+        Returns:
+            Concatenated translatable text
+        """
+        translatable_parts = []
+        for element in self._flatten_elements(elements):
+            if element.should_translate and element.element_type != ContentType.CODE:
+                if element.element_type == ContentType.TEXT:
+                    translatable_parts.append(element.content)
+                else:
+                    # Add spacing for block elements
+                    if element.element_type == ContentType.HEADING:
+                        translatable_parts.append('\n\n')
+        return ''.join(translatable_parts).strip()
+    def inject_translation_markers(
+        self,
+        elements: List[ContentElement],
+        translated_text: str
+    ) -> List[ContentElement]:
+        """
+        Inject translation markers into elements for reconstruction.
+        Args:
+            elements: Original parsed elements
+            translated_text: Translated text content
+        Returns:
+            Elements with markers injected
+        """
+        # This is a simplified version - in practice, you'd want
+        # more sophisticated mapping of translated text to elements
+        translatable_elements = [
+            e for e in self._flatten_elements(elements)
+            if e.should_translate and e.element_type != ContentType.CODE
+        ]
+        if translatable_elements:
+            # Inject markers around the whole content
+            first = translatable_elements[0]
+            last = translatable_elements[-1]
+            # Add start marker
+            first.attributes['_translation_start'] = True
+            # Add end marker
+            last.attributes['_translation_end'] = True
+        return elements
+    def extract_code_blocks(self, html_content: str) -> List[Dict[str, Any]]:
+        """
+        Extract code blocks from HTML content.
+        Args:
+            html_content: HTML content to parse
+        Returns:
+            List of code block information
+        """
+        code_blocks = []
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Find all code blocks
+        for code_element in soup.find_all(['pre', 'code']):
+            if code_element.name == 'pre' or (
+                code_element.name == 'code' and
+                code_element.parent and
+                code_element.parent.name == 'pre'
+            ):
+                language = None
+                classes = code_element.get('class', [])
+                # Extract language from classes
+                if classes:
+                    for cls in classes:
+                        if isinstance(cls, str):
+                            if cls.startswith('language-'):
+                                language = cls[9:]
+                            elif cls in ['python', 'javascript', 'java', 'cpp', 'html', 'css', 'sql']:
+                                language = cls
+                code_content = code_element.get_text()
+                code_html = str(code_element)
+                code_blocks.append({
+                    'language': language or 'text',
+                    'content': code_content,
+                    'html': code_html,
+                    'position': html_content.find(code_html)
+                })
+        logger.info(
+            "Code blocks extracted",
+            total_blocks=len(code_blocks),
+            languages=[cb['language'] for cb in code_blocks]
+        )
+        return code_blocks
+    def preserve_code_blocks(
+        self,
+        html_content: str,
+        translated_content: str
+    ) -> str:
+        """
+        Preserve code blocks in translated content.
+        Args:
+            html_content: Original HTML with code blocks
+            translated_content: Translated HTML (code blocks might be altered)
+        Returns:
+            HTML with original code blocks preserved
+        """
+        # Extract code blocks from original
+        original_blocks = self.extract_code_blocks(html_content)
+        # Replace code blocks in translated content with originals
+        result = translated_content
+        for block in original_blocks:
+            result = result.replace(block['html'], block['html'], 1)
+        logger.info(
+            "Code blocks preserved",
+            blocks_count=len(original_blocks)
+        )
+        return result
+    def validate_structure(
+        self,
+        original_elements: List[ContentElement],
+        translated_elements: List[ContentElement]
+    ) -> List[str]:
+        """
+        Validate that structure is preserved between original and translated.
+        Args:
+            original_elements: Original parsed elements
+            translated_elements: Translated parsed elements
+        Returns:
+            List of validation errors
+        """
+        errors = []
+        # Compare structure counts
+        original_types = self._count_element_types(original_elements)
+        translated_types = self._count_element_types(translated_elements)
+        for element_type, count in original_types.items():
+            if element_type != ContentType.TEXT:  # Text count may differ
+                translated_count = translated_types.get(element_type, 0)
+                if count != translated_count:
+                    errors.append(
+                        f"Element count mismatch for {element_type.value}: "
+                        f"original={count}, translated={translated_count}"
+                    )
+        # Check that code blocks are preserved
+        original_code_blocks = len([
+            e for e in self._flatten_elements(original_elements)
+            if e.element_type == ContentType.CODE
+        ])
+        translated_code_blocks = len([
+            e for e in self._flatten_elements(translated_elements)
+            if e.element_type == ContentType.CODE
+        ])
+        if original_code_blocks != translated_code_blocks:
+            errors.append(
+                f"Code block count mismatch: "
+                f"original={original_code_blocks}, translated={translated_code_blocks}"
+            )
+        logger.info(
+            "Structure validation complete",
+            errors_count=len(errors),
+            element_types_matched=len(set(original_types.keys()) & set(translated_types.keys()))
+        )
+        return errors
+    def _count_element_types(self, elements: List[ContentElement]) -> Dict[ContentType, int]:
+        """Count occurrences of each element type."""
+        counts = {}
+        for element in self._flatten_elements(elements):
+            counts[element.element_type] = counts.get(element.element_type, 0) + 1
+        return counts
+    def generate_structure_report(
+        self,
+        elements: List[ContentElement]
+    ) -> Dict[str, Any]:
+        """
+        Generate a report of the content structure.
+        Args:
+            elements: Parsed content elements
+        Returns:
+            Structure report
+        """
+        flattened = self._flatten_elements(elements)
+        type_counts = self._count_element_types(elements)
+        report = {
+            "total_elements": len(flattened),
+            "element_types": {
+                type_name.value: count
+                for type_name, count in type_counts.items()
+            },
+            "translatable_elements": len([e for e in flattened if e.should_translate]),
+            "code_blocks": type_counts.get(ContentType.CODE, 0),
+            "headings": type_counts.get(ContentType.HEADING, 0),
+            "lists": type_counts.get(ContentType.LIST, 0),
+            "links": type_counts.get(ContentType.LINK, 0),
+            "images": type_counts.get(ContentType.IMAGE, 0),
+            "tables": type_counts.get(ContentType.TABLE, 0)
+        }
+        return report

src/services/openai_translation/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+OpenAI Translation Service with Gemini API.
+This package provides translation services using OpenAI Agents SDK
+with Gemini API for high-quality English to Urdu translation.
+"""
+from .service import OpenAITranslationService
+__all__ = ["OpenAITranslationService"]

src/services/openai_translation/client.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+OpenAI Agents SDK Client for Gemini API.
+"""
+import os
+from agents import AsyncOpenAI, OpenAIChatCompletionsModel
+class GeminiOpenAIClient:
+    """OpenAI Agents SDK client for Gemini API."""
+    def __init__(self):
+        """Initialize Gemini OpenAI client."""
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not configured")
+        # Initialize AsyncOpenAI client for Gemini
+        self.provider = AsyncOpenAI(
+            base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
+            api_key=api_key,
+        )
+        # Define the chat completions model using Gemini
+        self.model = OpenAIChatCompletionsModel(
+            openai_client=self.provider,
+            model="gemini-2.0-flash-lite",
+        )
+    def get_provider(self) -> AsyncOpenAI:
+        """Get the AsyncOpenAI provider."""
+        return self.provider
+    def get_client(self) -> AsyncOpenAI:
+        """Get the AsyncOpenAI client (alias for get_provider)."""
+        return self.provider
+    def get_model(self) -> OpenAIChatCompletionsModel:
+        """Get the OpenAI chat completions model."""
+        return self.model
+    async def test_connection(self) -> bool:
+        """Test the connection to Gemini API."""
+        try:
+            # Try a simple completion request
+            response = await self.provider.chat.completions.create(
+                model="gemini-2.0-flash-lite",
+                messages=[{"role": "user", "content": "test"}],
+                max_tokens=1
+            )
+            return True
+        except Exception as e:
+            print(f"Connection test failed: {str(e)}")
+            return False
+def get_gemini_client() -> GeminiOpenAIClient:
+    """Get the Gemini client instance."""
+    return GeminiOpenAIClient()

src/services/openai_translation/openai_agent.py ADDED Viewed

	@@ -0,0 +1,533 @@

+"""
+OpenAI Agents SDK Implementation for Translation.
+This module properly implements translation using the OpenAI Agents SDK
+with Gemini API integration, including proper error handling for rate limits.
+"""
+import asyncio
+from typing import Dict, List, Optional, Any, AsyncGenerator
+from dataclasses import dataclass
+import time
+import json
+from agents import Agent, Runner, function_tool, RunContextWrapper
+from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
+from src.models.translation_openai import TranslationJob, TranslationChunk
+from src.utils.translation_logger import get_translation_logger
+from src.utils.translation_errors import (
+    TranslationError, RateLimitError, APIError,
+    retry_with_exponential_backoff, handle_api_error
+)
+logger = get_translation_logger(__name__)
+@dataclass
+class TranslationContext:
+    """Context information for translation."""
+    page_url: Optional[str] = None
+    page_title: Optional[str] = None
+    document_type: Optional[str] = None  # book, article, documentation, etc.
+    technical_domain: Optional[str] = None  # AI, robotics, programming, etc.
+    target_audience: Optional[str] = None  # students, professionals, general
+    previous_translations: Optional[List[str]] = None
+    glossary: Optional[Dict[str, str]] = None
+    chunk_index: Optional[int] = None
+    total_chunks: Optional[int] = None
+class OpenAITranslationAgent:
+    """
+    OpenAI Agents SDK-based translation agent with proper error handling.
+    Uses the official OpenAI Agents SDK with Gemini API for intelligent translation
+    with context awareness and specialized tools.
+    """
+    def __init__(
+        self,
+        gemini_client: GeminiOpenAIClient,
+        model: str = "gemini-2.0-flash-lite"
+    ):
+        """
+        Initialize translation agent.
+        Args:
+            gemini_client: Configured Gemini OpenAI client
+            model: Model to use for translation
+        """
+        self.client = gemini_client
+        self.model = model
+        self.agent = self._create_agent()
+        logger.info(
+            "OpenAI Translation Agent initialized",
+            model=model
+        )
+    def _create_agent(self) -> Agent:
+        """Create the translation agent with tools and proper error handling."""
+        instructions = """
+        You are a professional translator specializing in technical content translation from English to Urdu.
+        Your primary task is to translate English content to Urdu while:
+        1. Maintaining technical accuracy
+        2. Using appropriate Urdu terminology
+        3. Preserving code blocks and technical identifiers
+        4. Providing contextually appropriate translations
+        5. Using Urdu script (Nastaleeq) for all Urdu text
+        Key Translation Guidelines:
+        - Translate ALL content unless explicitly marked as code
+        - Use Urdu script for all translations
+        - For technical terms, use established Urdu translations where available
+        - For brand new terms, create appropriate Urdu equivalents
+        - Maintain the original document structure and formatting
+        - Code blocks remain in English but add Urdu comments if helpful
+        Technical Term Examples:
+        - AI → مصنوعی ذہانت
+        - Machine Learning → مشین لرننگ
+        - Robotics → روبوٹکس
+        - Computer Vision → کمپیوٹر ویژن
+        - Neural Network → نیورل نیٹورک
+        - Algorithm → الگورتھم
+        Error Handling:
+        - If you encounter rate limiting errors, wait and retry automatically
+        - If translation fails for a chunk, note the error and continue
+        - Always provide meaningful error messages
+        Always strive for natural, fluent Urdu that accurately conveys the technical meaning.
+        """
+        return Agent(
+            name="UrduTechnicalTranslator",
+            instructions=instructions,
+            model=self.model,
+            tools=[
+                self._create_translate_tool(),
+                self._create_analyze_code_tool(),
+                self._create_glossary_tool(),
+                self._create_context_tool()
+            ]
+        )
+    async def _handle_rate_limit_error(self, error: Exception) -> None:
+        """
+        Handle rate limit errors with proper backoff.
+        Args:
+            error: The rate limit error
+        """
+        if isinstance(error, OpenAIRateLimitError):
+            logger.warning(
+                "Rate limit hit, implementing backoff",
+                retry_after=error.retry_after if hasattr(error, 'retry_after') else None
+            )
+            # Implement exponential backoff
+            retry_after = getattr(error, 'retry_after', 1)
+            await asyncio.sleep(retry_after)
+        # Handle HTTP 429 from OpenAI client
+        elif hasattr(error, 'status_code') and error.status_code == 429:
+            retry_after = 1
+            if hasattr(error, 'response') and error.response:
+                try:
+                    error_data = error.response.json()
+                    retry_after = error_data.get('retry_after', retry_after)
+                except:
+                    pass
+            logger.warning(
+                "HTTP 429 rate limit hit",
+                retry_after=retry_after
+            )
+            await asyncio.sleep(retry_after)
+    async def translate_with_agent(
+        self,
+        text: str,
+        context: Optional[TranslationContext] = None
+    ) -> Dict[str, Any]:
+        """
+        Translate text using OpenAI Agents SDK with proper error handling.
+        Args:
+            text: Text to translate
+            context: Translation context
+        Returns:
+            Translation result with metadata
+        """
+        logger.info(
+            "Starting translation with OpenAI Agents SDK",
+            text_length=len(text),
+            has_context=bool(context)
+        )
+        # Prepare context prompt
+        context_info = ""
+        if context:
+            if context.technical_domain:
+                context_info += f"\nDomain: {context.technical_domain}"
+            if context.document_type:
+                context_info += f"\nDocument Type: {context.document_type}"
+            if context.target_audience:
+                context_info += f"\nTarget Audience: {context.target_audience}"
+            if context.chunk_index is not None:
+                context_info += f"\nChunk: {context.chunk_index + 1} of {context.total_chunks or '?'}"
+        # Create the translation prompt
+        prompt = f"""
+        Translate the following English text to Urdu:
+        {context_info}
+        Text:
+        {text}
+        Requirements:
+        - Use Urdu script (Nastaleeq)
+        - Translate all non-code content
+        - Preserve formatting and structure
+        - Use appropriate technical terminology
+        - Maintain consistency with previous translations
+        """
+        try:
+            # Create runner and execute with retry logic
+            runner = Runner(self.agent)
+            # Implement retry with rate limit handling
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    result = await runner.run(prompt)
+                    # Extract metadata
+                    tokens_used = 0
+                    if hasattr(result, 'usage') and result.usage:
+                        tokens_used = result.usage.total_tokens
+                    return {
+                        "translated_text": result.final_output.strip(),
+                        "original_text": text,
+                        "tokens_used": tokens_used,
+                        "model": self.model,
+                        "confidence_score": 0.9,  # Placeholder
+                        "attempt": attempt + 1,
+                        "context": context_info
+                    }
+                except OpenAIRateLimitError as e:
+                    if attempt < max_retries - 1:
+                        await self._handle_rate_limit_error(e)
+                        continue
+                    else:
+                        raise RateLimitError(
+                            f"Rate limit exceeded after {max_retries} attempts",
+                            retry_after=getattr(e, 'retry_after', None)
+                        )
+                except Exception as e:
+                    # Check if it's an HTTP 429 error
+                    if hasattr(e, 'status_code') and e.status_code == 429:
+                        if attempt < max_retries - 1:
+                            await self._handle_rate_limit_error(e)
+                            continue
+                        else:
+                            raise RateLimitError(
+                                f"Rate limit exceeded after {max_retries} attempts",
+                                retry_after=getattr(e, 'retry_after', 1)
+                            )
+                    else:
+                        # Re-raise non-rate-limit errors
+                        raise
+        except RateLimitError:
+            raise
+        except Exception as e:
+            logger.error(
+                "Agent translation failed",
+                error=str(e),
+                error_type=type(e).__name__
+            )
+            raise TranslationError(
+                f"Translation failed: {str(e)}",
+                error_type="AGENT_ERROR",
+                details={"original_error": str(e)}
+            )
+    def _create_translate_tool(self):
+        """Create the translate tool for the agent."""
+        @function_tool
+        async def translate_text(
+            ctx: RunContextWrapper[Any],
+            text: str,
+            context: Optional[Dict[str, Any]] = None,
+            preserve_formatting: bool = True
+        ) -> str:
+            """
+            Translate text from English to Urdu using the OpenAI client directly.
+            This is a fallback tool used by the agent for complex translations.
+            """
+            logger.debug(
+                "Using translate_text tool",
+                text_length=len(text)
+            )
+            try:
+                # Use the Gemini OpenAI client directly
+                client = self.client.get_client()
+                response = await client.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {
+                            "role": "system",
+                            "content": "You are a professional translator for technical content."
+                        },
+                        {
+                            "role": "user",
+                            "content": f"Translate to Urdu: {text}"
+                        }
+                    ],
+                    temperature=0.3,
+                    max_tokens=4000
+                )
+                return response.choices[0].message.content.strip()
+            except Exception as e:
+                if hasattr(e, 'status_code') and e.status_code == 429:
+                    # Convert to OpenAI Agents SDK rate limit error
+                    raise OpenAIRateLimitError(
+                        "Rate limit exceeded",
+                        retry_after=getattr(e, 'retry_after', 1)
+                    )
+                raise
+        return translate_text
+    def _create_analyze_code_tool(self):
+        """Create the code analysis tool for the agent."""
+        @function_tool
+        async def analyze_code_blocks(
+            ctx: RunContextWrapper[Any],
+            text: str
+        ) -> List[Dict[str, Any]]:
+            """
+            Analyze text to identify and extract code blocks.
+            """
+            import re
+            # Pattern to match code blocks
+            code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
+            code_blocks = []
+            for match in code_pattern.finditer(text):
+                language = match.group(1) or "text"
+                code_content = match.group(2)
+                start_pos = match.start()
+                end_pos = match.end()
+                code_blocks.append({
+                    "language": language,
+                    "content": code_content,
+                    "start_position": start_pos,
+                    "end_position": end_pos,
+                    "length": len(code_content)
+                })
+            return code_blocks
+        return analyze_code_blocks
+    def _create_glossary_tool(self):
+        """Create the glossary tool for the agent."""
+        @function_tool
+        async def get_translation_glossary(
+            ctx: RunContextWrapper[Any],
+            domain: Optional[str] = None
+        ) -> Dict[str, str]:
+            """
+            Get domain-specific translation glossary.
+            """
+            glossaries = {
+                "ai": {
+                    "Artificial Intelligence": "مصنوعی ذہانت",
+                    "Machine Learning": "مشین لرننگ",
+                    "Deep Learning": "ڈیپ لرننگ",
+                    "Neural Network": "نیورل نیٹورک",
+                    "Algorithm": "الگورتھم",
+                    "Model": "ماڈل",
+                    "Training": "تربیت",
+                    "Inference": "استنتاج",
+                    "Dataset": "ڈیٹاسیٹ",
+                    "Feature": "خصوصیت"
+                },
+                "robotics": {
+                    "Robot": "روبوٹ",
+                    "Actuator": "ایکچویٹر",
+                    "Sensor": "سینسر",
+                    "Kinematics": "کائنیمیٹکس",
+                    "Path Planning": "پاتھ پلاننگ",
+                    "Control System": "کنٹرول سسٹم",
+                    "Embedded": "ایمبیڈڈ",
+                    "Autonomous": "خودکار"
+                },
+                "programming": {
+                    "Function": "فنکشن",
+                    "Variable": "متغیر",
+                    "Class": "کلاس",
+                    "Object": "آبجیکٹ",
+                    "Method": "میٹھڈ",
+                    "Library": "لائبریری",
+                    "Framework": "فریم ورک",
+                    "API": "API",
+                    "Database": "ڈیٹا بیس",
+                    "Server": "سرور"
+                }
+            }
+            if domain and domain.lower() in glossaries:
+                return glossaries[domain.lower()]
+            # Return combined glossary for general use
+            combined = {}
+            for gloss in glossaries.values():
+                combined.update(gloss)
+            return combined
+        return get_translation_glossary
+    def _create_context_tool(self):
+        """Create the context tool for the agent."""
+        @function_tool
+        async def set_translation_context(
+            ctx: RunContextWrapper[Any],
+            page_url: Optional[str] = None,
+            document_type: Optional[str] = None,
+            technical_domain: Optional[str] = None,
+            target_audience: Optional[str] = None
+        ) -> Dict[str, Any]:
+            """
+            Set context for translation decisions.
+            """
+            context = {
+                "page_url": page_url,
+                "document_type": document_type,
+                "technical_domain": technical_domain,
+                "target_audience": target_audience,
+                "set_at": time.time()
+            }
+            logger.info(
+                "Translation context set via tool",
+                context=context
+            )
+            return {
+                "success": True,
+                "message": "Translation context updated successfully",
+                "context": context
+            }
+        return set_translation_context
+    async def translate_chunk_sequence(
+        self,
+        chunks: List[str],
+        context: Optional[TranslationContext] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Translate a sequence of chunks maintaining consistency.
+        Args:
+            chunks: List of text chunks to translate
+            context: Translation context
+        Returns:
+            List of translation results
+        """
+        logger.info(
+            "Translating chunk sequence with OpenAI Agents SDK",
+            chunk_count=len(chunks),
+            has_context=bool(context)
+        )
+        results = []
+        total_tokens = 0
+        for i, chunk in enumerate(chunks):
+            logger.debug(
+                "Translating chunk",
+                chunk_index=i,
+                chunk_length=len(chunk)
+            )
+            # Update context with chunk info
+            chunk_context = context
+            if chunk_context:
+                chunk_context.chunk_index = i
+                chunk_context.total_chunks = len(chunks)
+            try:
+                result = await self.translate_with_agent(chunk, chunk_context)
+                result["chunk_index"] = i
+                results.append(result)
+                total_tokens += result.get("tokens_used", 0)
+            except RateLimitError as e:
+                logger.error(
+                    "Rate limit hit for chunk",
+                    chunk_index=i,
+                    retry_after=e.retry_after
+                )
+                # Add rate limit error result
+                results.append({
+                    "chunk_index": i,
+                    "translated_text": f"[RATE LIMIT ERROR: {str(e)}]",
+                    "original_text": chunk,
+                    "error": str(e),
+                    "error_type": "RATE_LIMIT",
+                    "tokens_used": 0,
+                    "model": self.model,
+                    "confidence_score": 0.0,
+                    "retry_after": e.retry_after
+                })
+            except Exception as e:
+                logger.error(
+                    "Chunk translation failed",
+                    chunk_index=i,
+                    error=str(e)
+                )
+                # Add failed result
+                results.append({
+                    "chunk_index": i,
+                    "translated_text": chunk,  # Fallback to original
+                    "original_text": chunk,
+                    "error": str(e),
+                    "tokens_used": 0,
+                    "model": self.model,
+                    "confidence_score": 0.0
+                })
+        logger.info(
+            "Chunk sequence translation completed",
+            total_chunks=len(chunks),
+            successful_chunks=len([r for r in results if not r.get("error")]),
+            total_tokens=total_tokens
+        )
+        return results
+    async def get_agent(self) -> Agent:
+        """Get the configured translation agent."""
+        return self.agent

src/services/openai_translation/service.py ADDED Viewed

	@@ -0,0 +1,855 @@

+"""
+OpenAI Translation Service using Gemini API.
+This service implements the core translation functionality using
+OpenAI Agents SDK with Gemini's OpenAI-compatible endpoint.
+"""
+import asyncio
+import hashlib
+import json
+import time
+import uuid
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Any, AsyncGenerator
+from dataclasses import dataclass
+from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletion
+from src.models.translation_openai import (
+    TranslationJob, TranslationChunk, TranslationError, TranslationSession,
+    TranslationCache, TranslationJobStatus, ChunkStatus, ErrorSeverity
+)
+from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
+from src.services.cache_service import CacheService, get_cache_service
+from src.database.base import get_db
+from src.utils.translation_errors import (
+    TranslationError as TranslationServiceError, APIError, RateLimitError,
+    with_translation_error_handling, retry_with_exponential_backoff
+)
+from src.utils.translation_logger import get_translation_logger, log_translation_performance
+logger = get_translation_logger(__name__)
+@dataclass
+class OpenAITranslationRequest:
+    """Translation request with comprehensive parameters."""
+    text: str
+    source_language: str
+    target_language: str
+    page_url: Optional[str] = None
+    user_id: Optional[str] = None
+    session_id: Optional[str] = None
+    # OpenAI parameters
+    model: str = "gemini-2.0-flash-lite"
+    temperature: float = 0.3
+    max_tokens: int = 2048
+    # Processing options
+    preserve_code_blocks: bool = True
+    enable_transliteration: bool = True
+    chunk_size: int = 2000
+    max_chunks: int = 100
+    # Retry settings
+    max_retries: int = 3
+    retry_delay: float = 1.0
+    # Streaming
+    streaming: bool = False
+    # Session context
+    user_agent: Optional[str] = None
+    ip_address: Optional[str] = None
+@dataclass
+class OpenAITranslationResponse:
+    """Translation response with comprehensive metadata."""
+    job_id: str
+    translated_text: str
+    status: TranslationJobStatus
+    progress: float  # 0-100
+    chunks: List[Dict[str, Any]]
+    processing_time_ms: int
+    cached: bool
+    # Cost tracking
+    input_tokens: int
+    output_tokens: int
+    estimated_cost_usd: float
+    # Quality metrics
+    confidence_score: Optional[float] = None
+    quality_score: Optional[float] = None
+    # Error information
+    error_message: Optional[str] = None
+    error_details: Optional[Dict[str, Any]] = None
+    # Cache information
+    cache_key: Optional[str] = None
+    cache_hit: bool = False
+class OpenAITranslationService:
+    """
+    Translation service using OpenAI Agents SDK with Gemini API.
+    Features:
+    - OpenAI Agents SDK with Gemini 2.0 Flash model
+    - Content chunking for large texts
+    - Enhanced caching with page URL support
+    - Progress tracking and streaming
+    - Error handling and retries
+    - Session management
+    - Cost and quality tracking
+    """
+    # Translation prompt templates
+    TRANSLATION_PROMPT_TEMPLATE = """
+You are a professional translator. Translate the following text from {source_lang} to {target_lang}.
+CRITICAL REQUIREMENTS:
+1. Translate ALL text to {target_lang} - no English words should remain
+2. ONLY preserve code blocks marked with ```
+3. Translate technical terms with context (e.g., AI → مصنوعی ذہانت)
+4. Use Urdu script (Nastaleeq) for Urdu text
+5. Maintain formatting and structure
+6. Mix Urdu with Roman Urdu for technical terms where appropriate
+Text to translate:
+{text}
+Translate only the content above.
+"""
+    CHUNK_TRANSLATION_PROMPT = """
+Translate this text segment from {source_lang} to {target_lang}.
+Context: This is part {current_part} of {total_parts} of a larger document.
+Requirements:
+- Maintain consistency with the overall document
+- Translate accurately while preserving meaning
+- Handle technical terms appropriately
+- Keep the flow natural
+- Use Urdu script (Nastaleeq)
+Text:
+{text}
+Translation:
+"""
+    # Model pricing (approximate USD per 1K tokens)
+    MODEL_PRICING = {
+        "gemini-2.0-flash-lite": {
+            "input": 0.000075,  # $0.075 per 1M input tokens
+            "output": 0.00015   # $0.15 per 1M output tokens
+        }
+    }
+    def __init__(
+        self,
+        gemini_client: Optional[GeminiOpenAIClient] = None,
+        cache_service: Optional[CacheService] = None,
+        enable_analytics: bool = True
+    ):
+        """
+        Initialize OpenAI translation service.
+        Args:
+            gemini_client: Gemini OpenAI client
+            cache_service: Cache service instance
+            enable_analytics: Whether to collect detailed analytics
+        """
+        self.gemini_client = gemini_client
+        self.cache_service = cache_service
+        self.enable_analytics = enable_analytics
+        # Initialize services if not provided
+        if not self.gemini_client:
+            self.gemini_client = get_gemini_client()
+        if not self.cache_service:
+            self.cache_service = get_cache_service()
+        logger.info(
+            "OpenAI Translation Service initialized",
+            model="gemini-2.0-flash-lite",
+            analytics_enabled=enable_analytics
+        )
+    def _generate_content_hash(self, text: str, source_lang: str, target_lang: str) -> str:
+        """Generate SHA-256 hash for content identification."""
+        content = f"{text}:{source_lang}:{target_lang}"
+        return hashlib.sha256(content.encode('utf-8')).hexdigest()
+    def _generate_cache_key(self, content_hash: str, page_url: Optional[str] = None) -> str:
+        """Generate comprehensive cache key including page URL."""
+        if page_url:
+            url_hash = hashlib.sha256(page_url.encode('utf-8')).hexdigest()[:16]
+            return f"openai_translation:{content_hash}:{url_hash}"
+        return f"openai_translation:{content_hash}"
+    async def _check_cache(
+        self,
+        content_hash: str,
+        page_url: Optional[str] = None
+    ) -> Optional[TranslationCache]:
+        """Check if translation is cached in database."""
+        cache_key = self._generate_cache_key(content_hash, page_url)
+        db = next(get_db())
+        try:
+            cache_entry = db.query(TranslationCache).filter(
+                TranslationCache.cache_key == cache_key,
+                TranslationCache.expires_at > datetime.utcnow()
+            ).first()
+            if cache_entry:
+                # Update hit statistics
+                cache_entry.hit_count += 1
+                cache_entry.last_hit_at = datetime.utcnow()
+                db.commit()
+                logger.info(
+                    "Cache hit found",
+                    cache_key=cache_key[:20],
+                    hits=cache_entry.hit_count
+                )
+                return cache_entry
+        finally:
+            db.close()
+        return None
+    async def _cache_translation(
+        self,
+        job: TranslationJob,
+        cache_key: str,
+        quality_score: Optional[float] = None
+    ) -> bool:
+        """Cache a successful translation."""
+        try:
+            db = next(get_db())
+            # Determine TTL based on quality
+            if quality_score and quality_score >= 4.5:
+                ttl_hours = 30 * 24  # 30 days for high quality
+            elif quality_score and quality_score < 3.0:
+                ttl_hours = 24  # 1 day for low quality
+            else:
+                ttl_hours = 7 * 24  # 7 days default
+            expires_at = datetime.utcnow() + timedelta(hours=ttl_hours)
+            cache_entry = TranslationCache(
+                cache_key=cache_key,
+                job_id=job.id,
+                content_hash=job.content_hash,
+                page_url=job.page_url,
+                source_language=job.source_language,
+                target_language=job.target_language,
+                original_text=job.original_text,
+                translated_text=job.translated_text,
+                model_version=job.model_name,
+                processing_time_ms=job.processing_time_ms,
+                ttl_hours=ttl_hours,
+                expires_at=expires_at,
+                quality_score=quality_score,
+                is_validated=quality_score is not None
+            )
+            db.add(cache_entry)
+            db.commit()
+            logger.info(
+                "Translation cached",
+                cache_key=cache_key[:20],
+                ttl_hours=ttl_hours
+            )
+            return True
+        except Exception as e:
+            logger.error("Failed to cache translation", error=str(e))
+            return False
+        finally:
+            db.close()
+    async def _translate_with_gemini(
+        self,
+        text: str,
+        source_lang: str,
+        target_lang: str,
+        model: str,
+        temperature: float,
+        max_tokens: int,
+        is_chunk: bool = False,
+        context: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Translate text using Gemini via OpenAI SDK.
+        Returns:
+            Dict containing translated_text, tokens_used, and response metadata
+        """
+        client = self.gemini_client.get_client()
+        try:
+            # Select appropriate prompt
+            if is_chunk and context:
+                prompt = self.CHUNK_TRANSLATION_PROMPT.format(
+                    source_lang=source_lang,
+                    target_lang=target_lang,
+                    current_part=context.get('current_part', 1),
+                    total_parts=context.get('total_parts', 1),
+                    text=text
+                )
+            else:
+                prompt = self.TRANSLATION_PROMPT_TEMPLATE.format(
+                    source_lang=source_lang,
+                    target_lang=target_lang,
+                    text=text
+                )
+            # Call Gemini API via OpenAI SDK
+            response = await client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "You are a professional translator."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=temperature,
+                max_tokens=max_tokens
+            )
+            # Extract translation and metrics
+            translated_text = response.choices[0].message.content
+            input_tokens = response.usage.prompt_tokens
+            output_tokens = response.usage.completion_tokens
+            # Calculate cost
+            pricing = self.MODEL_PRICING.get(model, self.MODEL_PRICING["gemini-2.0-flash-lite"])
+            estimated_cost = (
+                (input_tokens / 1000 * pricing["input"]) +
+                (output_tokens / 1000 * pricing["output"])
+            )
+            return {
+                "translated_text": translated_text.strip() if translated_text else "",
+                "input_tokens": input_tokens,
+                "output_tokens": output_tokens,
+                "total_tokens": input_tokens + output_tokens,
+                "estimated_cost": estimated_cost,
+                "model": model,
+                "response_id": response.id
+            }
+        except Exception as e:
+            logger.error("Gemini API error", error=str(e))
+            raise TranslationServiceError(
+                f"Translation failed: {str(e)}",
+                error_type="API_ERROR",
+                is_retriable=True
+            )
+    def _split_text_into_chunks(
+        self,
+        text: str,
+        chunk_size: int,
+        max_chunks: int,
+        preserve_code_blocks: bool = True
+    ) -> List[Dict[str, Any]]:
+        """
+        Split text into chunks for processing.
+        Returns:
+            List of chunks with text, position, and metadata
+        """
+        chunks = []
+        if preserve_code_blocks:
+            # Handle code blocks separately
+            import re
+            code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
+            last_end = 0
+            chunk_index = 0
+            for match in code_pattern.finditer(text):
+                # Process text before code block
+                text_before = text[last_end:match.start()]
+                if text_before:
+                    text_chunks = self._split_plain_text(text_before, chunk_size - 200)
+                    for chunk_text in text_chunks:
+                        if chunk_index >= max_chunks:
+                            break
+                        chunks.append({
+                            "text": chunk_text,
+                            "start": last_end,
+                            "end": last_end + len(chunk_text),
+                            "is_code_block": False,
+                            "index": chunk_index
+                        })
+                        chunk_index += 1
+                        last_end += len(chunk_text)
+                # Add code block as separate chunk
+                if chunk_index < max_chunks:
+                    code_lang = match.group(1) or "unknown"
+                    code_content = match.group(2)
+                    full_code = f"```{code_lang}\n{code_content}\n```"
+                    chunks.append({
+                        "text": full_code,
+                        "start": match.start(),
+                        "end": match.end(),
+                        "is_code_block": True,
+                        "code_language": code_lang,
+                        "index": chunk_index
+                    })
+                    chunk_index += 1
+                    last_end = match.end()
+            # Process remaining text
+            if last_end < len(text) and chunk_index < max_chunks:
+                remaining_text = text[last_end:]
+                text_chunks = self._split_plain_text(remaining_text, chunk_size)
+                for chunk_text in text_chunks:
+                    if chunk_index >= max_chunks:
+                        break
+                    chunks.append({
+                        "text": chunk_text,
+                        "start": last_end,
+                        "end": last_end + len(chunk_text),
+                        "is_code_block": False,
+                        "index": chunk_index
+                    })
+                    chunk_index += 1
+                    last_end += len(chunk_text)
+        else:
+            # Simple text splitting
+            text_chunks = self._split_plain_text(text, chunk_size)
+            chunks = [
+                {
+                    "text": chunk,
+                    "start": i * chunk_size,
+                    "end": (i + 1) * chunk_size,
+                    "is_code_block": False,
+                    "index": i
+                }
+                for i, chunk in enumerate(text_chunks[:max_chunks])
+            ]
+        return chunks
+    def _split_plain_text(self, text: str, chunk_size: int) -> List[str]:
+        """Split plain text into chunks, trying to preserve sentences."""
+        import re
+        chunks = []
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= chunk_size:
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk)
+                current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk)
+        return chunks
+    @log_translation_performance
+    async def translate(
+        self,
+        request: OpenAITranslationRequest
+    ) -> OpenAITranslationResponse:
+        """
+        Translate text with comprehensive tracking and caching.
+        Args:
+            request: Translation request with all parameters
+        Returns:
+            Translation response with metadata
+        """
+        start_time = time.time()
+        job_id = str(uuid.uuid4())
+        content_hash = self._generate_content_hash(
+            request.text,
+            request.source_language,
+            request.target_language
+        )
+        cache_key = self._generate_cache_key(content_hash, request.page_url)
+        logger.bind_request(request_id=job_id).log_translation_request(
+            text_length=len(request.text),
+            source_lang=request.source_language,
+            target_lang=request.target_language,
+            page_url=request.page_url
+        )
+        # Check cache first
+        cached_translation = await self._check_cache(content_hash, request.page_url)
+        if cached_translation:
+            processing_time = int((time.time() - start_time) * 1000)
+            logger.log_translation_response(
+                translated_length=len(cached_translation.translated_text),
+                chunks_count=1,
+                cached=True
+            )
+            return OpenAITranslationResponse(
+                job_id=job_id,
+                translated_text=cached_translation.translated_text,
+                status=TranslationJobStatus.COMPLETED,
+                progress=100.0,
+                chunks=[],
+                processing_time_ms=processing_time,
+                cached=True,
+                input_tokens=0,
+                output_tokens=0,
+                estimated_cost_usd=0.0,
+                cache_key=cache_key,
+                cache_hit=True
+            )
+        # Create translation job
+        db = next(get_db())
+        try:
+            job = TranslationJob(
+                job_id=job_id,
+                user_id=request.user_id,
+                session_id=request.session_id,
+                content_hash=content_hash,
+                page_url=request.page_url,
+                source_language=request.source_language,
+                target_language=request.target_language,
+                original_text=request.text,
+                model_name=request.model,
+                temperature=request.temperature,
+                max_tokens=request.max_tokens,
+                preserve_code_blocks=request.preserve_code_blocks,
+                enable_transliteration=request.enable_transliteration,
+                chunk_size=request.chunk_size,
+                max_chunks=request.max_chunks,
+                user_agent=request.user_agent,
+                ip_address=request.ip_address
+            )
+            db.add(job)
+            db.commit()
+            # Split text into chunks
+            chunks_data = self._split_text_into_chunks(
+                request.text,
+                request.chunk_size,
+                request.max_chunks,
+                request.preserve_code_blocks
+            )
+            job.chunks_total = len(chunks_data)
+            job.status = TranslationJobStatus.PROCESSING.value
+            job.started_at = datetime.utcnow()
+            db.commit()
+            # Process chunks
+            translated_chunks = []
+            total_input_tokens = 0
+            total_output_tokens = 0
+            total_cost = 0.0
+            for i, chunk_data in enumerate(chunks_data):
+                try:
+                    # Create chunk record
+                    chunk = TranslationChunk(
+                        job_id=job.id,
+                        chunk_index=i,
+                        original_text=chunk_data["text"],
+                        start_position=chunk_data["start"],
+                        end_position=chunk_data["end"],
+                        is_code_block=chunk_data["is_code_block"],
+                        code_language=chunk_data.get("code_language"),
+                        word_count=len(chunk_data["text"].split()),
+                        status=ChunkStatus.PROCESSING.value,
+                        started_at=datetime.utcnow()
+                    )
+                    db.add(chunk)
+                    db.commit()
+                    # Translate or skip code blocks
+                    if chunk_data["is_code_block"] and request.preserve_code_blocks:
+                        translated_text = chunk_data["text"]
+                        chunk.status = ChunkStatus.COMPLETED.value
+                        chunk.translated_text = translated_text
+                        chunk.completed_at = datetime.utcnow()
+                    else:
+                        # Translate chunk with retry logic
+                        async def translate_chunk():
+                            return await self._translate_with_gemini(
+                                chunk_data["text"],
+                                request.source_language,
+                                request.target_language,
+                                request.model,
+                                request.temperature,
+                                request.max_tokens,
+                                is_chunk=True,
+                                context={
+                                    "current_part": i + 1,
+                                    "total_parts": len(chunks_data)
+                                } if len(chunks_data) > 1 else None
+                            )
+                        result = await retry_with_exponential_backoff(
+                            translate_chunk,
+                            max_retries=request.max_retries
+                        )
+                        translated_text = result["translated_text"]
+                        chunk.translated_text = translated_text
+                        chunk.input_tokens = result["input_tokens"]
+                        chunk.output_tokens = result["output_tokens"]
+                        chunk.status = ChunkStatus.COMPLETED.value
+                        chunk.completed_at = datetime.utcnow()
+                        total_input_tokens += result["input_tokens"]
+                        total_output_tokens += result["output_tokens"]
+                        total_cost += result["estimated_cost"]
+                    # Update job progress
+                    job.chunks_completed += 1
+                    job.progress_percentage = (job.chunks_completed / job.chunks_total) * 100
+                    db.commit()
+                    # Add to response chunks
+                    translated_chunks.append({
+                        "index": i,
+                        "original_text": chunk_data["text"],
+                        "translated_text": translated_text,
+                        "start_position": chunk_data["start"],
+                        "end_position": chunk_data["end"],
+                        "is_code_block": chunk_data["is_code_block"],
+                        "code_language": chunk_data.get("code_language")
+                    })
+                except Exception as e:
+                    # Handle chunk error
+                    chunk.status = ChunkStatus.FAILED.value
+                    chunk.last_error = str(e)
+                    job.chunks_failed += 1
+                    # Log error
+                    logger.log_error(e, chunk_index=i)
+                    db.commit()
+                    logger.error(f"Chunk {i} translation failed", error=str(e))
+            # Reconstruct final translation
+            final_translation = ''.join(chunk["translated_text"] for chunk in translated_chunks)
+            # Update job completion
+            job.translated_text = final_translation
+            job.input_tokens = total_input_tokens
+            job.output_tokens = total_output_tokens
+            job.estimated_cost_usd = total_cost
+            job.status = (
+                TranslationJobStatus.COMPLETED.value
+                if job.chunks_failed == 0
+                else TranslationJobStatus.FAILED.value
+            )
+            job.completed_at = datetime.utcnow()
+            job.processing_time_ms = int((time.time() - start_time) * 1000)
+            job.progress_percentage = 100.0
+            db.commit()
+            # Cache successful translation
+            if job.chunks_failed == 0:
+                await self._cache_translation(job, cache_key)
+            processing_time = int((time.time() - start_time) * 1000)
+            logger.log_translation_response(
+                translated_length=len(final_translation),
+                chunks_count=len(translated_chunks),
+                tokens_used=total_input_tokens + total_output_tokens,
+                cost_usd=total_cost,
+                cached=False
+            )
+            logger.info(
+                "Translation completed",
+                job_id=job_id,
+                chunks=len(chunks_data),
+                failed=job.chunks_failed,
+                processing_time_ms=processing_time,
+                total_cost=total_cost
+            )
+            return OpenAITranslationResponse(
+                job_id=job_id,
+                translated_text=final_translation,
+                status=TranslationJobStatus(job.status),
+                progress=100.0,
+                chunks=translated_chunks,
+                processing_time_ms=processing_time,
+                cached=False,
+                input_tokens=total_input_tokens,
+                output_tokens=total_output_tokens,
+                estimated_cost_usd=total_cost,
+                cache_key=cache_key,
+                cache_hit=False,
+                error_message=(
+                    f"{job.chunks_failed} chunks failed"
+                    if job.chunks_failed > 0
+                    else None
+                )
+            )
+        except Exception as e:
+            # Update job status to failed
+            if 'job' in locals():
+                job.status = TranslationJobStatus.FAILED.value
+                job.completed_at = datetime.utcnow()
+                db.commit()
+            logger.log_error(e, job_id=job_id)
+            raise TranslationServiceError(
+                f"Translation failed: {str(e)}",
+                error_type="SYSTEM_ERROR"
+            )
+        finally:
+            db.close()
+    async def get_translation_status(self, job_id: str) -> Dict[str, Any]:
+        """Get the status of a translation job."""
+        db = next(get_db())
+        try:
+            job = db.query(TranslationJob).filter(
+                TranslationJob.job_id == job_id
+            ).first()
+            if not job:
+                raise TranslationServiceError(
+                    "Translation job not found",
+                    error_type="VALIDATION_ERROR"
+                )
+            return {
+                "job_id": job.job_id,
+                "status": job.status,
+                "progress": float(job.progress_percentage),
+                "chunks_total": job.chunks_total,
+                "chunks_completed": job.chunks_completed,
+                "chunks_failed": job.chunks_failed,
+                "processing_time_ms": job.processing_time_ms,
+                "estimated_cost_usd": float(job.estimated_cost_usd),
+                "created_at": job.created_at.isoformat(),
+                "started_at": job.started_at.isoformat() if job.started_at else None,
+                "completed_at": job.completed_at.isoformat() if job.completed_at else None
+            }
+        finally:
+            db.close()
+    async def stream_translation_status(self, job_id: str) -> AsyncGenerator[Dict[str, Any], None]:
+        """Stream translation status updates."""
+        # Implementation for streaming status updates
+        # This would typically check status periodically and yield updates
+        yield {"type": "start", "job_id": job_id, "message": "Starting stream..."}
+        # In a real implementation, you would:
+        # 1. Get initial job status
+        # 2. Poll status changes
+        # 3. Yield updates as they occur
+        # 4. Close stream when job completes
+    async def check_cache(self, content_hash: str, page_url: Optional[str] = None) -> Optional[TranslationCache]:
+        """Check cache for translation."""
+        return await self._check_cache(content_hash, page_url)
+    def generate_cache_key(self, content_hash: str, page_url: Optional[str] = None) -> str:
+        """Generate cache key."""
+        return self._generate_cache_key(content_hash, page_url)
+    async def clear_cache(self, page_url: Optional[str] = None, older_than_hours: Optional[int] = None) -> int:
+        """Clear translation cache entries."""
+        db = next(get_db())
+        try:
+            query = db.query(TranslationCache)
+            if page_url:
+                query = query.filter(TranslationCache.page_url == page_url)
+            if older_than_hours:
+                cutoff_time = datetime.utcnow() - timedelta(hours=older_than_hours)
+                query = query.filter(TranslationCache.created_at < cutoff_time)
+            # Get count before deleting
+            count = query.count()
+            # Delete entries
+            query.delete()
+            db.commit()
+            logger.info(
+                "Cache cleared",
+                entries_deleted=count,
+                page_url=page_url,
+                older_than_hours=older_than_hours
+            )
+            return count
+        finally:
+            db.close()
+    async def health_check(self) -> bool:
+        """Check if the service is healthy."""
+        try:
+            # Test Gemini connection
+            await self.gemini_client.test_connection()
+            return True
+        except Exception as e:
+            logger.error("Health check failed", error=str(e))
+            return False
+    async def get_metrics(self, period: str = "24h") -> Dict[str, Any]:
+        """Get translation metrics."""
+        # Implementation would aggregate metrics from database
+        # This is a placeholder
+        return {
+            "period": period,
+            "total_requests": 0,
+            "successful_requests": 0,
+            "failed_requests": 0,
+            "cache_hit_rate": 0.0,
+            "avg_processing_time_ms": 0.0,
+            "total_cost_usd": 0.0
+        }
+# Global service instance
+_translation_service: Optional[OpenAITranslationService] = None
+async def get_translation_service() -> OpenAITranslationService:
+    """Get or create OpenAI translation service instance."""
+    global _translation_service
+    if _translation_service is None:
+        _translation_service = OpenAITranslationService()
+        # Initialize the async client
+        _translation_service.gemini_client = get_gemini_client()
+    return _translation_service

src/services/openai_translation/translation_agent.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Simplified OpenAI Translation Agent using proper Runner.run pattern.
+"""
+import asyncio
+from typing import Dict, Optional, Any
+from dataclasses import dataclass
+from agents import Agent, Runner
+from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
+from src.utils.translation_logger import get_translation_logger
+logger = get_translation_logger(__name__)
+@dataclass
+class TranslationContext:
+    """Context information for translation."""
+    page_url: Optional[str] = None
+    document_type: Optional[str] = None
+    technical_domain: Optional[str] = None
+    target_audience: Optional[str] = None
+class OpenAITranslationAgent:
+    """
+    OpenAI Agents SDK-based translation agent using proper Runner.run pattern.
+    """
+    def __init__(
+        self,
+        gemini_client: Optional[GeminiOpenAIClient] = None,
+        model: str = "gemini-2.0-flash-lite"
+    ):
+        """Initialize translation agent."""
+        self.client = gemini_client or get_gemini_client()
+        self.model = model
+        # Create the agent with translation instructions
+        self.agent = Agent(
+            name="Translation Agent",
+            instructions=self._get_translation_instructions(),
+            model=self.client.get_model()
+        )
+    def _get_translation_instructions(self) -> str:
+        """Get the base translation instructions for the agent."""
+        return """
+You are a professional translator specializing in English to Urdu translation.
+CRITICAL REQUIREMENTS:
+1. Translate ALL text to Urdu - no English words should remain
+2. ONLY preserve code blocks marked with ```
+3. Translate technical terms with context (e.g., AI -> مصنوعی ذہانت)
+4. Use Urdu script (Nastaleeq) for Urdu text
+5. Maintain formatting and structure
+6. Mix Urdu with Roman Urdu for technical terms where appropriate
+When translating:
+- Use appropriate honorifics and politeness levels
+- Translate idioms and expressions to their Urdu equivalents
+- Preserve the meaning and tone of the original text
+- Handle technical terminology correctly
+- Ensure grammatical correctness in Urdu
+Additional context will be provided as needed for specific domains.
+"""
+    async def translate_with_agent(
+        self,
+        text: str,
+        context: Optional[TranslationContext] = None,
+        user_id: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Translate text using OpenAI Agents SDK with proper Runner.run pattern.
+        Args:
+            text: Text to translate
+            context: Translation context information
+            user_id: User ID for tracking
+        Returns:
+            Dictionary containing translation result
+        """
+        try:
+            # Build the prompt with context
+            prompt = self._build_translation_prompt(text, context)
+            logger.info(
+                "Starting translation with agent",
+                text_length=len(text),
+                context=context.document_type if context else None,
+                model=self.model
+            )
+            # Run the agent using the proper Runner.run pattern
+            result = await Runner.run(
+                self.agent,
+                prompt,
+                max_turns=1  # Single turn for simple translation
+            )
+            # Extract the translated text
+            translated_text = result.final_output
+            # Try to extract tokens from usage if available
+            tokens_used = 0
+            model_used = self.model
+            # The result might have usage information in different formats
+            if hasattr(result, 'usage') and result.usage:
+                tokens_used = result.usage.total_tokens if hasattr(result.usage, 'total_tokens') else 0
+                model_used = result.usage.model if hasattr(result.usage, 'model') else self.model
+            # Check if the translation contains code blocks
+            has_code_blocks = "```" in translated_text
+            # Extract code blocks if present
+            code_blocks = []
+            if has_code_blocks:
+                import re
+                code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
+                code_blocks = [
+                    {
+                        "language": match.group(1) or "unknown",
+                        "code": match.group(2)
+                    }
+                    for match in code_pattern.finditer(translated_text)
+                ]
+            logger.info(
+                "Translation completed successfully",
+                original_length=len(text),
+                translated_length=len(translated_text),
+                tokens_used=tokens_used,
+                has_code_blocks=has_code_blocks
+            )
+            return {
+                "translated_text": translated_text.strip(),
+                "original_text": text,
+                "tokens_used": tokens_used,
+                "model": model_used,
+                "confidence_score": 0.95,  # Agent typically produces high-quality translations
+                "has_code_blocks": has_code_blocks,
+                "code_blocks": code_blocks,
+                "context_used": context is not None,
+                "processing_time_ms": 0,  # Could track this if needed
+                "cache_hit": False
+            }
+        except Exception as e:
+            logger.error(
+                "Agent translation failed",
+                error=str(e),
+                error_type=type(e).__name__,
+                text_length=len(text)
+            )
+            # Re-raise with context
+            raise Exception(f"Translation failed: {str(e)}") from e
+    def _build_translation_prompt(
+        self,
+        text: str,
+        context: Optional[TranslationContext]
+    ) -> str:
+        """Build the translation prompt with context."""
+        prompt_parts = ["Translate the following text from English to Urdu:"]
+        # Add context information if provided
+        if context:
+            context_parts = []
+            if context.document_type:
+                context_parts.append(f"Document Type: {context.document_type}")
+            if context.technical_domain:
+                context_parts.append(f"Technical Domain: {context.technical_domain}")
+            if context.target_audience:
+                context_parts.append(f"Target Audience: {context.target_audience}")
+            if context_parts:
+                prompt_parts.append("\nContext:")
+                prompt_parts.append("\n".join(f"- {part}" for part in context_parts))
+        # Add the text to translate
+        prompt_parts.append(f"\n\nText to translate:\n{text}")
+        # Add instruction to translate only the content
+        prompt_parts.append("\n\nTranslate only the text above.")
+        return "\n".join(prompt_parts)
+# Factory function
+def create_translation_agent(model: str = "gemini-2.0-flash-lite") -> OpenAITranslationAgent:
+    """Create a translation agent instance."""
+    return OpenAITranslationAgent(model=model)