GitHub Actions commited on
Commit
457b685
·
1 Parent(s): 84b0fa3

Deploy backend from GitHub Actions

Browse files

🚀 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +159 -15
  2. .gitignore +3 -1
  3. OPENAI_AGENTS_FIX.md +348 -0
  4. alembic/env.py +5 -1
  5. alembic/versions/001_reader_features_tables.py +179 -0
  6. alembic/versions/004_add_translation_tables.py +114 -0
  7. alembic/versions/005_add_openai_translation_tables.py +295 -0
  8. create_translation_tables.py +47 -0
  9. fix_async_client.py +44 -0
  10. fix_jsonb.py +28 -0
  11. fix_translation_endpoint.py +45 -0
  12. fix_user_id_issue.py +34 -0
  13. fix_user_model.py +53 -0
  14. main.py +58 -3
  15. migrate_user_id.py +63 -0
  16. migrate_user_id_fixed.py +53 -0
  17. migration_summary_translation_tables.md +124 -0
  18. migrations/versions/001_create_openai_translation_tables.py +297 -0
  19. pyproject.toml +7 -1
  20. requirements.txt +7 -0
  21. src/api/v1/progress.py +450 -0
  22. src/api/v1/reader_features.py +94 -0
  23. src/api/v1/translation.py +336 -0
  24. src/config/logging_config.py +442 -0
  25. src/config/translation_config.py +432 -0
  26. src/database/base.py +1 -1
  27. src/middleware/auth.py +302 -0
  28. src/middleware/cors.py +356 -0
  29. src/middleware/rate_limit.py +385 -0
  30. src/models/__init__.py +29 -0
  31. src/models/auth.py +3 -0
  32. src/models/base.py +26 -0
  33. src/models/bookmark.py +53 -0
  34. src/models/chat.py +1 -1
  35. src/models/content_localization.py +50 -0
  36. src/models/personalization.py +64 -0
  37. src/models/reading_progress.py +33 -0
  38. src/models/search_index.py +30 -0
  39. src/models/translation_openai.py +512 -0
  40. src/models/user_preferences.py +54 -0
  41. src/services/cache_examples.py +231 -0
  42. src/services/cache_service.py +690 -0
  43. src/services/code_block_handler.py +630 -0
  44. src/services/content_reconstructor.py +471 -0
  45. src/services/html_parser.py +565 -0
  46. src/services/openai_translation/__init__.py +10 -0
  47. src/services/openai_translation/client.py +59 -0
  48. src/services/openai_translation/openai_agent.py +533 -0
  49. src/services/openai_translation/service.py +855 -0
  50. src/services/openai_translation/translation_agent.py +198 -0
.env.example CHANGED
@@ -1,4 +1,13 @@
 
 
 
 
 
 
 
 
1
  # Google OAuth Configuration
 
2
  GOOGLE_CLIENT_ID=your-google-client-id
3
  GOOGLE_CLIENT_SECRET=your-google-client-secret
4
  # For production:
@@ -7,47 +16,182 @@ GOOGLE_CLIENT_SECRET=your-google-client-secret
7
  AUTH_REDIRECT_URI=http://localhost:3000/auth/google/callback
8
  FRONTEND_URL=http://localhost:3000
9
 
 
10
  # JWT Configuration
 
11
  JWT_SECRET_KEY=your-super-secret-jwt-key-at-least-32-characters-long
12
  JWT_ALGORITHM=HS256
13
  JWT_EXPIRE_MINUTES=10080 # 7 days
14
 
 
15
  # Database Configuration
 
16
  DATABASE_URL=sqlite:///./database/auth.db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
18
  # API Configuration
 
19
  API_HOST=0.0.0.0
20
  API_PORT=7860
21
  LOG_LEVEL=INFO
22
 
23
- # Rate Limiting
24
- RATE_LIMIT_REQUESTS=60
25
- RATE_LIMIT_WINDOW=60
26
-
27
  # CORS Configuration
28
- ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co
 
 
 
29
 
30
- # OpenAI Configuration (already existing)
31
- OPENAI_API_KEY=your-openai-api-key
32
- OPENAI_MODEL=gpt-4.1-nano
33
- OPENAI_EMBEDDING_MODEL=text-embedding-3-small
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Qdrant Configuration (already existing)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  QDRANT_URL=http://localhost:6333
37
  QDRANT_API_KEY=your-qdrant-api-key-if-needed
38
 
39
- # Content Configuration (already existing)
 
 
40
  BOOK_CONTENT_PATH=./book_content
41
  CHUNK_SIZE=1000
42
  CHUNK_OVERLAP=200
43
 
44
- # Conversation Context (already existing)
 
 
45
  MAX_CONTEXT_MESSAGES=3
46
  CONTEXT_WINDOW_SIZE=4000
47
 
48
- # Ingestion Configuration (already existing)
 
 
49
  BATCH_SIZE=100
50
  MAX_CONCURRENT_REQUESTS=10
51
 
52
- # Health Monitoring (already existing)
53
- HEALTH_CHECK_INTERVAL=30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # Environment Configuration
3
+ # ============================================
4
+ # Environment: development, testing, staging, production
5
+ ENVIRONMENT=development
6
+ DEBUG=true
7
+
8
+ # ============================================
9
  # Google OAuth Configuration
10
+ # ============================================
11
  GOOGLE_CLIENT_ID=your-google-client-id
12
  GOOGLE_CLIENT_SECRET=your-google-client-secret
13
  # For production:
 
16
  AUTH_REDIRECT_URI=http://localhost:3000/auth/google/callback
17
  FRONTEND_URL=http://localhost:3000
18
 
19
+ # ============================================
20
  # JWT Configuration
21
+ # ============================================
22
  JWT_SECRET_KEY=your-super-secret-jwt-key-at-least-32-characters-long
23
  JWT_ALGORITHM=HS256
24
  JWT_EXPIRE_MINUTES=10080 # 7 days
25
 
26
+ # ============================================
27
  # Database Configuration
28
+ # ============================================
29
  DATABASE_URL=sqlite:///./database/auth.db
30
+ DB_POOL_SIZE=5
31
+ DB_MAX_OVERFLOW=10
32
+ DB_POOL_TIMEOUT=30
33
+ DB_POOL_RECYCLE=3600
34
+ DB_AUTO_MIGRATE=true
35
+
36
+ # ============================================
37
+ # Gemini API Configuration (for OpenAI SDK)
38
+ # ============================================
39
+ GEMINI_API_KEY=your-gemini-api-key
40
+ GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
41
+ GEMINI_MODEL=gemini-2.0-flash-lite
42
+ GEMINI_TIMEOUT=60
43
+ GEMINI_MAX_RETRIES=3
44
+ GEMINI_RETRY_DELAY=1.0
45
+ GEMINI_HTTP2=true
46
+ GEMINI_RPM=60
47
+ GEMINI_RPH=1000
48
+
49
+ # ============================================
50
+ # OpenAI Agents SDK Configuration
51
+ # ============================================
52
+ OPENAI_AGENTS_ENABLED=true
53
+ OPENAI_AGENTS_TRACING=false
54
+ OPENAI_AGENTS_VERBOSE=false
55
+ AGENT_DEFAULT_TEMPERATURE=0.3
56
+ AGENT_MAX_TOKENS=2048
57
+ AGENT_MAX_TURNS=5
58
+ AGENT_HTML_TOOL=true
59
+ AGENT_CODE_TOOL=true
60
+ AGENT_QUALITY_TOOL=true
61
+ AGENT_QUALITY_CHECK=true
62
+ AGENT_CONFIDENCE_THRESHOLD=0.8
63
+
64
+ # ============================================
65
+ # Legacy OpenAI Configuration (for RAG)
66
+ # ============================================
67
+ OPENAI_API_KEY=your-openai-api-key
68
+ OPENAI_MODEL=gpt-4.1-nano
69
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small
70
+
71
+ # ============================================
72
+ # Cache Configuration
73
+ # ============================================
74
+ CACHE_BACKEND=memory
75
+ CACHE_DEFAULT_TTL=168
76
+ CACHE_HIGH_QUALITY_TTL=720
77
+ CACHE_LOW_QUALITY_TTL=24
78
+ REDIS_URL=redis://localhost:6379
79
+ REDIS_PREFIX=translation:
80
+ REDIS_MAX_CONNECTIONS=10
81
+ CACHE_MEMORY_MAX_SIZE=1000
82
+ CACHE_CLEANUP_INTERVAL=3600
83
+
84
+ # ============================================
85
+ # Rate Limiting Configuration
86
+ # ============================================
87
+ RATE_LIMIT_ENABLED=true
88
+ RATE_LIMIT_RPM=60
89
+ RATE_LIMIT_RPH=1000
90
+ RATE_LIMIT_RPD=10000
91
+ TRANSLATION_RPM=10
92
+ TRANSLATION_RPH=500
93
+ RATE_LIMIT_BLOCK_DURATION=3600
94
+ RATE_LIMIT_WARNING_THRESHOLD=0.8
95
+ RATE_LIMIT_REDIS=false
96
 
97
+ # ============================================
98
  # API Configuration
99
+ # ============================================
100
  API_HOST=0.0.0.0
101
  API_PORT=7860
102
  LOG_LEVEL=INFO
103
 
104
+ # ============================================
 
 
 
105
  # CORS Configuration
106
+ # ============================================
107
+ CORS_ORIGINS=http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co
108
+ CORS_METHODS=GET,POST,PUT,DELETE
109
+ CORS_HEADERS=*
110
 
111
+ # ============================================
112
+ # Security Configuration
113
+ # ============================================
114
+ SECURITY_REQUIRE_API_KEY=false
115
+ SECURITY_API_KEY_HEADER=X-API-Key
116
+ SECURITY_MAX_TEXT_LENGTH=100000
117
+ SECURITY_MAX_CHUNKS=100
118
+ SECURITY_CONTENT_FILTER=true
119
+ SECURITY_BLOCKED_PATTERNS=
120
+ SECURITY_IP_WHITELIST=
121
+ SECURITY_IP_BLACKLIST=
122
+
123
+ # ============================================
124
+ # Logging Configuration
125
+ # ============================================
126
+ LOG_FILE_ENABLED=true
127
+ LOG_FILE_PATH=logs/translation.log
128
+ LOG_FILE_ROTATION=1 day
129
+ LOG_FILE_RETENTION=30 days
130
+ LOG_MAX_FILE_SIZE=100 MB
131
+ LOG_JSON_FORMAT=false
132
+ LOG_INCLUDE_REQUEST_ID=true
133
+ LOG_FILTER_SENSITIVE=true
134
+ SENSITIVE_FIELDS=api_key,password,token,authorization
135
 
136
+ # ============================================
137
+ # Monitoring Configuration
138
+ # ============================================
139
+ MONITORING_ENABLED=true
140
+ METRICS_ENDPOINT=/metrics
141
+ METRICS_PORT=9090
142
+ HEALTH_ENDPOINT=/health
143
+ HEALTH_DETAILED=true
144
+ TRACK_PERFORMANCE=true
145
+ SLOW_QUERY_THRESHOLD=1000
146
+ TRACK_ERRORS=true
147
+ ERROR_SAMPLE_RATE=1.0
148
+
149
+ # External Monitoring
150
+ SENTRY_DSN=
151
+ PROMETHEUS_GATEWAY=
152
+
153
+ # ============================================
154
+ # Qdrant Configuration (for RAG)
155
+ # ============================================
156
  QDRANT_URL=http://localhost:6333
157
  QDRANT_API_KEY=your-qdrant-api-key-if-needed
158
 
159
+ # ============================================
160
+ # Content Configuration (for RAG)
161
+ # ============================================
162
  BOOK_CONTENT_PATH=./book_content
163
  CHUNK_SIZE=1000
164
  CHUNK_OVERLAP=200
165
 
166
+ # ============================================
167
+ # Conversation Context (for RAG)
168
+ # ============================================
169
  MAX_CONTEXT_MESSAGES=3
170
  CONTEXT_WINDOW_SIZE=4000
171
 
172
+ # ============================================
173
+ # Ingestion Configuration (for RAG)
174
+ # ============================================
175
  BATCH_SIZE=100
176
  MAX_CONCURRENT_REQUESTS=10
177
 
178
+ # ============================================
179
+ # Health Monitoring
180
+ # ============================================
181
+ HEALTH_CHECK_INTERVAL=30
182
+
183
+ # ============================================
184
+ # Proxy Configuration (Optional)
185
+ # ============================================
186
+ HTTP_PROXY=
187
+ HTTPS_PROXY=
188
+
189
+ # ============================================
190
+ # Feature Flags
191
+ # ============================================
192
+ FEATURE_STREAMING=true
193
+ FEATURE_QUALITY_CHECK=true
194
+ FEATURE_CHUNKING=true
195
+ FEATURE_CODE_PRESERVATION=true
196
+ FEATURE_HTML_PRESERVATION=true
197
+ FEATURE_BATCH_TRANSLATION=true
.gitignore CHANGED
@@ -149,4 +149,6 @@ Thumbs.db
149
 
150
  # Test files
151
  test_output/
152
- test_reports/
 
 
 
149
 
150
  # Test files
151
  test_output/
152
+ test_reports/
153
+
154
+ .playwright-mcp
OPENAI_AGENTS_FIX.md ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI Agents SDK Implementation Fix for Gemini API Quota Errors
2
+
3
+ ## Problem Summary
4
+
5
+ The translation system was experiencing Gemini API quota exceeded errors (HTTP 429) due to several issues with the OpenAI Agents SDK implementation:
6
+
7
+ 1. **Incorrect Package Name**: The code was importing from `agents` package instead of the correct `openai-agents-sdk`
8
+ 2. **Not Actually Using OpenAI Agents SDK**: Despite claiming to use the SDK, the implementation was using the OpenAI client directly
9
+ 3. **Insufficient Rate Limit Handling**: Basic error handling that didn't properly implement exponential backoff
10
+ 4. **Missing Per-User Rate Limiting**: No per-user or per-IP rate limiting to prevent quota exhaustion
11
+
12
+ ## Solution Implementation
13
+
14
+ ### 1. Fixed Package Dependencies
15
+
16
+ Updated `pyproject.toml`:
17
+
18
+ ```toml
19
+ # Before
20
+ "openai-agents>=0.1.0"
21
+
22
+ # After
23
+ "openai-agents-sdk>=0.2.9"
24
+ ```
25
+
26
+ ### 2. Created Proper OpenAI Agents SDK Implementation
27
+
28
+ **File**: `src/services/openai_translation/openai_agent.py`
29
+
30
+ - Correct imports from `openai_agents_sdk`
31
+ - Proper agent implementation with tools
32
+ - Enhanced error handling for rate limits
33
+ - Exponential backoff with jitter
34
+ - Detailed error reporting
35
+
36
+ Key features:
37
+
38
+ ```python
39
+ from openai_agents_sdk import Agent, Runner, function_tool, RunContextWrapper
40
+ from openai_agents_sdk.errors import RateLimitError as OpenAIRateLimitError
41
+ ```
42
+
43
+ ### 3. Enhanced Error Handling
44
+
45
+ **File**: `src/services/openai_translation/enhanced_service.py`
46
+
47
+ - Per-user rate limiting
48
+ - Exponential backoff implementation
49
+ - Detailed rate limit error responses
50
+ - Retry attempt tracking
51
+ - Backoff time accumulation
52
+
53
+ Example retry logic:
54
+
55
+ ```python
56
+ for attempt in range(request.max_retries + 1):
57
+ try:
58
+ # API call
59
+ result = await api_call()
60
+ return result
61
+ except RateLimitError as e:
62
+ if attempt < request.max_retries:
63
+ delay = min(
64
+ request.retry_delay * (request.backoff_factor ** attempt),
65
+ request.max_retry_delay
66
+ )
67
+ # Add jitter
68
+ delay *= (0.5 + random.random() * 0.5)
69
+ await asyncio.sleep(delay)
70
+ continue
71
+ else:
72
+ raise
73
+ ```
74
+
75
+ ### 4. Enhanced API Endpoints
76
+
77
+ **File**: `src/api/v1/enhanced_translation.py`
78
+
79
+ - Proper HTTP 429 status codes
80
+ - Retry-After headers
81
+ - Detailed rate limit information
82
+ - Per-endpoint rate limiting
83
+
84
+ Example response:
85
+
86
+ ```json
87
+ {
88
+ "error": "RATE_LIMIT_EXCEEDED",
89
+ "message": "User rate limit exceeded. Please wait 45.2 seconds.",
90
+ "retry_after": 45.2,
91
+ "rate_limit_info": {
92
+ "retry_after": 45.2,
93
+ "limit_type": "quota_exceeded",
94
+ "user_id": "user123"
95
+ },
96
+ "timestamp": 1703847123.45
97
+ }
98
+ ```
99
+
100
+ ### 5. Rate Limiting Middleware
101
+
102
+ **File**: `src/middleware/rate_limit.py`
103
+
104
+ - Per-IP rate limiting
105
+ - Per-user rate limiting (if authenticated)
106
+ - Sliding window algorithm
107
+ - Redis support for distributed systems
108
+ - In-memory fallback
109
+
110
+ ## How to Use the Enhanced System
111
+
112
+ ### 1. Update Your Environment
113
+
114
+ ```bash
115
+ cd backend
116
+ pip install -e .
117
+ ```
118
+
119
+ ### 2. Update Your `.env` File
120
+
121
+ Make sure you have:
122
+
123
+ ```env
124
+ GEMINI_API_KEY=your_gemini_api_key_here
125
+ GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
126
+ GEMINI_MODEL=gemini-2.0-flash-lite
127
+ ```
128
+
129
+ ### 3. Add Rate Limiting to Your App
130
+
131
+ In your FastAPI app initialization:
132
+
133
+ ```python
134
+ from src.middleware.rate_limit import TranslationRateLimitMiddleware
135
+
136
+ app.add_middleware(TranslationRateLimitMiddleware)
137
+ ```
138
+
139
+ ### 4. Use Enhanced Endpoints
140
+
141
+ Instead of `/translation/translate`, use the enhanced endpoint:
142
+
143
+ ```http
144
+ POST /translation/translate
145
+ ```
146
+
147
+ This provides better error handling and rate limit information.
148
+
149
+ ## Rate Limit Configuration
150
+
151
+ Default limits:
152
+
153
+ - **Per IP**: 60 requests per minute, 1000 per hour
154
+ - **Per User (if authenticated)**: 10 translations per minute, 500 per hour
155
+ - **Translation Endpoints**: Stricter limits (10/min, 500/hour)
156
+
157
+ These can be configured via environment variables or in the middleware initialization.
158
+
159
+ ## Monitoring and Metrics
160
+
161
+ The enhanced system provides detailed metrics:
162
+
163
+ ```json
164
+ {
165
+ "period": "24h",
166
+ "total_requests": 1250,
167
+ "successful_requests": 1180,
168
+ "failed_requests": 45,
169
+ "rate_limited_requests": 25,
170
+ "cache_hit_rate": 0.35,
171
+ "avg_processing_time_ms": 2340,
172
+ "total_cost_usd": 2.45,
173
+ "active_users": 15,
174
+ "user_rate_limits": {
175
+ "user123": {
176
+ "requests_last_minute": 3,
177
+ "last_reset": 1703847123.45
178
+ }
179
+ }
180
+ }
181
+ ```
182
+
183
+ ## Best Practices
184
+
185
+ 1. **Handle Rate Limit Errors Properly**
186
+
187
+ ```python
188
+ try:
189
+ result = await translate_text(text)
190
+ except RateLimitError as e:
191
+ print(f"Rate limited. Retry after {e.retry_after} seconds")
192
+ await asyncio.sleep(e.retry_after)
193
+ # Retry with backoff
194
+ ```
195
+
196
+ 2. **Use Caching When Possible**
197
+
198
+ - The system automatically caches successful translations
199
+ - Cache hits don't count against rate limits
200
+ - Provide `page_url` for better cache keys
201
+
202
+ 3. **Batch Large Translations**
203
+
204
+ - The system automatically chunks large texts
205
+ - Configure `chunk_size` and `max_chunks` appropriately
206
+ - Monitor processing time to optimize chunk size
207
+
208
+ 4. **Monitor Your Usage**
209
+ - Use `/translation/metrics` endpoint (admin only)
210
+ - Watch for rate limit errors in logs
211
+ - Adjust retry settings based on your quota
212
+
213
+ ## Testing the Fix
214
+
215
+ To test the rate limiting:
216
+
217
+ ```python
218
+ import asyncio
219
+ import httpx
220
+
221
+ async def test_rate_limit():
222
+ async with httpx.AsyncClient() as client:
223
+ # Make rapid requests to trigger rate limit
224
+ for i in range(15):
225
+ response = await client.post(
226
+ "http://localhost:8000/translation/translate",
227
+ json={
228
+ "text": f"Test translation {i}",
229
+ "source_language": "en",
230
+ "target_language": "ur"
231
+ }
232
+ )
233
+ print(f"Request {i}: Status {response.status_code}")
234
+ if response.status_code == 429:
235
+ retry_after = response.headers.get("Retry-After")
236
+ print(f"Rate limited. Retry after {retry_after} seconds")
237
+ break
238
+
239
+ asyncio.run(test_rate_limit())
240
+ ```
241
+
242
+ ## Troubleshooting
243
+
244
+ ### Still Getting 429 Errors?
245
+
246
+ 1. **Check Your Gemini API Quota**
247
+
248
+ - Visit Google AI Studio
249
+ - Verify your daily/monthly quota
250
+ - Request quota increase if needed
251
+
252
+ 2. **Implement Client-Side Rate Limiting**
253
+
254
+ ```python
255
+ import asyncio
256
+ from asyncio import Semaphore
257
+
258
+ # Limit concurrent requests
259
+ semaphore = Semaphore(5) # Max 5 concurrent requests
260
+
261
+ async def translate_with_limit(text):
262
+ async with semaphore:
263
+ return await translate_text(text)
264
+ ```
265
+
266
+ 3. **Use Backoff in Your Client**
267
+
268
+ ```python
269
+ import backoff
270
+
271
+ @backoff.on_exception(backoff.expo, RateLimitError, max_tries=3)
272
+ async def safe_translate(text):
273
+ return await translate_text(text)
274
+ ```
275
+
276
+ ### Performance Issues?
277
+
278
+ 1. **Reduce Chunk Size**
279
+
280
+ - Smaller chunks process faster
281
+ - Less chance of timeout
282
+ - Better error recovery
283
+
284
+ 2. **Enable Caching**
285
+
286
+ - Set `page_url` for content-based caching
287
+ - Cache hits are instant
288
+ - Reduces API usage
289
+
290
+ 3. **Monitor Memory Usage**
291
+ - Large translations use more memory
292
+ - Consider streaming for very large texts
293
+ - Implement pagination for batch jobs
294
+
295
+ ## Migration Guide
296
+
297
+ To migrate from the old implementation:
298
+
299
+ 1. **Update Dependencies**
300
+
301
+ ```bash
302
+ pip install openai-agents-sdk>=0.2.9
303
+ ```
304
+
305
+ 2. **Update Imports**
306
+
307
+ ```python
308
+ # Old
309
+ from agents import Agent, Runner
310
+
311
+ # New
312
+ from openai_agents_sdk import Agent, Runner
313
+ ```
314
+
315
+ 3. **Update Error Handling**
316
+
317
+ ```python
318
+ # Old
319
+ except Exception as e:
320
+ if "429" in str(e):
321
+ # Handle rate limit
322
+
323
+ # New
324
+ except RateLimitError as e:
325
+ retry_after = e.retry_after
326
+ # Handle with proper backoff
327
+ ```
328
+
329
+ 4. **Add Rate Limiting**
330
+ ```python
331
+ from src.middleware.rate_limit import TranslationRateLimitMiddleware
332
+ app.add_middleware(TranslationRateLimitMiddleware)
333
+ ```
334
+
335
+ ## Conclusion
336
+
337
+ The enhanced OpenAI Agents SDK implementation provides:
338
+
339
+ - ✅ Correct package usage and imports
340
+ - ✅ Proper agent implementation with tools
341
+ - ✅ Robust rate limit error handling
342
+ - ✅ Exponential backoff with jitter
343
+ - ✅ Per-user and per-IP rate limiting
344
+ - ✅ Detailed error reporting and metrics
345
+ - ✅ Caching to reduce API usage
346
+ - ✅ Monitoring and health checks
347
+
348
+ This should significantly reduce Gemini API quota errors and provide a better user experience with proper error handling and retry logic.
alembic/env.py CHANGED
@@ -10,7 +10,11 @@ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
10
 
11
  # Import models
12
  from src.models.auth import Base
13
- from src.models.chat import Base
 
 
 
 
14
 
15
  # this is the Alembic Config object, which provides
16
  # access to the values within the .ini file in use.
 
10
 
11
  # Import models
12
  from src.models.auth import Base
13
+ # Import other models to register them with the Base metadata
14
+ import src.models.chat
15
+ import src.models.translation
16
+ import src.models.personalization
17
+ import src.models.content_localization
18
 
19
  # this is the Alembic Config object, which provides
20
  # access to the values within the .ini file in use.
alembic/versions/001_reader_features_tables.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Create tables for reader experience features
2
+
3
+ Revision ID: 003_reader_features_tables
4
+ Revises: 002_add_onboarding_tables
5
+ Create Date: 2025-01-09
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+ # revision identifiers
12
+ revision = '003_reader_features_tables'
13
+ down_revision = '002_add_onboarding_tables'
14
+ branch_labels = None
15
+ depends_on = None
16
+
17
+ def upgrade():
18
+ # Create reading_progress table
19
+ op.create_table('reading_progress',
20
+ sa.Column('id', sa.String(), nullable=False),
21
+ sa.Column('user_id', sa.String(), nullable=False),
22
+ sa.Column('chapter_id', sa.String(), nullable=False),
23
+ sa.Column('section_id', sa.String(), nullable=False),
24
+ sa.Column('position', sa.Float(), nullable=False),
25
+ sa.Column('completed', sa.Boolean(), nullable=False, server_default='false'),
26
+ sa.Column('time_spent', sa.Integer(), nullable=False, server_default='0'),
27
+ sa.Column('last_accessed', sa.DateTime(), nullable=False),
28
+ sa.Column('created_at', sa.DateTime(), nullable=False),
29
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
30
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
31
+ sa.PrimaryKeyConstraint('id'),
32
+ sa.UniqueConstraint('user_id', 'chapter_id', 'section_id')
33
+ )
34
+ op.create_index('idx_reading_progress_user_chapter', 'reading_progress', ['user_id', 'chapter_id'])
35
+ op.create_index('idx_reading_progress_last_accessed', 'reading_progress', ['last_accessed'])
36
+
37
+ # Create bookmarks table
38
+ op.create_table('bookmarks',
39
+ sa.Column('id', sa.String(), nullable=False),
40
+ sa.Column('user_id', sa.String(), nullable=False),
41
+ sa.Column('chapter_id', sa.String(), nullable=False),
42
+ sa.Column('section_id', sa.String(), nullable=True),
43
+ sa.Column('page_url', sa.String(), nullable=False),
44
+ sa.Column('page_title', sa.String(length=255), nullable=False),
45
+ sa.Column('snippet', sa.String(), nullable=True),
46
+ sa.Column('note', sa.String(length=1000), nullable=True),
47
+ sa.Column('is_private', sa.Boolean(), nullable=False, server_default='true'),
48
+ sa.Column('created_at', sa.DateTime(), nullable=False),
49
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
50
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
51
+ sa.PrimaryKeyConstraint('id')
52
+ )
53
+ op.create_index('idx_bookmarks_user_created', 'bookmarks', ['user_id', 'created_at'])
54
+ op.create_index('idx_bookmarks_chapter', 'bookmarks', ['chapter_id'])
55
+
56
+ # Create bookmark_tags table
57
+ op.create_table('bookmark_tags',
58
+ sa.Column('id', sa.String(), nullable=False),
59
+ sa.Column('bookmark_id', sa.String(), nullable=False),
60
+ sa.Column('tag', sa.String(length=50), nullable=False),
61
+ sa.Column('created_at', sa.DateTime(), nullable=False),
62
+ sa.ForeignKeyConstraint(['bookmark_id'], ['bookmarks.id'], ondelete='CASCADE'),
63
+ sa.PrimaryKeyConstraint('id'),
64
+ sa.UniqueConstraint('bookmark_id', 'tag')
65
+ )
66
+ op.create_index('idx_bookmark_tags_tag', 'bookmark_tags', ['tag'])
67
+
68
+ # Create user_preferences table
69
+ op.create_table('user_preferences',
70
+ sa.Column('id', sa.String(), nullable=False),
71
+ sa.Column('user_id', sa.String(), nullable=False),
72
+ sa.Column('language', sa.String(), nullable=False),
73
+ sa.Column('reading_pace', sa.String(), nullable=False),
74
+ sa.Column('preferred_depth', sa.String(), nullable=False),
75
+ sa.Column('show_code_examples', sa.Boolean(), nullable=False, server_default='true'),
76
+ sa.Column('adaptive_difficulty', sa.Boolean(), nullable=False, server_default='false'),
77
+ sa.Column('theme', sa.String(), nullable=False),
78
+ sa.Column('font_size', sa.Integer(), nullable=False, server_default='16'),
79
+ sa.Column('line_height', sa.Float(), nullable=False, server_default='1.5'),
80
+ sa.Column('created_at', sa.DateTime(), nullable=False),
81
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
82
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
83
+ sa.PrimaryKeyConstraint('id'),
84
+ sa.UniqueConstraint('user_id')
85
+ )
86
+
87
+ # Create user_custom_notes table
88
+ op.create_table('user_custom_notes',
89
+ sa.Column('id', sa.String(), nullable=False),
90
+ sa.Column('user_preference_id', sa.String(), nullable=False),
91
+ sa.Column('key', sa.String(), nullable=False),
92
+ sa.Column('value', sa.String(), nullable=False),
93
+ sa.Column('created_at', sa.DateTime(), nullable=False),
94
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
95
+ sa.ForeignKeyConstraint(['user_preference_id'], ['user_preferences.id'], ondelete='CASCADE'),
96
+ sa.PrimaryKeyConstraint('id'),
97
+ sa.UniqueConstraint('user_preference_id', 'key')
98
+ )
99
+
100
+ # Create content_localization table
101
+ op.create_table('content_localization',
102
+ sa.Column('id', sa.String(), nullable=False),
103
+ sa.Column('content_id', sa.String(), nullable=False),
104
+ sa.Column('language', sa.String(), nullable=False),
105
+ sa.Column('title', sa.String(length=255), nullable=False),
106
+ sa.Column('content', sa.String(), nullable=False),
107
+ sa.Column('word_count', sa.Integer(), nullable=False, server_default='0'),
108
+ sa.Column('reading_time_minutes', sa.Integer(), nullable=False, server_default='0'),
109
+ sa.Column('last_updated', sa.DateTime(), nullable=False),
110
+ sa.Column('translator', sa.String(), nullable=True),
111
+ sa.Column('reviewed', sa.Boolean(), nullable=False, server_default='false'),
112
+ sa.Column('created_at', sa.DateTime(), nullable=False),
113
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
114
+ sa.PrimaryKeyConstraint('id'),
115
+ sa.UniqueConstraint('content_id', 'language')
116
+ )
117
+ op.create_index('idx_content_localization_language', 'content_localization', ['language'])
118
+ op.create_index('idx_content_localization_content', 'content_localization', ['content_id'])
119
+
120
+ # Create search_index table
121
+ op.create_table('search_index',
122
+ sa.Column('id', sa.String(), nullable=False),
123
+ sa.Column('content_id', sa.String(), nullable=False),
124
+ sa.Column('language', sa.String(), nullable=False),
125
+ sa.Column('content_type', sa.String(), nullable=False),
126
+ sa.Column('title', sa.String(), nullable=False),
127
+ sa.Column('content', sa.String(), nullable=False),
128
+ sa.Column('chapter_id', sa.String(), nullable=False),
129
+ sa.Column('section_id', sa.String(), nullable=True),
130
+ sa.Column('rank', sa.Float(), nullable=False, server_default='0.5'),
131
+ sa.Column('indexed_at', sa.DateTime(), nullable=False),
132
+ sa.PrimaryKeyConstraint('id')
133
+ )
134
+ op.create_index('idx_search_index_language_rank', 'search_index', ['language', 'rank'])
135
+ op.create_index('idx_search_index_chapter', 'search_index', ['chapter_id'])
136
+
137
+ # Create FTS virtual table for search
138
+ op.execute("""
139
+ CREATE VIRTUAL TABLE search_index_fts USING fts5(
140
+ title,
141
+ content,
142
+ keywords,
143
+ content=search_index
144
+ )
145
+ """)
146
+
147
+ # Create FTS triggers
148
+ op.execute("""
149
+ CREATE TRIGGER search_index_ai AFTER INSERT ON search_index BEGIN
150
+ INSERT INTO search_index_fts(rowid, title, content, keywords)
151
+ VALUES (new.id, new.title, new.content, new.title || ' ' || new.content);
152
+ END
153
+ """)
154
+
155
+ op.execute("""
156
+ CREATE TRIGGER search_index_ad AFTER DELETE ON search_index BEGIN
157
+ INSERT INTO search_index_fts(search_index_fts, rowid, title, content, keywords)
158
+ VALUES ('delete', old.id, old.title, old.content, NULL);
159
+ END
160
+ """)
161
+
162
+ op.execute("""
163
+ CREATE TRIGGER search_index_au AFTER UPDATE ON search_index BEGIN
164
+ DELETE FROM search_index_fts WHERE rowid = old.id;
165
+ INSERT INTO search_index_fts(rowid, title, content, keywords)
166
+ VALUES (new.id, new.title, new.content, new.title || ' ' || new.content);
167
+ END
168
+ """)
169
+
170
+ def downgrade():
171
+ # Drop tables in reverse order
172
+ op.drop_table('search_index')
173
+ op.execute('DROP TABLE IF EXISTS search_index_fts')
174
+ op.drop_table('content_localization')
175
+ op.drop_table('user_custom_notes')
176
+ op.drop_table('user_preferences')
177
+ op.drop_table('bookmark_tags')
178
+ op.drop_table('bookmarks')
179
+ op.drop_table('reading_progress')
alembic/versions/004_add_translation_tables.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Add translation tables and personalization features
2
+
3
+ Revision ID: 004_add_translation_tables
4
+ Revises: 003_reader_features_tables
5
+ Create Date: 2025-01-10
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ from sqlalchemy.dialects import sqlite
11
+
12
+ # revision identifiers
13
+ revision = '004_add_translation_tables'
14
+ down_revision = '003_reader_features_tables'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade():
20
+ # Create translations table
21
+ op.create_table('translations',
22
+ sa.Column('id', sa.Integer(), nullable=False),
23
+ sa.Column('content_hash', sa.String(length=64), nullable=False),
24
+ sa.Column('source_language', sa.String(length=10), nullable=False),
25
+ sa.Column('target_language', sa.String(length=10), nullable=False),
26
+ sa.Column('original_text', sa.Text(), nullable=False),
27
+ sa.Column('translated_text', sa.Text(), nullable=False),
28
+ sa.Column('created_at', sa.DateTime(), nullable=False),
29
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
30
+ sa.Column('translation_model', sa.String(length=50), nullable=False),
31
+ sa.Column('character_count', sa.Integer(), nullable=False),
32
+ sa.PrimaryKeyConstraint('id'),
33
+ sa.UniqueConstraint('content_hash')
34
+ )
35
+ op.create_index('idx_content_lookup', 'translations', ['content_hash', 'source_language', 'target_language'], unique=False)
36
+ op.create_index(op.f('ix_translations_content_hash'), 'translations', ['content_hash'], unique=True)
37
+
38
+ # Create translation_feedback table
39
+ op.create_table('translation_feedback',
40
+ sa.Column('id', sa.Integer(), nullable=False),
41
+ sa.Column('translation_id', sa.Integer(), nullable=False),
42
+ sa.Column('user_id', sa.String(length=36), nullable=False),
43
+ sa.Column('rating', sa.SmallInteger(), nullable=False),
44
+ sa.Column('comment', sa.Text(), nullable=True),
45
+ sa.Column('created_at', sa.DateTime(), nullable=False),
46
+ sa.ForeignKeyConstraint(['translation_id'], ['translations.id'], ),
47
+ sa.PrimaryKeyConstraint('id'),
48
+ sa.CheckConstraint('rating IN (-1, 1)', name='check_rating_range')
49
+ )
50
+ op.create_index('idx_user_translation', 'translation_feedback', ['user_id', 'translation_id'], unique=True)
51
+
52
+ # Create personalization_profiles table
53
+ op.create_table('personalization_profiles',
54
+ sa.Column('id', sa.Integer(), nullable=False),
55
+ sa.Column('user_id', sa.String(length=36), nullable=False),
56
+ sa.Column('reading_level', sa.String(length=20), nullable=True),
57
+ sa.Column('preferred_language', sa.String(length=10), nullable=True),
58
+ sa.Column('focus_areas', sa.JSON(), nullable=True),
59
+ sa.Column('learning_style', sa.String(length=20), nullable=True),
60
+ sa.Column('enable_transliteration', sa.Boolean(), nullable=True),
61
+ sa.Column('technical_term_handling', sa.String(length=20), nullable=True),
62
+ sa.Column('font_size', sa.Integer(), nullable=True),
63
+ sa.Column('focus_mode_preferences', sa.JSON(), nullable=True),
64
+ sa.Column('created_at', sa.DateTime(), nullable=False),
65
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
66
+ sa.Column('last_active', sa.DateTime(), nullable=False),
67
+ sa.PrimaryKeyConstraint('id'),
68
+ sa.UniqueConstraint('user_id')
69
+ )
70
+ op.create_index(op.f('ix_personalization_profiles_user_id'), 'personalization_profiles', ['user_id'], unique=False)
71
+
72
+ # Check if content_localization table exists before creating
73
+ conn = op.get_bind()
74
+ inspector = sa.inspect(conn)
75
+ tables = inspector.get_table_names()
76
+
77
+ if 'content_localization' not in tables:
78
+ # Create content_localization table
79
+ op.create_table('content_localization',
80
+ sa.Column('id', sa.Integer(), nullable=False),
81
+ sa.Column('content_url', sa.String(length=500), nullable=False),
82
+ sa.Column('content_hash', sa.String(length=64), nullable=False),
83
+ sa.Column('is_translated', sa.Boolean(), nullable=True),
84
+ sa.Column('last_translation_date', sa.DateTime(), nullable=True),
85
+ sa.Column('translation_cache_key', sa.String(length=64), nullable=True),
86
+ sa.Column('word_count', sa.Integer(), nullable=True),
87
+ sa.Column('character_count', sa.Integer(), nullable=True),
88
+ sa.Column('has_code_blocks', sa.Boolean(), nullable=True),
89
+ sa.Column('detected_languages', sa.JSON(), nullable=True),
90
+ sa.Column('chunk_count', sa.Integer(), nullable=True),
91
+ sa.Column('processing_status', sa.String(length=20), nullable=True),
92
+ sa.Column('created_at', sa.DateTime(), nullable=False),
93
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
94
+ sa.PrimaryKeyConstraint('id')
95
+ )
96
+ op.create_index(op.f('ix_content_localization_content_hash'), 'content_localization', ['content_hash'], unique=False)
97
+ op.create_index(op.f('ix_content_localization_content_url'), 'content_localization', ['content_url'], unique=False)
98
+
99
+
100
+ def downgrade():
101
+ # Drop tables in reverse order
102
+ op.drop_index(op.f('ix_content_localization_content_url'), table_name='content_localization')
103
+ op.drop_index(op.f('ix_content_localization_content_hash'), table_name='content_localization')
104
+ op.drop_table('content_localization')
105
+
106
+ op.drop_index(op.f('ix_personalization_profiles_user_id'), table_name='personalization_profiles')
107
+ op.drop_table('personalization_profiles')
108
+
109
+ op.drop_index('idx_user_translation', table_name='translation_feedback')
110
+ op.drop_table('translation_feedback')
111
+
112
+ op.drop_index(op.f('ix_translations_content_hash'), table_name='translations')
113
+ op.drop_index('idx_content_lookup', table_name='translations')
114
+ op.drop_table('translations')
alembic/versions/005_add_openai_translation_tables.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Add OpenAI Translation System tables
2
+
3
+ Revision ID: 005_add_openai_translation_tables
4
+ Revises: 004_add_translation_tables
5
+ Create Date: 2025-12-12
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ from sqlalchemy.dialects import postgresql, sqlite
11
+ import uuid
12
+
13
+ # revision identifiers
14
+ revision = '005_add_openai_translation_tables'
15
+ down_revision = '004_add_translation_tables'
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade():
21
+ # Create translation_jobs table
22
+ op.create_table('translation_jobs',
23
+ sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
24
+ sa.Column('job_id', sa.String(length=255), nullable=False),
25
+ sa.Column('user_id', sa.String(length=255), nullable=True),
26
+ sa.Column('session_id', sa.String(length=255), nullable=True),
27
+ sa.Column('page_url', sa.String(length=2048), nullable=True),
28
+ sa.Column('content_hash', sa.String(length=64), nullable=False),
29
+ sa.Column('source_language', sa.String(length=10), nullable=False, default='en'),
30
+ sa.Column('target_language', sa.String(length=10), nullable=False, default='ur'),
31
+ sa.Column('model_name', sa.String(length=100), nullable=False),
32
+ sa.Column('temperature', sa.Float(), nullable=True),
33
+ sa.Column('max_tokens', sa.Integer(), nullable=True),
34
+ sa.Column('original_text', sa.Text(), nullable=False),
35
+ sa.Column('translated_text', sa.Text(), nullable=True),
36
+ sa.Column('status', sa.String(length=20), nullable=False, default='PENDING'),
37
+ sa.Column('chunks_total', sa.Integer(), nullable=False, default=0),
38
+ sa.Column('chunks_completed', sa.Integer(), nullable=False, default=0),
39
+ sa.Column('chunks_failed', sa.Integer(), nullable=False, default=0),
40
+ sa.Column('progress_percentage', sa.Float(), nullable=False, default=0.0),
41
+ sa.Column('input_tokens', sa.Integer(), nullable=False, default=0),
42
+ sa.Column('output_tokens', sa.Integer(), nullable=False, default=0),
43
+ sa.Column('total_tokens', sa.Integer(), nullable=False, default=0),
44
+ sa.Column('estimated_cost_usd', sa.Float(), nullable=False, default=0.0),
45
+ sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
46
+ sa.Column('preserve_code_blocks', sa.Boolean(), nullable=False, default=True),
47
+ sa.Column('enable_transliteration', sa.Boolean(), nullable=False, default=True),
48
+ sa.Column('chunk_size', sa.Integer(), nullable=False, default=2000),
49
+ sa.Column('max_chunks', sa.Integer(), nullable=False, default=100),
50
+ sa.Column('max_retries', sa.Integer(), nullable=False, default=3),
51
+ sa.Column('user_agent', sa.Text(), nullable=True),
52
+ sa.Column('ip_address', sa.String(length=45), nullable=True),
53
+ sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
54
+ sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
55
+ sa.Column('started_at', sa.DateTime(), nullable=True),
56
+ sa.Column('completed_at', sa.DateTime(), nullable=True),
57
+ sa.PrimaryKeyConstraint('id'),
58
+ sa.UniqueConstraint('job_id'),
59
+ sa.CheckConstraint('chunks_total >= 0', name='check_chunks_total_non_negative'),
60
+ sa.CheckConstraint('chunks_completed >= 0', name='check_chunks_completed_non_negative'),
61
+ sa.CheckConstraint('chunks_failed >= 0', name='check_chunks_failed_non_negative'),
62
+ sa.CheckConstraint('progress_percentage >= 0.0 AND progress_percentage <= 100.0', name='check_progress_percentage_range'),
63
+ sa.CheckConstraint('chunk_size > 0', name='check_chunk_size_positive'),
64
+ sa.CheckConstraint('max_chunks > 0', name='check_max_chunks_positive'),
65
+ sa.CheckConstraint('max_retries >= 0', name='check_max_retries_non_negative')
66
+ )
67
+ op.create_index('ix_translation_jobs_job_id', 'translation_jobs', ['job_id'], unique=False)
68
+ op.create_index('ix_translation_jobs_user_id', 'translation_jobs', ['user_id'], unique=False)
69
+ op.create_index('ix_translation_jobs_session_id', 'translation_jobs', ['session_id'], unique=False)
70
+ op.create_index('ix_translation_jobs_page_url', 'translation_jobs', ['page_url'], unique=False)
71
+ op.create_index('ix_translation_jobs_content_hash', 'translation_jobs', ['content_hash'], unique=False)
72
+ op.create_index('ix_translation_jobs_status', 'translation_jobs', ['status'], unique=False)
73
+ op.create_index('ix_translation_jobs_created_at', 'translation_jobs', ['created_at'], unique=False)
74
+
75
+ # Create translation_chunks table
76
+ op.create_table('translation_chunks',
77
+ sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
78
+ sa.Column('job_id', sa.UUID(), nullable=False),
79
+ sa.Column('chunk_index', sa.Integer(), nullable=False),
80
+ sa.Column('original_text', sa.Text(), nullable=False),
81
+ sa.Column('translated_text', sa.Text(), nullable=True),
82
+ sa.Column('status', sa.String(length=20), nullable=False, default='PENDING'),
83
+ sa.Column('retry_count', sa.Integer(), nullable=False, default=0),
84
+ sa.Column('start_position', sa.Integer(), nullable=False),
85
+ sa.Column('end_position', sa.Integer(), nullable=False),
86
+ sa.Column('is_code_block', sa.Boolean(), nullable=False, default=False),
87
+ sa.Column('code_language', sa.String(length=50), nullable=True),
88
+ sa.Column('word_count', sa.Integer(), nullable=False, default=0),
89
+ sa.Column('token_count', sa.Integer(), nullable=False, default=0),
90
+ sa.Column('input_tokens', sa.Integer(), nullable=False, default=0),
91
+ sa.Column('output_tokens', sa.Integer(), nullable=False, default=0),
92
+ sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
93
+ sa.Column('last_error', sa.Text(), nullable=True),
94
+ sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
95
+ sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
96
+ sa.Column('started_at', sa.DateTime(), nullable=True),
97
+ sa.Column('completed_at', sa.DateTime(), nullable=True),
98
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ondelete='CASCADE'),
99
+ sa.PrimaryKeyConstraint('id'),
100
+ sa.CheckConstraint('chunk_index >= 0', name='check_chunk_index_non_negative'),
101
+ sa.CheckConstraint('start_position >= 0', name='check_start_position_non_negative'),
102
+ sa.CheckConstraint('end_position >= start_position', name='check_end_position_after_start'),
103
+ sa.CheckConstraint('word_count >= 0', name='check_word_count_non_negative'),
104
+ sa.CheckConstraint('token_count >= 0', name='check_token_count_non_negative'),
105
+ sa.CheckConstraint('retry_count >= 0', name='check_retry_count_non_negative'),
106
+ sa.UniqueConstraint('job_id', 'chunk_index', name='uq_job_chunk_index')
107
+ )
108
+ op.create_index('ix_translation_chunks_job_id', 'translation_chunks', ['job_id'], unique=False)
109
+ op.create_index('ix_translation_chunks_status', 'translation_chunks', ['status'], unique=False)
110
+ op.create_index('ix_translation_chunks_is_code_block', 'translation_chunks', ['is_code_block'], unique=False)
111
+
112
+ # Create translation_cache table
113
+ op.create_table('translation_cache',
114
+ sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
115
+ sa.Column('cache_key', sa.String(length=255), nullable=False),
116
+ sa.Column('job_id', sa.UUID(), nullable=True),
117
+ sa.Column('content_hash', sa.String(length=64), nullable=False),
118
+ sa.Column('page_url', sa.String(length=2048), nullable=True),
119
+ sa.Column('url_hash', sa.String(length=32), nullable=True),
120
+ sa.Column('source_language', sa.String(length=10), nullable=False),
121
+ sa.Column('target_language', sa.String(length=10), nullable=False),
122
+ sa.Column('original_text', sa.Text(), nullable=False),
123
+ sa.Column('translated_text', sa.Text(), nullable=False),
124
+ sa.Column('model_version', sa.String(length=100), nullable=True),
125
+ sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
126
+ sa.Column('translation_metadata', sa.JSON(), nullable=True),
127
+ sa.Column('quality_score', sa.Float(), nullable=True),
128
+ sa.Column('confidence_score', sa.Float(), nullable=True),
129
+ sa.Column('is_validated', sa.Boolean(), nullable=False, default=False),
130
+ sa.Column('hit_count', sa.Integer(), nullable=False, default=0),
131
+ sa.Column('last_hit_at', sa.DateTime(), nullable=True),
132
+ sa.Column('ttl_hours', sa.Integer(), nullable=False, default=24),
133
+ sa.Column('priority', sa.String(length=10), nullable=False, default='MEDIUM'),
134
+ sa.Column('expires_at', sa.DateTime(), nullable=False),
135
+ sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
136
+ sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
137
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id']),
138
+ sa.PrimaryKeyConstraint('id'),
139
+ sa.UniqueConstraint('cache_key'),
140
+ sa.CheckConstraint('quality_score >= 0.0 AND quality_score <= 5.0', name='check_quality_score_range'),
141
+ sa.CheckConstraint('confidence_score >= 0.0 AND confidence_score <= 1.0', name='check_confidence_score_range'),
142
+ sa.CheckConstraint('hit_count >= 0', name='check_hit_count_non_negative'),
143
+ sa.CheckConstraint('ttl_hours > 0', name='check_ttl_hours_positive')
144
+ )
145
+ op.create_index('ix_translation_cache_cache_key', 'translation_cache', ['cache_key'], unique=False)
146
+ op.create_index('ix_translation_cache_content_hash', 'translation_cache', ['content_hash'], unique=False)
147
+ op.create_index('ix_translation_cache_page_url', 'translation_cache', ['page_url'], unique=False)
148
+ op.create_index('ix_translation_cache_url_hash', 'translation_cache', ['url_hash'], unique=False)
149
+ op.create_index('ix_translation_cache_expires_at', 'translation_cache', ['expires_at'], unique=False)
150
+ op.create_index('ix_translation_cache_priority', 'translation_cache', ['priority'], unique=False)
151
+
152
+ # Create translation_errors table
153
+ op.create_table('translation_errors',
154
+ sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
155
+ sa.Column('error_id', sa.String(length=255), nullable=False),
156
+ sa.Column('job_id', sa.UUID(), nullable=True),
157
+ sa.Column('chunk_id', sa.UUID(), nullable=True),
158
+ sa.Column('error_type', sa.String(length=50), nullable=False),
159
+ sa.Column('error_code', sa.String(length=100), nullable=True),
160
+ sa.Column('error_message', sa.Text(), nullable=False),
161
+ sa.Column('error_details', sa.JSON(), nullable=True),
162
+ sa.Column('severity', sa.String(length=10), nullable=False),
163
+ sa.Column('category', sa.String(length=50), nullable=False, default='translation'),
164
+ sa.Column('is_retriable', sa.Boolean(), nullable=False, default=True),
165
+ sa.Column('retry_count', sa.Integer(), nullable=False, default=0),
166
+ sa.Column('max_retries', sa.Integer(), nullable=False, default=3),
167
+ sa.Column('next_retry_at', sa.DateTime(), nullable=True),
168
+ sa.Column('is_resolved', sa.Boolean(), nullable=False, default=False),
169
+ sa.Column('resolution_notes', sa.Text(), nullable=True),
170
+ sa.Column('resolved_at', sa.DateTime(), nullable=True),
171
+ sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
172
+ sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
173
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id']),
174
+ sa.ForeignKeyConstraint(['chunk_id'], ['translation_chunks.id']),
175
+ sa.PrimaryKeyConstraint('id'),
176
+ sa.UniqueConstraint('error_id'),
177
+ sa.CheckConstraint('retry_count >= 0', name='check_error_retry_count_non_negative'),
178
+ sa.CheckConstraint('max_retries >= 0', name='check_error_max_retries_non_negative')
179
+ )
180
+ op.create_index('ix_translation_errors_error_id', 'translation_errors', ['error_id'], unique=False)
181
+ op.create_index('ix_translation_errors_job_id', 'translation_errors', ['job_id'], unique=False)
182
+ op.create_index('ix_translation_errors_chunk_id', 'translation_errors', ['chunk_id'], unique=False)
183
+ op.create_index('ix_translation_errors_error_type', 'translation_errors', ['error_type'], unique=False)
184
+ op.create_index('ix_translation_errors_severity', 'translation_errors', ['severity'], unique=False)
185
+ op.create_index('ix_translation_errors_created_at', 'translation_errors', ['created_at'], unique=False)
186
+
187
+ # Create translation_sessions table
188
+ op.create_table('translation_sessions',
189
+ sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
190
+ sa.Column('session_id', sa.String(length=255), nullable=False),
191
+ sa.Column('user_id', sa.String(length=255), nullable=True),
192
+ sa.Column('source_language', sa.String(length=10), nullable=False, default='en'),
193
+ sa.Column('target_language', sa.String(length=10), nullable=False, default='ur'),
194
+ sa.Column('preferred_model', sa.String(length=100), nullable=True),
195
+ sa.Column('request_count', sa.Integer(), nullable=False, default=0),
196
+ sa.Column('character_count', sa.Integer(), nullable=False, default=0),
197
+ sa.Column('total_cost_usd', sa.Float(), nullable=False, default=0.0),
198
+ sa.Column('session_data', sa.JSON(), nullable=True),
199
+ sa.Column('user_agent', sa.Text(), nullable=True),
200
+ sa.Column('ip_address', sa.String(length=45), nullable=True),
201
+ sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
202
+ sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
203
+ sa.Column('last_activity_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
204
+ sa.Column('expires_at', sa.DateTime(), nullable=False),
205
+ sa.PrimaryKeyConstraint('id'),
206
+ sa.UniqueConstraint('session_id'),
207
+ sa.CheckConstraint('request_count >= 0', name='check_session_request_count_non_negative'),
208
+ sa.CheckConstraint('character_count >= 0', name='check_session_character_count_non_negative'),
209
+ sa.CheckConstraint('total_cost_usd >= 0.0', name='check_session_total_cost_non_negative')
210
+ )
211
+ op.create_index('ix_translation_sessions_session_id', 'translation_sessions', ['session_id'], unique=False)
212
+ op.create_index('ix_translation_sessions_user_id', 'translation_sessions', ['user_id'], unique=False)
213
+ op.create_index('ix_translation_sessions_expires_at', 'translation_sessions', ['expires_at'], unique=False)
214
+
215
+ # Create translation_metrics table
216
+ op.create_table('translation_metrics',
217
+ sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
218
+ sa.Column('metric_date', sa.DateTime(), nullable=False),
219
+ sa.Column('period_type', sa.String(length=20), nullable=False, default='daily'),
220
+ sa.Column('total_requests', sa.Integer(), nullable=False, default=0),
221
+ sa.Column('successful_requests', sa.Integer(), nullable=False, default=0),
222
+ sa.Column('failed_requests', sa.Integer(), nullable=False, default=0),
223
+ sa.Column('cached_requests', sa.Integer(), nullable=False, default=0),
224
+ sa.Column('avg_processing_time_ms', sa.Float(), nullable=False, default=0.0),
225
+ sa.Column('p95_processing_time_ms', sa.Float(), nullable=False, default=0.0),
226
+ sa.Column('p99_processing_time_ms', sa.Float(), nullable=False, default=0.0),
227
+ sa.Column('total_characters', sa.Integer(), nullable=False, default=0),
228
+ sa.Column('total_tokens', sa.Integer(), nullable=False, default=0),
229
+ sa.Column('total_cost_usd', sa.Float(), nullable=False, default=0.0),
230
+ sa.Column('avg_quality_score', sa.Float(), nullable=True),
231
+ sa.Column('cache_hit_rate', sa.Float(), nullable=False, default=0.0),
232
+ sa.Column('error_rate', sa.Float(), nullable=False, default=0.0),
233
+ sa.Column('top_error_types', sa.JSON(), nullable=True),
234
+ sa.Column('language_pairs', sa.JSON(), nullable=True),
235
+ sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
236
+ sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
237
+ sa.PrimaryKeyConstraint('id'),
238
+ sa.CheckConstraint('total_requests >= 0', name='check_metrics_total_requests_non_negative'),
239
+ sa.CheckConstraint('successful_requests >= 0', name='check_metrics_successful_requests_non_negative'),
240
+ sa.CheckConstraint('failed_requests >= 0', name='check_metrics_failed_requests_non_negative'),
241
+ sa.CheckConstraint('cached_requests >= 0', name='check_metrics_cached_requests_non_negative'),
242
+ sa.CheckConstraint('total_characters >= 0', name='check_metrics_total_characters_non_negative'),
243
+ sa.CheckConstraint('total_tokens >= 0', name='check_metrics_total_tokens_non_negative'),
244
+ sa.CheckConstraint('total_cost_usd >= 0.0', name='check_metrics_total_cost_non_negative'),
245
+ sa.CheckConstraint('avg_processing_time_ms >= 0.0', name='check_metrics_avg_processing_time_non_negative'),
246
+ sa.CheckConstraint('p95_processing_time_ms >= 0.0', name='check_metrics_p95_processing_time_non_negative'),
247
+ sa.CheckConstraint('p99_processing_time_ms >= 0.0', name='check_metrics_p99_processing_time_non_negative'),
248
+ sa.CheckConstraint('cache_hit_rate >= 0.0 AND cache_hit_rate <= 1.0', name='check_metrics_cache_hit_rate_range'),
249
+ sa.CheckConstraint('error_rate >= 0.0 AND error_rate <= 1.0', name='check_metrics_error_rate_range'),
250
+ sa.UniqueConstraint('metric_date', 'period_type', name='uq_metrics_date_period')
251
+ )
252
+ op.create_index('ix_translation_metrics_metric_date', 'translation_metrics', ['metric_date'], unique=False)
253
+ op.create_index('ix_translation_metrics_period_type', 'translation_metrics', ['period_type'], unique=False)
254
+
255
+
256
+ def downgrade():
257
+ # Drop tables in reverse order
258
+ op.drop_index('ix_translation_metrics_period_type', table_name='translation_metrics')
259
+ op.drop_index('ix_translation_metrics_metric_date', table_name='translation_metrics')
260
+ op.drop_table('translation_metrics')
261
+
262
+ op.drop_index('ix_translation_sessions_expires_at', table_name='translation_sessions')
263
+ op.drop_index('ix_translation_sessions_user_id', table_name='translation_sessions')
264
+ op.drop_index('ix_translation_sessions_session_id', table_name='translation_sessions')
265
+ op.drop_table('translation_sessions')
266
+
267
+ op.drop_index('ix_translation_errors_created_at', table_name='translation_errors')
268
+ op.drop_index('ix_translation_errors_severity', table_name='translation_errors')
269
+ op.drop_index('ix_translation_errors_error_type', table_name='translation_errors')
270
+ op.drop_index('ix_translation_errors_chunk_id', table_name='translation_errors')
271
+ op.drop_index('ix_translation_errors_job_id', table_name='translation_errors')
272
+ op.drop_index('ix_translation_errors_error_id', table_name='translation_errors')
273
+ op.drop_table('translation_errors')
274
+
275
+ op.drop_index('ix_translation_cache_priority', table_name='translation_cache')
276
+ op.drop_index('ix_translation_cache_expires_at', table_name='translation_cache')
277
+ op.drop_index('ix_translation_cache_url_hash', table_name='translation_cache')
278
+ op.drop_index('ix_translation_cache_page_url', table_name='translation_cache')
279
+ op.drop_index('ix_translation_cache_content_hash', table_name='translation_cache')
280
+ op.drop_index('ix_translation_cache_cache_key', table_name='translation_cache')
281
+ op.drop_table('translation_cache')
282
+
283
+ op.drop_index('ix_translation_chunks_is_code_block', table_name='translation_chunks')
284
+ op.drop_index('ix_translation_chunks_status', table_name='translation_chunks')
285
+ op.drop_index('ix_translation_chunks_job_id', table_name='translation_chunks')
286
+ op.drop_table('translation_chunks')
287
+
288
+ op.drop_index('ix_translation_jobs_created_at', table_name='translation_jobs')
289
+ op.drop_index('ix_translation_jobs_status', table_name='translation_jobs')
290
+ op.drop_index('ix_translation_jobs_content_hash', table_name='translation_jobs')
291
+ op.drop_index('ix_translation_jobs_page_url', table_name='translation_jobs')
292
+ op.drop_index('ix_translation_jobs_session_id', table_name='translation_jobs')
293
+ op.drop_index('ix_translation_jobs_user_id', table_name='translation_jobs')
294
+ op.drop_index('ix_translation_jobs_job_id', table_name='translation_jobs')
295
+ op.drop_table('translation_jobs')
create_translation_tables.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Create translation tables in the database.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ from src.database.base import engine, Base
15
+ from src.models import * # Import all models
16
+
17
+ def create_tables():
18
+ """Create all tables in the database."""
19
+ try:
20
+ # Import models to register them
21
+ from src.models.auth import User
22
+ from src.models.translation_openai import (
23
+ TranslationJob, TranslationChunk, TranslationError,
24
+ TranslationSession, TranslationCache, TranslationMetrics
25
+ )
26
+
27
+ # Create all tables
28
+ Base.metadata.create_all(bind=engine)
29
+ print("Translation tables created successfully!")
30
+
31
+ # List created tables
32
+ from sqlalchemy import inspect
33
+ inspector = inspect(engine)
34
+ tables = inspector.get_table_names()
35
+
36
+ print("\nAvailable tables:")
37
+ for table in sorted(tables):
38
+ if 'translation' in table.lower():
39
+ print(f" - {table}")
40
+
41
+ except Exception as e:
42
+ print(f"Error creating tables: {e}")
43
+ import traceback
44
+ traceback.print_exc()
45
+
46
+ if __name__ == "__main__":
47
+ create_tables()
fix_async_client.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Fix the async client initialization in get_translation_service().
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ # Read the service.py file
15
+ file_path = backend_path / "src" / "services" / "openai_translation" / "service.py"
16
+ content = file_path.read_text(encoding='utf-8')
17
+
18
+ # Find and replace the get_translation_service function
19
+ old_function = """async def get_translation_service() -> OpenAITranslationService:
20
+ \"\"\"Get or create OpenAI translation service instance.\"\"\"
21
+ global _translation_service
22
+
23
+ if _translation_service is None:
24
+ _translation_service = OpenAITranslationService()
25
+
26
+ return _translation_service"""
27
+
28
+ new_function = """async def get_translation_service() -> OpenAITranslationService:
29
+ \"\"\"Get or create OpenAI translation service instance.\"\"\"
30
+ global _translation_service
31
+
32
+ if _translation_service is None:
33
+ _translation_service = OpenAITranslationService()
34
+ # Initialize the async client
35
+ _translation_service.gemini_client = await get_gemini_client()
36
+
37
+ return _translation_service"""
38
+
39
+ content = content.replace(old_function, new_function)
40
+
41
+ # Write back to file
42
+ file_path.write_text(content, encoding='utf-8')
43
+
44
+ print("Fixed async client initialization in get_translation_service()")
fix_jsonb.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Replace JSONB with JSON for SQLite compatibility.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ # Read the translation_openai.py file
15
+ model_file = backend_path / "src" / "models" / "translation_openai.py"
16
+ content = model_file.read_text(encoding='utf-8')
17
+
18
+ # Replace all JSONB with JSON
19
+ content = content.replace('JSONB', 'JSON')
20
+
21
+ # Remove JSONB from imports since we're using JSON from sqlalchemy
22
+ content = content.replace('from sqlalchemy.dialects.postgresql import UUID, JSONB',
23
+ 'from sqlalchemy.dialects.postgresql import UUID')
24
+
25
+ # Write back to file
26
+ model_file.write_text(content, encoding='utf-8')
27
+
28
+ print("Fixed JSONB to JSON conversion")
fix_translation_endpoint.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Fix translation endpoint to handle User objects properly.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ # Read the translation.py file
15
+ file_path = backend_path / "src" / "api" / "v1" / "translation.py"
16
+ content = file_path.read_text(encoding='utf-8')
17
+
18
+ # Add User import
19
+ if "from src.models.auth import User" not in content:
20
+ # Add User import after other imports
21
+ content = content.replace(
22
+ "from src.security.dependencies import get_current_user_or_anonymous",
23
+ "from src.security.dependencies import get_current_user_or_anonymous\nfrom src.models.auth import User"
24
+ )
25
+
26
+ # Fix type hints
27
+ content = content.replace(
28
+ "current_user: Optional[Dict] = Depends(get_current_user_or_anonymous),",
29
+ "current_user: Optional[User] = Depends(get_current_user_or_anonymous),"
30
+ )
31
+
32
+ # Fix current_user.get() calls
33
+ content = content.replace(
34
+ 'current_user.get("id") if current_user else None',
35
+ 'current_user.id if current_user else None'
36
+ )
37
+ content = content.replace(
38
+ 'current_user.get("is_admin", False)',
39
+ 'getattr(current_user, "is_admin", False)'
40
+ )
41
+
42
+ # Write back to file
43
+ file_path.write_text(content, encoding='utf-8')
44
+
45
+ print("Fixed translation endpoint to handle User objects")
fix_user_id_issue.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Fix the user_id issue in translation service.
4
+ The User.id is a string but the foreign key expects a UUID.
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ from pathlib import Path
10
+
11
+ # Add backend to path
12
+ backend_path = Path(__file__).parent
13
+ sys.path.insert(0, str(backend_path))
14
+
15
+ # Read the translation_openai.py file
16
+ file_path = backend_path / "src" / "models" / "translation_openai.py"
17
+ content = file_path.read_text(encoding='utf-8')
18
+
19
+ # Change user_id from UUID to String to match the User model
20
+ content = content.replace(
21
+ 'user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True)',
22
+ 'user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)'
23
+ )
24
+
25
+ # Also fix TranslationSession and TranslationMetrics user_id fields
26
+ content = content.replace(
27
+ 'user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True)',
28
+ 'user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)'
29
+ )
30
+
31
+ # Write back to file
32
+ file_path.write_text(content, encoding='utf-8')
33
+
34
+ print("Fixed user_id to use String instead of UUID to match User.id field")
fix_user_model.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Fix the User model to add translation relationships.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ # Read the auth.py file
15
+ auth_file = backend_path / "src" / "models" / "auth.py"
16
+ content = auth_file.read_text(encoding='utf-8')
17
+
18
+ # Find the User model's relationships section
19
+ import_start = content.find(" # Relationships")
20
+ if import_start == -1:
21
+ print("Could not find relationships section in User model")
22
+ sys.exit(1)
23
+
24
+ # Find where the relationships end
25
+ relationships_end = content.find("\n\n", import_start)
26
+ if relationships_end == -1:
27
+ relationships_end = content.find("\nclass", import_start)
28
+
29
+ if relationships_end == -1:
30
+ print("Could not find end of relationships section")
31
+ sys.exit(1)
32
+
33
+ # Extract the relationships section
34
+ relationships_section = content[import_start:relationships_end]
35
+
36
+ # Check if translation relationships already exist
37
+ if "translation_jobs" in relationships_section:
38
+ print("Translation relationships already exist in User model")
39
+ else:
40
+ # Add the translation relationships
41
+ new_relationships = relationships_section.rstrip()
42
+ if not new_relationships.endswith('\n'):
43
+ new_relationships += '\n'
44
+ new_relationships += """ translation_jobs = relationship("TranslationJob", back_populates="user", cascade="all, delete-orphan")
45
+ translation_sessions = relationship("TranslationSession", back_populates="user", cascade="all, delete-orphan")
46
+ translation_metrics = relationship("TranslationMetrics", back_populates="user", cascade="all, delete-orphan")"""
47
+
48
+ # Replace the old relationships section with the new one
49
+ new_content = content[:import_start] + new_relationships + content[relationships_end:]
50
+
51
+ # Write back to file
52
+ auth_file.write_text(new_content, encoding='utf-8')
53
+ print("✅ Added translation relationships to User model")
main.py CHANGED
@@ -25,6 +25,7 @@ from rag.chat import ChatHandler
25
  from rag.qdrant_client import QdrantManager
26
  from rag.tasks import TaskManager
27
  from api.exceptions import ContentNotFoundError, RAGException
 
28
 
29
  # Import security middleware
30
  from middleware.csrf import CSRFMiddleware
@@ -60,6 +61,7 @@ logger = structlog.get_logger()
60
 
61
  # Load environment variables
62
  load_dotenv()
 
63
 
64
 
65
  class Settings(BaseSettings):
@@ -91,12 +93,15 @@ class Settings(BaseSettings):
91
  # CORS Configuration
92
  allowed_origins: str = os.getenv(
93
  "ALLOWED_ORIGINS",
94
- "http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co"
95
  )
96
 
97
  # JWT Configuration
98
  jwt_secret_key: str = os.getenv("JWT_SECRET_KEY", "your-super-secret-jwt-key")
99
 
 
 
 
100
  # Conversation Context
101
  max_context_messages: int = int(os.getenv("MAX_CONTEXT_MESSAGES", "3"))
102
  context_window_size: int = int(os.getenv("CONTEXT_WINDOW_SIZE", "4000"))
@@ -182,6 +187,9 @@ async def lifespan(app: FastAPI):
182
  )
183
  await task_manager.start()
184
 
 
 
 
185
  logger.info("RAG backend initialized successfully")
186
 
187
  yield
@@ -237,13 +245,13 @@ app.add_middleware(
237
  httponly=False,
238
  samesite="lax",
239
  max_age=3600,
240
- exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth/login", "/auth/register", "/api/chat", "/auth/logout", "/auth/me", "/auth/preferences", "/auth/refresh"],
241
  )
242
 
243
  app.add_middleware(
244
  AuthMiddleware,
245
  anonymous_limit=3,
246
- exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth"],
247
  anonymous_header="X-Anonymous-Session-ID",
248
  )
249
 
@@ -253,6 +261,14 @@ app.include_router(auth.router)
253
  # Include new chat routes
254
  app.include_router(chat.router)
255
 
 
 
 
 
 
 
 
 
256
 
257
  # Optional API key security for higher rate limits
258
  security = HTTPBearer(auto_error=False)
@@ -887,6 +903,45 @@ async def create_chatkit_session(request: Request):
887
  # raise HTTPException(status_code=500, detail=f"ChatKit processing error: {str(e)}")
888
 
889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890
  if __name__ == "__main__":
891
  import uvicorn
892
 
 
25
  from rag.qdrant_client import QdrantManager
26
  from rag.tasks import TaskManager
27
  from api.exceptions import ContentNotFoundError, RAGException
28
+ from src.services.translation_cache import cache_service
29
 
30
  # Import security middleware
31
  from middleware.csrf import CSRFMiddleware
 
61
 
62
  # Load environment variables
63
  load_dotenv()
64
+ print(f"*** Environment loaded. GEMINI_API_KEY exists: {bool(os.getenv('GEMINI_API_KEY'))} ***")
65
 
66
 
67
  class Settings(BaseSettings):
 
93
  # CORS Configuration
94
  allowed_origins: str = os.getenv(
95
  "ALLOWED_ORIGINS",
96
+ "http://localhost:3000,http://localhost:3001,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co"
97
  )
98
 
99
  # JWT Configuration
100
  jwt_secret_key: str = os.getenv("JWT_SECRET_KEY", "your-super-secret-jwt-key")
101
 
102
+ # Google AI Configuration
103
+ google_ai_api_key: str = os.getenv("GEMINI_API_KEY", "")
104
+
105
  # Conversation Context
106
  max_context_messages: int = int(os.getenv("MAX_CONTEXT_MESSAGES", "3"))
107
  context_window_size: int = int(os.getenv("CONTEXT_WINDOW_SIZE", "4000"))
 
187
  )
188
  await task_manager.start()
189
 
190
+ # Start background task for cache cleanup (runs daily)
191
+ asyncio.create_task(schedule_cache_cleanup())
192
+
193
  logger.info("RAG backend initialized successfully")
194
 
195
  yield
 
245
  httponly=False,
246
  samesite="lax",
247
  max_age=3600,
248
+ exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth/login", "/auth/register", "/api/chat", "/auth/logout", "/auth/me", "/auth/preferences", "/auth/refresh", "/api/v1/translation"],
249
  )
250
 
251
  app.add_middleware(
252
  AuthMiddleware,
253
  anonymous_limit=3,
254
+ exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth", "/api/v1/translation"],
255
  anonymous_header="X-Anonymous-Session-ID",
256
  )
257
 
 
261
  # Include new chat routes
262
  app.include_router(chat.router)
263
 
264
+ # Include reader features routes
265
+ from src.api.v1 import reader_features
266
+ app.include_router(reader_features.router, prefix="/api/v1")
267
+
268
+ # Include translation routes
269
+ from src.api.v1 import translation
270
+ app.include_router(translation.router, prefix="/api/v1")
271
+
272
 
273
  # Optional API key security for higher rate limits
274
  security = HTTPBearer(auto_error=False)
 
903
  # raise HTTPException(status_code=500, detail=f"ChatKit processing error: {str(e)}")
904
 
905
 
906
+ async def schedule_cache_cleanup():
907
+ """
908
+ Schedule periodic cache cleanup task.
909
+ Runs every 24 hours to clear expired translation cache entries.
910
+ """
911
+ import logging
912
+
913
+ cache_logger = logging.getLogger(__name__)
914
+
915
+ while True:
916
+ try:
917
+ # Wait for 24 hours
918
+ await asyncio.sleep(86400) # 24 hours in seconds
919
+
920
+ # Clean up expired cache entries
921
+ cleared_count = await cache_service.clear_expired_cache()
922
+
923
+ if cleared_count > 0:
924
+ cache_logger.info(
925
+ f"Cache cleanup completed",
926
+ cleared_entries=cleared_count,
927
+ timestamp=datetime.utcnow().isoformat()
928
+ )
929
+ else:
930
+ cache_logger.debug(
931
+ "Cache cleanup completed - no expired entries found",
932
+ timestamp=datetime.utcnow().isoformat()
933
+ )
934
+
935
+ except Exception as e:
936
+ cache_logger.error(
937
+ "Cache cleanup failed",
938
+ error=str(e),
939
+ timestamp=datetime.utcnow().isoformat()
940
+ )
941
+ # Wait 1 hour before retrying on error
942
+ await asyncio.sleep(3600)
943
+
944
+
945
  if __name__ == "__main__":
946
  import uvicorn
947
 
migrate_user_id.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Migration script to change user_id from UUID to String in translation tables.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ from sqlalchemy import text
15
+ from src.database.base import engine
16
+
17
+ def migrate_user_id_columns():
18
+ """Migrate user_id columns from UUID to String in translation tables."""
19
+
20
+ # Tables to modify
21
+ tables = [
22
+ 'translation_jobs',
23
+ 'translation_sessions',
24
+ 'translation_metrics'
25
+ ]
26
+
27
+ with engine.connect() as connection:
28
+ # Begin transaction
29
+ trans = connection.begin()
30
+
31
+ try:
32
+ for table in tables:
33
+ print(f"Migrating {table}...")
34
+
35
+ # SQLite doesn't support ALTER COLUMN directly, so we need to:
36
+ # 1. Create new table with correct schema
37
+ # 2. Copy data
38
+ # 3. Drop old table
39
+ # 4. Rename new table
40
+
41
+ # For simplicity, let's just create new tables and drop the old ones
42
+ # since this is still development
43
+ connection.execute(text(f"DROP TABLE IF EXISTS {table}"))
44
+ print(f" - Dropped {table}")
45
+
46
+ # Commit transaction
47
+ trans.commit()
48
+ print("\nMigration successful!")
49
+
50
+ # Recreate tables
51
+ from src.models import * # Import all models
52
+ from src.database.base import Base
53
+ Base.metadata.create_all(bind=engine)
54
+ print("\nTables recreated with new schema!")
55
+
56
+ except Exception as e:
57
+ # Rollback on error
58
+ trans.rollback()
59
+ print(f"\nMigration failed: {e}")
60
+ raise
61
+
62
+ if __name__ == "__main__":
63
+ migrate_user_id_columns()
migrate_user_id_fixed.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Migration script to change user_id from UUID to String in translation tables.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add backend to path
11
+ backend_path = Path(__file__).parent
12
+ sys.path.insert(0, str(backend_path))
13
+
14
+ from sqlalchemy import text
15
+ from src.database.base import engine, Base
16
+ from src.models import * # Import all models
17
+
18
+ def migrate_user_id_columns():
19
+ """Migrate user_id columns from UUID to String in translation tables."""
20
+
21
+ # Tables to modify
22
+ tables = [
23
+ 'translation_jobs',
24
+ 'translation_sessions',
25
+ 'translation_metrics'
26
+ ]
27
+
28
+ with engine.connect() as connection:
29
+ # Begin transaction
30
+ trans = connection.begin()
31
+
32
+ try:
33
+ for table in tables:
34
+ print(f"Dropping {table}...")
35
+ connection.execute(text(f"DROP TABLE IF EXISTS {table}"))
36
+
37
+ # Commit transaction
38
+ trans.commit()
39
+ print("\nDropped all translation tables successfully!")
40
+
41
+ except Exception as e:
42
+ # Rollback on error
43
+ trans.rollback()
44
+ print(f"\nMigration failed: {e}")
45
+ raise
46
+
47
+ # Recreate tables with new schema
48
+ print("Recreating tables with new schema...")
49
+ Base.metadata.create_all(bind=engine)
50
+ print("\nMigration completed successfully!")
51
+
52
+ if __name__ == "__main__":
53
+ migrate_user_id_columns()
migration_summary_translation_tables.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Database Migration: Translation Tables (Phase 2, Task T010)
2
+
3
+ ## Overview
4
+ Created Alembic migration `004_add_translation_tables.py` to add support for translation features, user feedback, personalization, and content localization.
5
+
6
+ ## Migration Details
7
+ - **Revision ID**: `004_add_translation_tables`
8
+ - **Revises**: `003_reader_features_tables`
9
+ - **File**: `backend/alembic/versions/004_add_translation_tables.py`
10
+
11
+ ## Tables Created
12
+
13
+ ### 1. `translations` Table
14
+ Stores cached translations with content hashing for deduplication.
15
+
16
+ **Columns:**
17
+ - `id` (Integer, Primary Key)
18
+ - `content_hash` (String(64), Unique, Indexed) - SHA-256 hash for deduplication
19
+ - `source_language` (String(10)) - Source language code
20
+ - `target_language` (String(10)) - Target language code
21
+ - `original_text` (Text) - Original text to translate
22
+ - `translated_text` (Text) - Translated text
23
+ - `created_at` (DateTime) - Creation timestamp
24
+ - `updated_at` (DateTime) - Last update timestamp
25
+ - `translation_model` (String(50)) - Model used for translation (e.g., "gemini-1.5-pro")
26
+ - `character_count` (Integer) - Character count of the text
27
+
28
+ **Indexes:**
29
+ - Unique index on `content_hash`
30
+ - Composite index `idx_content_lookup` on (`content_hash`, `source_language`, `target_language`)
31
+
32
+ ### 2. `translation_feedback` Table
33
+ Stores user feedback on translations for quality improvement.
34
+
35
+ **Columns:**
36
+ - `id` (Integer, Primary Key)
37
+ - `translation_id` (Integer, Foreign Key → translations.id)
38
+ - `user_id` (String(36)) - User UUID from auth system
39
+ - `rating` (SmallInteger) - -1 (downvote) or 1 (upvote)
40
+ - `comment` (Text, Optional) - User comment on the translation
41
+ - `created_at` (DateTime) - Feedback timestamp
42
+
43
+ **Constraints:**
44
+ - Check constraint: `rating IN (-1, 1)`
45
+ - Unique composite index on (`user_id`, `translation_id`) - One feedback per user per translation
46
+
47
+ ### 3. `personalization_profiles` Table
48
+ Stores user preferences for personalized content delivery.
49
+
50
+ **Columns:**
51
+ - `id` (Integer, Primary Key)
52
+ - `user_id` (String(36), Unique, Indexed) - User UUID
53
+ - `reading_level` (Enum: 'beginner', 'intermediate', 'advanced')
54
+ - `preferred_language` (String(10)) - User's preferred language
55
+ - `focus_areas` (JSON) - Array of topics user cares about
56
+ - `learning_style` (Enum: 'visual', 'practical', 'theoretical', 'balanced')
57
+ - `enable_transliteration` (Boolean) - Whether to show transliterations
58
+ - `technical_term_handling` (Enum: 'translate', 'transliterate', 'keep_english')
59
+ - `font_size` (Integer) - Preferred font size
60
+ - `focus_mode_preferences` (JSON) - Preferences for focus mode
61
+ - `created_at` (DateTime)
62
+ - `updated_at` (DateTime)
63
+ - `last_active` (DateTime)
64
+
65
+ ### 4. `content_localization` Table (Conditional Creation)
66
+ Tracks translation status and metadata for content pages.
67
+ This table is only created if it doesn't already exist.
68
+
69
+ **Columns:**
70
+ - `id` (Integer, Primary Key)
71
+ - `content_url` (String(500), Indexed) - URL of the content page
72
+ - `content_hash` (String(64), Indexed) - Content hash for change detection
73
+ - `is_translated` (Boolean) - Whether the content has been translated
74
+ - `last_translation_date` (DateTime) - When translation was last updated
75
+ - `translation_cache_key` (String(64)) - Cache key for translations
76
+ - `word_count` (Integer) - Number of words in content
77
+ - `character_count` (Integer) - Number of characters
78
+ - `has_code_blocks` (Boolean) - Whether content contains code blocks
79
+ - `detected_languages` (JSON) - Array of detected languages in content
80
+ - `chunk_count` (Integer) - Number of chunks for processing
81
+ - `processing_status` (Enum: 'pending', 'processing', 'completed', 'failed', 'partial')
82
+ - `created_at` (DateTime)
83
+ - `updated_at` (DateTime)
84
+
85
+ **Indexes:**
86
+ - Index on `content_hash`
87
+ - Index on `content_url`
88
+
89
+ ## Database Compatibility
90
+ The migration is designed to work with SQLite (current database) but is compatible with PostgreSQL as well.
91
+
92
+ ## Foreign Key Relationships
93
+ - `translation_feedback.translation_id` → `translations.id`
94
+ - (Other foreign keys would be to the users table from auth system)
95
+
96
+ ## Migration Usage
97
+
98
+ ### To apply the migration:
99
+ ```bash
100
+ cd backend
101
+ alembic upgrade head
102
+ ```
103
+
104
+ ### To revert the migration:
105
+ ```bash
106
+ cd backend
107
+ alembic downgrade -1
108
+ ```
109
+
110
+ ### To check current status:
111
+ ```bash
112
+ cd backend
113
+ alembic current
114
+ ```
115
+
116
+ ## Notes
117
+ 1. The migration uses SQLite-compatible syntax but will work with PostgreSQL
118
+ 2. Enum types are stored as strings with length constraints for compatibility
119
+ 3. JSON fields use SQLite's JSON extension (available in SQLite 3.38+)
120
+ 4. The content_localization table check prevents errors if it already exists
121
+
122
+ ## Updated Files
123
+ 1. `backend/alembic/versions/004_add_translation_tables.py` - Main migration file
124
+ 2. `backend/alembic/env.py` - Updated to import new models for metadata registration
migrations/versions/001_create_openai_translation_tables.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Create OpenAI translation system tables
2
+
3
+ Revision ID: 001_create_openai_translation_tables
4
+ Revises:
5
+ Create Date: 2024-01-12 12:00:00.000000
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ from sqlalchemy.dialects import postgresql
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '001_create_openai_translation_tables'
14
+ down_revision = None
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # Create translation_jobs table
21
+ op.create_table('translation_jobs',
22
+ sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
23
+ sa.Column('job_id', sa.String(length=64), nullable=False),
24
+ sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
25
+ sa.Column('session_id', sa.String(length=128), nullable=True),
26
+ sa.Column('content_hash', sa.String(length=64), nullable=False),
27
+ sa.Column('page_url', sa.Text(), nullable=True),
28
+ sa.Column('source_language', sa.String(length=10), nullable=False),
29
+ sa.Column('target_language', sa.String(length=10), nullable=False),
30
+ sa.Column('original_text', sa.Text(), nullable=False),
31
+ sa.Column('translated_text', sa.Text(), nullable=True),
32
+ sa.Column('preserve_code_blocks', sa.Boolean(), nullable=False),
33
+ sa.Column('enable_transliteration', sa.Boolean(), nullable=False),
34
+ sa.Column('chunk_size', sa.Integer(), nullable=False),
35
+ sa.Column('max_chunks', sa.Integer(), nullable=False),
36
+ sa.Column('model_name', sa.String(length=50), nullable=False),
37
+ sa.Column('temperature', sa.Numeric(precision=3, scale=2), nullable=False),
38
+ sa.Column('max_tokens', sa.Integer(), nullable=False),
39
+ sa.Column('status', sa.String(length=20), nullable=False),
40
+ sa.Column('progress_percentage', sa.Numeric(precision=5, scale=2), nullable=False),
41
+ sa.Column('chunks_total', sa.Integer(), nullable=False),
42
+ sa.Column('chunks_completed', sa.Integer(), nullable=False),
43
+ sa.Column('chunks_failed', sa.Integer(), nullable=False),
44
+ sa.Column('retry_count', sa.Integer(), nullable=False),
45
+ sa.Column('max_retries', sa.Integer(), nullable=False),
46
+ sa.Column('started_at', sa.DateTime(timezone=True), nullable=True),
47
+ sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True),
48
+ sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
49
+ sa.Column('input_tokens', sa.BigInteger(), nullable=False),
50
+ sa.Column('output_tokens', sa.BigInteger(), nullable=False),
51
+ sa.Column('estimated_cost_usd', sa.Numeric(precision=10, scale=6), nullable=False),
52
+ sa.Column('actual_cost_usd', sa.Numeric(precision=10, scale=6), nullable=True),
53
+ sa.Column('quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
54
+ sa.Column('confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
55
+ sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
56
+ sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
57
+ sa.Column('last_activity_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
58
+ sa.Column('user_agent', sa.Text(), nullable=True),
59
+ sa.Column('ip_address', sa.String(length=45), nullable=True),
60
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
61
+ sa.PrimaryKeyConstraint('id'),
62
+ sa.UniqueConstraint('job_id')
63
+ )
64
+
65
+ # Create indexes for translation_jobs
66
+ op.create_index('ix_translation_jobs_job_id', 'translation_jobs', ['job_id'], unique=True)
67
+ op.create_index('ix_translation_jobs_user_id', 'translation_jobs', ['user_id'])
68
+ op.create_index('ix_translation_jobs_session_id', 'translation_jobs', ['session_id'])
69
+ op.create_index('ix_translation_jobs_content_hash', 'translation_jobs', ['content_hash'])
70
+ op.create_index('ix_translation_jobs_page_url', 'translation_jobs', ['page_url'])
71
+ op.create_index('ix_translation_jobs_source_language', 'translation_jobs', ['source_language'])
72
+ op.create_index('ix_translation_jobs_target_language', 'translation_jobs', ['target_language'])
73
+ op.create_index('ix_translation_jobs_status', 'translation_jobs', ['status'])
74
+ op.create_index('ix_translation_jobs_status_created', 'translation_jobs', ['status', 'created_at'])
75
+ op.create_index('ix_translation_jobs_user_status', 'translation_jobs', ['user_id', 'status'])
76
+ op.create_index('ix_translation_jobs_content_lookup', 'translation_jobs', ['content_hash', 'source_language', 'target_language'])
77
+ op.create_index('ix_translation_jobs_page_cache', 'translation_jobs', ['page_url', 'content_hash'])
78
+ op.create_index('ix_translation_jobs_activity', 'translation_jobs', ['last_activity_at'])
79
+ op.create_index('ix_translation_jobs_progress', 'translation_jobs', ['status', 'progress_percentage'])
80
+
81
+ # Create translation_chunks table
82
+ op.create_table('translation_chunks',
83
+ sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
84
+ sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
85
+ sa.Column('chunk_index', sa.Integer(), nullable=False),
86
+ sa.Column('original_text', sa.Text(), nullable=False),
87
+ sa.Column('translated_text', sa.Text(), nullable=True),
88
+ sa.Column('start_position', sa.Integer(), nullable=False),
89
+ sa.Column('end_position', sa.Integer(), nullable=False),
90
+ sa.Column('is_code_block', sa.Boolean(), nullable=False),
91
+ sa.Column('code_language', sa.String(length=50), nullable=True),
92
+ sa.Column('word_count', sa.Integer(), nullable=False),
93
+ sa.Column('status', sa.String(length=20), nullable=False),
94
+ sa.Column('retry_count', sa.Integer(), nullable=False),
95
+ sa.Column('started_at', sa.DateTime(timezone=True), nullable=True),
96
+ sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True),
97
+ sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
98
+ sa.Column('input_tokens', sa.Integer(), nullable=False),
99
+ sa.Column('output_tokens', sa.Integer(), nullable=False),
100
+ sa.Column('confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
101
+ sa.Column('requires_review', sa.Boolean(), nullable=False),
102
+ sa.Column('last_error', sa.Text(), nullable=True),
103
+ sa.Column('error_code', sa.String(length=50), nullable=True),
104
+ sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
105
+ sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
106
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
107
+ sa.PrimaryKeyConstraint('id'),
108
+ sa.UniqueConstraint('job_id', 'chunk_index', name='uq_translation_chunks_job_chunk')
109
+ )
110
+
111
+ # Create indexes for translation_chunks
112
+ op.create_index('ix_translation_chunks_job_id', 'translation_chunks', ['job_id'])
113
+ op.create_index('ix_translation_chunks_job_chunk', 'translation_chunks', ['job_id', 'chunk_index'], unique=True)
114
+ op.create_index('ix_translation_chunks_status', 'translation_chunks', ['status'])
115
+ op.create_index('ix_translation_chunks_status_created', 'translation_chunks', ['status', 'created_at'])
116
+ op.create_index('ix_translation_chunks_is_code_block', 'translation_chunks', ['is_code_block'])
117
+ op.create_index('ix_translation_chunks_code_language', 'translation_chunks', ['code_language'])
118
+
119
+ # Create translation_errors table
120
+ op.create_table('translation_errors',
121
+ sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
122
+ sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
123
+ sa.Column('chunk_id', postgresql.UUID(as_uuid=True), nullable=True),
124
+ sa.Column('error_id', sa.String(length=64), nullable=False),
125
+ sa.Column('error_type', sa.String(length=50), nullable=False),
126
+ sa.Column('error_code', sa.String(length=50), nullable=True),
127
+ sa.Column('error_message', sa.Text(), nullable=False),
128
+ sa.Column('error_details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
129
+ sa.Column('severity', sa.String(length=20), nullable=False),
130
+ sa.Column('category', sa.String(length=50), nullable=False),
131
+ sa.Column('is_retriable', sa.Boolean(), nullable=False),
132
+ sa.Column('retry_attempt', sa.Integer(), nullable=False),
133
+ sa.Column('max_retries', sa.Integer(), nullable=False),
134
+ sa.Column('next_retry_at', sa.DateTime(timezone=True), nullable=True),
135
+ sa.Column('request_payload', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
136
+ sa.Column('response_payload', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
137
+ sa.Column('stack_trace', sa.Text(), nullable=True),
138
+ sa.Column('debug_info', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
139
+ sa.Column('resolved_at', sa.DateTime(timezone=True), nullable=True),
140
+ sa.Column('resolution', sa.String(length=200), nullable=True),
141
+ sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
142
+ sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
143
+ sa.ForeignKeyConstraint(['chunk_id'], ['translation_chunks.id'], ),
144
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
145
+ sa.PrimaryKeyConstraint('id'),
146
+ sa.UniqueConstraint('error_id')
147
+ )
148
+
149
+ # Create indexes for translation_errors
150
+ op.create_index('ix_translation_errors_error_id', 'translation_errors', ['error_id'], unique=True)
151
+ op.create_index('ix_translation_errors_job_id', 'translation_errors', ['job_id'])
152
+ op.create_index('ix_translation_errors_chunk_id', 'translation_errors', ['chunk_id'])
153
+ op.create_index('ix_translation_errors_error_type', 'translation_errors', ['error_type'])
154
+ op.create_index('ix_translation_errors_severity', 'translation_errors', ['severity'])
155
+ op.create_index('ix_translation_errors_error_type_created', 'translation_errors', ['error_type', 'created_at'])
156
+ op.create_index('ix_translation_errors_error_severity', 'translation_errors', ['severity', 'created_at'])
157
+ op.create_index('ix_translation_errors_job_errors', 'translation_errors', ['job_id', 'created_at'])
158
+ op.create_index('ix_translation_errors_retry_schedule', 'translation_errors', ['next_retry_at', 'is_retriable'])
159
+
160
+ # Create translation_sessions table
161
+ op.create_table('translation_sessions',
162
+ sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
163
+ sa.Column('session_id', sa.String(length=128), nullable=False),
164
+ sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
165
+ sa.Column('started_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
166
+ sa.Column('last_activity_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
167
+ sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
168
+ sa.Column('is_active', sa.Boolean(), nullable=False),
169
+ sa.Column('request_count', sa.Integer(), nullable=False),
170
+ sa.Column('character_count', sa.Integer(), nullable=False),
171
+ sa.Column('total_cost_usd', sa.Numeric(precision=10, scale=6), nullable=False),
172
+ sa.Column('requests_per_minute', sa.Integer(), nullable=False),
173
+ sa.Column('characters_per_hour', sa.Integer(), nullable=False),
174
+ sa.Column('source_language', sa.String(length=10), nullable=True),
175
+ sa.Column('target_language', sa.String(length=10), nullable=True),
176
+ sa.Column('preferred_model', sa.String(length=50), nullable=True),
177
+ sa.Column('user_agent', sa.Text(), nullable=True),
178
+ sa.Column('ip_address', sa.String(length=45), nullable=True),
179
+ sa.Column('country_code', sa.String(length=2), nullable=True),
180
+ sa.Column('preferences', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
181
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
182
+ sa.PrimaryKeyConstraint('id'),
183
+ sa.UniqueConstraint('session_id')
184
+ )
185
+
186
+ # Create indexes for translation_sessions
187
+ op.create_index('ix_translation_sessions_session_id', 'translation_sessions', ['session_id'], unique=True)
188
+ op.create_index('ix_translation_sessions_user_id', 'translation_sessions', ['user_id'])
189
+ op.create_index('ix_translation_sessions_is_active', 'translation_sessions', ['is_active'])
190
+ op.create_index('ix_translation_sessions_expires_at', 'translation_sessions', ['expires_at'])
191
+ op.create_index('ix_translation_sessions_user_sessions', 'translation_sessions', ['user_id', 'is_active'])
192
+ op.create_index('ix_translation_sessions_session_expiry', 'translation_sessions', ['expires_at', 'is_active'])
193
+ op.create_index('ix_translation_sessions_ip_address', 'translation_sessions', ['ip_address'])
194
+
195
+ # Create translation_cache table
196
+ op.create_table('translation_cache',
197
+ sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
198
+ sa.Column('cache_key', sa.String(length=128), nullable=False),
199
+ sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=True),
200
+ sa.Column('content_hash', sa.String(length=64), nullable=False),
201
+ sa.Column('page_url', sa.Text(), nullable=True),
202
+ sa.Column('url_hash', sa.String(length=64), nullable=True),
203
+ sa.Column('source_language', sa.String(length=10), nullable=False),
204
+ sa.Column('target_language', sa.String(length=10), nullable=False),
205
+ sa.Column('original_text', sa.Text(), nullable=False),
206
+ sa.Column('translated_text', sa.Text(), nullable=False),
207
+ sa.Column('hit_count', sa.Integer(), nullable=False),
208
+ sa.Column('last_hit_at', sa.DateTime(timezone=True), nullable=True),
209
+ sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
210
+ sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
211
+ sa.Column('quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
212
+ sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
213
+ sa.Column('model_version', sa.String(length=50), nullable=False),
214
+ sa.Column('ttl_hours', sa.Integer(), nullable=False),
215
+ sa.Column('is_pinned', sa.Boolean(), nullable=False),
216
+ sa.Column('priority', sa.Integer(), nullable=False),
217
+ sa.Column('is_validated', sa.Boolean(), nullable=False),
218
+ sa.Column('validated_by', sa.String(length=50), nullable=True),
219
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
220
+ sa.PrimaryKeyConstraint('id'),
221
+ sa.UniqueConstraint('cache_key')
222
+ )
223
+
224
+ # Create indexes for translation_cache
225
+ op.create_index('ix_translation_cache_cache_key', 'translation_cache', ['cache_key'], unique=True)
226
+ op.create_index('ix_translation_cache_job_id', 'translation_cache', ['job_id'])
227
+ op.create_index('ix_translation_cache_content_hash', 'translation_cache', ['content_hash'])
228
+ op.create_index('ix_translation_cache_page_url', 'translation_cache', ['page_url'])
229
+ op.create_index('ix_translation_cache_url_hash', 'translation_cache', ['url_hash'])
230
+ op.create_index('ix_translation_cache_source_language', 'translation_cache', ['source_language'])
231
+ op.create_index('ix_translation_cache_target_language', 'translation_cache', ['target_language'])
232
+ op.create_index('ix_translation_cache_expires_at', 'translation_cache', ['expires_at'])
233
+ op.create_index('ix_translation_cache_cache_lookup', 'translation_cache', ['content_hash', 'source_language', 'target_language'])
234
+ op.create_index('ix_translation_cache_page_cache', 'translation_cache', ['url_hash', 'content_hash'])
235
+ op.create_index('ix_translation_cache_cache_expires', 'translation_cache', ['expires_at', 'priority'])
236
+ op.create_index('ix_translation_cache_cache_popularity', 'translation_cache', ['hit_count', 'last_hit_at'])
237
+
238
+ # Create translation_metrics table
239
+ op.create_table('translation_metrics',
240
+ sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
241
+ sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
242
+ sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
243
+ sa.Column('metric_date', sa.DateTime(timezone=True), nullable=False),
244
+ sa.Column('period_type', sa.String(length=20), nullable=False),
245
+ sa.Column('total_requests', sa.Integer(), nullable=False),
246
+ sa.Column('total_characters', sa.BigInteger(), nullable=False),
247
+ sa.Column('total_chunks', sa.Integer(), nullable=False),
248
+ sa.Column('successful_translations', sa.Integer(), nullable=False),
249
+ sa.Column('failed_translations', sa.Integer(), nullable=False),
250
+ sa.Column('avg_processing_time_ms', sa.BigInteger(), nullable=False),
251
+ sa.Column('min_processing_time_ms', sa.BigInteger(), nullable=False),
252
+ sa.Column('max_processing_time_ms', sa.BigInteger(), nullable=False),
253
+ sa.Column('p95_processing_time_ms', sa.BigInteger(), nullable=False),
254
+ sa.Column('total_input_tokens', sa.BigInteger(), nullable=False),
255
+ sa.Column('total_output_tokens', sa.BigInteger(), nullable=False),
256
+ sa.Column('total_cost_usd', sa.Numeric(precision=12, scale=6), nullable=False),
257
+ sa.Column('avg_cost_per_char', sa.Numeric(precision=10, scale=8), nullable=False),
258
+ sa.Column('avg_quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
259
+ sa.Column('avg_confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
260
+ sa.Column('cache_hits', sa.Integer(), nullable=False),
261
+ sa.Column('cache_misses', sa.Integer(), nullable=False),
262
+ sa.Column('cache_hit_rate', sa.Numeric(precision=5, scale=2), nullable=False),
263
+ sa.Column('error_count', sa.Integer(), nullable=False),
264
+ sa.Column('error_rate', sa.Numeric(precision=5, scale=2), nullable=False),
265
+ sa.Column('top_error_types', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
266
+ sa.Column('source_language', sa.String(length=10), nullable=True),
267
+ sa.Column('target_language', sa.String(length=10), nullable=True),
268
+ sa.Column('model_name', sa.String(length=50), nullable=True),
269
+ sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
270
+ sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
271
+ sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
272
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
273
+ sa.PrimaryKeyConstraint('id')
274
+ )
275
+
276
+ # Create indexes for translation_metrics
277
+ op.create_index('ix_translation_metrics_job_id', 'translation_metrics', ['job_id'])
278
+ op.create_index('ix_translation_metrics_user_id', 'translation_metrics', ['user_id'])
279
+ op.create_index('ix_translation_metrics_metric_date', 'translation_metrics', ['metric_date'])
280
+ op.create_index('ix_translation_metrics_period_type', 'translation_metrics', ['period_type'])
281
+ op.create_index('ix_translation_metrics_source_language', 'translation_metrics', ['source_language'])
282
+ op.create_index('ix_translation_metrics_target_language', 'translation_metrics', ['target_language'])
283
+ op.create_index('ix_translation_metrics_model_name', 'translation_metrics', ['model_name'])
284
+ op.create_index('ix_translation_metrics_date_period', 'translation_metrics', ['metric_date', 'period_type'])
285
+ op.create_index('ix_translation_metrics_user_metrics', 'translation_metrics', ['user_id', 'metric_date'])
286
+ op.create_index('ix_translation_metrics_job_metrics', 'translation_metrics', ['job_id', 'metric_date'])
287
+ op.create_index('ix_translation_metrics_lang_metrics', 'translation_metrics', ['source_language', 'target_language', 'metric_date'])
288
+
289
+
290
+ def downgrade() -> None:
291
+ # Drop tables in reverse order of creation
292
+ op.drop_table('translation_metrics')
293
+ op.drop_table('translation_cache')
294
+ op.drop_table('translation_sessions')
295
+ op.drop_table('translation_errors')
296
+ op.drop_table('translation_chunks')
297
+ op.drop_table('translation_jobs')
pyproject.toml CHANGED
@@ -41,7 +41,8 @@ dependencies = [
41
  "authlib>=1.2.1",
42
  "itsdangerous>=2.1.0",
43
  # OpenAI Integration
44
- "openai>=1.6.1",
 
45
  "tiktoken>=0.5.2",
46
  # Vector Database
47
  "qdrant-client>=1.7.0",
@@ -59,11 +60,16 @@ dependencies = [
59
  # Logging and Monitoring
60
  "structlog>=23.2.0",
61
  "backoff>=2.2.1",
 
 
62
  # Monitoring and Performance
63
  "psutil>=5.9.6",
64
  "openai-chatkit>=1.4.0",
65
  "email-validator>=2.3.0",
66
  "bcrypt==4.2.0",
 
 
 
67
  ]
68
 
69
  [project.optional-dependencies]
 
41
  "authlib>=1.2.1",
42
  "itsdangerous>=2.1.0",
43
  # OpenAI Integration
44
+ "openai>=1.68.0",
45
+ "openai-agents>=0.2.9",
46
  "tiktoken>=0.5.2",
47
  # Vector Database
48
  "qdrant-client>=1.7.0",
 
60
  # Logging and Monitoring
61
  "structlog>=23.2.0",
62
  "backoff>=2.2.1",
63
+ "python-json-logger>=2.0.7",
64
+ "PyYAML>=6.0.1",
65
  # Monitoring and Performance
66
  "psutil>=5.9.6",
67
  "openai-chatkit>=1.4.0",
68
  "email-validator>=2.3.0",
69
  "bcrypt==4.2.0",
70
+ "google-genai>=0.3.0",
71
+ "redis>=7.1.0",
72
+ "python-json-logger>=4.0.0",
73
  ]
74
 
75
  [project.optional-dependencies]
requirements.txt CHANGED
@@ -16,6 +16,7 @@ aiosmtplib>=3.0.0
16
  jinja2>=3.1.0
17
  python-dotenv>=1.0.0
18
  structlog>=23.2.0
 
19
  backoff>=2.2.1
20
  psutil>=5.9.6
21
  # ChatKit Python SDK
@@ -28,3 +29,9 @@ python-jose[cryptography]>=3.3.0
28
  passlib[bcrypt]>=1.7.4
29
  authlib>=1.2.1
30
  itsdangerous>=2.1.0
 
 
 
 
 
 
 
16
  jinja2>=3.1.0
17
  python-dotenv>=1.0.0
18
  structlog>=23.2.0
19
+ python-json-logger>=2.0.7
20
  backoff>=2.2.1
21
  psutil>=5.9.6
22
  # ChatKit Python SDK
 
29
  passlib[bcrypt]>=1.7.4
30
  authlib>=1.2.1
31
  itsdangerous>=2.1.0
32
+
33
+ # Cache dependencies
34
+ redis[hiredis]>=5.0.0
35
+
36
+ # Google Generative AI for Gemini integration
37
+ google-generativeai>=0.8.0
src/api/v1/progress.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Progress tracking API endpoints.
3
+
4
+ Manages user reading progress through chapters and sections.
5
+ """
6
+
7
+ from datetime import datetime, timedelta
8
+ from typing import List, Optional, Dict, Any
9
+ from fastapi import APIRouter, Depends, HTTPException, Query, Body, BackgroundTasks
10
+ from sqlalchemy.orm import Session
11
+ from pydantic import BaseModel, Field, validator
12
+
13
+ from src.database.base import get_db
14
+ from src.middleware.auth import get_current_active_user, require_user
15
+ from src.models.auth import User
16
+ from src.models.reading_progress import ReadingProgress
17
+ from src.models.user_preferences import UserPreference
18
+ from src.services.progress import ReadingProgressService
19
+ from src.services.personalization import PersonalizationService
20
+ from src.utils.errors import handle_errors, NotFoundError, ValidationError
21
+ from src.utils.logging import get_logger
22
+
23
+ logger = get_logger(__name__)
24
+
25
+ router = APIRouter(
26
+ prefix="/progress",
27
+ tags=["progress"]
28
+ )
29
+
30
+ # Pydantic models for API
31
+ class SectionProgress(BaseModel):
32
+ section_id: str = Field(..., description="Section identifier")
33
+ position: float = Field(..., ge=0, le=100, description="Progress percentage (0-100)")
34
+ time_spent: int = Field(0, ge=0, description="Time spent in minutes")
35
+ completed: bool = Field(False, description="Whether section is completed")
36
+
37
+ @validator('position')
38
+ def validate_position(cls, v):
39
+ if not 0 <= v <= 100:
40
+ raise ValueError("Position must be between 0 and 100")
41
+ return v
42
+
43
+ class ChapterProgressUpdate(BaseModel):
44
+ chapter_id: str = Field(..., description="Chapter identifier")
45
+ sections: List[SectionProgress] = Field(..., description="Section progress updates")
46
+
47
+ class ProgressResponse(BaseModel):
48
+ chapter_id: str
49
+ overall_progress: float
50
+ sections_completed: int
51
+ total_sections: int
52
+ time_spent: int
53
+ sections: List[Dict[str, Any]]
54
+ last_accessed: Optional[str]
55
+ estimated_completion: Optional[Dict[str, Any]]
56
+
57
+ class SessionStart(BaseModel):
58
+ chapter_id: str = Field(..., description="Chapter identifier")
59
+ section_id: Optional[str] = Field(None, description="Section identifier")
60
+
61
+ class SessionEnd(BaseModel):
62
+ chapter_id: str = Field(..., description="Chapter identifier")
63
+ section_id: Optional[str] = Field(None, description="Section identifier")
64
+ position: float = Field(..., ge=0, le=100, description="Final position")
65
+ time_spent: int = Field(..., ge=0, description="Time spent in minutes")
66
+
67
+
68
+ # Helper function to get services
69
+ def get_progress_service(db: Session = Depends(get_db)) -> ReadingProgressService:
70
+ return ReadingProgressService(db)
71
+
72
+ def get_personalization_service(db: Session = Depends(get_db)) -> PersonalizationService:
73
+ return PersonalizationService(db)
74
+
75
+
76
+ @router.get("/chapter/{chapter_id}")
77
+ @handle_errors
78
+ async def get_chapter_progress(
79
+ chapter_id: str,
80
+ current_user: User = Depends(get_current_active_user),
81
+ service: ReadingProgressService = Depends(get_progress_service)
82
+ ) -> ProgressResponse:
83
+ """Get comprehensive progress for a specific chapter."""
84
+ progress = await service.get_chapter_progress(current_user.id, chapter_id)
85
+
86
+ if not progress["total_sections"]:
87
+ raise NotFoundError("Chapter", chapter_id)
88
+
89
+ return ProgressResponse(**progress)
90
+
91
+
92
+ @router.get("/summary")
93
+ @handle_errors
94
+ async def get_progress_summary(
95
+ current_user: User = Depends(get_current_active_user),
96
+ service: ReadingProgressService = Depends(get_progress_service)
97
+ ) -> Dict[str, Any]:
98
+ """Get overall reading progress summary for the user."""
99
+ summary = await service.get_user_progress_summary(current_user.id)
100
+
101
+ # Add personalization info
102
+ personalization_service = PersonalizationService(service.db)
103
+ personalization = await personalization_service.get_user_personalization(current_user.id)
104
+
105
+ return {
106
+ **summary,
107
+ "personalization": personalization,
108
+ "last_updated": datetime.utcnow().isoformat()
109
+ }
110
+
111
+
112
+ @router.post("/session/start")
113
+ @handle_errors
114
+ async def start_reading_session(
115
+ session_data: SessionStart,
116
+ current_user: User = Depends(get_current_active_user),
117
+ service: ReadingProgressService = Depends(get_progress_service)
118
+ ) -> Dict[str, Any]:
119
+ """Start a new reading session."""
120
+ # Log session start
121
+ logger.info(
122
+ "Reading session started",
123
+ user_id=current_user.id,
124
+ chapter_id=session_data.chapter_id,
125
+ section_id=session_data.section_id
126
+ )
127
+
128
+ # Get or create progress record
129
+ progress = await service.update_section_progress(
130
+ user_id=current_user.id,
131
+ chapter_id=session_data.chapter_id,
132
+ section_id=session_data.section_id or f"{session_data.chapter_id}_intro",
133
+ position=0,
134
+ time_spent_delta=0
135
+ )
136
+
137
+ return {
138
+ "session_id": progress.id,
139
+ "chapter_id": session_data.chapter_id,
140
+ "section_id": session_data.section_id,
141
+ "started_at": progress.last_accessed.isoformat(),
142
+ "message": "Reading session started successfully"
143
+ }
144
+
145
+
146
+ @router.post("/session/end")
147
+ @handle_errors
148
+ async def end_reading_session(
149
+ session_data: SessionEnd,
150
+ current_user: User = Depends(get_current_active_user),
151
+ service: ReadingProgressService = Depends(get_progress_service)
152
+ ) -> Dict[str, Any]:
153
+ """End a reading session with final progress."""
154
+ # Update progress with session data
155
+ progress = await service.update_section_progress(
156
+ user_id=current_user.id,
157
+ chapter_id=session_data.chapter_id,
158
+ section_id=session_data.section_id or f"{session_data.chapter_id}_intro",
159
+ position=session_data.position,
160
+ time_spent_delta=session_data.time_spent,
161
+ completed=session_data.position >= 100
162
+ )
163
+
164
+ # Get updated chapter progress
165
+ chapter_progress = await service.get_chapter_progress(current_user.id, session_data.chapter_id)
166
+
167
+ # Generate session summary
168
+ session_summary = {
169
+ "chapter_id": session_data.chapter_id,
170
+ "section_id": session_data.section_id,
171
+ "final_position": session_data.position,
172
+ "time_spent": session_data.time_spent,
173
+ "chapter_progress": chapter_progress["overall_progress"],
174
+ "sections_completed": chapter_progress["sections_completed"],
175
+ "completed_at": datetime.utcnow().isoformat()
176
+ }
177
+
178
+ # Log session end
179
+ logger.info(
180
+ "Reading session ended",
181
+ user_id=current_user.id,
182
+ **session_summary
183
+ )
184
+
185
+ return {
186
+ "session_id": progress.id,
187
+ "summary": session_summary,
188
+ "message": "Reading session completed successfully"
189
+ }
190
+
191
+
192
+ @router.post("/update")
193
+ @handle_errors
194
+ async def update_progress(
195
+ progress_update: ChapterProgressUpdate,
196
+ background_tasks: BackgroundTasks,
197
+ current_user: User = Depends(get_current_active_user),
198
+ service: ReadingProgressService = Depends(get_progress_service)
199
+ ) -> Dict[str, Any]:
200
+ """Update progress for multiple sections in a chapter."""
201
+ updated_sections = []
202
+ errors = []
203
+
204
+ for section in progress_update.sections:
205
+ try:
206
+ updated = await service.update_section_progress(
207
+ user_id=current_user.id,
208
+ chapter_id=progress_update.chapter_id,
209
+ section_id=section.section_id,
210
+ position=section.position,
211
+ time_spent_delta=section.time_spent,
212
+ completed=section.completed
213
+ )
214
+ updated_sections.append({
215
+ "section_id": section.section_id,
216
+ "position": updated.position,
217
+ "completed": updated.completed,
218
+ "updated_at": updated.updated_at.isoformat()
219
+ })
220
+ except Exception as e:
221
+ logger.error(
222
+ "Failed to update section progress",
223
+ user_id=current_user.id,
224
+ chapter_id=progress_update.chapter_id,
225
+ section_id=section.section_id,
226
+ error=str(e)
227
+ )
228
+ errors.append({
229
+ "section_id": section.section_id,
230
+ "error": str(e)
231
+ })
232
+
233
+ # Schedule background task to calculate recommendations
234
+ if updated_sections:
235
+ background_tasks.add_task(
236
+ calculate_recommendations_delayed,
237
+ current_user.id
238
+ )
239
+
240
+ return {
241
+ "chapter_id": progress_update.chapter_id,
242
+ "updated_sections": updated_sections,
243
+ "errors": errors,
244
+ "total_updated": len(updated_sections),
245
+ "total_errors": len(errors),
246
+ "message": f"Updated {len(updated_sections)} sections successfully"
247
+ }
248
+
249
+
250
+ @router.post("/section/{section_id}/complete")
251
+ @handle_errors
252
+ async def complete_section(
253
+ chapter_id: str,
254
+ section_id: str,
255
+ time_spent: int = Query(0, ge=0, description="Time spent in minutes"),
256
+ current_user: User = Depends(get_current_active_user),
257
+ service: ReadingProgressService = Depends(get_progress_service)
258
+ ) -> Dict[str, Any]:
259
+ """Mark a section as completed."""
260
+ progress = await service.mark_section_complete(
261
+ user_id=current_user.id,
262
+ chapter_id=chapter_id,
263
+ section_id=section_id,
264
+ time_spent_delta=time_spent
265
+ )
266
+
267
+ # Get updated chapter progress
268
+ chapter_progress = await service.get_chapter_progress(current_user.id, chapter_id)
269
+
270
+ # Log completion
271
+ logger.info(
272
+ "Section completed",
273
+ user_id=current_user.id,
274
+ chapter_id=chapter_id,
275
+ section_id=section_id,
276
+ position=100,
277
+ time_spent=time_spent
278
+ )
279
+
280
+ return {
281
+ "section_id": section_id,
282
+ "chapter_id": chapter_id,
283
+ "completed_at": progress.updated_at.isoformat(),
284
+ "time_spent": time_spent,
285
+ "chapter_progress": chapter_progress["overall_progress"],
286
+ "sections_completed": chapter_progress["sections_completed"],
287
+ "message": "Section marked as completed"
288
+ }
289
+
290
+
291
+ @router.get("/restore/{chapter_id}")
292
+ @handle_errors
293
+ async def restore_progress(
294
+ chapter_id: str,
295
+ current_user: User = Depends(get_current_active_user),
296
+ service: ReadingProgressService = Depends(get_progress_service)
297
+ ) -> Dict[str, Any]:
298
+ """Restore user's last position in a chapter."""
299
+ restored = await service.restore_progress(current_user.id, chapter_id)
300
+
301
+ if restored["section_id"]:
302
+ # Update last accessed
303
+ progress = await service.update_section_progress(
304
+ user_id=current_user.id,
305
+ chapter_id=chapter_id,
306
+ section_id=restored["section_id"],
307
+ position=restored["position"],
308
+ time_spent_delta=0
309
+ )
310
+
311
+ logger.info(
312
+ "Progress restored",
313
+ user_id=current_user.id,
314
+ chapter_id=chapter_id,
315
+ section_id=restored["section_id"],
316
+ position=restored["position"]
317
+ )
318
+
319
+ return restored
320
+
321
+
322
+ @router.get("/analytics")
323
+ @handle_errors
324
+ async def get_progress_analytics(
325
+ timeframe: str = Query("month", regex="^(day|week|month|year)$"),
326
+ current_user: User = Depends(get_current_active_user),
327
+ service: ReadingProgressService = Depends(get_progress_service)
328
+ ) -> Dict[str, Any]:
329
+ """Get detailed reading analytics."""
330
+ analytics = await service.get_reading_analytics(current_user.id, timeframe)
331
+
332
+ # Add additional user-specific analytics
333
+ personalization_service = PersonalizationService(service.db)
334
+ personalization = await personalization_service.get_user_personalization(current_user.id)
335
+
336
+ return {
337
+ **analytics,
338
+ "user_experience_level": personalization["experience_level"],
339
+ "user_preferences": personalization["preferences"],
340
+ "generated_at": datetime.utcnow().isoformat()
341
+ }
342
+
343
+
344
+ @router.post("/bulk")
345
+ @handle_errors
346
+ async def bulk_update_progress(
347
+ updates: List[ChapterProgressUpdate],
348
+ background_tasks: BackgroundTasks,
349
+ current_user: User = Depends(get_current_active_user),
350
+ service: ReadingProgressService = Depends(get_progress_service)
351
+ ) -> Dict[str, Any]:
352
+ """Bulk update progress for multiple chapters."""
353
+ results = []
354
+ total_updated = 0
355
+ total_errors = 0
356
+
357
+ for chapter_update in updates:
358
+ try:
359
+ chapter_result = await update_progress(
360
+ progress_update=chapter_update,
361
+ background_tasks=background_tasks,
362
+ current_user=current_user,
363
+ service=service
364
+ )
365
+ results.append(chapter_result)
366
+ total_updated += chapter_result["total_updated"]
367
+ total_errors += chapter_result["total_errors"]
368
+ except Exception as e:
369
+ logger.error(
370
+ "Failed to bulk update chapter progress",
371
+ user_id=current_user.id,
372
+ chapter_id=chapter_update.chapter_id,
373
+ error=str(e)
374
+ )
375
+ results.append({
376
+ "chapter_id": chapter_update.chapter_id,
377
+ "updated_sections": [],
378
+ "errors": [{"error": str(e)}],
379
+ "total_updated": 0,
380
+ "total_errors": 1
381
+ })
382
+ total_errors += 1
383
+
384
+ return {
385
+ "results": results,
386
+ "summary": {
387
+ "total_chapters": len(updates),
388
+ "total_updated": total_updated,
389
+ "total_errors": total_errors,
390
+ "success_rate": (total_updated / (total_updated + total_errors)) * 100 if (total_updated + total_errors) > 0 else 0
391
+ },
392
+ "message": f"Bulk update completed: {total_updated} sections updated, {total_errors} errors"
393
+ }
394
+
395
+
396
+ @router.delete("/chapter/{chapter_id}")
397
+ @handle_errors
398
+ async def reset_chapter_progress(
399
+ chapter_id: str,
400
+ current_user: User = Depends(get_current_active_user),
401
+ db: Session = Depends(get_db)
402
+ ) -> Dict[str, Any]:
403
+ """Reset all progress for a specific chapter."""
404
+ # Delete all progress records for this chapter
405
+ deleted = db.query(ReadingProgress).filter(
406
+ ReadingProgress.user_id == current_user.id,
407
+ ReadingProgress.chapter_id == chapter_id
408
+ ).delete()
409
+
410
+ db.commit()
411
+
412
+ logger.info(
413
+ "Chapter progress reset",
414
+ user_id=current_user.id,
415
+ chapter_id=chapter_id,
416
+ deleted_sections=deleted
417
+ )
418
+
419
+ return {
420
+ "chapter_id": chapter_id,
421
+ "deleted_sections": deleted,
422
+ "message": f"Progress for chapter {chapter_id} has been reset"
423
+ }
424
+
425
+
426
+ # Background task helper
427
+ async def calculate_recommendations_delayed(user_id: str):
428
+ """Background task to calculate recommendations after progress update."""
429
+ try:
430
+ from src.services.personalization import PersonalizationService
431
+ from src.database.base import SessionLocal
432
+
433
+ db = SessionLocal()
434
+ try:
435
+ service = PersonalizationService(db)
436
+ recommendations = await service.generate_recommendations(user_id, limit=5)
437
+
438
+ logger.info(
439
+ "Recommendations calculated",
440
+ user_id=user_id,
441
+ recommendations_count=len(recommendations)
442
+ )
443
+ finally:
444
+ db.close()
445
+ except Exception as e:
446
+ logger.error(
447
+ "Failed to calculate recommendations in background task",
448
+ user_id=user_id,
449
+ error=str(e)
450
+ )
src/api/v1/reader_features.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reader features API routes v1.
3
+
4
+ API endpoints for progress tracking, bookmarks, preferences, and search.
5
+ """
6
+
7
+ from fastapi import APIRouter, Depends, HTTPException, Query
8
+ from sqlalchemy.orm import Session
9
+ from typing import List, Optional
10
+
11
+ from src.database.base import get_db
12
+ from src.middleware.auth import get_current_active_user, require_user
13
+ from src.models.auth import User
14
+ from src.utils.errors import handle_errors, NotFoundError, ValidationError
15
+ from src.utils.logging import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+ router = APIRouter(
20
+ prefix="/reader-features",
21
+ tags=["reader-features"]
22
+ )
23
+
24
+ # Health check endpoint for reader features
25
+ @router.get("/health")
26
+ async def health_check():
27
+ """Health check for reader features API."""
28
+ return {
29
+ "status": "healthy",
30
+ "service": "reader-features",
31
+ "version": "1.0.0"
32
+ }
33
+
34
+ # Placeholder endpoints - will be implemented in user stories
35
+ @router.get("/progress")
36
+ @handle_errors
37
+ async def get_progress_summary(
38
+ current_user: User = Depends(get_current_active_user),
39
+ db: Session = Depends(get_db)
40
+ ):
41
+ """Get user's overall reading progress summary."""
42
+ # TODO: Implement in User Story 1
43
+ raise HTTPException(status_code=501, detail="Not implemented yet")
44
+
45
+ @router.get("/bookmarks")
46
+ @handle_errors
47
+ async def get_bookmarks(
48
+ limit: int = Query(50, ge=1, le=100),
49
+ offset: int = Query(0, ge=0),
50
+ current_user: User = Depends(get_current_active_user),
51
+ db: Session = Depends(get_db)
52
+ ):
53
+ """Get user's bookmarks."""
54
+ # TODO: Implement in User Story 4
55
+ raise HTTPException(status_code=501, detail="Not implemented yet")
56
+
57
+ @router.get("/preferences")
58
+ @handle_errors
59
+ async def get_preferences(
60
+ current_user: User = Depends(get_current_active_user),
61
+ db: Session = Depends(get_db)
62
+ ):
63
+ """Get user's reading preferences."""
64
+ # TODO: Implement in User Story 1
65
+ raise HTTPException(status_code=501, detail="Not implemented yet")
66
+
67
+ @router.get("/search")
68
+ @handle_errors
69
+ async def search_content(
70
+ q: str = Query(..., min_length=1, description="Search query"),
71
+ language: Optional[str] = Query(None, description="Filter by language"),
72
+ chapter: Optional[str] = Query(None, description="Filter by chapter"),
73
+ current_user: User = Depends(get_current_active_user),
74
+ db: Session = Depends(get_db)
75
+ ):
76
+ """Search content across all languages."""
77
+ # TODO: Implement in User Story 3
78
+ raise HTTPException(status_code=501, detail="Not implemented yet")
79
+
80
+ # Import all routers from individual feature modules
81
+ # These will be added as we implement each user story
82
+ # from .progress import router as progress_router
83
+ # from .bookmarks import router as bookmarks_router
84
+ # from .preferences import router as preferences_router
85
+ # from .search import router as search_router
86
+ # from .analytics import router as analytics_router
87
+
88
+ # Combine all routers
89
+ # api_router = APIRouter()
90
+ # api_router.include_router(progress_router, prefix="/progress", tags=["progress"])
91
+ # api_router.include_router(bookmarks_router, prefix="/bookmarks", tags=["bookmarks"])
92
+ # api_router.include_router(preferences_router, prefix="/preferences", tags=["preferences"])
93
+ # api_router.include_router(search_router, prefix="/search", tags=["search"])
94
+ # api_router.include_router(analytics_router, prefix="/analytics", tags=["analytics"])
src/api/v1/translation.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Translation API endpoints using OpenAI Agents SDK.
3
+
4
+ Provides RESTful endpoints for translating text from English to Urdu
5
+ using the OpenAI Agents SDK with Gemini API integration.
6
+ """
7
+
8
+ from fastapi import APIRouter, Depends, HTTPException, Request
9
+ from fastapi.responses import JSONResponse
10
+ from fastapi import status
11
+ from typing import Optional, Dict, Any
12
+ import time
13
+
14
+ from src.services.openai_translation.translation_agent import OpenAITranslationAgent, TranslationContext
15
+ from src.services.openai_translation.client import get_gemini_client
16
+ from src.services.translation_cache import cache_service
17
+ from src.models.auth import User
18
+ from src.security.dependencies import get_current_user_or_anonymous
19
+
20
+ router = APIRouter(prefix="/translation", tags=["translation"])
21
+
22
+
23
+ @router.post("/translate", response_model=dict)
24
+ async def translate_text(
25
+ request: dict,
26
+ http_request: Request,
27
+ current_user: Optional[User] = Depends(get_current_user_or_anonymous)
28
+ ) -> JSONResponse:
29
+ """
30
+ Legacy translation endpoint (for backward compatibility).
31
+
32
+ This endpoint uses the OpenAI Agents SDK with the improved agent implementation.
33
+
34
+ Args:
35
+ request: Translation request with text and parameters
36
+ http_request: FastAPI request object
37
+ current_user: Optional current user
38
+
39
+ Returns:
40
+ Translation result
41
+ """
42
+ try:
43
+ # Extract request data
44
+ text = request.get("text", "")
45
+ source_language = request.get("source_language", "en")
46
+ target_language = request.get("target_language", "ur")
47
+ document_type = request.get("document_type")
48
+ technical_domain = request.get("technical_domain")
49
+ target_audience = request.get("target_audience")
50
+ model = request.get("model", "gemini-2.0-flash-lite")
51
+
52
+ # Create translation context
53
+ context = TranslationContext(
54
+ document_type=document_type,
55
+ technical_domain=technical_domain,
56
+ target_audience=target_audience
57
+ )
58
+
59
+ # Create agent and translate
60
+ agent = OpenAITranslationAgent(
61
+ gemini_client=get_gemini_client(),
62
+ model=model
63
+ )
64
+
65
+ result = await agent.translate_with_agent(
66
+ text=text,
67
+ context=context,
68
+ user_id=current_user.id if current_user else None
69
+ )
70
+
71
+ return JSONResponse(
72
+ status_code=status.HTTP_200_OK,
73
+ content={
74
+ "job_id": f"translate_{int(time.time())}",
75
+ "translated_text": result["translated_text"],
76
+ "status": "completed",
77
+ "progress": 100.0,
78
+ "chunks": [],
79
+ "processing_time_ms": 0,
80
+ "cached": False,
81
+ "input_tokens": result.get("tokens_used", 0),
82
+ "output_tokens": 0,
83
+ "estimated_cost_usd": 0.0,
84
+ "confidence_score": result.get("confidence_score", 0.95)
85
+ }
86
+ )
87
+
88
+ except Exception as e:
89
+ return JSONResponse(
90
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
91
+ content={
92
+ "error": "TRANSLATION_ERROR",
93
+ "message": "Failed to translate text"
94
+ }
95
+ )
96
+
97
+
98
+ @router.post("/translate/agent")
99
+ async def translate_with_agent(
100
+ request: dict,
101
+ http_request: Request,
102
+ current_user: Optional[User] = Depends(get_current_user_or_anonymous)
103
+ ) -> JSONResponse:
104
+ """
105
+ Translate text using OpenAI Agents SDK directly with caching.
106
+
107
+ This endpoint uses the OpenAI Agents SDK for translation with enhanced
108
+ context awareness and proper Runner.run pattern. Translations are cached
109
+ for 1 week to avoid redundant API calls.
110
+
111
+ Args:
112
+ request: Translation request
113
+ http_request: FastAPI request object
114
+ current_user: Optional current user
115
+
116
+ Returns:
117
+ Translation result with detailed metadata
118
+ """
119
+ try:
120
+ # Extract request parameters
121
+ text = request.get("text", "")
122
+ source_language = request.get("source_language", "en")
123
+ target_language = request.get("target_language", "ur")
124
+ page_url = request.get("page_url")
125
+ model = request.get("model", "gemini-2.0-flash-lite")
126
+
127
+ # Check cache first
128
+ cached_result = await cache_service.get_cached_translation(
129
+ text=text,
130
+ source_language=source_language,
131
+ target_language=target_language,
132
+ page_url=page_url
133
+ )
134
+
135
+ if cached_result:
136
+ return JSONResponse(
137
+ status_code=status.HTTP_200_OK,
138
+ content={
139
+ "translated_text": cached_result["translated_text"],
140
+ "original_text": cached_result["original_text"],
141
+ "cached": True,
142
+ "cache_created_at": cached_result["cache_created_at"],
143
+ "cache_expires_at": cached_result["cache_expires_at"],
144
+ "hit_count": cached_result["hit_count"],
145
+ "tokens_used": 0, # No tokens used for cached result
146
+ "model": cached_result["model"],
147
+ "confidence_score": cached_result["confidence_score"],
148
+ "has_code_blocks": False, # Would need to be stored in cache
149
+ "code_blocks": [] # Would need to be stored in cache
150
+ }
151
+ )
152
+
153
+ # Not in cache, proceed with translation
154
+ # Create translation context
155
+ context = TranslationContext(
156
+ page_url=page_url,
157
+ document_type=request.get("document_type"),
158
+ technical_domain=request.get("technical_domain"),
159
+ target_audience=request.get("target_audience")
160
+ )
161
+
162
+ # Create agent and translate
163
+ agent = OpenAITranslationAgent(
164
+ gemini_client=get_gemini_client(),
165
+ model=model
166
+ )
167
+
168
+ start_time = time.time()
169
+ result = await agent.translate_with_agent(
170
+ text=text,
171
+ context=context,
172
+ user_id=current_user.id if current_user else None
173
+ )
174
+ processing_time_ms = int((time.time() - start_time) * 1000)
175
+
176
+ # Cache the translation result
177
+ await cache_service.cache_translation(
178
+ text=text,
179
+ translated_text=result["translated_text"],
180
+ source_language=source_language,
181
+ target_language=target_language,
182
+ model=result.get("model", model),
183
+ confidence_score=result.get("confidence_score", 0.95),
184
+ processing_time_ms=processing_time_ms,
185
+ page_url=page_url
186
+ )
187
+
188
+ return JSONResponse(
189
+ status_code=status.HTTP_200_OK,
190
+ content={
191
+ "translated_text": result["translated_text"],
192
+ "original_text": result["original_text"],
193
+ "cached": False,
194
+ "tokens_used": result.get("tokens_used", 0),
195
+ "model": result.get("model", model),
196
+ "confidence_score": result.get("confidence_score", 0.95),
197
+ "has_code_blocks": result.get("has_code_blocks", False),
198
+ "code_blocks": result.get("code_blocks", []),
199
+ "processing_time_ms": processing_time_ms
200
+ }
201
+ )
202
+
203
+ except Exception as e:
204
+ return JSONResponse(
205
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
206
+ content={
207
+ "error": "AGENT_TRANSLATION_ERROR",
208
+ "message": "Failed to translate text using agent"
209
+ }
210
+ )
211
+
212
+
213
+ @router.get("/health")
214
+ async def health_check() -> JSONResponse:
215
+ """
216
+ Simple health check endpoint for translation service.
217
+
218
+ Returns:
219
+ Health status
220
+ """
221
+ return JSONResponse(
222
+ status_code=status.HTTP_200_OK,
223
+ content={
224
+ "status": "healthy",
225
+ "service": "translation",
226
+ "version": "2.0.0",
227
+ "features": ["openai_agents_sdk", "gemini_api", "translation_cache"]
228
+ }
229
+ )
230
+
231
+
232
+ @router.post("/cache/clear-expired")
233
+ async def clear_expired_cache(
234
+ current_user: Optional[User] = Depends(get_current_user_or_anonymous)
235
+ ) -> JSONResponse:
236
+ """
237
+ Clear expired cache entries.
238
+
239
+ Returns:
240
+ Number of cleared entries
241
+ """
242
+ try:
243
+ cleared_count = await cache_service.clear_expired_cache()
244
+ return JSONResponse(
245
+ status_code=status.HTTP_200_OK,
246
+ content={
247
+ "message": f"Cleared {cleared_count} expired cache entries",
248
+ "cleared_count": cleared_count
249
+ }
250
+ )
251
+ except Exception as e:
252
+ return JSONResponse(
253
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
254
+ content={
255
+ "error": "CACHE_CLEAR_ERROR",
256
+ "message": "Failed to clear expired cache"
257
+ }
258
+ )
259
+
260
+
261
+ @router.post("/cache/clear-url")
262
+ async def clear_cache_by_url(
263
+ request: dict,
264
+ current_user: Optional[User] = Depends(get_current_user_or_anonymous)
265
+ ) -> JSONResponse:
266
+ """
267
+ Clear cache entries for a specific URL.
268
+
269
+ Args:
270
+ request: Dict containing 'url' and optional 'source_language' and 'target_language'
271
+
272
+ Returns:
273
+ Number of cleared entries
274
+ """
275
+ try:
276
+ url = request.get("url")
277
+ if not url:
278
+ return JSONResponse(
279
+ status_code=status.HTTP_400_BAD_REQUEST,
280
+ content={
281
+ "error": "INVALID_REQUEST",
282
+ "message": "URL is required"
283
+ }
284
+ )
285
+
286
+ source_language = request.get("source_language")
287
+ target_language = request.get("target_language")
288
+
289
+ cleared_count = await cache_service.clear_cache_by_url(
290
+ page_url=url,
291
+ source_language=source_language,
292
+ target_language=target_language
293
+ )
294
+
295
+ return JSONResponse(
296
+ status_code=status.HTTP_200_OK,
297
+ content={
298
+ "message": f"Cleared {cleared_count} cache entries for URL",
299
+ "url": url,
300
+ "cleared_count": cleared_count
301
+ }
302
+ )
303
+ except Exception as e:
304
+ return JSONResponse(
305
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
306
+ content={
307
+ "error": "CACHE_CLEAR_URL_ERROR",
308
+ "message": "Failed to clear cache for URL"
309
+ }
310
+ )
311
+
312
+
313
+ @router.get("/cache/stats")
314
+ async def get_cache_stats(
315
+ current_user: Optional[User] = Depends(get_current_user_or_anonymous)
316
+ ) -> JSONResponse:
317
+ """
318
+ Get translation cache statistics.
319
+
320
+ Returns:
321
+ Cache statistics
322
+ """
323
+ try:
324
+ stats = await cache_service.get_cache_stats()
325
+ return JSONResponse(
326
+ status_code=status.HTTP_200_OK,
327
+ content=stats
328
+ )
329
+ except Exception as e:
330
+ return JSONResponse(
331
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
332
+ content={
333
+ "error": "CACHE_STATS_ERROR",
334
+ "message": "Failed to retrieve cache statistics"
335
+ }
336
+ )
src/config/logging_config.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Production-ready logging configuration.
3
+
4
+ Configures structured logging with multiple handlers, sensitive data filtering,
5
+ and integration with monitoring systems.
6
+ """
7
+
8
+ import sys
9
+ import json
10
+ import logging
11
+ import logging.handlers
12
+ from typing import Dict, Any, List, Optional, Union
13
+ from pathlib import Path
14
+ from datetime import datetime
15
+ import traceback
16
+ from contextvars import ContextVar
17
+
18
+ from pythonjsonlogger import jsonlogger
19
+ from structlog import processors, stdlib, configure
20
+ from structlog.typing import FilteringBoundLogger
21
+
22
+ from .translation_config import get_config, LogLevel
23
+
24
+
25
+ # Context variables for request tracking
26
+ request_id: ContextVar[Optional[str]] = ContextVar('request_id', default=None)
27
+ user_id: ContextVar[Optional[str]] = ContextVar('user_id', default=None)
28
+ session_id: ContextVar[Optional[str]] = ContextVar('session_id', default=None)
29
+
30
+
31
+ class SensitiveDataFilter(logging.Filter):
32
+ """Filter to mask sensitive data in log records."""
33
+
34
+ def __init__(self, sensitive_fields: List[str] = None, mask_char: str = "*"):
35
+ super().__init__()
36
+ self.sensitive_fields = [field.lower() for field in (sensitive_fields or [])]
37
+ self.mask_char = mask_char
38
+
39
+ def filter(self, record):
40
+ """Filter sensitive data from log record."""
41
+ # Filter message
42
+ if hasattr(record, 'msg') and record.msg:
43
+ record.msg = self._mask_sensitive_data(str(record.msg))
44
+
45
+ # Filter args
46
+ if hasattr(record, 'args') and record.args:
47
+ record.args = tuple(
48
+ self._mask_sensitive_data(str(arg)) if isinstance(arg, str) else arg
49
+ for arg in record.args
50
+ )
51
+
52
+ # Filter extra attributes
53
+ for attr_name in dir(record):
54
+ if not attr_name.startswith('_') and attr_name not in {
55
+ 'name', 'msg', 'args', 'levelname', 'levelno', 'pathname',
56
+ 'filename', 'module', 'lineno', 'funcName', 'created',
57
+ 'msecs', 'relativeCreated', 'thread', 'threadName',
58
+ 'processName', 'process', 'getMessage', 'exc_info',
59
+ 'exc_text', 'stack_info'
60
+ }:
61
+ attr_value = getattr(record, attr_name)
62
+ if isinstance(attr_value, str):
63
+ setattr(record, attr_name, self._mask_sensitive_data(attr_value))
64
+
65
+ return True
66
+
67
+ def _mask_sensitive_data(self, text: str) -> str:
68
+ """Mask sensitive data in text."""
69
+ import re
70
+
71
+ # General patterns
72
+ patterns = [
73
+ (r'(?i)(api[_-]?key["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
74
+ (r'(?i)(password["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
75
+ (r'(?i)(token["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
76
+ (r'(?i)(secret["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
77
+ (r'(?i)(authorization["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
78
+ (r'(Bearer\s+)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
79
+ ]
80
+
81
+ # Custom field patterns
82
+ for field in self.sensitive_fields:
83
+ patterns.append(
84
+ (rf'(?i)({field}["\']?\s*[:=]\s*["\']?)([\w\-\.]+)',
85
+ lambda m, f=field: f"{m.group(1)}{self.mask_char * len(m.group(2))}")
86
+ )
87
+
88
+ # Apply patterns
89
+ for pattern, replacement in patterns:
90
+ text = re.sub(pattern, replacement, text)
91
+
92
+ return text
93
+
94
+
95
+ class ContextFilter(logging.Filter):
96
+ """Add context information to log records."""
97
+
98
+ def filter(self, record):
99
+ """Add context variables to log record."""
100
+ record.request_id = request_id.get()
101
+ record.user_id = user_id.get()
102
+ record.session_id = session_id.get()
103
+ return True
104
+
105
+
106
+ class JSONFormatter(jsonlogger.JsonFormatter):
107
+ """Custom JSON formatter with additional fields."""
108
+
109
+ def add_fields(self, log_record, record, message_dict):
110
+ """Add custom fields to JSON log record."""
111
+ super().add_fields(log_record, record, message_dict)
112
+
113
+ # Add timestamp
114
+ if not log_record.get('timestamp'):
115
+ log_record['timestamp'] = datetime.utcnow().isoformat()
116
+
117
+ # Add context
118
+ if hasattr(record, 'request_id') and record.request_id:
119
+ log_record['request_id'] = record.request_id
120
+ if hasattr(record, 'user_id') and record.user_id:
121
+ log_record['user_id'] = record.user_id
122
+ if hasattr(record, 'session_id') and record.session_id:
123
+ log_record['session_id'] = record.session_id
124
+
125
+ # Add exception details
126
+ if record.exc_info:
127
+ log_record['exception'] = {
128
+ 'type': record.exc_info[0].__name__,
129
+ 'message': str(record.exc_info[1]),
130
+ 'traceback': self.formatException(record.exc_info)
131
+ }
132
+
133
+ # Add source location
134
+ log_record['source'] = {
135
+ 'file': record.filename,
136
+ 'line': record.lineno,
137
+ 'function': record.funcName,
138
+ 'module': record.module
139
+ }
140
+
141
+
142
+ class ColoredFormatter(logging.Formatter):
143
+ """Colored formatter for console output."""
144
+
145
+ COLORS = {
146
+ 'DEBUG': '\033[36m', # Cyan
147
+ 'INFO': '\033[32m', # Green
148
+ 'WARNING': '\033[33m', # Yellow
149
+ 'ERROR': '\033[31m', # Red
150
+ 'CRITICAL': '\033[35m', # Magenta
151
+ 'RESET': '\033[0m' # Reset
152
+ }
153
+
154
+ def format(self, record):
155
+ """Format log record with colors."""
156
+ log_color = self.COLORS.get(record.levelname, self.COLORS['RESET'])
157
+ reset = self.COLORS['RESET']
158
+
159
+ # Add color to levelname
160
+ record.levelname = f"{log_color}{record.levelname}{reset}"
161
+
162
+ # Add request ID if present
163
+ if hasattr(record, 'request_id') and record.request_id:
164
+ record.msg = f"[{record.request_id[:8]}] {record.msg}"
165
+
166
+ return super().format(record)
167
+
168
+
169
+ def setup_logging() -> None:
170
+ """Setup logging configuration based on environment."""
171
+ config = get_config()
172
+
173
+ # Get root logger
174
+ root_logger = logging.getLogger()
175
+ root_logger.setLevel(getattr(logging, config.logging.level.value))
176
+
177
+ # Clear existing handlers
178
+ root_logger.handlers.clear()
179
+
180
+ # Create formatters
181
+ if config.logging.json_format:
182
+ formatter = JSONFormatter(
183
+ '%(asctime)s %(name)s %(levelname)s %(message)s'
184
+ )
185
+ else:
186
+ formatter = logging.Formatter(
187
+ config.logging.format,
188
+ datefmt='%Y-%m-%d %H:%M:%S'
189
+ )
190
+
191
+ # Console handler
192
+ console_handler = logging.StreamHandler(sys.stdout)
193
+ if config.logging.json_format:
194
+ console_handler.setFormatter(formatter)
195
+ else:
196
+ console_handler.setFormatter(ColoredFormatter(config.logging.format))
197
+ console_handler.addFilter(ContextFilter())
198
+ root_logger.addHandler(console_handler)
199
+
200
+ # File handler (if enabled)
201
+ if config.logging.file_logging:
202
+ setup_file_handler(root_logger, formatter, config)
203
+
204
+ # Apply sensitive data filter
205
+ if config.logging.filter_sensitive_data:
206
+ sensitive_filter = SensitiveDataFilter(config.logging.sensitive_fields)
207
+ for handler in root_logger.handlers:
208
+ handler.addFilter(sensitive_filter)
209
+
210
+ # Configure structlog
211
+ if config.logging.json_format:
212
+ configure(
213
+ processors=[
214
+ structlog.stdlib.filter_by_level,
215
+ structlog.stdlib.add_logger_name,
216
+ structlog.stdlib.add_log_level,
217
+ structlog.stdlib.PositionalArgumentsFormatter(),
218
+ structlog.processors.TimeStamper(fmt="iso"),
219
+ structlog.processors.StackInfoRenderer(),
220
+ structlog.processors.format_exc_info,
221
+ structlog.processors.UnicodeDecoder(),
222
+ structlog.processors.JSONRenderer()
223
+ ],
224
+ context_class=dict,
225
+ logger_factory=stdlib.LoggerFactory(),
226
+ wrapper_class=stdlib.BoundLogger,
227
+ cache_logger_on_first_use=True,
228
+ )
229
+ else:
230
+ configure(
231
+ processors=[
232
+ structlog.stdlib.filter_by_level,
233
+ structlog.stdlib.add_logger_name,
234
+ structlog.stdlib.add_log_level,
235
+ structlog.stdlib.PositionalArgumentsFormatter(),
236
+ structlog.processors.TimeStamper(fmt="iso"),
237
+ structlog.processors.StackInfoRenderer(),
238
+ structlog.processors.format_exc_info,
239
+ structlog.processors.UnicodeDecoder(),
240
+ structlog.dev.ConsoleRenderer()
241
+ ],
242
+ context_class=dict,
243
+ logger_factory=stdlib.LoggerFactory(),
244
+ wrapper_class=stdlib.BoundLogger,
245
+ cache_logger_on_first_use=True,
246
+ )
247
+
248
+ # Log configuration
249
+ logger = logging.getLogger(__name__)
250
+ logger.info(
251
+ "Logging configured",
252
+ level=config.logging.level.value,
253
+ json_format=config.logging.json_format,
254
+ file_logging=config.logging.file_logging,
255
+ filter_sensitive=config.logging.filter_sensitive_data
256
+ )
257
+
258
+
259
+ def setup_file_handler(
260
+ logger: logging.Logger,
261
+ formatter: Union[logging.Formatter, JSONFormatter],
262
+ config
263
+ ) -> None:
264
+ """Setup file handler with rotation."""
265
+ # Create logs directory
266
+ log_path = Path(config.logging.file_path)
267
+ log_path.parent.mkdir(parents=True, exist_ok=True)
268
+
269
+ # Parse rotation settings
270
+ when = "midnight"
271
+ if config.logging.file_rotation.endswith(" day"):
272
+ when = "midnight"
273
+ elif config.logging.file_rotation.endswith(" hour"):
274
+ when = "H"
275
+ elif config.logging.file_rotation.endswith(" minute"):
276
+ when = "M"
277
+
278
+ # Parse backup count from retention
279
+ backup_count = 30 # Default
280
+ if "days" in config.logging.file_retention:
281
+ backup_count = int(config.logging.file_retention.split()[0])
282
+
283
+ # Create rotating file handler
284
+ try:
285
+ file_handler = logging.handlers.RotatingFileHandler(
286
+ filename=log_path,
287
+ maxBytes=_parse_size(config.logging.max_file_size),
288
+ backupCount=backup_count,
289
+ encoding='utf-8'
290
+ )
291
+ except Exception:
292
+ # Fallback to TimedRotatingFileHandler
293
+ file_handler = logging.handlers.TimedRotatingFileHandler(
294
+ filename=log_path,
295
+ when=when,
296
+ backupCount=backup_count,
297
+ encoding='utf-8'
298
+ )
299
+
300
+ file_handler.setFormatter(formatter)
301
+ file_handler.addFilter(ContextFilter())
302
+ logger.addHandler(file_handler)
303
+
304
+
305
+ def _parse_size(size_str: str) -> int:
306
+ """Parse size string to bytes."""
307
+ size_str = size_str.upper().strip()
308
+ multipliers = {
309
+ 'B': 1,
310
+ 'KB': 1024,
311
+ 'MB': 1024 ** 2,
312
+ 'GB': 1024 ** 3
313
+ }
314
+
315
+ for unit, multiplier in multipliers.items():
316
+ if size_str.endswith(unit):
317
+ return int(float(size_str[:-len(unit)]) * multiplier)
318
+
319
+ return int(size_str)
320
+
321
+
322
+ def bind_context(
323
+ request_id: Optional[str] = None,
324
+ user_id: Optional[str] = None,
325
+ session_id: Optional[str] = None
326
+ ) -> Dict[str, Any]:
327
+ """Bind context variables for logging."""
328
+ context = {}
329
+
330
+ if request_id:
331
+ request_id.set(request_id)
332
+ context['request_id'] = request_id
333
+
334
+ if user_id:
335
+ user_id.set(user_id)
336
+ context['user_id'] = user_id
337
+
338
+ if session_id:
339
+ session_id.set(session_id)
340
+ context['session_id'] = session_id
341
+
342
+ return context
343
+
344
+
345
+ def unbind_context() -> None:
346
+ """Clear all context variables."""
347
+ request_id.set(None)
348
+ user_id.set(None)
349
+ session_id.set(None)
350
+
351
+
352
+ class LogContext:
353
+ """Context manager for log context."""
354
+
355
+ def __init__(
356
+ self,
357
+ request_id: Optional[str] = None,
358
+ user_id: Optional[str] = None,
359
+ session_id: Optional[str] = None,
360
+ **kwargs
361
+ ):
362
+ self.context = bind_context(request_id, user_id, session_id)
363
+ self.context.update(kwargs)
364
+ self.old_context = {}
365
+
366
+ def __enter__(self):
367
+ # Store old context
368
+ for key, value in self.context.items():
369
+ var = globals().get(key)
370
+ if var:
371
+ self.old_context[key] = var.get()
372
+ var.set(value)
373
+
374
+ return self.context
375
+
376
+ def __exit__(self, exc_type, exc_val, exc_tb):
377
+ # Restore old context
378
+ for key, value in self.old_context.items():
379
+ var = globals().get(key)
380
+ if var:
381
+ var.set(value)
382
+
383
+
384
+ def log_function_call(func):
385
+ """Decorator to log function calls."""
386
+ import functools
387
+
388
+ @functools.wraps(func)
389
+ def wrapper(*args, **kwargs):
390
+ logger = logging.getLogger(func.__module__)
391
+ logger.debug(
392
+ "Function called",
393
+ function=func.__name__,
394
+ args_count=len(args),
395
+ kwargs=list(kwargs.keys())
396
+ )
397
+ try:
398
+ result = func(*args, **kwargs)
399
+ logger.debug(
400
+ "Function completed",
401
+ function=func.__name__
402
+ )
403
+ return result
404
+ except Exception as e:
405
+ logger.error(
406
+ "Function failed",
407
+ function=func.__name__,
408
+ error=str(e),
409
+ error_type=type(e).__name__
410
+ )
411
+ raise
412
+
413
+ @functools.wraps(func)
414
+ async def async_wrapper(*args, **kwargs):
415
+ logger = logging.getLogger(func.__module__)
416
+ logger.debug(
417
+ "Async function called",
418
+ function=func.__name__,
419
+ args_count=len(args),
420
+ kwargs=list(kwargs.keys())
421
+ )
422
+ try:
423
+ result = await func(*args, **kwargs)
424
+ logger.debug(
425
+ "Async function completed",
426
+ function=func.__name__
427
+ )
428
+ return result
429
+ except Exception as e:
430
+ logger.error(
431
+ "Async function failed",
432
+ function=func.__name__,
433
+ error=str(e),
434
+ error_type=type(e).__name__
435
+ )
436
+ raise
437
+
438
+ return async_wrapper if asyncio.iscoroutinefunction(func) else wrapper
439
+
440
+
441
+ # Initialize logging on import
442
+ setup_logging()
src/config/translation_config.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Translation Service Configuration Management.
3
+
4
+ Centralized configuration for the OpenAI Translation Service with
5
+ environment-based overrides and validation.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import yaml
11
+ from typing import Dict, Any, Optional, Union, List
12
+ from dataclasses import dataclass, field, asdict
13
+ from pathlib import Path
14
+ from enum import Enum
15
+
16
+ from pydantic import BaseModel, Field, validator
17
+ from src.utils.translation_logger import get_translation_logger
18
+
19
+ logger = get_translation_logger(__name__)
20
+
21
+
22
+ class LogLevel(str, Enum):
23
+ """Log levels for the translation service."""
24
+ DEBUG = "DEBUG"
25
+ INFO = "INFO"
26
+ WARNING = "WARNING"
27
+ ERROR = "ERROR"
28
+ CRITICAL = "CRITICAL"
29
+
30
+
31
+ class Environment(str, Enum):
32
+ """Environment types."""
33
+ DEVELOPMENT = "development"
34
+ TESTING = "testing"
35
+ STAGING = "staging"
36
+ PRODUCTION = "production"
37
+
38
+
39
+ class CacheBackend(str, Enum):
40
+ """Cache backend types."""
41
+ MEMORY = "memory"
42
+ REDIS = "redis"
43
+ DATABASE = "database"
44
+
45
+
46
+ @dataclass
47
+ class GeminiConfig:
48
+ """Configuration for Gemini API."""
49
+ api_key: str = field(default_factory=lambda: os.getenv("GEMINI_API_KEY", ""))
50
+ base_url: str = field(
51
+ default_factory=lambda: os.getenv(
52
+ "GEMINI_BASE_URL",
53
+ "https://generativelanguage.googleapis.com/v1beta/openai/"
54
+ )
55
+ )
56
+ default_model: str = field(
57
+ default_factory=lambda: os.getenv("GEMINI_MODEL", "gemini-2.0-flash-lite")
58
+ )
59
+ organization: Optional[str] = field(default_factory=lambda: os.getenv("OPENAI_ORGANIZATION"))
60
+
61
+ # Connection settings
62
+ timeout: float = field(default_factory=lambda: float(os.getenv("GEMINI_TIMEOUT", "60")))
63
+ max_retries: int = field(default_factory=lambda: int(os.getenv("GEMINI_MAX_RETRIES", "3")))
64
+ retry_delay: float = field(default_factory=lambda: float(os.getenv("GEMINI_RETRY_DELAY", "1.0")))
65
+
66
+ # Advanced settings
67
+ proxy: Optional[str] = field(default_factory=lambda: os.getenv("HTTP_PROXY"))
68
+ custom_headers: Dict[str, str] = field(default_factory=dict)
69
+ http2: bool = field(default_factory=lambda: os.getenv("GEMINI_HTTP2", "true").lower() == "true")
70
+
71
+ # Rate limiting
72
+ requests_per_minute: int = field(default_factory=lambda: int(os.getenv("GEMINI_RPM", "60")))
73
+ requests_per_hour: int = field(default_factory=lambda: int(os.getenv("GEMINI_RPH", "1000")))
74
+
75
+ # Model pricing (USD per 1M tokens)
76
+ pricing: Dict[str, Dict[str, float]] = field(default_factory=lambda: {
77
+ "gemini-2.0-flash-lite": {"input": 0.000075, "output": 0.00015},
78
+ "gemini-2.5-pro": {"input": 0.00125, "output": 0.00375}
79
+ })
80
+
81
+
82
+ @dataclass
83
+ class OpenAIAgentsConfig:
84
+ """Configuration for OpenAI Agents SDK."""
85
+ enabled: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_ENABLED", "true").lower() == "true")
86
+ enable_tracing: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_TRACING", "false").lower() == "true")
87
+ verbose_logging: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_VERBOSE", "false").lower() == "true")
88
+
89
+ # Agent settings
90
+ default_temperature: float = field(default_factory=lambda: float(os.getenv("AGENT_DEFAULT_TEMPERATURE", "0.3")))
91
+ default_max_tokens: int = field(default_factory=lambda: int(os.getenv("AGENT_MAX_TOKENS", "2048")))
92
+ max_turns: int = field(default_factory=lambda: int(os.getenv("AGENT_MAX_TURNS", "5")))
93
+
94
+ # Tool settings
95
+ enable_html_tool: bool = field(default_factory=lambda: os.getenv("AGENT_HTML_TOOL", "true").lower() == "true")
96
+ enable_code_tool: bool = field(default_factory=lambda: os.getenv("AGENT_CODE_TOOL", "true").lower() == "true")
97
+ enable_quality_tool: bool = field(default_factory=lambda: os.getenv("AGENT_QUALITY_TOOL", "true").lower() == "true")
98
+
99
+ # Quality settings
100
+ quality_check_enabled: bool = field(default_factory=lambda: os.getenv("AGENT_QUALITY_CHECK", "true").lower() == "true")
101
+ confidence_threshold: float = field(default_factory=lambda: float(os.getenv("AGENT_CONFIDENCE_THRESHOLD", "0.8")))
102
+
103
+
104
+ @dataclass
105
+ class CacheConfig:
106
+ """Configuration for caching."""
107
+ backend: CacheBackend = field(
108
+ default_factory=lambda: CacheBackend(os.getenv("CACHE_BACKEND", "memory"))
109
+ )
110
+
111
+ # TTL settings
112
+ default_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_DEFAULT_TTL", "168"))) # 7 days
113
+ high_quality_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_HIGH_QUALITY_TTL", "720"))) # 30 days
114
+ low_quality_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_LOW_QUALITY_TTL", "24"))) # 1 day
115
+
116
+ # Redis settings
117
+ redis_url: str = field(default_factory=lambda: os.getenv("REDIS_URL", "redis://localhost:6379"))
118
+ redis_prefix: str = field(default_factory=lambda: os.getenv("REDIS_PREFIX", "translation:"))
119
+ redis_max_connections: int = field(default_factory=lambda: int(os.getenv("REDIS_MAX_CONNECTIONS", "10")))
120
+
121
+ # Memory cache settings
122
+ memory_max_size: int = field(default_factory=lambda: int(os.getenv("CACHE_MEMORY_MAX_SIZE", "1000")))
123
+ memory_cleanup_interval: int = field(default_factory=lambda: int(os.getenv("CACHE_CLEANUP_INTERVAL", "3600")))
124
+
125
+
126
+ @dataclass
127
+ class DatabaseConfig:
128
+ """Configuration for database connections."""
129
+ url: str = field(default_factory=lambda: os.getenv(
130
+ "DATABASE_URL",
131
+ "sqlite:///./translation.db"
132
+ ))
133
+ pool_size: int = field(default_factory=lambda: int(os.getenv("DB_POOL_SIZE", "5")))
134
+ max_overflow: int = field(default_factory=lambda: int(os.getenv("DB_MAX_OVERFLOW", "10")))
135
+ pool_timeout: int = field(default_factory=lambda: int(os.getenv("DB_POOL_TIMEOUT", "30")))
136
+ pool_recycle: int = field(default_factory=lambda: int(os.getenv("DB_POOL_RECYCLE", "3600")))
137
+
138
+ # Migration settings
139
+ auto_migrate: bool = field(default_factory=lambda: os.getenv("DB_AUTO_MIGRATE", "true").lower() == "true")
140
+ migration_timeout: int = field(default_factory=lambda: int(os.getenv("DB_MIGRATION_TIMEOUT", "300")))
141
+
142
+
143
+ @dataclass
144
+ class LoggingConfig:
145
+ """Configuration for logging."""
146
+ level: LogLevel = field(default_factory=lambda: LogLevel(os.getenv("LOG_LEVEL", "INFO")))
147
+ format: str = field(
148
+ default_factory=lambda: os.getenv(
149
+ "LOG_FORMAT",
150
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
151
+ )
152
+ )
153
+
154
+ # File logging
155
+ file_logging: bool = field(default_factory=lambda: os.getenv("LOG_FILE_ENABLED", "true").lower() == "true")
156
+ file_path: str = field(default_factory=lambda: os.getenv("LOG_FILE_PATH", "logs/translation.log"))
157
+ file_rotation: str = field(default_factory=lambda: os.getenv("LOG_FILE_ROTATION", "1 day"))
158
+ file_retention: str = field(default_factory=lambda: os.getenv("LOG_FILE_RETENTION", "30 days"))
159
+ max_file_size: str = field(default_factory=lambda: os.getenv("LOG_MAX_FILE_SIZE", "100 MB"))
160
+
161
+ # Structured logging
162
+ json_format: bool = field(default_factory=lambda: os.getenv("LOG_JSON_FORMAT", "false").lower() == "true")
163
+ include_request_id: bool = field(default_factory=lambda: os.getenv("LOG_INCLUDE_REQUEST_ID", "true").lower() == "true")
164
+
165
+ # Sensitive data filtering
166
+ filter_sensitive_data: bool = field(default_factory=lambda: os.getenv("LOG_FILTER_SENSITIVE", "true").lower() == "true")
167
+ sensitive_fields: List[str] = field(default_factory=lambda: [
168
+ "api_key", "password", "token", "authorization"
169
+ ])
170
+
171
+
172
+ @dataclass
173
+ class RateLimitConfig:
174
+ """Configuration for rate limiting."""
175
+ enabled: bool = field(default_factory=lambda: os.getenv("RATE_LIMIT_ENABLED", "true").lower() == "true")
176
+
177
+ # Global limits
178
+ requests_per_minute: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPM", "60")))
179
+ requests_per_hour: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPH", "1000")))
180
+ requests_per_day: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPD", "10000")))
181
+
182
+ # Translation-specific limits
183
+ translation_rpm: int = field(default_factory=lambda: int(os.getenv("TRANSLATION_RPM", "10")))
184
+ translation_rph: int = field(default_factory=lambda: int(os.getenv("TRANSLATION_RPH", "500")))
185
+
186
+ # Enforcement
187
+ block_duration: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_BLOCK_DURATION", "3600")))
188
+ warning_threshold: float = field(default_factory=lambda: float(os.getenv("RATE_LIMIT_WARNING_THRESHOLD", "0.8")))
189
+
190
+ # Redis backend for distributed limiting
191
+ redis_backend: bool = field(default_factory=lambda: os.getenv("RATE_LIMIT_REDIS", "false").lower() == "true")
192
+
193
+
194
+ @dataclass
195
+ class SecurityConfig:
196
+ """Configuration for security settings."""
197
+ # API key validation
198
+ require_api_key: bool = field(default_factory=lambda: os.getenv("SECURITY_REQUIRE_API_KEY", "false").lower() == "true")
199
+ api_key_header: str = field(default_factory=lambda: os.getenv("SECURITY_API_KEY_HEADER", "X-API-Key"))
200
+
201
+ # Request validation
202
+ max_text_length: int = field(default_factory=lambda: int(os.getenv("SECURITY_MAX_TEXT_LENGTH", "100000")))
203
+ max_chunks: int = field(default_factory=lambda: int(os.getenv("SECURITY_MAX_CHUNKS", "100")))
204
+
205
+ # CORS settings
206
+ cors_origins: List[str] = field(default_factory=lambda: os.getenv("CORS_ORIGINS", "*").split(","))
207
+ cors_methods: List[str] = field(default_factory=lambda: os.getenv("CORS_METHODS", "GET,POST").split(","))
208
+ cors_headers: List[str] = field(default_factory=lambda: os.getenv("CORS_HEADERS", "*").split(","))
209
+
210
+ # Content filtering
211
+ enable_content_filter: bool = field(default_factory=lambda: os.getenv("SECURITY_CONTENT_FILTER", "true").lower() == "true")
212
+ blocked_patterns: List[str] = field(default_factory=lambda: os.getenv(
213
+ "SECURITY_BLOCKED_PATTERNS",
214
+ ""
215
+ ).split(",") if os.getenv("SECURITY_BLOCKED_PATTERNS") else [])
216
+
217
+ # IP-based restrictions
218
+ ip_whitelist: List[str] = field(default_factory=lambda: os.getenv("SECURITY_IP_WHITELIST", "").split(","))
219
+ ip_blacklist: List[str] = field(default_factory=lambda: os.getenv("SECURITY_IP_BLACKLIST", "").split(","))
220
+
221
+
222
+ @dataclass
223
+ class MonitoringConfig:
224
+ """Configuration for monitoring and metrics."""
225
+ enabled: bool = field(default_factory=lambda: os.getenv("MONITORING_ENABLED", "true").lower() == "true")
226
+
227
+ # Metrics
228
+ metrics_endpoint: str = field(default_factory=lambda: os.getenv("METRICS_ENDPOINT", "/metrics"))
229
+ metrics_port: int = field(default_factory=lambda: int(os.getenv("METRICS_PORT", "9090")))
230
+
231
+ # Health checks
232
+ health_endpoint: str = field(default_factory=lambda: os.getenv("HEALTH_ENDPOINT", "/health"))
233
+ detailed_health: bool = field(default_factory=lambda: os.getenv("HEALTH_DETAILED", "true").lower() == "true")
234
+
235
+ # Performance tracking
236
+ track_performance: bool = field(default_factory=lambda: os.getenv("TRACK_PERFORMANCE", "true").lower() == "true")
237
+ slow_query_threshold_ms: int = field(default_factory=lambda: int(os.getenv("SLOW_QUERY_THRESHOLD", "1000")))
238
+
239
+ # Error tracking
240
+ track_errors: bool = field(default_factory=lambda: os.getenv("TRACK_ERRORS", "true").lower() == "true")
241
+ error_sample_rate: float = field(default_factory=lambda: float(os.getenv("ERROR_SAMPLE_RATE", "1.0")))
242
+
243
+ # External integrations
244
+ sentry_dsn: Optional[str] = field(default_factory=lambda: os.getenv("SENTRY_DSN"))
245
+ prometheus_gateway: Optional[str] = field(default_factory=lambda: os.getenv("PROMETHEUS_GATEWAY"))
246
+
247
+
248
+ class TranslationConfig(BaseModel):
249
+ """Main configuration for the translation service."""
250
+ environment: Environment = Field(default=Environment.DEVELOPMENT)
251
+ debug: bool = Field(default=False)
252
+
253
+ # Component configurations
254
+ gemini: GeminiConfig = Field(default_factory=GeminiConfig)
255
+ openai_agents: OpenAIAgentsConfig = Field(default_factory=OpenAIAgentsConfig)
256
+ cache: CacheConfig = Field(default_factory=CacheConfig)
257
+ database: DatabaseConfig = Field(default_factory=DatabaseConfig)
258
+ logging: LoggingConfig = Field(default_factory=LoggingConfig)
259
+ rate_limit: RateLimitConfig = Field(default_factory=RateLimitConfig)
260
+ security: SecurityConfig = Field(default_factory=SecurityConfig)
261
+ monitoring: MonitoringConfig = Field(default_factory=MonitoringConfig)
262
+
263
+ # Feature flags
264
+ features: Dict[str, bool] = Field(default_factory=lambda: {
265
+ "streaming": True,
266
+ "quality_check": True,
267
+ "chunking": True,
268
+ "code_preservation": True,
269
+ "html_preservation": True,
270
+ "batch_translation": True
271
+ })
272
+
273
+ class Config:
274
+ env_file = ".env"
275
+ env_file_encoding = "utf-8"
276
+ case_sensitive = False
277
+
278
+ @validator("environment", pre=True)
279
+ def parse_environment(cls, v):
280
+ """Parse environment from string."""
281
+ if isinstance(v, str):
282
+ return Environment(v.lower())
283
+ return v
284
+
285
+ def __init__(self, **data):
286
+ """Initialize configuration with environment detection."""
287
+ # Auto-detect environment if not specified
288
+ if "environment" not in data:
289
+ env = os.getenv("ENVIRONMENT", os.getenv("ENV", "development")).lower()
290
+ data["environment"] = Environment(env)
291
+
292
+ # Set debug flag based on environment
293
+ if "debug" not in data:
294
+ data["debug"] = data["environment"] == Environment.DEVELOPMENT
295
+
296
+ super().__init__(**data)
297
+
298
+ # Validate configuration
299
+ self.validate_config()
300
+
301
+ def validate_config(self) -> None:
302
+ """Validate the configuration."""
303
+ errors = []
304
+
305
+ # Validate Gemini configuration
306
+ if not self.gemini.api_key:
307
+ errors.append("GEMINI_API_KEY is required")
308
+
309
+ if self.gemini.timeout <= 0:
310
+ errors.append("GEMINI_TIMEOUT must be positive")
311
+
312
+ if self.gemini.max_retries < 0:
313
+ errors.append("GEMINI_MAX_RETRIES must be non-negative")
314
+
315
+ # Validate database URL if provided
316
+ if self.database.url and not self.database.url.startswith(("sqlite://", "postgresql://", "mysql://")):
317
+ errors.append("DATABASE_URL must be a valid database connection string")
318
+
319
+ # Validate cache configuration
320
+ if self.cache.backend == CacheBackend.REDIS and not self.cache.redis_url:
321
+ errors.append("REDIS_URL is required when using Redis cache backend")
322
+
323
+ # Validate rate limits
324
+ if self.rate_limit.requests_per_minute <= 0:
325
+ errors.append("RATE_LIMIT_RPM must be positive")
326
+
327
+ # Log errors and raise if any
328
+ if errors:
329
+ for error in errors:
330
+ logger.error(f"Configuration validation error: {error}")
331
+ raise ValueError(f"Configuration validation failed: {'; '.join(errors)}")
332
+
333
+ logger.info("Configuration validated successfully", environment=self.environment.value)
334
+
335
+ @classmethod
336
+ def from_file(cls, config_path: Union[str, Path]) -> "TranslationConfig":
337
+ """Load configuration from file."""
338
+ config_path = Path(config_path)
339
+
340
+ if not config_path.exists():
341
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
342
+
343
+ # Parse based on file extension
344
+ with open(config_path, "r", encoding="utf-8") as f:
345
+ if config_path.suffix.lower() in [".yaml", ".yml"]:
346
+ data = yaml.safe_load(f)
347
+ elif config_path.suffix.lower() == ".json":
348
+ data = json.load(f)
349
+ else:
350
+ raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
351
+
352
+ # Override with environment variables
353
+ return cls(**data)
354
+
355
+ def to_dict(self) -> Dict[str, Any]:
356
+ """Convert configuration to dictionary."""
357
+ return {
358
+ "environment": self.environment.value,
359
+ "debug": self.debug,
360
+ "gemini": asdict(self.gemini),
361
+ "openai_agents": asdict(self.openai_agents),
362
+ "cache": asdict(self.cache),
363
+ "database": asdict(self.database),
364
+ "logging": {
365
+ **asdict(self.logging),
366
+ "level": self.logging.level.value
367
+ },
368
+ "rate_limit": asdict(self.rate_limit),
369
+ "security": asdict(self.security),
370
+ "monitoring": asdict(self.monitoring),
371
+ "features": self.features
372
+ }
373
+
374
+ def save_to_file(self, config_path: Union[str, Path]) -> None:
375
+ """Save configuration to file."""
376
+ config_path = Path(config_path)
377
+ config_path.parent.mkdir(parents=True, exist_ok=True)
378
+
379
+ data = self.to_dict()
380
+
381
+ with open(config_path, "w", encoding="utf-8") as f:
382
+ if config_path.suffix.lower() in [".yaml", ".yml"]:
383
+ yaml.dump(data, f, default_flow_style=False, indent=2)
384
+ elif config_path.suffix.lower() == ".json":
385
+ json.dump(data, f, indent=2)
386
+ else:
387
+ raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
388
+
389
+ logger.info(f"Configuration saved to {config_path}")
390
+
391
+ def get_model_pricing(self, model: str) -> Dict[str, float]:
392
+ """Get pricing for a specific model."""
393
+ return self.gemini.pricing.get(model, self.gemini.pricing["gemini-2.0-flash-lite"])
394
+
395
+ def is_feature_enabled(self, feature: str) -> bool:
396
+ """Check if a feature is enabled."""
397
+ return self.features.get(feature, False)
398
+
399
+ def should_use_agents(self) -> bool:
400
+ """Determine if OpenAI Agents SDK should be used."""
401
+ return self.openai_agents.enabled and self.is_feature_enabled("quality_check")
402
+
403
+
404
+ # Global configuration instance
405
+ _config: Optional[TranslationConfig] = None
406
+
407
+
408
+ def get_config() -> TranslationConfig:
409
+ """Get the global configuration instance."""
410
+ global _config
411
+ if _config is None:
412
+ _config = TranslationConfig()
413
+ return _config
414
+
415
+
416
+ def load_config(config_path: Optional[Union[str, Path]] = None) -> TranslationConfig:
417
+ """Load configuration from file or environment."""
418
+ global _config
419
+
420
+ if config_path:
421
+ _config = TranslationConfig.from_file(config_path)
422
+ else:
423
+ _config = TranslationConfig()
424
+
425
+ return _config
426
+
427
+
428
+ def reload_config() -> TranslationConfig:
429
+ """Reload configuration from environment."""
430
+ global _config
431
+ _config = TranslationConfig()
432
+ return _config
src/database/base.py CHANGED
@@ -7,7 +7,7 @@ from sqlalchemy.ext.declarative import declarative_base
7
  from sqlalchemy.orm import sessionmaker
8
  import os
9
 
10
- # Create the declarative base
11
  Base = declarative_base()
12
 
13
  # Database URL from environment
 
7
  from sqlalchemy.orm import sessionmaker
8
  import os
9
 
10
+ # Create the declarative base - this will be the single source of truth
11
  Base = declarative_base()
12
 
13
  # Database URL from environment
src/middleware/auth.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Authentication middleware for API routes.
3
+
4
+ This module provides JWT-based authentication middleware for protecting API endpoints.
5
+ """
6
+
7
+ from datetime import datetime, timedelta
8
+ from typing import Optional, Dict, Any
9
+
10
+ from fastapi import HTTPException, status, Depends
11
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
12
+ from jose import JWTError, jwt
13
+ from passlib.context import CryptContext
14
+ from sqlalchemy.orm import Session
15
+
16
+ from src.database.base import get_db
17
+ from src.models.auth import User
18
+
19
+ # Configuration
20
+ SECRET_KEY = "your-secret-key-here" # Should be in environment variables
21
+ ALGORITHM = "HS256"
22
+ ACCESS_TOKEN_EXPIRE_MINUTES = 30
23
+
24
+ # Password hashing
25
+ pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
26
+
27
+ # Security scheme for FastAPI
28
+ security = HTTPBearer(auto_error=False)
29
+
30
+
31
+ def verify_password(plain_password: str, hashed_password: str) -> bool:
32
+ """Verify a password against its hash."""
33
+ return pwd_context.verify(plain_password, hashed_password)
34
+
35
+
36
+ def get_password_hash(password: str) -> str:
37
+ """Generate password hash."""
38
+ return pwd_context.hash(password)
39
+
40
+
41
+ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
42
+ """Create JWT access token."""
43
+ to_encode = data.copy()
44
+ if expires_delta:
45
+ expire = datetime.utcnow() + expires_delta
46
+ else:
47
+ expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
48
+
49
+ to_encode.update({"exp": expire})
50
+ encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
51
+ return encoded_jwt
52
+
53
+
54
+ def decode_token(token: str) -> Dict[str, Any]:
55
+ """Decode and validate JWT token."""
56
+ try:
57
+ payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
58
+ return payload
59
+ except JWTError as e:
60
+ raise HTTPException(
61
+ status_code=status.HTTP_401_UNAUTHORIZED,
62
+ detail=f"Could not validate credentials: {str(e)}",
63
+ headers={"WWW-Authenticate": "Bearer"},
64
+ )
65
+
66
+
67
+ async def get_current_user(
68
+ credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
69
+ db: Session = Depends(get_db)
70
+ ) -> User:
71
+ """Get the current authenticated user."""
72
+ if not credentials:
73
+ raise HTTPException(
74
+ status_code=status.HTTP_401_UNAUTHORIZED,
75
+ detail="Not authenticated",
76
+ headers={"WWW-Authenticate": "Bearer"},
77
+ )
78
+
79
+ token = credentials.credentials
80
+ payload = decode_token(token)
81
+
82
+ user_id: str = payload.get("sub")
83
+ if user_id is None:
84
+ raise HTTPException(
85
+ status_code=status.HTTP_401_UNAUTHORIZED,
86
+ detail="Could not validate credentials",
87
+ headers={"WWW-Authenticate": "Bearer"},
88
+ )
89
+
90
+ user = db.query(User).filter(User.id == user_id).first()
91
+ if user is None:
92
+ raise HTTPException(
93
+ status_code=status.HTTP_401_UNAUTHORIZED,
94
+ detail="User not found",
95
+ headers={"WWW-Authenticate": "Bearer"},
96
+ )
97
+
98
+ return user
99
+
100
+
101
+ async def get_current_active_user(
102
+ current_user: User = Depends(get_current_user)
103
+ ) -> User:
104
+ """Get the current active user."""
105
+ if not current_user.is_active:
106
+ raise HTTPException(
107
+ status_code=status.HTTP_400_BAD_REQUEST,
108
+ detail="Inactive user"
109
+ )
110
+ return current_user
111
+
112
+
113
+ async def get_optional_current_user(
114
+ credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
115
+ db: Session = Depends(get_db)
116
+ ) -> Optional[User]:
117
+ """Get the current user if authenticated, otherwise return None."""
118
+ if not credentials:
119
+ return None
120
+
121
+ try:
122
+ token = credentials.credentials
123
+ payload = decode_token(token)
124
+
125
+ user_id: str = payload.get("sub")
126
+ if user_id is None:
127
+ return None
128
+
129
+ user = db.query(User).filter(User.id == user_id).first()
130
+ return user if user and user.is_active else None
131
+
132
+ except HTTPException:
133
+ return None
134
+
135
+
136
+ # Role-based access control
137
+ class RoleChecker:
138
+ """Check if user has required role."""
139
+
140
+ def __init__(self, allowed_roles: list):
141
+ self.allowed_roles = allowed_roles
142
+
143
+ def __call__(self, current_user: User = Depends(get_current_active_user)) -> User:
144
+ if current_user.role not in self.allowed_roles:
145
+ raise HTTPException(
146
+ status_code=status.HTTP_403_FORBIDDEN,
147
+ detail="Not enough permissions"
148
+ )
149
+ return current_user
150
+
151
+
152
+ # Pre-defined role checkers
153
+ require_admin = RoleChecker(["admin"])
154
+ require_user = RoleChecker(["user", "admin"])
155
+
156
+
157
+ # Authentication dependencies
158
+ def authenticate_user(email: str, password: str, db: Session) -> Optional[User]:
159
+ """Authenticate user with email and password."""
160
+ user = db.query(User).filter(User.email == email).first()
161
+ if not user:
162
+ return None
163
+ if not verify_password(password, user.hashed_password):
164
+ return None
165
+ return user
166
+
167
+
168
+ # Rate limiting middleware
169
+ from slowapi import Limiter
170
+ from slowapi.util import get_remote_address
171
+ from slowapi.errors import RateLimitExceeded
172
+
173
+ limiter = Limiter(key_func=get_remote_address)
174
+
175
+
176
+ class RateLimitMiddleware:
177
+ """Rate limiting middleware for API endpoints."""
178
+
179
+ def __init__(self, times: int, milliseconds: int):
180
+ self.times = times
181
+ self.milliseconds = milliseconds
182
+
183
+ def __call__(self, endpoint):
184
+ return limiter.limit(f"{self.times}/{self.milliseconds}milliseconds")(endpoint)
185
+
186
+
187
+ # Pre-defined rate limiters
188
+ auth_rate_limit = RateLimitMiddleware(5, 60000) # 5 requests per minute
189
+ general_rate_limit = RateLimitMiddleware(100, 60000) # 100 requests per minute
190
+ upload_rate_limit = RateLimitMiddleware(10, 60000) # 10 requests per minute
191
+
192
+
193
+ # CORS middleware configuration
194
+ from fastapi.middleware.cors import CORSMiddleware
195
+
196
+ def create_cors_middleware(allow_origins: list = None) -> CORSMiddleware:
197
+ """Create CORS middleware with specified origins."""
198
+ return CORSMiddleware(
199
+ allow_origins=allow_origins or ["http://localhost:3000"],
200
+ allow_credentials=True,
201
+ allow_methods=["*"],
202
+ allow_headers=["*"],
203
+ )
204
+
205
+
206
+ # Request logging middleware
207
+ import logging
208
+ import time
209
+ from fastapi import Request, Response
210
+
211
+ logger = logging.getLogger(__name__)
212
+
213
+ async def log_requests(request: Request, call_next):
214
+ """Log all API requests with timing."""
215
+ start_time = time.time()
216
+
217
+ # Get client IP
218
+ client_ip = request.client.host if request.client else "unknown"
219
+
220
+ # Get user if authenticated
221
+ user = getattr(request.state, 'user', None)
222
+ user_id = user.id if user else "anonymous"
223
+
224
+ # Log request
225
+ logger.info(
226
+ f"Request started",
227
+ extra={
228
+ "method": request.method,
229
+ "url": str(request.url),
230
+ "client_ip": client_ip,
231
+ "user_id": user_id,
232
+ "headers": dict(request.headers),
233
+ }
234
+ )
235
+
236
+ # Process request
237
+ response = await call_next(request)
238
+
239
+ # Calculate duration
240
+ process_time = time.time() - start_time
241
+
242
+ # Log response
243
+ logger.info(
244
+ f"Request completed",
245
+ extra={
246
+ "method": request.method,
247
+ "url": str(request.url),
248
+ "status_code": response.status_code,
249
+ "process_time": process_time,
250
+ "client_ip": client_ip,
251
+ "user_id": user_id,
252
+ }
253
+ )
254
+
255
+ # Add timing header
256
+ response.headers["X-Process-Time"] = str(process_time)
257
+
258
+ return response
259
+
260
+
261
+ # Security headers middleware
262
+ async def add_security_headers(request: Request, call_next):
263
+ """Add security headers to responses."""
264
+ response = await call_next(request)
265
+
266
+ # Add security headers
267
+ response.headers["X-Content-Type-Options"] = "nosniff"
268
+ response.headers["X-Frame-Options"] = "DENY"
269
+ response.headers["X-XSS-Protection"] = "1; mode=block"
270
+ response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
271
+ response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
272
+ response.headers["Content-Security-Policy"] = "default-src 'self'"
273
+
274
+ return response
275
+
276
+
277
+ # Token refresh endpoint dependencies
278
+ def create_refresh_token(data: dict) -> str:
279
+ """Create a refresh token with longer expiry."""
280
+ to_encode = data.copy()
281
+ expire = datetime.utcnow() + timedelta(days=7) # 7 days
282
+ to_encode.update({"exp": expire, "type": "refresh"})
283
+ return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
284
+
285
+
286
+ async def verify_refresh_token(token: str) -> Dict[str, Any]:
287
+ """Verify refresh token and return payload."""
288
+ try:
289
+ payload = decode_token(token)
290
+ if payload.get("type") != "refresh":
291
+ raise HTTPException(
292
+ status_code=status.HTTP_401_UNAUTHORIZED,
293
+ detail="Invalid refresh token"
294
+ )
295
+ return payload
296
+ except HTTPException:
297
+ raise
298
+ except Exception as e:
299
+ raise HTTPException(
300
+ status_code=status.HTTP_401_UNAUTHORIZED,
301
+ detail=f"Could not validate refresh token: {str(e)}"
302
+ )
src/middleware/cors.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CORS middleware configuration for frontend-backend communication.
3
+
4
+ Provides configurable Cross-Origin Resource Sharing middleware.
5
+ """
6
+
7
+ import os
8
+ from typing import List, Optional
9
+
10
+ from fastapi import FastAPI, Request, Response
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.middleware.base import BaseHTTPMiddleware
13
+ from starlette.middleware.base import RequestResponseEndpoint
14
+
15
+
16
+ class CustomCORSMiddleware(BaseHTTPMiddleware):
17
+ """Custom CORS middleware with additional security features."""
18
+
19
+ def __init__(
20
+ self,
21
+ app: FastAPI,
22
+ allow_origins: List[str] = None,
23
+ allow_methods: List[str] = None,
24
+ allow_headers: List[str] = None,
25
+ expose_headers: List[str] = None,
26
+ allow_credentials: bool = True,
27
+ max_age: int = 86400, # 24 hours
28
+ strict_mode: bool = False
29
+ ):
30
+ super().__init__(app)
31
+ self.allow_origins = allow_origins or self._get_default_origins()
32
+ self.allow_methods = allow_methods or ["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"]
33
+ self.allow_headers = allow_headers or ["*"]
34
+ self.expose_headers = expose_headers or []
35
+ self.allow_credentials = allow_credentials
36
+ self.max_age = max_age
37
+ self.strict_mode = strict_mode
38
+
39
+ # Apply FastAPI's CORS middleware
40
+ app.add_middleware(
41
+ CORSMiddleware,
42
+ allow_origins=self.allow_origins,
43
+ allow_credentials=self.allow_credentials,
44
+ allow_methods=self.allow_methods,
45
+ allow_headers=self.allow_headers,
46
+ expose_headers=self.expose_headers,
47
+ max_age=self.max_age
48
+ )
49
+
50
+ def _get_default_origins(self) -> List[str]:
51
+ """Get default allowed origins from environment."""
52
+ env_origins = os.getenv("CORS_ORIGINS", "")
53
+ if env_origins:
54
+ return [origin.strip() for origin in env_origins.split(",")]
55
+
56
+ # Default origins for development
57
+ default_origins = [
58
+ "http://localhost:3000",
59
+ "http://localhost:3001",
60
+ "http://127.0.0.1:3000",
61
+ "http://127.0.0.1:3001",
62
+ ]
63
+
64
+ # Add production URL if available
65
+ if os.getenv("FRONTEND_URL"):
66
+ default_origins.append(os.getenv("FRONTEND_URL"))
67
+
68
+ return default_origins
69
+
70
+ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
71
+ """Add additional CORS security features."""
72
+
73
+ # Handle preflight requests
74
+ if request.method == "OPTIONS":
75
+ # Add additional security headers for preflight
76
+ response = await call_next(request)
77
+ else:
78
+ response = await call_next(request)
79
+
80
+ # Add security headers
81
+ self._add_security_headers(request, response)
82
+
83
+ # Log CORS requests in strict mode
84
+ if self.strict_mode:
85
+ self._log_cors_request(request, response)
86
+
87
+ return response
88
+
89
+ def _add_security_headers(self, request: Request, response: Response):
90
+ """Add additional security headers."""
91
+ # Remove server information
92
+ response.headers["Server"] = ""
93
+
94
+ # CSP header (Content Security Policy)
95
+ csp_directives = [
96
+ "default-src 'self'",
97
+ "script-src 'self' 'unsafe-inline' 'unsafe-eval'",
98
+ "style-src 'self' 'unsafe-inline'",
99
+ "img-src 'self' data: https:",
100
+ "font-src 'self' data:",
101
+ "connect-src 'self'",
102
+ "frame-ancestors 'none'",
103
+ "base-uri 'self'",
104
+ "form-action 'self'",
105
+ ]
106
+ response.headers["Content-Security-Policy"] = "; ".join(csp_directives)
107
+
108
+ # Additional security headers
109
+ response.headers["X-Content-Type-Options"] = "nosniff"
110
+ response.headers["X-Frame-Options"] = "DENY"
111
+ response.headers["X-XSS-Protection"] = "1; mode=block"
112
+ response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
113
+
114
+ # HSTS (only in production with HTTPS)
115
+ if os.getenv("ENVIRONMENT") == "production" and request.url.scheme == "https":
116
+ response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
117
+
118
+ # Permissions Policy
119
+ permissions_policy = [
120
+ "geolocation=()",
121
+ "microphone=()",
122
+ "camera=()",
123
+ "payment=()",
124
+ "usb=()",
125
+ "magnetometer=()",
126
+ "gyroscope=()",
127
+ "accelerometer=()",
128
+ ]
129
+ response.headers["Permissions-Policy"] = ", ".join(permissions_policy)
130
+
131
+ def _log_cors_request(self, request: Request, response: Response):
132
+ """Log CORS-related requests for monitoring."""
133
+ from src.utils.logging import get_logger
134
+
135
+ logger = get_logger("cors")
136
+
137
+ origin = request.headers.get("origin")
138
+ if origin:
139
+ if origin not in self.allow_origins:
140
+ logger.warning(
141
+ "Cross-origin request from unauthorized origin",
142
+ origin=origin,
143
+ path=request.url.path,
144
+ method=request.method,
145
+ )
146
+ else:
147
+ logger.info(
148
+ "Cross-origin request allowed",
149
+ origin=origin,
150
+ path=request.url.path,
151
+ method=request.method,
152
+ )
153
+
154
+
155
+ class RateLimitCORSMiddleware(BaseHTTPMiddleware):
156
+ """CORS middleware with rate limiting per origin."""
157
+
158
+ def __init__(
159
+ self,
160
+ app: FastAPI,
161
+ requests_per_minute: int = 100,
162
+ burst_size: int = 200
163
+ ):
164
+ super().__init__(app)
165
+ self.requests_per_minute = requests_per_minute
166
+ self.burst_size = burst_size
167
+ self.request_counts = {} # Simple in-memory tracking
168
+
169
+ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
170
+ """Apply rate limiting based on origin."""
171
+ import time
172
+ from fastapi import HTTPException
173
+
174
+ origin = request.headers.get("origin")
175
+ if origin:
176
+ current_time = time.time()
177
+ minute_key = int(current_time // 60)
178
+
179
+ # Clean old entries
180
+ self._cleanup_old_entries(minute_key)
181
+
182
+ # Track requests
183
+ origin_key = f"{origin}:{minute_key}"
184
+ count = self.request_counts.get(origin_key, 0)
185
+
186
+ if count >= self.requests_per_minute:
187
+ raise HTTPException(
188
+ status_code=429,
189
+ detail="Too many requests from this origin",
190
+ headers={
191
+ "Retry-After": "60",
192
+ "X-RateLimit-Limit": str(self.requests_per_minute),
193
+ "X-RateLimit-Remaining": "0",
194
+ "X-RateLimit-Reset": str((minute_key + 1) * 60)
195
+ }
196
+ )
197
+
198
+ self.request_counts[origin_key] = count + 1
199
+
200
+ response = await call_next(request)
201
+
202
+ # Add rate limit headers
203
+ if origin:
204
+ response.headers["X-RateLimit-Limit"] = str(self.requests_per_minute)
205
+ remaining = max(0, self.requests_per_minute - self.request_counts.get(origin_key, 0))
206
+ response.headers["X-RateLimit-Remaining"] = str(remaining)
207
+
208
+ return response
209
+
210
+ def _cleanup_old_entries(self, current_minute: int):
211
+ """Remove old entries from request counts."""
212
+ keys_to_remove = []
213
+ for key in self.request_counts.keys():
214
+ key_minute = int(key.split(":")[-1])
215
+ if current_minute - key_minute > 5: # Keep 5 minutes of history
216
+ keys_to_remove.append(key)
217
+
218
+ for key in keys_to_remove:
219
+ del self.request_counts[key]
220
+
221
+
222
+ def configure_cors(
223
+ app: FastAPI,
224
+ environment: str = "development"
225
+ ) -> None:
226
+ """Configure CORS based on environment."""
227
+
228
+ if environment == "production":
229
+ # Production CORS settings
230
+ origins = os.getenv("CORS_ORIGINS", "").split(",") if os.getenv("CORS_ORIGINS") else []
231
+
232
+ # Add production frontend URL
233
+ frontend_url = os.getenv("FRONTEND_URL")
234
+ if frontend_url and frontend_url not in origins:
235
+ origins.append(frontend_url)
236
+
237
+ # In production, be strict about origins
238
+ if origins:
239
+ app.add_middleware(
240
+ CustomCORSMiddleware,
241
+ allow_origins=origins,
242
+ allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
243
+ allow_headers=["Authorization", "Content-Type", "X-Requested-With"],
244
+ expose_headers=["X-Total-Count", "X-Page-Count"],
245
+ strict_mode=True
246
+ )
247
+
248
+ # Add rate limiting
249
+ app.add_middleware(
250
+ RateLimitCORSMiddleware,
251
+ requests_per_minute=int(os.getenv("RATE_LIMIT_PER_MINUTE", "100"))
252
+ )
253
+
254
+ else:
255
+ # Development CORS settings - more permissive
256
+ app.add_middleware(
257
+ CustomCORSMiddleware,
258
+ allow_origins=[
259
+ "http://localhost:3000",
260
+ "http://localhost:3001",
261
+ "http://127.0.0.1:3000",
262
+ "http://127.0.0.1:3001",
263
+ "http://localhost:5173", # Vite dev server
264
+ "http://127.0.0.1:5173",
265
+ ],
266
+ allow_credentials=True,
267
+ strict_mode=False
268
+ )
269
+
270
+
271
+ # CORS configuration for specific routes
272
+ class RouteSpecificCORSMiddleware(BaseHTTPMiddleware):
273
+ """Apply different CORS settings to specific routes."""
274
+
275
+ def __init__(
276
+ self,
277
+ app: FastAPI,
278
+ path_prefix: str,
279
+ cors_config: dict
280
+ ):
281
+ super().__init__(app)
282
+ self.path_prefix = path_prefix
283
+ self.cors_config = cors_config
284
+
285
+ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
286
+ """Apply route-specific CORS configuration."""
287
+ if request.url.path.startswith(self.path_prefix):
288
+ # Apply custom CORS settings for this route
289
+ origin = request.headers.get("origin")
290
+ if origin and self.cors_config.get("allowed_origins"):
291
+ if origin in self.cors_config["allowed_origins"]:
292
+ response = await call_next(request)
293
+ response.headers["Access-Control-Allow-Origin"] = origin
294
+ response.headers["Access-Control-Allow-Credentials"] = "true"
295
+
296
+ for method in self.cors_config.get("allowed_methods", []):
297
+ response.headers["Access-Control-Allow-Methods"] = ", ".join(methods)
298
+
299
+ for header in self.cors_config.get("allowed_headers", []):
300
+ response.headers["Access-Control-Allow-Headers"] = ", ".join(headers)
301
+
302
+ return response
303
+ else:
304
+ # Use default CORS handling
305
+ return await call_next(request)
306
+
307
+
308
+ # Pre-configured CORS settings for different environments
309
+ CORS_CONFIGS = {
310
+ "development": {
311
+ "allowed_origins": ["http://localhost:3000", "http://localhost:5173"],
312
+ "allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
313
+ "allowed_headers": ["*"],
314
+ "allow_credentials": True,
315
+ "strict_mode": False
316
+ },
317
+ "staging": {
318
+ "allowed_origins": ["https://staging.example.com"],
319
+ "allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
320
+ "allowed_headers": ["Authorization", "Content-Type"],
321
+ "allow_credentials": True,
322
+ "strict_mode": True
323
+ },
324
+ "production": {
325
+ "allowed_origins": ["https://example.com"],
326
+ "allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
327
+ "allowed_headers": ["Authorization", "Content-Type"],
328
+ "allow_credentials": True,
329
+ "strict_mode": True
330
+ }
331
+ }
332
+
333
+
334
+ def setup_cors_with_config(
335
+ app: FastAPI,
336
+ config_name: str = "development"
337
+ ) -> None:
338
+ """Setup CORS using pre-configured settings."""
339
+
340
+ config = CORS_CONFIGS.get(config_name, CORS_CONFIGS["development"])
341
+
342
+ app.add_middleware(
343
+ CustomCORSMiddleware,
344
+ **config
345
+ )
346
+
347
+ # Log CORS configuration
348
+ from src.utils.logging import get_logger
349
+
350
+ logger = get_logger("cors")
351
+ logger.info(
352
+ "CORS configured",
353
+ environment=config_name,
354
+ allowed_origins=config["allowed_origins"],
355
+ allow_credentials=config["allow_credentials"]
356
+ )
src/middleware/rate_limit.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rate Limiting Middleware for Translation API.
3
+
4
+ This middleware implements per-IP and per-user rate limiting
5
+ to prevent abuse and manage Gemini API quotas effectively.
6
+ """
7
+
8
+ import time
9
+ import asyncio
10
+ from typing import Dict, Optional
11
+ from fastapi import Request, HTTPException, status
12
+ from fastapi.responses import JSONResponse
13
+ from starlette.middleware.base import BaseHTTPMiddleware
14
+
15
+ from src.utils.translation_logger import get_translation_logger
16
+
17
+ logger = get_translation_logger(__name__)
18
+
19
+
20
+ class RateLimitMiddleware(BaseHTTPMiddleware):
21
+ """
22
+ Middleware for rate limiting API requests.
23
+
24
+ Implements:
25
+ - Per-IP rate limiting
26
+ - Per-user rate limiting (if authenticated)
27
+ - Sliding window algorithm
28
+ - Redis-based storage (if available)
29
+ - In-memory fallback
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ app,
35
+ *,
36
+ requests_per_minute: int = 60,
37
+ requests_per_hour: int = 1000,
38
+ redis_client=None
39
+ ):
40
+ """
41
+ Initialize rate limit middleware.
42
+
43
+ Args:
44
+ app: FastAPI application
45
+ requests_per_minute: Requests allowed per minute per client
46
+ requests_per_hour: Requests allowed per hour per client
47
+ redis_client: Optional Redis client for distributed rate limiting
48
+ """
49
+ super().__init__(app)
50
+ self.requests_per_minute = requests_per_minute
51
+ self.requests_per_hour = requests_per_hour
52
+ self.redis_client = redis_client
53
+
54
+ # In-memory storage fallback
55
+ self.ip_rate_limits: Dict[str, Dict[str, Any]] = {}
56
+ self.user_rate_limits: Dict[str, Dict[str, Any]] = {}
57
+
58
+ logger.info(
59
+ "Rate limit middleware initialized",
60
+ requests_per_minute=requests_per_minute,
61
+ requests_per_hour=requests_hour,
62
+ redis_enabled=redis_client is not None
63
+ )
64
+
65
+ async def dispatch(self, request: Request, call_next):
66
+ """
67
+ Process request with rate limiting.
68
+
69
+ Args:
70
+ request: Incoming request
71
+ call_next: Next middleware/endpoint
72
+
73
+ Returns:
74
+ Response or rate limit error
75
+ """
76
+ # Skip rate limiting for health checks
77
+ if request.url.path in ["/health", "/health/enhanced", "/metrics/health"]:
78
+ return await call_next(request)
79
+
80
+ # Get client identifiers
81
+ client_ip = self._get_client_ip(request)
82
+ user_id = self._get_user_id(request)
83
+
84
+ # Check rate limits
85
+ await self._check_rate_limits(client_ip, user_id)
86
+
87
+ # Process request
88
+ response = await call_next(request)
89
+
90
+ # Add rate limit headers
91
+ self._add_rate_limit_headers(response, client_ip, user_id)
92
+
93
+ return response
94
+
95
+ def _get_client_ip(self, request: Request) -> str:
96
+ """Get client IP address from request."""
97
+ # Check for forwarded headers
98
+ forwarded_for = request.headers.get("X-Forwarded-For")
99
+ if forwarded_for:
100
+ return forwarded_for.split(",")[0].strip()
101
+
102
+ real_ip = request.headers.get("X-Real-IP")
103
+ if real_ip:
104
+ return real_ip
105
+
106
+ # Fall back to direct connection IP
107
+ return request.client.host if request.client else "unknown"
108
+
109
+ def _get_user_id(self, request: Request) -> Optional[str]:
110
+ """Get user ID from request if authenticated."""
111
+ # This would extract from JWT token or session
112
+ # For now, return None to implement IP-based limiting only
113
+ return None
114
+
115
+ async def _check_rate_limits(self, client_ip: str, user_id: Optional[str]) -> None:
116
+ """
117
+ Check if client has exceeded rate limits.
118
+
119
+ Args:
120
+ client_ip: Client IP address
121
+ user_id: Optional user ID
122
+
123
+ Raises:
124
+ HTTPException: If rate limit exceeded
125
+ """
126
+ now = time.time()
127
+
128
+ # Check per-IP limits
129
+ ip_data = await self._get_rate_limit_data(f"ip:{client_ip}")
130
+ if self._is_rate_limited(ip_data, now):
131
+ retry_after = self._calculate_retry_after(ip_data, now)
132
+ logger.warning(
133
+ "IP rate limit exceeded",
134
+ client_ip=client_ip,
135
+ requests_in_minute=ip_data.get("minute_requests", 0),
136
+ retry_after=retry_after
137
+ )
138
+ raise HTTPException(
139
+ status_code=status.HTTP_429_TOO_MANY_REQUESTS,
140
+ detail={
141
+ "error": "RATE_LIMIT_EXCEEDED",
142
+ "message": f"IP rate limit exceeded. Please wait {retry_after:.1f} seconds.",
143
+ "retry_after": retry_after,
144
+ "limit_type": "ip"
145
+ }
146
+ )
147
+
148
+ # Check per-user limits if authenticated
149
+ if user_id:
150
+ user_data = await self._get_rate_limit_data(f"user:{user_id}")
151
+ if self._is_rate_limited(user_data, now):
152
+ retry_after = self._calculate_retry_after(user_data, now)
153
+ logger.warning(
154
+ "User rate limit exceeded",
155
+ user_id=user_id,
156
+ requests_in_minute=user_data.get("minute_requests", 0),
157
+ retry_after=retry_after
158
+ )
159
+ raise HTTPException(
160
+ status_code=status.HTTP_429_TOO_MANY_REQUESTS,
161
+ detail={
162
+ "error": "RATE_LIMIT_EXCEEDED",
163
+ "message": f"User rate limit exceeded. Please wait {retry_after:.1f} seconds.",
164
+ "retry_after": retry_after,
165
+ "limit_type": "user"
166
+ }
167
+ )
168
+
169
+ # Update rate limit data
170
+ await self._update_rate_limit_data(f"ip:{client_ip}", now)
171
+ if user_id:
172
+ await self._update_rate_limit_data(f"user:{user_id}", now)
173
+
174
+ async def _get_rate_limit_data(self, key: str) -> Dict[str, Any]:
175
+ """Get rate limit data for a client."""
176
+ if self.redis_client:
177
+ try:
178
+ # Get data from Redis
179
+ data = await self.redis_client.hgetall(f"rate_limit:{key}")
180
+ if data:
181
+ return {
182
+ "minute_requests": int(data.get("minute_requests", 0)),
183
+ "minute_window": float(data.get("minute_window", 0)),
184
+ "hour_requests": int(data.get("hour_requests", 0)),
185
+ "hour_window": float(data.get("hour_window", 0)),
186
+ "last_request": float(data.get("last_request", 0))
187
+ }
188
+ except Exception as e:
189
+ logger.warning("Redis rate limit read failed", error=str(e))
190
+
191
+ # Fall back to in-memory
192
+ if key.startswith("ip:"):
193
+ storage = self.ip_rate_limits
194
+ key = key[3:] # Remove "ip:" prefix
195
+ else:
196
+ storage = self.user_rate_limits
197
+ key = key[5:] # Remove "user:" prefix
198
+
199
+ return storage.get(key, {
200
+ "minute_requests": 0,
201
+ "minute_window": 0,
202
+ "hour_requests": 0,
203
+ "hour_window": 0,
204
+ "last_request": 0
205
+ })
206
+
207
+ async def _update_rate_limit_data(self, key: str, now: float) -> None:
208
+ """Update rate limit data for a client."""
209
+ # Get current data
210
+ data = await self._get_rate_limit_data(key)
211
+
212
+ # Update minute window
213
+ if now - data["minute_window"] > 60:
214
+ data["minute_requests"] = 1
215
+ data["minute_window"] = now
216
+ else:
217
+ data["minute_requests"] += 1
218
+
219
+ # Update hour window
220
+ if now - data["hour_window"] > 3600:
221
+ data["hour_requests"] = 1
222
+ data["hour_window"] = now
223
+ else:
224
+ data["hour_requests"] += 1
225
+
226
+ data["last_request"] = now
227
+
228
+ # Save updated data
229
+ if self.redis_client:
230
+ try:
231
+ # Save to Redis with TTL
232
+ await self.redis_client.hset(
233
+ f"rate_limit:{key}",
234
+ mapping={
235
+ "minute_requests": str(data["minute_requests"]),
236
+ "minute_window": str(data["minute_window"]),
237
+ "hour_requests": str(data["hour_requests"]),
238
+ "hour_window": str(data["hour_window"]),
239
+ "last_request": str(data["last_request"])
240
+ }
241
+ )
242
+ # Set TTL to 1 hour
243
+ await self.redis_client.expire(f"rate_limit:{key}", 3600)
244
+ except Exception as e:
245
+ logger.warning("Redis rate limit write failed", error=str(e))
246
+
247
+ # Fall back to in-memory
248
+ if key.startswith("ip:"):
249
+ storage = self.ip_rate_limits
250
+ key = key[3:] # Remove "ip:" prefix
251
+ else:
252
+ storage = self.user_rate_limits
253
+ key = key[5:] # Remove "user:" prefix
254
+
255
+ storage[key] = data
256
+
257
+ # Cleanup old entries (simple cleanup every 100 requests)
258
+ if data["minute_requests"] % 100 == 0:
259
+ await self._cleanup_old_entries(now)
260
+
261
+ async def _cleanup_old_entries(self, now: float) -> None:
262
+ """Clean up old rate limit entries."""
263
+ cutoff = now - 3600 # 1 hour ago
264
+
265
+ # Cleanup IP entries
266
+ to_remove = []
267
+ for ip, data in self.ip_rate_limits.items():
268
+ if data["last_request"] < cutoff:
269
+ to_remove.append(ip)
270
+ for ip in to_remove:
271
+ del self.ip_rate_limits[ip]
272
+
273
+ # Cleanup user entries
274
+ to_remove = []
275
+ for user, data in self.user_rate_limits.items():
276
+ if data["last_request"] < cutoff:
277
+ to_remove.append(user)
278
+ for user in to_remove:
279
+ del self.user_rate_limits[user]
280
+
281
+ if to_remove:
282
+ logger.debug("Cleaned up old rate limit entries", count=len(to_remove))
283
+
284
+ def _is_rate_limited(self, data: Dict[str, Any], now: float) -> bool:
285
+ """Check if client has exceeded rate limits."""
286
+ # Check minute limit
287
+ if now - data["minute_window"] < 60:
288
+ if data["minute_requests"] >= self.requests_per_minute:
289
+ return True
290
+
291
+ # Check hour limit
292
+ if now - data["hour_window"] < 3600:
293
+ if data["hour_requests"] >= self.requests_per_hour:
294
+ return True
295
+
296
+ return False
297
+
298
+ def _calculate_retry_after(self, data: Dict[str, Any], now: float) -> float:
299
+ """Calculate retry-after time based on rate limit data."""
300
+ # Check minute limit
301
+ if now - data["minute_window"] < 60 and data["minute_requests"] >= self.requests_per_minute:
302
+ return 60 - (now - data["minute_window"])
303
+
304
+ # Check hour limit
305
+ if now - data["hour_window"] < 3600 and data["hour_requests"] >= self.requests_per_hour:
306
+ return 3600 - (now - data["hour_window"])
307
+
308
+ return 60.0 # Default retry after
309
+
310
+ def _add_rate_limit_headers(
311
+ self,
312
+ response,
313
+ client_ip: str,
314
+ user_id: Optional[str]
315
+ ) -> None:
316
+ """Add rate limit headers to response."""
317
+ now = time.time()
318
+
319
+ # Get current limits
320
+ ip_data = asyncio.create_task(self._get_rate_limit_data(f"ip:{client_ip}"))
321
+ ip_data_result = asyncio.run(ip_data)
322
+
323
+ # Add headers
324
+ response.headers["X-RateLimit-Limit-Minute"] = str(self.requests_per_minute)
325
+ response.headers["X-RateLimit-Limit-Hour"] = str(self.requests_per_hour)
326
+ response.headers["X-RateLimit-Remaining-Minute"] = str(
327
+ max(0, self.requests_per_minute - ip_data_result.get("minute_requests", 0))
328
+ )
329
+ response.headers["X-RateLimit-Remaining-Hour"] = str(
330
+ max(0, self.requests_per_hour - ip_data_result.get("hour_requests", 0))
331
+ )
332
+
333
+ # Add reset time
334
+ if ip_data_result.get("minute_window", 0):
335
+ reset_time = ip_data_result["minute_window"] + 60
336
+ response.headers["X-RateLimit-Reset"] = str(int(reset_time))
337
+
338
+
339
+ class TranslationRateLimitMiddleware(RateLimitMiddleware):
340
+ """
341
+ Specialized rate limit middleware for translation endpoints.
342
+
343
+ Implements stricter limits for translation endpoints to manage
344
+ Gemini API quotas effectively.
345
+ """
346
+
347
+ def __init__(
348
+ self,
349
+ app,
350
+ *,
351
+ redis_client=None
352
+ ):
353
+ """
354
+ Initialize translation rate limit middleware.
355
+
356
+ Args:
357
+ app: FastAPI application
358
+ redis_client: Optional Redis client
359
+ """
360
+ # Stricter limits for translation endpoints
361
+ super().__init__(
362
+ app,
363
+ requests_per_minute=10, # 10 translations per minute
364
+ requests_per_hour=500, # 500 translations per hour
365
+ redis_client=redis_client
366
+ )
367
+
368
+ logger.info(
369
+ "Translation rate limit middleware initialized",
370
+ requests_per_minute=10,
371
+ requests_per_hour=500
372
+ )
373
+
374
+ async def dispatch(self, request: Request, call_next):
375
+ """
376
+ Process request with translation-specific rate limiting.
377
+
378
+ Only applies to translation endpoints.
379
+ """
380
+ # Check if this is a translation endpoint
381
+ if not request.url.path.startswith("/translation/"):
382
+ return await call_next(request)
383
+
384
+ # Apply rate limiting
385
+ return await super().dispatch(request, call_next)
src/models/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Import all models to ensure they are registered with SQLAlchemy.
3
+ """
4
+
5
+ # Import all models to register them with SQLAlchemy
6
+ from .auth import (
7
+ User, Account, UserBackground, OnboardingResponse, Session,
8
+ PasswordResetToken, AnonymousSession, ChatSession, ChatMessage,
9
+ UserPreferences, MessageVersion, ChatFolder, ChatTag, MessageReaction
10
+ )
11
+
12
+ from .translation_openai import (
13
+ TranslationJob, TranslationChunk, TranslationError,
14
+ TranslationSession, TranslationCache, TranslationMetrics,
15
+ TranslationJobStatus, ChunkStatus, ErrorSeverity
16
+ )
17
+
18
+ # Export all models
19
+ __all__ = [
20
+ # Auth models
21
+ "User", "Account", "UserBackground", "OnboardingResponse", "Session",
22
+ "PasswordResetToken", "AnonymousSession", "ChatSession", "ChatMessage",
23
+ "UserPreferences", "MessageVersion", "ChatFolder", "ChatTag", "MessageReaction",
24
+
25
+ # Translation models
26
+ "TranslationJob", "TranslationChunk", "TranslationError",
27
+ "TranslationSession", "TranslationCache", "TranslationMetrics",
28
+ "TranslationJobStatus", "ChunkStatus", "ErrorSeverity"
29
+ ]
src/models/auth.py CHANGED
@@ -42,6 +42,9 @@ class User(Base):
42
  chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
43
  folders = relationship("ChatFolder", back_populates="user", cascade="all, delete-orphan")
44
  tags = relationship("ChatTag", back_populates="user", cascade="all, delete-orphan")
 
 
 
45
 
46
 
47
  class Account(Base):
 
42
  chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
43
  folders = relationship("ChatFolder", back_populates="user", cascade="all, delete-orphan")
44
  tags = relationship("ChatTag", back_populates="user", cascade="all, delete-orphan")
45
+ translation_jobs = relationship("TranslationJob", back_populates="user", cascade="all, delete-orphan")
46
+ translation_sessions = relationship("TranslationSession", back_populates="user", cascade="all, delete-orphan")
47
+ translation_metrics = relationship("TranslationMetrics", back_populates="user", cascade="all, delete-orphan")
48
 
49
 
50
  class Account(Base):
src/models/base.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base model for reader features.
3
+ """
4
+
5
+ from datetime import datetime
6
+ import uuid
7
+ from sqlalchemy import Column, String, DateTime
8
+ from sqlalchemy.sql import func
9
+ from src.database.base import Base
10
+
11
+
12
+ class BaseModel(Base):
13
+ """Base model with common fields for reader features."""
14
+
15
+ __abstract__ = True
16
+
17
+ id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
18
+ created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
19
+ updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
20
+
21
+ def to_dict(self):
22
+ """Convert model to dictionary."""
23
+ return {
24
+ column.name: getattr(self, column.name)
25
+ for column in self.__table__.columns
26
+ }
src/models/bookmark.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Bookmark model for user-saved page references with optional metadata.
3
+ """
4
+
5
+ from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey
6
+ from sqlalchemy.orm import relationship
7
+ from sqlalchemy.sql import func
8
+ from src.models.base import BaseModel
9
+
10
+
11
+ class Bookmark(BaseModel):
12
+ """Represents user-saved page references with optional metadata."""
13
+
14
+ __tablename__ = "bookmarks"
15
+
16
+ user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
17
+ chapter_id = Column(String(255), nullable=False, index=True)
18
+ section_id = Column(String(255), nullable=True)
19
+ page_url = Column(String(2048), nullable=False)
20
+ page_title = Column(String(255), nullable=False)
21
+ snippet = Column(Text, nullable=True)
22
+ note = Column(String(1000), nullable=True)
23
+ is_private = Column(Boolean, nullable=False, default=True)
24
+
25
+ # Relationships
26
+ user = relationship("User", back_populates="bookmarks")
27
+ tags = relationship("BookmarkTag", back_populates="bookmark", cascade="all, delete-orphan")
28
+
29
+ __table_args__ = (
30
+ {"extend_existing": True},
31
+ )
32
+
33
+ def __repr__(self):
34
+ return f"<Bookmark(id='{self.id}', user_id='{self.user_id}', title='{self.page_title}')>"
35
+
36
+
37
+ class BookmarkTag(BaseModel):
38
+ """Tags for organizing bookmarks."""
39
+
40
+ __tablename__ = "bookmark_tags"
41
+
42
+ bookmark_id = Column(String(36), ForeignKey("bookmarks.id"), nullable=False, index=True)
43
+ tag = Column(String(50), nullable=False, index=True)
44
+
45
+ # Relationships
46
+ bookmark = relationship("Bookmark", back_populates="tags")
47
+
48
+ __table_args__ = (
49
+ {"extend_existing": True},
50
+ )
51
+
52
+ def __repr__(self):
53
+ return f"<BookmarkTag(bookmark_id='{self.bookmark_id}', tag='{self.tag}')>"
src/models/chat.py CHANGED
@@ -30,7 +30,7 @@ class ChatMessage(Base):
30
  chat_session_id = Column(String(36), ForeignKey("chat_sessions.id"), nullable=False)
31
  role = Column(SQLEnum(Role), nullable=False)
32
  content = Column(Text, nullable=False)
33
- metadata = Column(JSON, nullable=True)
34
  created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
35
 
36
  # Relationships
 
30
  chat_session_id = Column(String(36), ForeignKey("chat_sessions.id"), nullable=False)
31
  role = Column(SQLEnum(Role), nullable=False)
32
  content = Column(Text, nullable=False)
33
+ message_metadata = Column(JSON, nullable=True)
34
  created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
35
 
36
  # Relationships
src/models/content_localization.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content localization model for tracking translation status of content pages.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from sqlalchemy import Column, String, Integer, DateTime, Boolean, JSON, Index
8
+ from sqlalchemy.dialects.postgresql import ENUM as Enum
9
+ from src.database.base import Base
10
+
11
+
12
+ class ProcessingStatus(Enum):
13
+ """Processing status for content localization."""
14
+ PENDING = "pending"
15
+ PROCESSING = "processing"
16
+ COMPLETED = "completed"
17
+ FAILED = "failed"
18
+ PARTIAL = "partial" # Some chunks failed
19
+
20
+
21
+ class ContentLocalization(Base):
22
+ """Tracks the translation status and metadata for content pages."""
23
+
24
+ __tablename__ = "content_localization"
25
+
26
+ id = Column(Integer, primary_key=True)
27
+ content_url = Column(String(500), nullable=False, index=True)
28
+ content_hash = Column(String(64), nullable=False, index=True)
29
+
30
+ # Localization status
31
+ is_translated = Column(Boolean, default=False)
32
+ last_translation_date = Column(DateTime)
33
+ translation_cache_key = Column(String(64))
34
+
35
+ # Content metadata
36
+ word_count = Column(Integer)
37
+ character_count = Column(Integer)
38
+ has_code_blocks = Column(Boolean, default=False)
39
+ detected_languages = Column(JSON) # Array of detected languages
40
+
41
+ # Processing metadata
42
+ chunk_count = Column(Integer, default=1)
43
+ processing_status = Column(Enum(ProcessingStatus), default=ProcessingStatus.PENDING)
44
+
45
+ # Metadata
46
+ created_at = Column(DateTime, default=datetime.utcnow)
47
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
48
+
49
+ def __repr__(self):
50
+ return f"<ContentLocalization(url='{self.content_url}', status='{self.processing_status}', translated={self.is_translated})>"
src/models/personalization.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PersonalizationProfile model for managing user preferences and learning styles.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from sqlalchemy import Column, Integer, String, DateTime, Boolean, JSON
8
+ from sqlalchemy.dialects.postgresql import ENUM as Enum
9
+ from src.database.base import Base
10
+
11
+
12
+ class ReadingLevel(Enum):
13
+ """Reading proficiency levels."""
14
+ BEGINNER = "beginner"
15
+ INTERMEDIATE = "intermediate"
16
+ ADVANCED = "advanced"
17
+
18
+
19
+ class LearningStyle(Enum):
20
+ """Learning style preferences."""
21
+ VISUAL = "visual" # More examples, diagrams
22
+ PRACTICAL = "practical" # Focus on code, implementation
23
+ THEORETICAL = "theoretical" # Focus on concepts, theory
24
+ BALANCED = "balanced"
25
+
26
+
27
+ class TermHandling(Enum):
28
+ """Technical term handling preferences."""
29
+ TRANSLATE = "translate" # Translate technical terms
30
+ TRANSLITERATE = "transliterate" # Keep in Urdu script
31
+ KEEP_ENGLISH = "keep_english" # Leave in English
32
+
33
+
34
+ class PersonalizationProfile(Base):
35
+ """Represents user preferences for personalized content delivery."""
36
+
37
+ __tablename__ = "personalization_profiles"
38
+
39
+ id = Column(Integer, primary_key=True)
40
+ user_id = Column(String(36), unique=True, nullable=False, index=True)
41
+
42
+ # Reading preferences
43
+ reading_level = Column(Enum(ReadingLevel), default=ReadingLevel.INTERMEDIATE)
44
+ preferred_language = Column(String(10), default='en')
45
+
46
+ # Content preferences
47
+ focus_areas = Column(JSON) # Array of topics user cares about
48
+ learning_style = Column(Enum(LearningStyle), default=LearningStyle.BALANCED)
49
+
50
+ # Translation preferences
51
+ enable_transliteration = Column(Boolean, default=True)
52
+ technical_term_handling = Column(Enum(TermHandling), default=TermHandling.TRANSLITERATE)
53
+
54
+ # UI preferences
55
+ font_size = Column(Integer, default=16)
56
+ focus_mode_preferences = Column(JSON)
57
+
58
+ # Metadata
59
+ created_at = Column(DateTime, default=datetime.utcnow)
60
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
61
+ last_active = Column(DateTime, default=datetime.utcnow)
62
+
63
+ def __repr__(self):
64
+ return f"<PersonalizationProfile(user_id='{self.user_id}', reading_level='{self.reading_level}')>"
src/models/reading_progress.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reading progress model for tracking user progress through chapters and sections.
3
+ """
4
+
5
+ from sqlalchemy import Column, String, Float, Boolean, Integer, DateTime, ForeignKey
6
+ from sqlalchemy.orm import relationship
7
+ from sqlalchemy.sql import func
8
+ from src.models.base import BaseModel
9
+
10
+
11
+ class ReadingProgress(BaseModel):
12
+ """Stores user's reading progress through chapters and sections."""
13
+
14
+ __tablename__ = "reading_progress"
15
+
16
+ user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
17
+ chapter_id = Column(String(255), nullable=False, index=True)
18
+ section_id = Column(String(255), nullable=False)
19
+ position = Column(Float, nullable=False, default=0.0) # 0-100 percentage
20
+ completed = Column(Boolean, nullable=False, default=False)
21
+ time_spent = Column(Integer, nullable=False, default=0) # Minutes
22
+ last_accessed = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
23
+
24
+ # Relationships
25
+ user = relationship("User", back_populates="reading_progress")
26
+
27
+ # Unique constraint to ensure one progress record per user per section
28
+ __table_args__ = (
29
+ {"extend_existing": True},
30
+ )
31
+
32
+ def __repr__(self):
33
+ return f"<ReadingProgress(user_id='{self.user_id}', chapter='{self.chapter_id}', position={self.position}%)>"
src/models/search_index.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Search index model for enabling fast content retrieval across languages.
3
+ """
4
+
5
+ from sqlalchemy import Column, String, Float, DateTime, Text
6
+ from sqlalchemy.sql import func
7
+ from src.models.base import BaseModel
8
+
9
+
10
+ class SearchIndex(BaseModel):
11
+ """Enables fast content retrieval across languages."""
12
+
13
+ __tablename__ = "search_index"
14
+
15
+ content_id = Column(String(255), nullable=False, index=True)
16
+ language = Column(String(10), nullable=False, index=True) # en, ur, ur-roman
17
+ content_type = Column(String(20), nullable=False, index=True) # chapter, section, bookmark
18
+ title = Column(String(255), nullable=False)
19
+ content = Column(Text, nullable=False)
20
+ chapter_id = Column(String(255), nullable=False, index=True)
21
+ section_id = Column(String(255), nullable=True)
22
+ rank = Column(Float, nullable=False, default=0.5) # 0-1 for result ranking
23
+ indexed_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
24
+
25
+ __table_args__ = (
26
+ {"extend_existing": True},
27
+ )
28
+
29
+ def __repr__(self):
30
+ return f"<SearchIndex(content_id='{self.content_id}', language='{self.language}', type='{self.content_type}')>"
src/models/translation_openai.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive OpenAI Translation System Models.
3
+
4
+ Provides database models for:
5
+ - Translation jobs with progress tracking
6
+ - Chunk-based translation processing
7
+ - Enhanced caching with page URL + content hash
8
+ - Error logging and retry tracking
9
+ - User session management
10
+ - Translation quality metrics
11
+ """
12
+
13
+ from datetime import datetime, timedelta
14
+ from typing import Optional, Dict, Any, List
15
+ from enum import Enum
16
+ import uuid
17
+
18
+ from sqlalchemy import (
19
+ Column, Integer, String, Text, DateTime, SmallInteger, ForeignKey,
20
+ Index, Boolean, Numeric, JSON, BigInteger, CheckConstraint, UniqueConstraint
21
+ )
22
+ from sqlalchemy.orm import relationship
23
+ from sqlalchemy.dialects.postgresql import UUID, JSON
24
+ from sqlalchemy.sql import func
25
+
26
+ from src.database.base import Base
27
+
28
+
29
+ class TranslationJobStatus(Enum):
30
+ """Translation job status values."""
31
+ PENDING = "pending"
32
+ QUEUED = "queued"
33
+ PROCESSING = "processing"
34
+ CHUNK_PROCESSING = "chunk_processing"
35
+ COMPLETED = "completed"
36
+ FAILED = "failed"
37
+ CANCELLED = "cancelled"
38
+ RETRYING = "retrying"
39
+ TIMEOUT = "timeout"
40
+
41
+
42
+ class ChunkStatus(Enum):
43
+ """Translation chunk status values."""
44
+ PENDING = "pending"
45
+ PROCESSING = "processing"
46
+ COMPLETED = "completed"
47
+ FAILED = "failed"
48
+ RETRY = "retry"
49
+ SKIPPED = "skipped" # For code blocks
50
+
51
+
52
+ class ErrorSeverity(Enum):
53
+ """Error severity levels."""
54
+ LOW = "low"
55
+ MEDIUM = "medium"
56
+ HIGH = "high"
57
+ CRITICAL = "critical"
58
+
59
+
60
+ class TranslationJob(Base):
61
+ """
62
+ Represents a translation job with comprehensive tracking.
63
+
64
+ Supports:
65
+ - Large text translation with chunking
66
+ - Progress tracking
67
+ - Error handling and retries
68
+ - Performance metrics
69
+ - Cost tracking
70
+ """
71
+
72
+ __tablename__ = "translation_jobs"
73
+
74
+ # Primary key and identifiers
75
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
76
+ job_id = Column(String(64), unique=True, nullable=False, index=True) # External job ID
77
+ user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
78
+ session_id = Column(String(128), nullable=True, index=True)
79
+
80
+ # Content identifiers for caching
81
+ content_hash = Column(String(64), nullable=False, index=True)
82
+ page_url = Column(Text, nullable=True, index=True) # Source page URL for caching
83
+
84
+ # Translation parameters
85
+ source_language = Column(String(10), nullable=False, index=True)
86
+ target_language = Column(String(10), nullable=False, index=True)
87
+
88
+ # Content information
89
+ original_text = Column(Text, nullable=False)
90
+ translated_text = Column(Text, nullable=True)
91
+
92
+ # Processing options
93
+ preserve_code_blocks = Column(Boolean, default=True, nullable=False)
94
+ enable_transliteration = Column(Boolean, default=True, nullable=False)
95
+ chunk_size = Column(Integer, default=2000, nullable=False) # Characters per chunk
96
+ max_chunks = Column(Integer, default=100, nullable=False)
97
+
98
+ # OpenAI specific settings
99
+ model_name = Column(String(50), nullable=False, default="gpt-4-turbo-preview")
100
+ temperature = Column(Numeric(3, 2), default=0.3, nullable=False)
101
+ max_tokens = Column(Integer, default=2048, nullable=False)
102
+
103
+ # Status and progress
104
+ status = Column(String(20), default=TranslationJobStatus.PENDING.value, nullable=False, index=True)
105
+ progress_percentage = Column(Numeric(5, 2), default=0.0, nullable=False)
106
+ chunks_total = Column(Integer, default=0, nullable=False)
107
+ chunks_completed = Column(Integer, default=0, nullable=False)
108
+ chunks_failed = Column(Integer, default=0, nullable=False)
109
+
110
+ # Retry settings
111
+ retry_count = Column(Integer, default=0, nullable=False)
112
+ max_retries = Column(Integer, default=3, nullable=False)
113
+
114
+ # Performance metrics
115
+ started_at = Column(DateTime(timezone=True), nullable=True)
116
+ completed_at = Column(DateTime(timezone=True), nullable=True)
117
+ processing_time_ms = Column(BigInteger, default=0, nullable=False)
118
+
119
+ # Cost tracking
120
+ input_tokens = Column(BigInteger, default=0, nullable=False)
121
+ output_tokens = Column(BigInteger, default=0, nullable=False)
122
+ estimated_cost_usd = Column(Numeric(10, 6), default=0.000000, nullable=False)
123
+ actual_cost_usd = Column(Numeric(10, 6), nullable=True)
124
+
125
+ # Quality metrics
126
+ quality_score = Column(Numeric(5, 2), nullable=True) # 1-5 score
127
+ confidence_score = Column(Numeric(5, 2), nullable=True) # 1-5 score
128
+
129
+ # Metadata
130
+ created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
131
+ updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
132
+ last_activity_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
133
+ user_agent = Column(Text, nullable=True)
134
+ ip_address = Column(String(45), nullable=True) # Supports IPv6
135
+
136
+ # Relationships
137
+ user = relationship("User", back_populates="translation_jobs")
138
+ chunks = relationship("TranslationChunk", back_populates="job", cascade="all, delete-orphan")
139
+ errors = relationship("TranslationError", back_populates="job", cascade="all, delete-orphan")
140
+ metrics = relationship("TranslationMetrics", back_populates="job", cascade="all, delete-orphan")
141
+ cache_entries = relationship("TranslationCache", back_populates="job", cascade="all, delete-orphan")
142
+
143
+ # Constraints and indexes
144
+ __table_args__ = (
145
+ Index('idx_job_status_created', 'status', 'created_at'),
146
+ Index('idx_user_status', 'user_id', 'status'),
147
+ Index('idx_content_lookup', 'content_hash', 'source_language', 'target_language'),
148
+ Index('idx_page_cache', 'page_url', 'content_hash'),
149
+ Index('idx_activity', 'last_activity_at'),
150
+ Index('idx_progress', 'status', 'progress_percentage'),
151
+ CheckConstraint('progress_percentage >= 0 AND progress_percentage <= 100', name='check_progress_range'),
152
+ CheckConstraint('temperature >= 0 AND temperature <= 2', name='check_temperature_range'),
153
+ CheckConstraint('chunk_size > 0 AND chunk_size <= 10000', name='check_chunk_size'),
154
+ )
155
+
156
+ def __repr__(self):
157
+ return f"<TranslationJob(id={self.id}, status={self.status}, progress={self.progress_percentage}%)>"
158
+
159
+
160
+ class TranslationChunk(Base):
161
+ """
162
+ Represents a chunk of text being translated.
163
+
164
+ Supports:
165
+ - Individual chunk status tracking
166
+ - Retry mechanism
167
+ - Performance metrics per chunk
168
+ - Code block detection
169
+ """
170
+
171
+ __tablename__ = "translation_chunks"
172
+
173
+ # Primary key and identifiers
174
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
175
+ job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
176
+ chunk_index = Column(Integer, nullable=False)
177
+
178
+ # Content
179
+ original_text = Column(Text, nullable=False)
180
+ translated_text = Column(Text, nullable=True)
181
+
182
+ # Position in original text
183
+ start_position = Column(Integer, nullable=False)
184
+ end_position = Column(Integer, nullable=False)
185
+
186
+ # Chunk properties
187
+ is_code_block = Column(Boolean, default=False, nullable=False)
188
+ code_language = Column(String(50), nullable=True)
189
+ word_count = Column(Integer, nullable=False)
190
+
191
+ # Status and processing
192
+ status = Column(String(20), default=ChunkStatus.PENDING.value, nullable=False, index=True)
193
+ retry_count = Column(Integer, default=0, nullable=False)
194
+
195
+ # Processing metrics
196
+ started_at = Column(DateTime(timezone=True), nullable=True)
197
+ completed_at = Column(DateTime(timezone=True), nullable=True)
198
+ processing_time_ms = Column(BigInteger, default=0, nullable=False)
199
+
200
+ # Token usage
201
+ input_tokens = Column(Integer, default=0, nullable=False)
202
+ output_tokens = Column(Integer, default=0, nullable=False)
203
+
204
+ # Quality indicators
205
+ confidence_score = Column(Numeric(5, 2), nullable=True)
206
+ requires_review = Column(Boolean, default=False, nullable=False)
207
+
208
+ # Error information
209
+ last_error = Column(Text, nullable=True)
210
+ error_code = Column(String(50), nullable=True)
211
+
212
+ # Metadata
213
+ created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
214
+ updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
215
+
216
+ # Relationships
217
+ job = relationship("TranslationJob", back_populates="chunks")
218
+
219
+ # Constraints and indexes
220
+ __table_args__ = (
221
+ Index('idx_job_chunk', 'job_id', 'chunk_index', unique=True),
222
+ Index('idx_chunk_status', 'status', 'created_at'),
223
+ Index('idx_code_blocks', 'is_code_block', 'code_language'),
224
+ CheckConstraint('chunk_index >= 0', name='check_chunk_index'),
225
+ CheckConstraint('start_position >= 0 AND end_position >= start_position', name='check_positions'),
226
+ CheckConstraint('word_count >= 0', name='check_word_count'),
227
+ )
228
+
229
+ def __repr__(self):
230
+ return f"<TranslationChunk(job_id={self.job_id}, index={self.chunk_index}, status={self.status})>"
231
+
232
+
233
+ class TranslationError(Base):
234
+ """
235
+ Tracks errors during translation processing.
236
+
237
+ Supports:
238
+ - Detailed error logging
239
+ - Error categorization
240
+ - Retry tracking
241
+ - Error analytics
242
+ """
243
+
244
+ __tablename__ = "translation_errors"
245
+
246
+ # Primary key and identifiers
247
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
248
+ job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
249
+ chunk_id = Column(UUID(as_uuid=True), ForeignKey("translation_chunks.id"), nullable=True, index=True)
250
+ error_id = Column(String(64), unique=True, nullable=False, index=True) # Unique error identifier
251
+
252
+ # Error details
253
+ error_type = Column(String(50), nullable=False, index=True) # e.g., "api_error", "timeout", "rate_limit"
254
+ error_code = Column(String(50), nullable=True) # API error code
255
+ error_message = Column(Text, nullable=False)
256
+ error_details = Column(JSON, nullable=True) # Additional error context
257
+
258
+ # Severity and categorization
259
+ severity = Column(String(20), default=ErrorSeverity.MEDIUM.value, nullable=False, index=True)
260
+ category = Column(String(50), nullable=False, index=True) # e.g., "network", "parsing", "validation"
261
+
262
+ # Retry information
263
+ is_retriable = Column(Boolean, default=True, nullable=False)
264
+ retry_attempt = Column(Integer, default=1, nullable=False)
265
+ max_retries = Column(Integer, default=3, nullable=False)
266
+ next_retry_at = Column(DateTime(timezone=True), nullable=True, index=True)
267
+
268
+ # Context information
269
+ request_payload = Column(JSON, nullable=True) # Sanitized request data
270
+ response_payload = Column(JSON, nullable=True) # Sanitized response data
271
+
272
+ # Stack trace and debugging
273
+ stack_trace = Column(Text, nullable=True)
274
+ debug_info = Column(JSON, nullable=True)
275
+
276
+ # Resolution
277
+ resolved_at = Column(DateTime(timezone=True), nullable=True)
278
+ resolution = Column(String(200), nullable=True) # How the error was resolved
279
+
280
+ # Metadata
281
+ created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
282
+ updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
283
+
284
+ # Relationships
285
+ job = relationship("TranslationJob", back_populates="errors")
286
+ chunk = relationship("TranslationChunk")
287
+
288
+ # Constraints and indexes
289
+ __table_args__ = (
290
+ Index('idx_error_type_created', 'error_type', 'created_at'),
291
+ Index('idx_error_severity', 'severity', 'created_at'),
292
+ Index('idx_job_errors', 'job_id', 'created_at'),
293
+ Index('idx_retry_schedule', 'next_retry_at', 'is_retriable'),
294
+ )
295
+
296
+ def __repr__(self):
297
+ return f"<TranslationError(id={self.id}, type={self.error_type}, severity={self.severity})>"
298
+
299
+
300
+ class TranslationSession(Base):
301
+ """
302
+ Manages user translation sessions.
303
+
304
+ Supports:
305
+ - Session-based tracking
306
+ - Rate limiting
307
+ - User preferences
308
+ - Analytics
309
+ """
310
+
311
+ __tablename__ = "translation_sessions"
312
+
313
+ # Primary key and identifiers
314
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
315
+ session_id = Column(String(128), unique=True, nullable=False, index=True)
316
+ user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
317
+
318
+ # Session information
319
+ started_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
320
+ last_activity_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
321
+ expires_at = Column(DateTime(timezone=True), nullable=False, index=True)
322
+ is_active = Column(Boolean, default=True, nullable=False, index=True)
323
+
324
+ # Usage tracking
325
+ request_count = Column(Integer, default=0, nullable=False)
326
+ character_count = Column(Integer, default=0, nullable=False)
327
+ total_cost_usd = Column(Numeric(10, 6), default=0.000000, nullable=False)
328
+
329
+ # Rate limiting
330
+ requests_per_minute = Column(Integer, default=60, nullable=False)
331
+ characters_per_hour = Column(Integer, default=100000, nullable=False)
332
+
333
+ # Session context
334
+ source_language = Column(String(10), nullable=True)
335
+ target_language = Column(String(10), nullable=True)
336
+ preferred_model = Column(String(50), nullable=True)
337
+
338
+ # Client information
339
+ user_agent = Column(Text, nullable=True)
340
+ ip_address = Column(String(45), nullable=True, index=True)
341
+ country_code = Column(String(2), nullable=True)
342
+
343
+ # Session preferences (stored as JSON)
344
+ preferences = Column(JSON, nullable=True)
345
+
346
+ # Relationships
347
+ user = relationship("User", back_populates="translation_sessions")
348
+
349
+ # Constraints and indexes
350
+ __table_args__ = (
351
+ Index('idx_user_sessions', 'user_id', 'is_active'),
352
+ Index('idx_session_expiry', 'expires_at', 'is_active'),
353
+ Index('idx_ip_sessions', 'ip_address', 'started_at'),
354
+ CheckConstraint('request_count >= 0', name='check_request_count'),
355
+ CheckConstraint('character_count >= 0', name='check_character_count'),
356
+ CheckConstraint('requests_per_minute > 0', name='check_rate_limit_requests'),
357
+ CheckConstraint('characters_per_hour > 0', name='check_rate_limit_chars'),
358
+ )
359
+
360
+ def __repr__(self):
361
+ return f"<TranslationSession(id={self.session_id}, active={self.is_active}, requests={self.request_count})>"
362
+
363
+
364
+ class TranslationCache(Base):
365
+ """
366
+ Enhanced translation caching with page URL support.
367
+
368
+ Supports:
369
+ - Page URL + content hash keys
370
+ - Hierarchical caching
371
+ - Cache invalidation
372
+ - Cache analytics
373
+ """
374
+
375
+ __tablename__ = "translation_cache"
376
+
377
+ # Primary key and identifiers
378
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
379
+ cache_key = Column(String(128), unique=True, nullable=False, index=True)
380
+ job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=True, index=True)
381
+
382
+ # Cache keys
383
+ content_hash = Column(String(64), nullable=False, index=True)
384
+ page_url = Column(Text, nullable=True, index=True)
385
+ url_hash = Column(String(64), nullable=True, index=True) # Hash of URL for privacy
386
+
387
+ # Translation data
388
+ source_language = Column(String(10), nullable=False, index=True)
389
+ target_language = Column(String(10), nullable=False, index=True)
390
+ original_text = Column(Text, nullable=False)
391
+ translated_text = Column(Text, nullable=False)
392
+
393
+ # Cache metadata
394
+ hit_count = Column(Integer, default=0, nullable=False)
395
+ last_hit_at = Column(DateTime(timezone=True), nullable=True)
396
+ created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
397
+ expires_at = Column(DateTime(timezone=True), nullable=False, index=True)
398
+
399
+ # Quality and performance
400
+ quality_score = Column(Numeric(5, 2), nullable=True)
401
+ processing_time_ms = Column(BigInteger, nullable=False)
402
+ model_version = Column(String(50), nullable=False)
403
+
404
+ # Cache configuration
405
+ ttl_hours = Column(Integer, default=168, nullable=False) # 7 days default
406
+ is_pinned = Column(Boolean, default=False, nullable=False) # Never expires if pinned
407
+ priority = Column(Integer, default=0, nullable=False) # Higher priority less likely to evict
408
+
409
+ # Validation
410
+ is_validated = Column(Boolean, default=False, nullable=False)
411
+ validated_by = Column(String(50), nullable=True) # user_id or "system"
412
+
413
+ # Relationships
414
+ job = relationship("TranslationJob", back_populates="cache_entries")
415
+
416
+ # Constraints and indexes
417
+ __table_args__ = (
418
+ Index('idx_cache_lookup', 'content_hash', 'source_language', 'target_language'),
419
+ Index('idx_page_cache', 'url_hash', 'content_hash'),
420
+ Index('idx_cache_expires', 'expires_at', 'priority'),
421
+ Index('idx_cache_popularity', 'hit_count', 'last_hit_at'),
422
+ CheckConstraint('hit_count >= 0', name='check_hit_count'),
423
+ CheckConstraint('processing_time_ms >= 0', name='check_processing_time'),
424
+ CheckConstraint('ttl_hours > 0', name='check_ttl_hours'),
425
+ )
426
+
427
+ def __repr__(self):
428
+ return f"<TranslationCache(key={self.cache_key[:20]}..., hits={self.hit_count})>"
429
+
430
+
431
+ class TranslationMetrics(Base):
432
+ """
433
+ Tracks detailed translation metrics and analytics.
434
+
435
+ Supports:
436
+ - Performance monitoring
437
+ - Quality analytics
438
+ - Cost tracking
439
+ - Usage statistics
440
+ """
441
+
442
+ __tablename__ = "translation_metrics"
443
+
444
+ # Primary key and identifiers
445
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
446
+ job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
447
+ user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
448
+
449
+ # Time period
450
+ metric_date = Column(DateTime(timezone=True), nullable=False, index=True)
451
+ period_type = Column(String(20), nullable=False, index=True) # hourly, daily, weekly, monthly
452
+
453
+ # Usage metrics
454
+ total_requests = Column(Integer, default=0, nullable=False)
455
+ total_characters = Column(BigInteger, default=0, nullable=False)
456
+ total_chunks = Column(Integer, default=0, nullable=False)
457
+ successful_translations = Column(Integer, default=0, nullable=False)
458
+ failed_translations = Column(Integer, default=0, nullable=False)
459
+
460
+ # Performance metrics
461
+ avg_processing_time_ms = Column(BigInteger, default=0, nullable=False)
462
+ min_processing_time_ms = Column(BigInteger, default=0, nullable=False)
463
+ max_processing_time_ms = Column(BigInteger, default=0, nullable=False)
464
+ p95_processing_time_ms = Column(BigInteger, default=0, nullable=False)
465
+
466
+ # Cost metrics
467
+ total_input_tokens = Column(BigInteger, default=0, nullable=False)
468
+ total_output_tokens = Column(BigInteger, default=0, nullable=False)
469
+ total_cost_usd = Column(Numeric(12, 6), default=0.000000, nullable=False)
470
+ avg_cost_per_char = Column(Numeric(10, 8), default=0.00000000, nullable=False)
471
+
472
+ # Quality metrics
473
+ avg_quality_score = Column(Numeric(5, 2), nullable=True)
474
+ avg_confidence_score = Column(Numeric(5, 2), nullable=True)
475
+
476
+ # Cache metrics
477
+ cache_hits = Column(Integer, default=0, nullable=False)
478
+ cache_misses = Column(Integer, default=0, nullable=False)
479
+ cache_hit_rate = Column(Numeric(5, 2), default=0.0, nullable=False)
480
+
481
+ # Error metrics
482
+ error_count = Column(Integer, default=0, nullable=False)
483
+ error_rate = Column(Numeric(5, 2), default=0.0, nullable=False)
484
+ top_error_types = Column(JSON, nullable=True) # Top 5 error types with counts
485
+
486
+ # Additional dimensions
487
+ source_language = Column(String(10), nullable=True, index=True)
488
+ target_language = Column(String(10), nullable=True, index=True)
489
+ model_name = Column(String(50), nullable=True, index=True)
490
+
491
+ # Metadata
492
+ created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
493
+ updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
494
+
495
+ # Relationships
496
+ job = relationship("TranslationJob", back_populates="metrics")
497
+ user = relationship("User", back_populates="translation_metrics")
498
+
499
+ # Constraints and indexes
500
+ __table_args__ = (
501
+ Index('idx_metrics_date_period', 'metric_date', 'period_type'),
502
+ Index('idx_user_metrics', 'user_id', 'metric_date'),
503
+ Index('idx_job_metrics', 'job_id', 'metric_date'),
504
+ Index('idx_lang_metrics', 'source_language', 'target_language', 'metric_date'),
505
+ CheckConstraint('total_requests >= 0', name='check_total_requests'),
506
+ CheckConstraint('total_characters >= 0', name='check_total_characters'),
507
+ CheckConstraint('cache_hit_rate >= 0 AND cache_hit_rate <= 100', name='check_cache_hit_rate'),
508
+ CheckConstraint('error_rate >= 0 AND error_rate <= 100', name='check_error_rate'),
509
+ )
510
+
511
+ def __repr__(self):
512
+ return f"<TranslationMetrics(date={self.metric_date}, requests={self.total_requests})>"
src/models/user_preferences.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ User preferences model for storing personalization settings.
3
+ """
4
+
5
+ from sqlalchemy import Column, String, Boolean, Integer, Float, ForeignKey, Text
6
+ from sqlalchemy.orm import relationship
7
+ from src.models.base import BaseModel
8
+
9
+
10
+ class UserPreference(BaseModel):
11
+ """Stores user personalization settings."""
12
+
13
+ __tablename__ = "user_preferences"
14
+
15
+ user_id = Column(String(36), ForeignKey("users.id"), nullable=False, unique=True, index=True)
16
+ language = Column(String(10), nullable=False, default='en') # en, ur, ur-roman
17
+ reading_pace = Column(String(20), nullable=False, default='medium') # slow, medium, fast
18
+ preferred_depth = Column(String(20), nullable=False, default='detailed') # overview, detailed, comprehensive
19
+ show_code_examples = Column(Boolean, nullable=False, default=True)
20
+ adaptive_difficulty = Column(Boolean, nullable=False, default=False)
21
+ theme = Column(String(20), nullable=False, default='auto') # light, dark, auto
22
+ font_size = Column(Integer, nullable=False, default=16)
23
+ line_height = Column(Float, nullable=False, default=1.5)
24
+
25
+ # Relationships
26
+ user = relationship("User", back_populates="preferences")
27
+ custom_notes = relationship("UserCustomNote", back_populates="preference", cascade="all, delete-orphan")
28
+
29
+ __table_args__ = (
30
+ {"extend_existing": True},
31
+ )
32
+
33
+ def __repr__(self):
34
+ return f"<UserPreference(user_id='{self.user_id}', language='{self.language}', theme='{self.theme}')>"
35
+
36
+
37
+ class UserCustomNote(BaseModel):
38
+ """Custom notes as key-value pairs for user preferences."""
39
+
40
+ __tablename__ = "user_custom_notes"
41
+
42
+ user_preference_id = Column(String(36), ForeignKey("user_preferences.id"), nullable=False)
43
+ key = Column(String(100), nullable=False)
44
+ value = Column(Text, nullable=False)
45
+
46
+ # Relationships
47
+ preference = relationship("UserPreference", back_populates="custom_notes")
48
+
49
+ __table_args__ = (
50
+ {"extend_existing": True},
51
+ )
52
+
53
+ def __repr__(self):
54
+ return f"<UserCustomNote(key='{self.key}', preference_id='{self.user_preference_id}')>"
src/services/cache_examples.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cache service usage examples.
3
+
4
+ This file demonstrates how to use the cache service for various scenarios
5
+ including translations, user preferences, and API response caching.
6
+ """
7
+
8
+ import asyncio
9
+ from typing import Dict, Any
10
+ from src.services.cache_service import (
11
+ get_cache_service,
12
+ CacheType,
13
+ cache_translation,
14
+ get_cached_translation,
15
+ cache_user_preference,
16
+ get_cached_user_preference,
17
+ cache_api_response,
18
+ get_cached_api_response
19
+ )
20
+
21
+
22
+ async def example_basic_usage():
23
+ """Basic cache service usage example."""
24
+ # Get cache service instance
25
+ cache = await get_cache_service()
26
+
27
+ # Generate a cache key
28
+ cache_key = cache._generate_cache_key(
29
+ prefix="example",
30
+ identifier="user_123",
31
+ version="v1",
32
+ param1="value1",
33
+ param2="value2"
34
+ )
35
+
36
+ # Set a value
37
+ await cache.set(
38
+ key=cache_key,
39
+ value={"message": "Hello, cached world!"},
40
+ cache_type=CacheType.API_RESPONSE,
41
+ ttl=60 # 1 minute
42
+ )
43
+
44
+ # Get the value
45
+ cached_value = await cache.get(cache_key, CacheType.API_RESPONSE)
46
+ print(f"Cached value: {cached_value}")
47
+
48
+ # Delete the value
49
+ await cache.delete(cache_key)
50
+
51
+
52
+ async def example_translation_caching():
53
+ """Example of caching translations."""
54
+ # Cache a translation
55
+ translation_data = {
56
+ "en": "Hello, World!",
57
+ "ur": "ہیلو، دنیا!",
58
+ "ur-roman": "Hello, Duniya!"
59
+ }
60
+
61
+ success = await cache_translation(
62
+ key="greeting.hello_world",
63
+ translation=translation_data,
64
+ language="all"
65
+ )
66
+
67
+ if success:
68
+ print("Translation cached successfully")
69
+
70
+ # Retrieve cached translation
71
+ cached_translation = await get_cached_translation(
72
+ key="greeting.hello_world",
73
+ language="all"
74
+ )
75
+
76
+ if cached_translation:
77
+ print(f"Cached translation: {cached_translation}")
78
+
79
+
80
+ async def example_user_preference_caching():
81
+ """Example of caching user preferences."""
82
+ # Cache user preferences
83
+ user_prefs = {
84
+ "language": "en",
85
+ "theme": "dark",
86
+ "font_size": 16,
87
+ "reading_pace": "medium",
88
+ "show_code_examples": True
89
+ }
90
+
91
+ success = await cache_user_preference(
92
+ user_id="user_456",
93
+ preferences=user_prefs
94
+ )
95
+
96
+ if success:
97
+ print("User preferences cached successfully")
98
+
99
+ # Retrieve cached preferences
100
+ cached_prefs = await get_cached_user_preference("user_456")
101
+
102
+ if cached_prefs:
103
+ print(f"Cached preferences: {cached_prefs}")
104
+
105
+
106
+ async def example_api_response_caching():
107
+ """Example of caching API responses."""
108
+ # Cache API response
109
+ api_response = {
110
+ "status": "success",
111
+ "data": [
112
+ {"id": 1, "title": "Chapter 1"},
113
+ {"id": 2, "title": "Chapter 2"}
114
+ ],
115
+ "pagination": {
116
+ "page": 1,
117
+ "total_pages": 10
118
+ }
119
+ }
120
+
121
+ success = await cache_api_response(
122
+ endpoint="/api/v1/chapters",
123
+ params={"page": 1, "limit": 10},
124
+ response=api_response,
125
+ ttl=300 # 5 minutes
126
+ )
127
+
128
+ if success:
129
+ print("API response cached successfully")
130
+
131
+ # Retrieve cached API response
132
+ cached_response = await get_cached_api_response(
133
+ endpoint="/api/v1/chapters",
134
+ params={"page": 1, "limit": 10}
135
+ )
136
+
137
+ if cached_response:
138
+ print(f"Cached API response: {cached_response}")
139
+
140
+
141
+ async def example_cache_statistics():
142
+ """Example of retrieving cache statistics."""
143
+ cache = await get_cache_service()
144
+
145
+ # Get cache statistics
146
+ stats = cache.get_stats()
147
+
148
+ print("Cache Statistics:")
149
+ print(f" Total requests: {stats['total_requests']}")
150
+ print(f" Cache hits: {stats['hits']}")
151
+ print(f" Cache misses: {stats['misses']}")
152
+ print(f" Hit rate: {stats['hit_rate']}%")
153
+ print(f" Redis hits: {stats['redis_hits']}")
154
+ print(f" Local hits: {stats['local_hits']}")
155
+ print(f" Errors: {stats['errors']}")
156
+ print(f" Redis enabled: {stats['redis_enabled']}")
157
+ print(f" Memory cache size: {stats['memory_cache_size']}")
158
+
159
+
160
+ async def example_cache_cleanup():
161
+ """Example of cleaning up expired cache entries."""
162
+ cache = await get_cache_service()
163
+
164
+ # Clean up expired entries
165
+ cleaned_count = await cache.cleanup_expired()
166
+ print(f"Cleaned up {cleaned_count} expired cache entries")
167
+
168
+ # Clear all cache entries for a specific type
169
+ cleared_count = await cache.clear(cache_type=CacheType.TRANSLATION)
170
+ print(f"Cleared {cleared_count} translation cache entries")
171
+
172
+ # Clear cache entries matching a pattern
173
+ cleared_count = await cache.clear(pattern="api:v1:user_*")
174
+ print(f"Cleared {cleared_count} entries matching pattern")
175
+
176
+
177
+ async def example_concurrent_access():
178
+ """Example demonstrating thread-safe concurrent access."""
179
+ async def worker(worker_id: int):
180
+ cache = await get_cache_service()
181
+
182
+ # Each worker uses its own key space
183
+ key = f"worker_{worker_id}:data"
184
+
185
+ for i in range(10):
186
+ # Set value
187
+ await cache.set(
188
+ key=key,
189
+ value={"worker": worker_id, "iteration": i},
190
+ cache_type=CacheType.API_RESPONSE,
191
+ ttl=60
192
+ )
193
+
194
+ # Get value
195
+ value = await cache.get(key, CacheType.API_RESPONSE)
196
+ print(f"Worker {worker_id}, iteration {i}: {value}")
197
+
198
+ # Small delay
199
+ await asyncio.sleep(0.1)
200
+
201
+ # Run multiple workers concurrently
202
+ tasks = [worker(i) for i in range(5)]
203
+ await asyncio.gather(*tasks)
204
+
205
+
206
+ async def main():
207
+ """Run all examples."""
208
+ print("=== Basic Usage ===")
209
+ await example_basic_usage()
210
+
211
+ print("\n=== Translation Caching ===")
212
+ await example_translation_caching()
213
+
214
+ print("\n=== User Preference Caching ===")
215
+ await example_user_preference_caching()
216
+
217
+ print("\n=== API Response Caching ===")
218
+ await example_api_response_caching()
219
+
220
+ print("\n=== Cache Statistics ===")
221
+ await example_cache_statistics()
222
+
223
+ print("\n=== Cache Cleanup ===")
224
+ await example_cache_cleanup()
225
+
226
+ print("\n=== Concurrent Access ===")
227
+ await example_concurrent_access()
228
+
229
+
230
+ if __name__ == "__main__":
231
+ asyncio.run(main())
src/services/cache_service.py ADDED
@@ -0,0 +1,690 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cache service for server-side caching with localStorage fallback.
3
+
4
+ Provides Redis caching with localStorage fallback, supporting different TTLs
5
+ for various cache types including translations, user preferences, and API responses.
6
+ """
7
+
8
+ import json
9
+ import pickle
10
+ import asyncio
11
+ from datetime import datetime, timedelta
12
+ from typing import Any, Dict, List, Optional, Union
13
+ from enum import Enum
14
+ import hashlib
15
+ import os
16
+ from pathlib import Path
17
+
18
+ try:
19
+ import redis.asyncio as redis
20
+ REDIS_AVAILABLE = True
21
+ except ImportError:
22
+ REDIS_AVAILABLE = False
23
+ redis = None
24
+
25
+ from src.utils.errors import CacheError, ValidationError
26
+ from src.utils.logging import get_logger
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ class CacheType(Enum):
32
+ """Cache types with different TTLs."""
33
+ TRANSLATION = "translation"
34
+ USER_PREFERENCE = "user_preference"
35
+ API_RESPONSE = "api_response"
36
+ PERSONALIZATION = "personalization"
37
+ PROGRESS = "progress"
38
+ SEARCH_RESULT = "search_result"
39
+ BOOKMARK = "bookmark"
40
+
41
+
42
+ class CacheService:
43
+ """
44
+ Cache service with Redis primary and localStorage fallback.
45
+
46
+ Features:
47
+ - Redis as primary cache (if available)
48
+ - localStorage as fallback
49
+ - TTL support per cache type
50
+ - Compression for large objects
51
+ - Statistics tracking
52
+ - Error handling and logging
53
+ """
54
+
55
+ # TTL configurations (in seconds)
56
+ TTL_CONFIG = {
57
+ CacheType.TRANSLATION: 7 * 24 * 60 * 60, # 7 days
58
+ CacheType.USER_PREFERENCE: 30 * 24 * 60 * 60, # 30 days
59
+ CacheType.API_RESPONSE: 5 * 60, # 5 minutes
60
+ CacheType.PERSONALIZATION: 1 * 60 * 60, # 1 hour
61
+ CacheType.PROGRESS: 24 * 60 * 60, # 24 hours
62
+ CacheType.SEARCH_RESULT: 10 * 60, # 10 minutes
63
+ CacheType.BOOKMARK: 30 * 24 * 60 * 60, # 30 days
64
+ }
65
+
66
+ # Statistics
67
+ _stats = {
68
+ "hits": 0,
69
+ "misses": 0,
70
+ "errors": 0,
71
+ "redis_hits": 0,
72
+ "local_hits": 0,
73
+ }
74
+
75
+ def __init__(
76
+ self,
77
+ redis_url: Optional[str] = None,
78
+ localStorage_path: Optional[str] = None,
79
+ enable_redis: bool = True,
80
+ enable_compression: bool = True,
81
+ compression_threshold: int = 1024
82
+ ):
83
+ """
84
+ Initialize cache service.
85
+
86
+ Args:
87
+ redis_url: Redis connection URL
88
+ localStorage_path: Path to localStorage directory
89
+ enable_redis: Whether to use Redis if available
90
+ enable_compression: Whether to compress large objects
91
+ compression_threshold: Size threshold for compression (bytes)
92
+ """
93
+ self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379/0")
94
+ self.localStorage_path = Path(localStorage_path or os.getenv("CACHE_LOCAL_PATH", "./cache_data"))
95
+ self.enable_redis = enable_redis and REDIS_AVAILABLE
96
+ self.enable_compression = enable_compression
97
+ self.compression_threshold = compression_threshold
98
+
99
+ self._redis_client = None
100
+ self._local_cache = {}
101
+
102
+ # Initialize localStorage
103
+ self.localStorage_path.mkdir(parents=True, exist_ok=True)
104
+
105
+ logger.info(
106
+ "Cache service initialized",
107
+ redis_enabled=self.enable_redis,
108
+ localStorage_path=str(self.localStorage_path),
109
+ compression_enabled=self.enable_compression
110
+ )
111
+
112
+ async def _get_redis_client(self):
113
+ """Get or create Redis client."""
114
+ if not self.enable_redis:
115
+ return None
116
+
117
+ if self._redis_client is None:
118
+ try:
119
+ self._redis_client = redis.from_url(
120
+ self.redis_url,
121
+ encoding="utf-8",
122
+ decode_responses=False,
123
+ socket_connect_timeout=5,
124
+ socket_timeout=5,
125
+ retry_on_timeout=True,
126
+ health_check_interval=30
127
+ )
128
+ # Test connection
129
+ await self._redis_client.ping()
130
+ logger.info("Redis connection established")
131
+ except Exception as e:
132
+ logger.warning("Failed to connect to Redis", error=str(e))
133
+ self.enable_redis = False
134
+ self._redis_client = None
135
+
136
+ return self._redis_client
137
+
138
+ def _generate_cache_key(
139
+ self,
140
+ prefix: str,
141
+ identifier: str,
142
+ version: str = "v1",
143
+ **kwargs
144
+ ) -> str:
145
+ """
146
+ Generate a consistent cache key.
147
+
148
+ Args:
149
+ prefix: Cache type or prefix
150
+ identifier: Unique identifier for the cache entry
151
+ version: Version of the cache schema
152
+ **kwargs: Additional parameters to include in key
153
+
154
+ Returns:
155
+ Generated cache key
156
+ """
157
+ # Create a stable representation of parameters
158
+ params = sorted(kwargs.items())
159
+ param_str = json.dumps(params, sort_keys=True, separators=(',', ':'))
160
+
161
+ # Create hash of identifier and params
162
+ hash_input = f"{identifier}:{param_str}"
163
+ hash_value = hashlib.sha256(hash_input.encode()).hexdigest()[:16]
164
+
165
+ return f"{prefix}:{version}:{identifier}:{hash_value}"
166
+
167
+ async def get(
168
+ self,
169
+ key: str,
170
+ cache_type: CacheType = CacheType.API_RESPONSE,
171
+ use_compression: Optional[bool] = None
172
+ ) -> Optional[Any]:
173
+ """
174
+ Get value from cache.
175
+
176
+ Args:
177
+ key: Cache key
178
+ cache_type: Type of cache entry
179
+ use_compression: Override compression setting
180
+
181
+ Returns:
182
+ Cached value or None if not found
183
+ """
184
+ try:
185
+ # Try Redis first
186
+ if self.enable_redis:
187
+ redis_client = await self._get_redis_client()
188
+ if redis_client:
189
+ value = await self._get_from_redis(
190
+ redis_client,
191
+ key,
192
+ cache_type,
193
+ use_compression
194
+ )
195
+ if value is not None:
196
+ self._stats["hits"] += 1
197
+ self._stats["redis_hits"] += 1
198
+ return value
199
+
200
+ # Fallback to localStorage
201
+ value = await self._get_from_local(key, cache_type, use_compression)
202
+ if value is not None:
203
+ self._stats["hits"] += 1
204
+ self._stats["local_hits"] += 1
205
+
206
+ # If found locally but not in Redis, backfill to Redis
207
+ if self.enable_redis:
208
+ redis_client = await self._get_redis_client()
209
+ if redis_client:
210
+ ttl = self.TTL_CONFIG[cache_type]
211
+ await self._set_to_redis(
212
+ redis_client,
213
+ key,
214
+ value,
215
+ ttl,
216
+ use_compression
217
+ )
218
+
219
+ return value
220
+
221
+ # Cache miss
222
+ self._stats["misses"] += 1
223
+ return None
224
+
225
+ except Exception as e:
226
+ self._stats["errors"] += 1
227
+ logger.error("Cache get failed", key=key, error=str(e))
228
+ return None
229
+
230
+ async def set(
231
+ self,
232
+ key: str,
233
+ value: Any,
234
+ cache_type: CacheType = CacheType.API_RESPONSE,
235
+ ttl: Optional[int] = None,
236
+ use_compression: Optional[bool] = None
237
+ ) -> bool:
238
+ """
239
+ Set value in cache.
240
+
241
+ Args:
242
+ key: Cache key
243
+ value: Value to cache
244
+ cache_type: Type of cache entry
245
+ ttl: Time to live in seconds (overrides type TTL)
246
+ use_compression: Override compression setting
247
+
248
+ Returns:
249
+ True if successful, False otherwise
250
+ """
251
+ try:
252
+ success = True
253
+ ttl = ttl or self.TTL_CONFIG[cache_type]
254
+
255
+ # Set in Redis
256
+ if self.enable_redis:
257
+ redis_client = await self._get_redis_client()
258
+ if redis_client:
259
+ success = await self._set_to_redis(
260
+ redis_client,
261
+ key,
262
+ value,
263
+ ttl,
264
+ use_compression
265
+ ) and success
266
+
267
+ # Set in localStorage (always set as fallback)
268
+ local_success = await self._set_to_local(
269
+ key,
270
+ value,
271
+ cache_type,
272
+ ttl,
273
+ use_compression
274
+ )
275
+ success = local_success and success
276
+
277
+ return success
278
+
279
+ except Exception as e:
280
+ self._stats["errors"] += 1
281
+ logger.error("Cache set failed", key=key, error=str(e))
282
+ return False
283
+
284
+ async def delete(self, key: str) -> bool:
285
+ """
286
+ Delete key from cache.
287
+
288
+ Args:
289
+ key: Cache key to delete
290
+
291
+ Returns:
292
+ True if successful, False otherwise
293
+ """
294
+ try:
295
+ success = True
296
+
297
+ # Delete from Redis
298
+ if self.enable_redis:
299
+ redis_client = await self._get_redis_client()
300
+ if redis_client:
301
+ await redis_client.delete(key)
302
+
303
+ # Delete from localStorage
304
+ local_file = self.localStorage_path / f"{key}.cache"
305
+ if local_file.exists():
306
+ local_file.unlink()
307
+
308
+ # Remove from memory cache
309
+ if key in self._local_cache:
310
+ del self._local_cache[key]
311
+
312
+ return True
313
+
314
+ except Exception as e:
315
+ self._stats["errors"] += 1
316
+ logger.error("Cache delete failed", key=key, error=str(e))
317
+ return False
318
+
319
+ async def clear(
320
+ self,
321
+ pattern: Optional[str] = None,
322
+ cache_type: Optional[CacheType] = None
323
+ ) -> int:
324
+ """
325
+ Clear cache entries.
326
+
327
+ Args:
328
+ pattern: Pattern to match keys (supports wildcards)
329
+ cache_type: Clear only this cache type
330
+
331
+ Returns:
332
+ Number of entries cleared
333
+ """
334
+ try:
335
+ cleared_count = 0
336
+
337
+ # Build pattern if cache_type specified
338
+ if cache_type and not pattern:
339
+ pattern = f"{cache_type.value}:*"
340
+
341
+ # Clear from Redis
342
+ if self.enable_redis:
343
+ redis_client = await self._get_redis_client()
344
+ if redis_client:
345
+ if pattern:
346
+ keys = await redis_client.keys(pattern)
347
+ if keys:
348
+ await redis_client.delete(*keys)
349
+ cleared_count += len(keys)
350
+ else:
351
+ await redis_client.flushdb()
352
+ cleared_count = -1 # Indicate full clear
353
+
354
+ # Clear from localStorage
355
+ if pattern:
356
+ # Convert pattern to file pattern
357
+ file_pattern = pattern.replace("*", "").replace(":", "_") + "*.cache"
358
+ for cache_file in self.localStorage_path.glob(file_pattern):
359
+ cache_file.unlink()
360
+ cleared_count += 1
361
+ else:
362
+ # Clear all files
363
+ for cache_file in self.localStorage_path.glob("*.cache"):
364
+ cache_file.unlink()
365
+ cleared_count += 1
366
+
367
+ # Clear memory cache
368
+ self._local_cache.clear()
369
+
370
+ logger.info("Cache cleared", pattern=pattern, count=cleared_count)
371
+ return cleared_count
372
+
373
+ except Exception as e:
374
+ self._stats["errors"] += 1
375
+ logger.error("Cache clear failed", pattern=pattern, error=str(e))
376
+ return 0
377
+
378
+ async def _get_from_redis(
379
+ self,
380
+ redis_client,
381
+ key: str,
382
+ cache_type: CacheType,
383
+ use_compression: Optional[bool]
384
+ ) -> Optional[Any]:
385
+ """Get value from Redis."""
386
+ try:
387
+ data = await redis_client.get(key)
388
+ if data is None:
389
+ return None
390
+
391
+ # Uncompress if needed
392
+ if use_compression or (use_compression is None and self.enable_compression):
393
+ if data.startswith(b"COMP:"):
394
+ import gzip
395
+ data = gzip.decompress(data[5:])
396
+
397
+ # Deserialize
398
+ return pickle.loads(data)
399
+
400
+ except Exception as e:
401
+ logger.warning("Redis get failed", key=key, error=str(e))
402
+ return None
403
+
404
+ async def _set_to_redis(
405
+ self,
406
+ redis_client,
407
+ key: str,
408
+ value: Any,
409
+ ttl: int,
410
+ use_compression: Optional[bool]
411
+ ) -> bool:
412
+ """Set value in Redis."""
413
+ try:
414
+ # Serialize
415
+ data = pickle.dumps(value)
416
+
417
+ # Compress if needed
418
+ if (use_compression or (use_compression is None and self.enable_compression)) \
419
+ and len(data) > self.compression_threshold:
420
+ import gzip
421
+ data = b"COMP:" + gzip.compress(data)
422
+
423
+ await redis_client.setex(key, ttl, data)
424
+ return True
425
+
426
+ except Exception as e:
427
+ logger.warning("Redis set failed", key=key, error=str(e))
428
+ return False
429
+
430
+ async def _get_from_local(
431
+ self,
432
+ key: str,
433
+ cache_type: CacheType,
434
+ use_compression: Optional[bool]
435
+ ) -> Optional[Any]:
436
+ """Get value from localStorage."""
437
+ try:
438
+ # Check memory cache first
439
+ cache_entry = self._local_cache.get(key)
440
+ if cache_entry:
441
+ # Check if expired
442
+ if cache_entry["expires"] > datetime.utcnow():
443
+ return cache_entry["value"]
444
+ else:
445
+ # Remove expired entry
446
+ del self._local_cache[key]
447
+
448
+ # Check file cache
449
+ cache_file = self.localStorage_path / f"{key}.cache"
450
+ if not cache_file.exists():
451
+ return None
452
+
453
+ # Read and validate file
454
+ data = cache_file.read_bytes()
455
+ cache_entry = json.loads(data.decode())
456
+
457
+ # Check if expired
458
+ expires = datetime.fromisoformat(cache_entry["expires"])
459
+ if expires <= datetime.utcnow():
460
+ cache_file.unlink()
461
+ return None
462
+
463
+ # Decode value
464
+ if cache_entry.get("compressed") and (use_compression or (use_compression is None and self.enable_compression)):
465
+ import gzip
466
+ value = pickle.loads(gzip.decompress(cache_entry["value"].encode()))
467
+ else:
468
+ value = pickle.loads(cache_entry["value"].encode())
469
+
470
+ # Update memory cache
471
+ self._local_cache[key] = {
472
+ "value": value,
473
+ "expires": expires
474
+ }
475
+
476
+ return value
477
+
478
+ except Exception as e:
479
+ logger.warning("Local cache get failed", key=key, error=str(e))
480
+ return None
481
+
482
+ async def _set_to_local(
483
+ self,
484
+ key: str,
485
+ value: Any,
486
+ cache_type: CacheType,
487
+ ttl: int,
488
+ use_compression: Optional[bool]
489
+ ) -> bool:
490
+ """Set value in localStorage."""
491
+ try:
492
+ expires = datetime.utcnow() + timedelta(seconds=ttl)
493
+
494
+ # Compress if needed
495
+ compressed = False
496
+ if (use_compression or (use_compression is None and self.enable_compression)):
497
+ serialized = pickle.dumps(value)
498
+ if len(serialized) > self.compression_threshold:
499
+ import gzip
500
+ value_serialized = gzip.compress(serialized).decode()
501
+ compressed = True
502
+ else:
503
+ value_serialized = serialized.decode()
504
+ else:
505
+ value_serialized = pickle.dumps(value).decode()
506
+
507
+ # Create cache entry
508
+ cache_entry = {
509
+ "value": value_serialized,
510
+ "expires": expires.isoformat(),
511
+ "compressed": compressed,
512
+ "cache_type": cache_type.value
513
+ }
514
+
515
+ # Write to file
516
+ cache_file = self.localStorage_path / f"{key}.cache"
517
+ cache_file.write_bytes(json.dumps(cache_entry).encode())
518
+
519
+ # Update memory cache
520
+ self._local_cache[key] = {
521
+ "value": value,
522
+ "expires": expires
523
+ }
524
+
525
+ return True
526
+
527
+ except Exception as e:
528
+ logger.warning("Local cache set failed", key=key, error=str(e))
529
+ return False
530
+
531
+ def get_stats(self) -> Dict[str, Any]:
532
+ """Get cache statistics."""
533
+ total_requests = self._stats["hits"] + self._stats["misses"]
534
+ hit_rate = self._stats["hits"] / max(total_requests, 1) * 100
535
+
536
+ return {
537
+ **self._stats,
538
+ "total_requests": total_requests,
539
+ "hit_rate": round(hit_rate, 2),
540
+ "redis_enabled": self.enable_redis,
541
+ "memory_cache_size": len(self._local_cache)
542
+ }
543
+
544
+ async def cleanup_expired(self) -> int:
545
+ """Clean up expired cache entries."""
546
+ cleaned = 0
547
+
548
+ try:
549
+ # Clean memory cache
550
+ now = datetime.utcnow()
551
+ expired_keys = [
552
+ key for key, entry in self._local_cache.items()
553
+ if entry["expires"] <= now
554
+ ]
555
+
556
+ for key in expired_keys:
557
+ del self._local_cache[key]
558
+ cleaned += 1
559
+
560
+ # Clean file cache
561
+ for cache_file in self.localStorage_path.glob("*.cache"):
562
+ try:
563
+ data = json.loads(cache_file.read_bytes().decode())
564
+ expires = datetime.fromisoformat(data["expires"])
565
+ if expires <= datetime.utcnow():
566
+ cache_file.unlink()
567
+ cleaned += 1
568
+ except:
569
+ # Invalid cache file, remove it
570
+ cache_file.unlink()
571
+ cleaned += 1
572
+
573
+ logger.info("Cache cleanup completed", cleaned_entries=cleaned)
574
+ return cleaned
575
+
576
+ except Exception as e:
577
+ logger.error("Cache cleanup failed", error=str(e))
578
+ return 0
579
+
580
+
581
+ # Global cache service instance
582
+ _cache_service: Optional[CacheService] = None
583
+
584
+
585
+ async def get_cache_service() -> CacheService:
586
+ """Get or create cache service instance."""
587
+ global _cache_service
588
+
589
+ if _cache_service is None:
590
+ _cache_service = CacheService()
591
+
592
+ return _cache_service
593
+
594
+
595
+ # Utility functions for specific cache types
596
+ async def cache_translation(
597
+ key: str,
598
+ translation: Dict[str, Any],
599
+ language: str
600
+ ) -> bool:
601
+ """Cache a translation entry."""
602
+ cache = await get_cache_service()
603
+ cache_key = cache._generate_cache_key(
604
+ "translation",
605
+ key,
606
+ lang=language
607
+ )
608
+ return await cache.set(
609
+ cache_key,
610
+ translation,
611
+ CacheType.TRANSLATION
612
+ )
613
+
614
+
615
+ async def get_cached_translation(
616
+ key: str,
617
+ language: str
618
+ ) -> Optional[Dict[str, Any]]:
619
+ """Get cached translation."""
620
+ cache = await get_cache_service()
621
+ cache_key = cache._generate_cache_key(
622
+ "translation",
623
+ key,
624
+ lang=language
625
+ )
626
+ return await cache.get(cache_key, CacheType.TRANSLATION)
627
+
628
+
629
+ async def cache_user_preference(
630
+ user_id: str,
631
+ preferences: Dict[str, Any]
632
+ ) -> bool:
633
+ """Cache user preferences."""
634
+ cache = await get_cache_service()
635
+ cache_key = cache._generate_cache_key(
636
+ "user_pref",
637
+ user_id
638
+ )
639
+ return await cache.set(
640
+ cache_key,
641
+ preferences,
642
+ CacheType.USER_PREFERENCE
643
+ )
644
+
645
+
646
+ async def get_cached_user_preference(
647
+ user_id: str
648
+ ) -> Optional[Dict[str, Any]]:
649
+ """Get cached user preferences."""
650
+ cache = await get_cache_service()
651
+ cache_key = cache._generate_cache_key(
652
+ "user_pref",
653
+ user_id
654
+ )
655
+ return await cache.get(cache_key, CacheType.USER_PREFERENCE)
656
+
657
+
658
+ async def cache_api_response(
659
+ endpoint: str,
660
+ params: Dict[str, Any],
661
+ response: Dict[str, Any],
662
+ ttl: Optional[int] = None
663
+ ) -> bool:
664
+ """Cache API response."""
665
+ cache = await get_cache_service()
666
+ cache_key = cache._generate_cache_key(
667
+ "api",
668
+ endpoint,
669
+ **params
670
+ )
671
+ return await cache.set(
672
+ cache_key,
673
+ response,
674
+ CacheType.API_RESPONSE,
675
+ ttl=ttl
676
+ )
677
+
678
+
679
+ async def get_cached_api_response(
680
+ endpoint: str,
681
+ params: Dict[str, Any]
682
+ ) -> Optional[Dict[str, Any]]:
683
+ """Get cached API response."""
684
+ cache = await get_cache_service()
685
+ cache_key = cache._generate_cache_key(
686
+ "api",
687
+ endpoint,
688
+ **params
689
+ )
690
+ return await cache.get(cache_key, CacheType.API_RESPONSE)
src/services/code_block_handler.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Code Block Handler for Translation System.
3
+
4
+ This module handles detection, preservation, and intelligent processing
5
+ of code blocks during translation.
6
+ """
7
+
8
+ import re
9
+ from typing import Dict, List, Optional, Any, Tuple, Set
10
+ from dataclasses import dataclass
11
+ from enum import Enum
12
+
13
+ from bs4 import BeautifulSoup, Tag
14
+ import markdown
15
+ from pygments import highlight
16
+ from pygments.lexers import get_lexer_by_name, guess_lexer
17
+ from pygments.formatters import HtmlFormatter
18
+
19
+ from src.utils.translation_logger import get_translation_logger
20
+
21
+ logger = get_translation_logger(__name__)
22
+
23
+
24
+ class CodeBlockType(Enum):
25
+ """Types of code blocks."""
26
+ MARKDOWN = "markdown"
27
+ HTML_PRE = "html_pre"
28
+ HTML_INLINE = "html_inline"
29
+ INDENTED = "indented"
30
+ FENCED = "fenced"
31
+
32
+
33
+ @dataclass
34
+ class CodeBlock:
35
+ """Represents a detected code block."""
36
+ block_type: CodeBlockType
37
+ language: Optional[str]
38
+ content: str
39
+ original_text: str
40
+ start_position: int
41
+ end_position: int
42
+ attributes: Dict[str, Any]
43
+ preserve_formatting: bool = True
44
+ add_urdu_comments: bool = False
45
+ translated: bool = False
46
+
47
+
48
+ class CodeBlockHandler:
49
+ """
50
+ Handles code block detection, preservation, and processing.
51
+
52
+ Features:
53
+ - Multi-format code block detection
54
+ - Language identification
55
+ - Format preservation
56
+ - Urdu comment injection
57
+ - Syntax highlighting
58
+ - Code validation
59
+ """
60
+
61
+ # Code block patterns
62
+ PATTERNS = {
63
+ CodeBlockType.MARKDOWN: [
64
+ re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL),
65
+ re.compile(r'~~~(\w+)?\n(.*?)\n~~~', re.DOTALL),
66
+ ],
67
+ CodeBlockType.HTML_PRE: [
68
+ re.compile(r'<pre[^>]*>.*?<code[^>]*>(.*?)</code>.*?</pre>', re.DOTALL | re.IGNORECASE),
69
+ ],
70
+ CodeBlockType.HTML_INLINE: [
71
+ re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL | re.IGNORECASE),
72
+ ],
73
+ CodeBlockType.INDENTED: [
74
+ # Detect 4+ spaces or tabs at start of line
75
+ re.compile(r'^( |\t).*$', re.MULTILINE),
76
+ ],
77
+ }
78
+
79
+ # Language patterns for detection
80
+ LANGUAGE_PATTERNS = {
81
+ 'python': [r'import\s+\w+', r'def\s+\w+', r'class\s+\w+', r'if\s+__name__\s*=='],
82
+ 'javascript': [r'function\s+\w+', r'const\s+\w+\s*=', r'let\s+\w+\s*=', r'var\s+\w+\s*='],
83
+ 'java': [r'public\s+class\s+\w+', r'private\s+\w+\s+\w+', r'import\s+java\.'],
84
+ 'cpp': [r'#include\s*<', r'using\s+namespace\s+', r'::\w+\s*\('],
85
+ 'html': [r'<!DOCTYPE\s+html>', r'<html[^>]*>', r'<div[^>]*>'],
86
+ 'css': [r'\.[\w-]+\s*{', r'#[\w-]+\s*{', r'@\w+\s*\w+\s*{'],
87
+ 'sql': [r'SELECT\s+', r'FROM\s+', r'WHERE\s+', r'INSERT\s+INTO'],
88
+ 'json': [r'^\s*{\s*"', r'^\s*\[', r'"[^"]*":\s*'],
89
+ 'yaml': [r'^\s*\w+:', r'^\s+-\s+', r'^\s* \w+:'],
90
+ 'bash': [r'#!/bin/bash', r'echo\s+', r'export\s+\w+='],
91
+ 'powershell': [r'Write-Host\s+', r'$\w+\s*=', r'Get-'],
92
+ 'dockerfile': [r'FROM\s+\w+', r'RUN\s+', r'CMD\s+'],
93
+ }
94
+
95
+ # Common programming keywords
96
+ PROGRAMMING_KEYWORDS = [
97
+ 'function', 'class', 'import', 'export', 'return', 'if', 'else', 'for', 'while',
98
+ 'def', 'var', 'let', 'const', 'try', 'catch', 'throw', 'new', 'this', 'super'
99
+ ]
100
+
101
+ def __init__(self):
102
+ """Initialize code block handler."""
103
+ self.detected_languages: Set[str] = set()
104
+ self.urdu_comments = {
105
+ 'python': '#',
106
+ 'javascript': '//',
107
+ 'java': '//',
108
+ 'cpp': '//',
109
+ 'c': '//',
110
+ 'css': '/*',
111
+ 'sql': '--',
112
+ 'bash': '#',
113
+ 'powershell': '#',
114
+ }
115
+
116
+ def detect_code_blocks(
117
+ self,
118
+ content: str,
119
+ source_format: str = "html"
120
+ ) -> List[CodeBlock]:
121
+ """
122
+ Detect all code blocks in content.
123
+
124
+ Args:
125
+ content: Content to analyze
126
+ source_format: Format type (html, markdown, etc.)
127
+
128
+ Returns:
129
+ List of detected code blocks
130
+ """
131
+ logger.info(
132
+ "Detecting code blocks",
133
+ content_length=len(content),
134
+ source_format=source_format
135
+ )
136
+
137
+ blocks = []
138
+
139
+ # Try each pattern type
140
+ for block_type, patterns in self.PATTERNS.items():
141
+ for pattern in patterns:
142
+ matches = list(pattern.finditer(content))
143
+ for match in matches:
144
+ block = self._create_code_block(
145
+ match, block_type, content
146
+ )
147
+ if block:
148
+ blocks.append(block)
149
+
150
+ # Remove duplicates (blocks that overlap)
151
+ blocks = self._remove_overlapping_blocks(blocks)
152
+
153
+ # Detect language for each block
154
+ for block in blocks:
155
+ block.language = self._detect_language(block.content)
156
+
157
+ logger.info(
158
+ "Code blocks detected",
159
+ total_blocks=len(blocks),
160
+ languages=list(set(b.language for b in blocks if b.language)),
161
+ block_types=[b.block_type.value for b in blocks]
162
+ )
163
+
164
+ return blocks
165
+
166
+ def _create_code_block(
167
+ self,
168
+ match: re.Match,
169
+ block_type: CodeBlockType,
170
+ content: str
171
+ ) -> Optional[CodeBlock]:
172
+ """Create a CodeBlock object from a regex match."""
173
+ start_pos = match.start()
174
+ end_pos = match.end()
175
+ original_text = match.group(0)
176
+
177
+ if block_type in [CodeBlockType.MARKDOWN, CodeBlockType.FENCED]:
178
+ # Extract language from fence
179
+ language = match.group(1) if match.groups() and match.group(1) else None
180
+ code_content = match.group(2) if match.groups() and len(match.groups()) > 1 else ""
181
+ elif block_type == CodeBlockType.HTML_PRE:
182
+ # Extract from HTML pre/code structure
183
+ soup = BeautifulSoup(original_text, 'html.parser')
184
+ code_tag = soup.find('code')
185
+ if code_tag:
186
+ language = self._extract_language_from_classes(code_tag.get('class', []))
187
+ code_content = code_tag.get_text()
188
+ else:
189
+ language = None
190
+ code_content = original_text
191
+ elif block_type == CodeBlockType.HTML_INLINE:
192
+ # Inline code
193
+ soup = BeautifulSoup(original_text, 'html.parser')
194
+ code_content = soup.get_text()
195
+ language = None
196
+ else:
197
+ # Other types
198
+ code_content = original_text
199
+ language = None
200
+
201
+ if not code_content.strip():
202
+ return None
203
+
204
+ return CodeBlock(
205
+ block_type=block_type,
206
+ language=language,
207
+ content=code_content,
208
+ original_text=original_text,
209
+ start_position=start_pos,
210
+ end_position=end_pos,
211
+ attributes={'match_groups': match.groups()},
212
+ preserve_formatting=True,
213
+ add_urdu_comments=self._should_add_urdu_comments(code_content, language)
214
+ )
215
+
216
+ def _remove_overlapping_blocks(self, blocks: List[CodeBlock]) -> List[CodeBlock]:
217
+ """Remove overlapping code blocks."""
218
+ if not blocks:
219
+ return []
220
+
221
+ # Sort by start position
222
+ blocks.sort(key=lambda x: x.start_position)
223
+
224
+ filtered_blocks = []
225
+ last_end = -1
226
+
227
+ for block in blocks:
228
+ if block.start_position >= last_end:
229
+ filtered_blocks.append(block)
230
+ last_end = block.end_position
231
+
232
+ return filtered_blocks
233
+
234
+ def _detect_language(self, code_content: str) -> Optional[str]:
235
+ """Detect the programming language of code content."""
236
+ # Try language hints first
237
+ language = self._detect_language_from_hints(code_content)
238
+ if language:
239
+ return language
240
+
241
+ # Try pattern matching
242
+ language = self._detect_language_from_patterns(code_content)
243
+ if language:
244
+ return language
245
+
246
+ # Use pygments as fallback
247
+ try:
248
+ lexer = guess_lexer(code_content)
249
+ if lexer:
250
+ return lexer.name.lower()
251
+ except:
252
+ pass
253
+
254
+ return None
255
+
256
+ def _detect_language_from_hints(self, code_content: str) -> Optional[str]:
257
+ """Detect language from explicit hints."""
258
+ # Check for shebang
259
+ shebang_match = re.match(r'^#!\s*/.*(?:python|node|bash|perl|ruby|php)\s*', code_content, re.MULTILINE)
260
+ if shebang_match:
261
+ shebang = shebang_match.group()
262
+ if 'python' in shebang:
263
+ return 'python'
264
+ elif 'node' in shebang:
265
+ return 'javascript'
266
+ elif 'bash' in shebang:
267
+ return 'bash'
268
+ elif 'perl' in shebang:
269
+ return 'perl'
270
+ elif 'ruby' in shebang:
271
+ return 'ruby'
272
+ elif 'php' in shebang:
273
+ return 'php'
274
+
275
+ # Check for language comments
276
+ if code_content.strip().startswith('#!'):
277
+ return 'bash' # Likely shell script
278
+
279
+ return None
280
+
281
+ def _detect_language_from_patterns(self, code_content: str) -> Optional[str]:
282
+ """Detect language using pattern matching."""
283
+ scores = {}
284
+
285
+ for language, patterns in self.LANGUAGE_PATTERNS.items():
286
+ score = 0
287
+ for pattern in patterns:
288
+ matches = len(list(re.finditer(pattern, code_content, re.MULTILINE)))
289
+ score += matches
290
+
291
+ if score > 0:
292
+ scores[language] = score
293
+
294
+ if scores:
295
+ return max(scores.items(), key=lambda x: x[1])[0]
296
+
297
+ return None
298
+
299
+ def _extract_language_from_classes(self, classes: List[str]) -> Optional[str]:
300
+ """Extract language from CSS classes."""
301
+ for cls in classes:
302
+ if isinstance(cls, str):
303
+ # Check for language- prefixed classes
304
+ if cls.startswith('language-'):
305
+ return cls[9:]
306
+ # Check for known language classes
307
+ if cls.lower() in ['python', 'javascript', 'java', 'cpp', 'c', 'html', 'css', 'sql', 'json']:
308
+ return cls.lower()
309
+ # Check for highlight.js classes
310
+ if cls.startswith('hljs-'):
311
+ lang = cls[5:]
312
+ if lang != 'language':
313
+ return lang
314
+
315
+ return None
316
+
317
+ def _should_add_urdu_comments(self, code_content: str, language: Optional[str]) -> bool:
318
+ """Determine if Urdu comments should be added."""
319
+ if not language or language not in self.urdu_comments:
320
+ return False
321
+
322
+ # Don't add comments to very short code blocks
323
+ if len(code_content.split('\n')) < 3:
324
+ return False
325
+
326
+ # Don't add if there are already comments in the target language
327
+ comment_char = self.urdu_comments[language]
328
+ if comment_char and comment_char in code_content:
329
+ # Check for non-English characters in comments
330
+ comment_pattern = re.compile(f'{re.escape(comment_char)}.*[^\x00-\x7F]+')
331
+ if comment_pattern.search(code_content):
332
+ return False
333
+
334
+ return True
335
+
336
+ def add_urdu_comments(self, code_block: CodeBlock) -> str:
337
+ """
338
+ Add Urdu explanatory comments to code block.
339
+
340
+ Args:
341
+ code_block: Code block to enhance
342
+
343
+ Returns:
344
+ Code block with Urdu comments added
345
+ """
346
+ if not code_block.language or not code_block.add_urdu_comments:
347
+ return code_block.content
348
+
349
+ language = code_block.language
350
+ comment_char = self.urdu_comments[language]
351
+
352
+ lines = code_block.content.split('\n')
353
+ enhanced_lines = []
354
+
355
+ for i, line in enumerate(lines):
356
+ enhanced_lines.append(line)
357
+
358
+ # Add comments after key lines
359
+ if self._is_comment_line(line, language):
360
+ continue
361
+
362
+ # Add Urdu comment after function definitions
363
+ if re.search(r'^(def|function|class|interface)\s+\w+', line):
364
+ # Extract function/class name
365
+ match = re.search(r'(def|function|class|interface)\s+(\w+)', line)
366
+ if match:
367
+ name = match.group(2)
368
+ urdu_translation = self._translate_code_name(name)
369
+ enhanced_lines.append(f"{comment_char} {urdu_translation}")
370
+
371
+ # Add comment after important statements
372
+ elif re.search(r'\b(return|break|continue|pass)\b', line):
373
+ urdu_comment = self._translate_statement(line.strip())
374
+ if urdu_comment:
375
+ enhanced_lines.append(f"{comment_char} {urdu_comment}")
376
+
377
+ # Add comment after imports
378
+ elif re.match(r'^(import|from|include)\s+', line):
379
+ urdu_comment = self._translate_import(line.strip())
380
+ if urdu_comment:
381
+ enhanced_lines.append(f"{comment_char} {urdu_comment}")
382
+
383
+ return '\n'.join(enhanced_lines)
384
+
385
+ def _is_comment_line(self, line: str, language: str) -> bool:
386
+ """Check if line is already a comment."""
387
+ comment_char = self.urdu_comments.get(language, '')
388
+ return comment_char and line.strip().startswith(comment_char)
389
+
390
+ def _translate_code_name(self, name: str) -> str:
391
+ """Translate a code identifier to Urdu."""
392
+ # Common translations
393
+ translations = {
394
+ 'main': 'مین',
395
+ 'init': 'ابتدائی',
396
+ 'start': 'شروع',
397
+ 'setup': 'سیٹ اپ',
398
+ 'run': 'چلائیں',
399
+ 'process': 'عملدرس',
400
+ 'handle': 'ہینڈل کریں',
401
+ 'update': 'اپڈیٹ کرنا',
402
+ 'get': 'حاصل کریں',
403
+ 'set': 'سیٹ کرنا',
404
+ 'create': 'بنانا',
405
+ 'delete': 'حذف کرنا',
406
+ 'calculate': 'حساب لگانا',
407
+ 'validate': 'تصدیق کرنا',
408
+ 'convert': 'تبدیل کرنا',
409
+ 'transform': 'تبدیل کرنا',
410
+ 'parse': 'پارس کرنا',
411
+ 'render': 'رینڈر کرنا',
412
+ 'fetch': 'لانا',
413
+ 'send': 'بھیجنا',
414
+ 'receive': 'صول کرنا',
415
+ 'connect': 'ربط جوڑنا',
416
+ 'close': 'بند کرنا',
417
+ 'open': 'کھولنا',
418
+ 'save': 'محفوظ کرنا',
419
+ 'load': 'لوڈ کرنا',
420
+ 'read': 'پڑھنا',
421
+ 'write': 'لکھنا',
422
+ }
423
+
424
+ return translations.get(name, name)
425
+
426
+ def _translate_statement(self, statement: str) -> str:
427
+ """Translate a code statement to Urdu."""
428
+ # Common statement translations
429
+ translations = {
430
+ 'return': 'واپس کریں',
431
+ 'break': 'روک جائیں',
432
+ 'continue': 'جاری رکھیں',
433
+ 'pass': 'چھوٹ دیں',
434
+ 'yield': 'دیں',
435
+ 'raise': 'پھلاؤ',
436
+ 'try': 'کوشش کریں',
437
+ 'except': 'چھوٹ',
438
+ 'finally': 'آخر میں',
439
+ 'assert': 'تصدیق کریں',
440
+ 'del': 'حذف کریں',
441
+ }
442
+
443
+ # Extract keyword
444
+ match = re.search(r'\b(' + '|'.join(translations.keys()) + r')\b', statement)
445
+ if match:
446
+ keyword = match.group(1)
447
+ translated = translations.get(keyword, keyword)
448
+ return statement.replace(keyword, translated, 1)
449
+
450
+ return None
451
+
452
+ def _translate_import(self, import_statement: str) -> str:
453
+ """Translate an import statement to Urdu."""
454
+ if 'import ' in import_statement:
455
+ return 'لائبریری امپورٹ کریں'
456
+ elif 'from ' in import_statement:
457
+ return 'سے امپورٹ کریں'
458
+ elif 'include ' in import_statement:
459
+ return 'شامل کریں'
460
+
461
+ return None
462
+
463
+ def preserve_code_blocks(
464
+ self,
465
+ original_content: str,
466
+ translated_content: str,
467
+ code_blocks: List[CodeBlock]
468
+ ) -> str:
469
+ """
470
+ Preserve code blocks in translated content.
471
+
472
+ Args:
473
+ original_content: Original content with code blocks
474
+ translated_content: Translated content
475
+ code_blocks: Detected code blocks
476
+
477
+ Returns:
478
+ Content with original code blocks preserved
479
+ """
480
+ logger.info(
481
+ "Preserving code blocks",
482
+ original_blocks=len(code_blocks)
483
+ )
484
+
485
+ # Replace translated code blocks with original ones
486
+ result = translated_content
487
+ blocks_preserved = 0
488
+
489
+ for block in code_blocks:
490
+ # Find and replace the corresponding block in translated content
491
+ # This is simplified - in practice, you'd want more precise matching
492
+ translated_block_content = self._find_translated_block(
493
+ result, block, original_content
494
+ )
495
+
496
+ if translated_block_content is not None:
497
+ # Replace with original
498
+ result = result.replace(
499
+ translated_block_content,
500
+ block.original_text,
501
+ 1
502
+ )
503
+ blocks_preserved += 1
504
+
505
+ # Add Urdu comments if configured
506
+ if block.add_urdu_comments:
507
+ enhanced_code = self.add_urdu_comments(block)
508
+ result = result.replace(
509
+ block.original_text,
510
+ enhanced_code,
511
+ 1
512
+ )
513
+
514
+ logger.info(
515
+ "Code blocks preserved",
516
+ blocks_preserved=blocks_preserved,
517
+ blocks_total=len(code_blocks)
518
+ )
519
+
520
+ return result
521
+
522
+ def _find_translated_block(
523
+ self,
524
+ content: str,
525
+ original_block: CodeBlock,
526
+ original_content: str
527
+ ) -> Optional[str]:
528
+ """Find the translated version of a code block."""
529
+ # This is a simplified implementation
530
+ # In practice, you'd track blocks more precisely during translation
531
+
532
+ # Look for the block content in the translated content
533
+ # This might not work perfectly due to translation changes
534
+ if original_block.content in content:
535
+ return original_block.content
536
+
537
+ # Try to find by looking for unique lines
538
+ original_lines = original_block.content.split('\n')
539
+ if len(original_lines) > 3:
540
+ # Use first and last lines as markers
541
+ first_line = original_lines[0]
542
+ last_line = original_lines[-1]
543
+
544
+ if first_line in content and last_line in content:
545
+ # Extract content between markers
546
+ start = content.find(first_line)
547
+ end = content.rfind(last_line) + len(last_line)
548
+ return content[start:end]
549
+
550
+ return None
551
+
552
+ def add_syntax_highlighting(
553
+ self,
554
+ code_block: CodeBlock,
555
+ theme: str = "default"
556
+ ) -> str:
557
+ """
558
+ Add syntax highlighting to a code block.
559
+
560
+ Args:
561
+ code_block: Code block to highlight
562
+ theme: Highlighting theme
563
+
564
+ Returns:
565
+ HTML with syntax highlighting
566
+ """
567
+ try:
568
+ lexer = get_lexer_by_name(code_block.language or 'text')
569
+ formatter = HtmlFormatter(
570
+ style=theme,
571
+ linenos=True,
572
+ cssclass="highlight"
573
+ )
574
+ return highlight(code_block.content, lexer, formatter)
575
+ except:
576
+ # Fallback to plain code block
577
+ return f'<pre><code>{code_block.content}</code></pre>'
578
+
579
+ def validate_code_blocks(
580
+ self,
581
+ code_blocks: List[CodeBlock],
582
+ content: str
583
+ ) -> Dict[str, Any]:
584
+ """
585
+ Validate detected code blocks.
586
+
587
+ Args:
588
+ code_blocks: Detected code blocks
589
+ content: Original content
590
+
591
+ Returns:
592
+ Validation report
593
+ """
594
+ report = {
595
+ 'valid': True,
596
+ 'warnings': [],
597
+ 'errors': [],
598
+ 'stats': {
599
+ 'total_blocks': len(code_blocks),
600
+ 'languages_detected': list(set(b.language for b in code_blocks if b.language)),
601
+ 'blocks_with_languages': len([b for b in code_blocks if b.language])
602
+ }
603
+ }
604
+
605
+ for block in code_blocks:
606
+ # Check for empty blocks
607
+ if not block.content.strip():
608
+ report['warnings'].append(
609
+ f"Empty code block at position {block.start_position}"
610
+ )
611
+
612
+ # Check for very long blocks
613
+ if len(block.content) > 10000:
614
+ report['warnings'].append(
615
+ f"Very long code block ({len(block.content)} chars) at position {block.start_position}"
616
+ )
617
+
618
+ # Check for potential formatting issues
619
+ if block.block_type == CodeBlockType.INDENTED and block.content.strip():
620
+ report['warnings'].append(
621
+ f"Indented code block detected at position {block.start_position} - might be unintentional"
622
+ )
623
+
624
+ logger.info(
625
+ "Code block validation complete",
626
+ total_warnings=len(report['warnings']),
627
+ total_errors=len(report['errors'])
628
+ )
629
+
630
+ return report
src/services/content_reconstructor.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Reconstructor for Translation System.
3
+
4
+ This module reconstructs HTML content from parsed elements,
5
+ injecting translated text while preserving original formatting
6
+ and structure.
7
+ """
8
+
9
+ from typing import Dict, List, Optional, Any
10
+ from dataclasses import dataclass
11
+
12
+ from bs4 import BeautifulSoup, Tag, NavigableString
13
+ import re
14
+ import markdown
15
+
16
+ from src.services.html_parser import ContentElement, ContentType
17
+ from src.utils.translation_logger import get_translation_logger
18
+
19
+ logger = get_translation_logger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class ReconstructionConfig:
24
+ """Configuration for content reconstruction."""
25
+ preserve_classes: bool = True
26
+ preserve_ids: bool = True
27
+ preserve_data_attributes: bool = False
28
+ preserve_style: bool = True
29
+ add_translation_markers: bool = False
30
+ cleanup_empty_elements: bool = True
31
+
32
+
33
+ class ContentReconstructor:
34
+ """
35
+ Reconstructs HTML content from parsed elements with translations.
36
+
37
+ Features:
38
+ - Recursive HTML reconstruction
39
+ - Formatting preservation
40
+ - Code block protection
41
+ - Translation marker injection
42
+ - Structure validation
43
+ """
44
+
45
+ def __init__(self, config: Optional[ReconstructionConfig] = None):
46
+ """
47
+ Initialize content reconstructor.
48
+
49
+ Args:
50
+ config: Reconstruction configuration
51
+ """
52
+ self.config = config or ReconstructionConfig()
53
+ self.translation_markers = {
54
+ 'translated': 'data-translated="true"',
55
+ 'original': 'data-original="',
56
+ 'preserve': 'data-preserve="true"'
57
+ }
58
+
59
+ def reconstruct_html(
60
+ self,
61
+ elements: List[ContentElement],
62
+ translated_map: Dict[str, str],
63
+ base_format: str = "html"
64
+ ) -> str:
65
+ """
66
+ Reconstruct HTML from parsed elements with translations.
67
+
68
+ Args:
69
+ elements: Parsed content elements
70
+ translated_map: Mapping of original text to translated text
71
+ base_format: Base format (html, markdown, etc.)
72
+
73
+ Returns:
74
+ Reconstructed HTML content
75
+ """
76
+ logger.info(
77
+ "Reconstructing HTML content",
78
+ elements_count=len(elements),
79
+ translations_count=len(translated_map),
80
+ base_format=base_format
81
+ )
82
+
83
+ # Create base document
84
+ if base_format == "html":
85
+ soup = BeautifulSoup("", "html.parser")
86
+ body = soup.new_tag("body")
87
+ soup.append(body)
88
+ else:
89
+ soup = BeautifulSoup("", "html.parser")
90
+
91
+ # Reconstruct elements
92
+ container = soup.body if soup.body else soup
93
+ for element in elements:
94
+ reconstructed = self._reconstruct_element(element, translated_map, soup)
95
+ if reconstructed:
96
+ container.append(reconstructed)
97
+
98
+ # Post-processing
99
+ html_content = str(soup)
100
+
101
+ if self.config.cleanup_empty_elements:
102
+ html_content = self._cleanup_empty_elements(html_content)
103
+
104
+ logger.info(
105
+ "HTML reconstruction complete",
106
+ output_length=len(html_content)
107
+ )
108
+
109
+ return html_content
110
+
111
+ def _reconstruct_element(
112
+ self,
113
+ element: ContentElement,
114
+ translated_map: Dict[str, str],
115
+ soup: BeautifulSoup
116
+ ) -> Optional[Tag]:
117
+ """
118
+ Reconstruct a single element.
119
+
120
+ Args:
121
+ element: Content element to reconstruct
122
+ translated_map: Translation mapping
123
+ soup: BeautifulSoup document
124
+
125
+ Returns:
126
+ Reconstructed HTML tag
127
+ """
128
+ # Handle special content types
129
+ if element.element_type == ContentType.CODE:
130
+ return self._reconstruct_code_element(element, soup)
131
+ elif element.element_type == ContentType.IMAGE:
132
+ return self._reconstruct_image_element(element, soup)
133
+ elif element.element_type == ContentType.LINK:
134
+ return self._reconstruct_link_element(element, soup)
135
+ elif element.element_type == ContentType.METADATA:
136
+ return None # Skip metadata
137
+
138
+ # Create appropriate tag
139
+ tag = self._create_tag(element.element_type, soup, element)
140
+
141
+ # Add attributes
142
+ self._add_attributes(tag, element)
143
+
144
+ # Add content or children
145
+ if element.should_translate and element.element_type == ContentType.TEXT:
146
+ # Add translated text
147
+ translated_text = translated_map.get(element.content, element.content)
148
+ tag.string = translated_text
149
+
150
+ # Add translation marker if configured
151
+ if self.config.add_translation_markers:
152
+ tag['data-translated'] = 'true'
153
+ tag['data-original'] = element.content
154
+
155
+ elif element.children:
156
+ # Reconstruct children
157
+ for child in element.children:
158
+ child_tag = self._reconstruct_element(child, translated_map, soup)
159
+ if child_tag:
160
+ tag.append(child_tag)
161
+
162
+ elif element.content:
163
+ # Add original content for non-translatable elements
164
+ tag.string = element.content
165
+ if element.element_type != ContentType.CODE:
166
+ tag['data-preserve'] = 'true'
167
+
168
+ return tag
169
+
170
+ def _reconstruct_code_element(
171
+ self,
172
+ element: ContentElement,
173
+ soup: BeautifulSoup
174
+ ) -> Tag:
175
+ """Reconstruct a code element."""
176
+ # Determine if it's inline or block code
177
+ is_inline = (
178
+ element.element_type == ContentType.INLINE_CODE or
179
+ not element.attributes.get('class', [])
180
+ )
181
+
182
+ if is_inline:
183
+ tag = soup.new_tag("code")
184
+ else:
185
+ tag = soup.new_tag("pre")
186
+ code_tag = soup.new_tag("code")
187
+ tag.append(code_tag)
188
+ tag = code_tag
189
+
190
+ # Add language class if specified
191
+ if 'language' in element.attributes:
192
+ tag['class'] = f"language-{element.attributes['language']}"
193
+
194
+ # Add original content
195
+ tag.string = element.content
196
+ tag['data-preserve'] = 'true'
197
+
198
+ return tag
199
+
200
+ def _reconstruct_image_element(
201
+ self,
202
+ element: ContentElement,
203
+ soup: BeautifulSoup
204
+ ) -> Tag:
205
+ """Reconstruct an image element."""
206
+ tag = soup.new_tag("img")
207
+
208
+ # Add attributes
209
+ for attr, value in element.attributes.items():
210
+ if attr in ['src', 'alt', 'title', 'width', 'height', 'class', 'id']:
211
+ tag[attr] = value
212
+
213
+ # Ensure essential attributes
214
+ if 'src' not in element.attributes and 'data-src' in element.attributes:
215
+ tag['src'] = element.attributes['data-src']
216
+
217
+ tag['data-preserve'] = 'true'
218
+ return tag
219
+
220
+ def _reconstruct_link_element(
221
+ self,
222
+ element: ContentElement,
223
+ soup: BeautifulSoup
224
+ ) -> Tag:
225
+ """Reconstruct a link element."""
226
+ tag = soup.new_tag("a")
227
+
228
+ # Add attributes
229
+ for attr, value in element.attributes.items():
230
+ if attr in ['href', 'title', 'target', 'class', 'id']:
231
+ tag[attr] = value
232
+
233
+ # Add content (typically don't translate URLs)
234
+ tag.string = element.content
235
+ tag['data-preserve'] = 'true'
236
+
237
+ return tag
238
+
239
+ def _create_tag(self, element_type: ContentType, soup: BeautifulSoup, element=None) -> Tag:
240
+ """Create appropriate HTML tag for element type."""
241
+ tag_mapping = {
242
+ ContentType.TEXT: "p",
243
+ ContentType.HEADING: "p", # Will be updated based on attributes
244
+ ContentType.LIST: "ul", # Default to unordered list
245
+ ContentType.QUOTE: "blockquote",
246
+ ContentType.EMPHASIS: "em",
247
+ ContentType.STRONG: "strong",
248
+ ContentType.TABLE: "table",
249
+ ContentType.CODE: "code",
250
+ }
251
+
252
+ tag_name = tag_mapping.get(element_type, "div")
253
+
254
+ if element_type == ContentType.HEADING and element and 'level' in element.attributes:
255
+ level = element.attributes['level']
256
+ if isinstance(level, int) and 1 <= level <= 6:
257
+ tag_name = f"h{level}"
258
+
259
+ return soup.new_tag(tag_name)
260
+
261
+ def _add_attributes(self, tag: Tag, element: ContentElement) -> None:
262
+ """Add attributes to reconstructed tag."""
263
+ for attr, value in element.attributes.items():
264
+ # Skip internal attributes
265
+ if attr.startswith('_'):
266
+ continue
267
+
268
+ # Skip content attributes
269
+ if attr in ['content', 'text']:
270
+ continue
271
+
272
+ # Attribute filtering based on config
273
+ if attr == 'class' and not self.config.preserve_classes:
274
+ continue
275
+ elif attr == 'id' and not self.config.preserve_ids:
276
+ continue
277
+ elif attr.startswith('data-') and not self.config.preserve_data_attributes:
278
+ continue
279
+ elif attr == 'style' and not self.config.preserve_style:
280
+ continue
281
+
282
+ tag[attr] = value
283
+
284
+ def _cleanup_empty_elements(self, html: str) -> str:
285
+ """Remove empty elements from HTML."""
286
+ # Remove empty tags
287
+ html = re.sub(r'<([a-z]+)[^>]*>\s*</\1>', '', html)
288
+
289
+ # Remove extra whitespace
290
+ html = re.sub(r'\s+', ' ', html)
291
+
292
+ # Clean up around tags
293
+ html = re.sub(r'>\s+<', '><', html)
294
+ html = re.sub(r'\s+', ' ', html)
295
+
296
+ return html.strip()
297
+
298
+ def inject_translated_text(
299
+ self,
300
+ html_content: str,
301
+ translated_segments: List[Dict[str, Any]]
302
+ ) -> str:
303
+ """
304
+ Inject translated text segments into HTML content.
305
+
306
+ Args:
307
+ html_content: Original HTML content
308
+ translated_segments: List of translated text segments with positions
309
+
310
+ Returns:
311
+ HTML content with translated text injected
312
+ """
313
+ logger.info(
314
+ "Injecting translated text",
315
+ segments_count=len(translated_segments)
316
+ )
317
+
318
+ # Sort segments by position (reverse order to maintain indices)
319
+ segments = sorted(translated_segments, key=lambda x: x.get('position', 0), reverse=True)
320
+
321
+ result = html_content
322
+ for segment in segments:
323
+ start = segment.get('start', 0)
324
+ end = segment.get('end', len(result))
325
+ translated_text = segment.get('translated_text', '')
326
+
327
+ # Replace the segment
328
+ result = result[:start] + translated_text + result[end:]
329
+
330
+ return result
331
+
332
+ def create_translation_markers(
333
+ self,
334
+ elements: List[ContentElement]
335
+ ) -> List[Dict[str, Any]]:
336
+ """
337
+ Create marker positions for text segments to be translated.
338
+
339
+ Args:
340
+ elements: Parsed content elements
341
+
342
+ Returns:
343
+ List of marker positions
344
+ """
345
+ markers = []
346
+ current_position = 0
347
+
348
+ for element in elements:
349
+ if element.should_translate and element.element_type == ContentType.TEXT:
350
+ text = element.content
351
+ if text.strip():
352
+ markers.append({
353
+ 'start': current_position,
354
+ 'end': current_position + len(text),
355
+ 'original_text': text,
356
+ 'element_id': id(element)
357
+ })
358
+ current_position += len(text)
359
+
360
+ logger.info(
361
+ "Created translation markers",
362
+ markers_count=len(markers),
363
+ text_length=current_position
364
+ )
365
+
366
+ return markers
367
+
368
+ def validate_reconstruction(
369
+ self,
370
+ original_html: str,
371
+ reconstructed_html: str,
372
+ original_elements: List[ContentElement],
373
+ reconstructed_elements: List[ContentElement]
374
+ ) -> Dict[str, Any]:
375
+ """
376
+ Validate the reconstruction process.
377
+
378
+ Args:
379
+ original_html: Original HTML content
380
+ reconstructed_html: Reconstructed HTML content
381
+ original_elements: Original parsed elements
382
+ reconstructed_elements: Reconstructed elements
383
+
384
+ Returns:
385
+ Validation report
386
+ """
387
+ report = {
388
+ 'is_valid': True,
389
+ 'errors': [],
390
+ 'warnings': [],
391
+ 'stats': {
392
+ 'original_length': len(original_html),
393
+ 'reconstructed_length': len(reconstructed_html),
394
+ 'original_elements': len(original_elements),
395
+ 'reconstructed_elements': len(reconstructed_elements)
396
+ }
397
+ }
398
+
399
+ # Check element counts
400
+ original_types = self._count_elements_by_type(original_elements)
401
+ reconstructed_types = self._count_elements_by_type(reconstructed_elements)
402
+
403
+ for element_type, count in original_types.items():
404
+ reconstructed_count = reconstructed_types.get(element_type, 0)
405
+ if count != reconstructed_count:
406
+ report['errors'].append(
407
+ f"Element count mismatch for {element_type.value}: "
408
+ f"original={count}, reconstructed={reconstructed_count}"
409
+ )
410
+ report['is_valid'] = False
411
+
412
+ # Check code blocks preservation
413
+ original_code = len([e for e in original_elements if e.element_type == ContentType.CODE])
414
+ reconstructed_code = len([e for e in reconstructed_elements if e.element_type == ContentType.CODE])
415
+
416
+ if original_code != reconstructed_code:
417
+ report['errors'].append(
418
+ f"Code blocks not preserved: original={original_code}, reconstructed={reconstructed_code}"
419
+ )
420
+ report['is_valid'] = False
421
+
422
+ # Check for preserved attributes
423
+ preserved_attributes = self._check_preserved_attributes(
424
+ original_elements,
425
+ reconstructed_elements
426
+ )
427
+ if not preserved_attributes['all_preserved']:
428
+ report['warnings'].extend(preserved_attributes['missing_attributes'])
429
+
430
+ logger.info(
431
+ "Reconstruction validation complete",
432
+ is_valid=report['is_valid'],
433
+ errors_count=len(report['errors']),
434
+ warnings_count=len(report['warnings'])
435
+ )
436
+
437
+ return report
438
+
439
+ def _count_elements_by_type(self, elements: List[ContentElement]) -> Dict[ContentType, int]:
440
+ """Count elements by type."""
441
+ counts = {}
442
+ for element in elements:
443
+ counts[element.element_type] = counts.get(element.element_type, 0) + 1
444
+ return counts
445
+
446
+ def _check_preserved_attributes(
447
+ self,
448
+ original_elements: List[ContentElement],
449
+ reconstructed_elements: List[ContentElement]
450
+ ) -> Dict[str, Any]:
451
+ """Check if important attributes are preserved."""
452
+ result = {
453
+ 'all_preserved': True,
454
+ 'missing_attributes': []
455
+ }
456
+
457
+ important_attrs = ['id', 'class', 'href', 'src', 'alt']
458
+
459
+ # This is a simplified check
460
+ # In practice, you'd want more sophisticated comparison
461
+ for orig_elem in original_elements:
462
+ for attr in important_attrs:
463
+ if attr in orig_elem.attributes:
464
+ result['missing_attributes'].append(
465
+ f"Attribute '{attr}' may not be preserved in element {orig_elem.element_type.value}"
466
+ )
467
+
468
+ if result['missing_attributes']:
469
+ result['all_preserved'] = False
470
+
471
+ return result
src/services/html_parser.py ADDED
@@ -0,0 +1,565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Parser for Translation Formatting Preservation.
3
+
4
+ This module parses HTML content to extract structure, identify
5
+ different content types, and prepare for translation while preserving
6
+ formatting.
7
+ """
8
+
9
+ import re
10
+ from typing import Dict, List, Optional, Any, Tuple
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
+
14
+ from bs4 import BeautifulSoup, Tag, NavigableString
15
+ import markdown
16
+
17
+ from src.utils.translation_logger import get_translation_logger
18
+
19
+ logger = get_translation_logger(__name__)
20
+
21
+
22
+ class ContentType(Enum):
23
+ """Content types for translation handling."""
24
+ TEXT = "text"
25
+ CODE = "code"
26
+ HEADING = "heading"
27
+ LIST = "list"
28
+ LINK = "link"
29
+ IMAGE = "image"
30
+ TABLE = "table"
31
+ QUOTE = "quote"
32
+ EMPHASIS = "emphasis"
33
+ STRONG = "strong"
34
+ INLINE_CODE = "inline_code"
35
+ MATH = "math"
36
+ METADATA = "metadata"
37
+
38
+
39
+ @dataclass
40
+ class ContentElement:
41
+ """Represents a parsed content element."""
42
+ element_type: ContentType
43
+ content: str
44
+ attributes: Dict[str, Any]
45
+ children: List['ContentElement']
46
+ parent: Optional['ContentElement'] = None
47
+ should_translate: bool = True
48
+ preserve_formatting: bool = True
49
+ position: int = 0
50
+
51
+ def to_dict(self) -> Dict[str, Any]:
52
+ """Convert to dictionary for serialization."""
53
+ return {
54
+ "type": self.element_type.value,
55
+ "content": self.content,
56
+ "attributes": self.attributes,
57
+ "children": [child.to_dict() for child in self.children],
58
+ "should_translate": self.should_translate,
59
+ "preserve_formatting": self.preserve_formatting,
60
+ "position": self.position
61
+ }
62
+
63
+
64
+ class HTMLParser:
65
+ """
66
+ HTML parser for translation with formatting preservation.
67
+
68
+ Features:
69
+ - Recursive HTML parsing
70
+ - Content type identification
71
+ - Code block detection and preservation
72
+ - Formatting marker injection
73
+ - Structure reconstruction support
74
+ """
75
+
76
+ # Code block patterns
77
+ CODE_BLOCK_PATTERNS = [
78
+ re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL), # Markdown code blocks
79
+ re.compile(r'<pre><code[^>]*>(.*?)</code></pre>', re.DOTALL | re.IGNORECASE), # HTML pre/code blocks
80
+ re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL | re.IGNORECASE), # Inline code
81
+ ]
82
+
83
+ # Special tags that should not be translated
84
+ NON_TRANSLATABLE_TAGS = {
85
+ 'script', 'style', 'noscript', 'iframe', 'object', 'embed',
86
+ 'svg', 'math', 'canvas', 'video', 'audio'
87
+ }
88
+
89
+ # Tags that preserve inner structure
90
+ STRUCTURE_PRESERVING_TAGS = {
91
+ 'pre', 'code', 'kbd', 'samp', 'var'
92
+ }
93
+
94
+ # Formatting tags
95
+ FORMATTING_TAGS = {
96
+ 'em', 'i', 'strong', 'b', 'mark', 'small', 'del', 'ins',
97
+ 'sub', 'sup', 'u', 'tt'
98
+ }
99
+
100
+ def __init__(self):
101
+ """Initialize HTML parser."""
102
+ self.position_counter = 0
103
+ self.translation_markers = {
104
+ 'start': '{{TRANSLATE_START}}',
105
+ 'end': '{{TRANSLATE_END}}',
106
+ 'skip': '{{SKIP_TRANSLATION}}'
107
+ }
108
+
109
+ def parse_html(
110
+ self,
111
+ html_content: str,
112
+ source_format: str = "html"
113
+ ) -> List[ContentElement]:
114
+ """
115
+ Parse HTML content into structured elements.
116
+
117
+ Args:
118
+ html_content: HTML content to parse
119
+ source_format: Format type (html, markdown, etc.)
120
+
121
+ Returns:
122
+ List of parsed content elements
123
+ """
124
+ logger.info(
125
+ "Parsing HTML content",
126
+ content_length=len(html_content),
127
+ source_format=source_format
128
+ )
129
+
130
+ # Convert markdown to HTML if needed
131
+ if source_format == "markdown":
132
+ html_content = markdown.markdown(
133
+ html_content,
134
+ extensions=['codehilite', 'tables', 'toc']
135
+ )
136
+
137
+ # Parse with BeautifulSoup
138
+ soup = BeautifulSoup(html_content, 'html.parser')
139
+
140
+ # Extract and parse elements
141
+ elements = []
142
+ self.position_counter = 0
143
+
144
+ for child in soup.body.children if soup.body else soup.children:
145
+ element = self._parse_node(child)
146
+ if element:
147
+ elements.append(element)
148
+
149
+ logger.info(
150
+ "HTML parsing complete",
151
+ elements_count=len(elements),
152
+ translate_elements=len([e for e in self._flatten_elements(elements) if e.should_translate])
153
+ )
154
+
155
+ return elements
156
+
157
+ def _parse_node(self, node) -> Optional[ContentElement]:
158
+ """
159
+ Parse a BeautifulSoup node into a content element.
160
+
161
+ Args:
162
+ node: BeautifulSoup node
163
+
164
+ Returns:
165
+ Parsed content element or None
166
+ """
167
+ if isinstance(node, NavigableString):
168
+ # Handle text content
169
+ text = str(node).strip()
170
+ if text:
171
+ return ContentElement(
172
+ element_type=ContentType.TEXT,
173
+ content=text,
174
+ attributes={},
175
+ children=[],
176
+ should_translate=True,
177
+ preserve_formatting=False,
178
+ position=self.position_counter
179
+ )
180
+ return None
181
+
182
+ elif isinstance(node, Tag):
183
+ tag_name = node.name.lower()
184
+ attributes = dict(node.attrs)
185
+
186
+ # Determine content type
187
+ element_type = self._determine_content_type(node, tag_name)
188
+
189
+ # Check if should translate
190
+ should_translate = self._should_translate_content(node, tag_name)
191
+
192
+ # Parse children
193
+ children = []
194
+ for child in node.children:
195
+ child_element = self._parse_node(child)
196
+ if child_element:
197
+ child_element.parent = node # type: ignore
198
+ children.append(child_element)
199
+
200
+ # Create element
201
+ element = ContentElement(
202
+ element_type=element_type,
203
+ content=node.get_text(strip=True) if should_translate else "",
204
+ attributes=attributes,
205
+ children=children,
206
+ should_translate=should_translate,
207
+ preserve_formatting=self._should_preserve_formatting(tag_name),
208
+ position=self.position_counter
209
+ )
210
+
211
+ self.position_counter += 1
212
+ return element
213
+
214
+ return None
215
+
216
+ def _determine_content_type(self, node: Tag, tag_name: str) -> ContentType:
217
+ """Determine the content type of a node."""
218
+ # Code blocks
219
+ if tag_name in ['pre', 'code'] or self._has_code_class(node):
220
+ return ContentType.CODE
221
+
222
+ # Headings
223
+ elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
224
+ return ContentType.HEADING
225
+
226
+ # Lists
227
+ elif tag_name in ['ul', 'ol', 'li', 'dl', 'dt', 'dd']:
228
+ return ContentType.LIST
229
+
230
+ # Links
231
+ elif tag_name == 'a':
232
+ return ContentType.LINK
233
+
234
+ # Images
235
+ elif tag_name == 'img':
236
+ return ContentType.IMAGE
237
+
238
+ # Tables
239
+ elif tag_name in ['table', 'thead', 'tbody', 'tr', 'td', 'th']:
240
+ return ContentType.TABLE
241
+
242
+ # Quotes
243
+ elif tag_name in ['blockquote', 'q']:
244
+ return ContentType.QUOTE
245
+
246
+ # Inline formatting
247
+ elif tag_name in self.FORMATTING_TAGS:
248
+ if tag_name in ['em', 'i']:
249
+ return ContentType.EMPHASIS
250
+ elif tag_name in ['strong', 'b']:
251
+ return ContentType.STRONG
252
+ elif tag_name == 'code' and not self._is_block_code(node):
253
+ return ContentType.INLINE_CODE
254
+
255
+ # Math
256
+ elif tag_name in ['math', 'mrow', 'mfrac', 'msqrt', 'mroot']:
257
+ return ContentType.MATH
258
+
259
+ # Metadata
260
+ elif tag_name in ['meta', 'title', 'head', 'style', 'script']:
261
+ return ContentType.METADATA
262
+
263
+ # Default to text
264
+ else:
265
+ return ContentType.TEXT
266
+
267
+ def _should_translate_content(self, node: Tag, tag_name: str) -> bool:
268
+ """Determine if content should be translated."""
269
+ # Don't translate non-translatable tags
270
+ if tag_name in self.NON_TRANSLATABLE_TAGS:
271
+ return False
272
+
273
+ # Don't translate code blocks
274
+ if tag_name == 'code' and (node.parent and node.parent.name == 'pre'):
275
+ return False
276
+
277
+ if tag_name == 'pre':
278
+ return False
279
+
280
+ # Don't translate if class indicates code
281
+ if self._has_code_class(node):
282
+ return False
283
+
284
+ # Don't translate image alt text that's purely technical
285
+ if tag_name == 'img' and self._is_technical_alt_text(node.get('alt', '')):
286
+ return False
287
+
288
+ return True
289
+
290
+ def _should_preserve_formatting(self, tag_name: str) -> bool:
291
+ """Check if formatting should be preserved."""
292
+ return tag_name in (self.STRUCTURE_PRESERVING_TAGS | self.FORMATTING_TAGS)
293
+
294
+ def _has_code_class(self, node: Tag) -> bool:
295
+ """Check if node has code-related classes."""
296
+ classes = node.get('class', [])
297
+ if isinstance(classes, str):
298
+ classes = [classes]
299
+
300
+ code_indicators = [
301
+ 'language-', 'highlight', 'code', ' hljs', 'chroma',
302
+ 'source-code', 'pre', 'verbatim', 'literal'
303
+ ]
304
+
305
+ return any(
306
+ any(indicator in cls for indicator in code_indicators)
307
+ for cls in classes
308
+ )
309
+
310
+ def _is_block_code(self, node: Tag) -> bool:
311
+ """Check if code element is a block code."""
312
+ return (
313
+ node.name == 'code' and
314
+ node.parent and
315
+ node.parent.name == 'pre'
316
+ )
317
+
318
+ def _is_technical_alt_text(self, alt_text: str) -> bool:
319
+ """Check if alt text is purely technical."""
320
+ technical_indicators = [
321
+ 'diagram', 'chart', 'graph', 'formula', 'equation',
322
+ 'algorithm', 'flowchart', 'schema', 'architecture'
323
+ ]
324
+
325
+ return any(indicator in alt_text.lower() for indicator in technical_indicators)
326
+
327
+ def _flatten_elements(self, elements: List[ContentElement]) -> List[ContentElement]:
328
+ """Flatten nested elements into a single list."""
329
+ flattened = []
330
+ for element in elements:
331
+ flattened.append(element)
332
+ flattened.extend(self._flatten_elements(element.children))
333
+ return flattened
334
+
335
+ def extract_translatable_text(self, elements: List[ContentElement]) -> str:
336
+ """
337
+ Extract only translatable text content from elements.
338
+
339
+ Args:
340
+ elements: Parsed content elements
341
+
342
+ Returns:
343
+ Concatenated translatable text
344
+ """
345
+ translatable_parts = []
346
+
347
+ for element in self._flatten_elements(elements):
348
+ if element.should_translate and element.element_type != ContentType.CODE:
349
+ if element.element_type == ContentType.TEXT:
350
+ translatable_parts.append(element.content)
351
+ else:
352
+ # Add spacing for block elements
353
+ if element.element_type == ContentType.HEADING:
354
+ translatable_parts.append('\n\n')
355
+
356
+ return ''.join(translatable_parts).strip()
357
+
358
+ def inject_translation_markers(
359
+ self,
360
+ elements: List[ContentElement],
361
+ translated_text: str
362
+ ) -> List[ContentElement]:
363
+ """
364
+ Inject translation markers into elements for reconstruction.
365
+
366
+ Args:
367
+ elements: Original parsed elements
368
+ translated_text: Translated text content
369
+
370
+ Returns:
371
+ Elements with markers injected
372
+ """
373
+ # This is a simplified version - in practice, you'd want
374
+ # more sophisticated mapping of translated text to elements
375
+ translatable_elements = [
376
+ e for e in self._flatten_elements(elements)
377
+ if e.should_translate and e.element_type != ContentType.CODE
378
+ ]
379
+
380
+ if translatable_elements:
381
+ # Inject markers around the whole content
382
+ first = translatable_elements[0]
383
+ last = translatable_elements[-1]
384
+
385
+ # Add start marker
386
+ first.attributes['_translation_start'] = True
387
+
388
+ # Add end marker
389
+ last.attributes['_translation_end'] = True
390
+
391
+ return elements
392
+
393
+ def extract_code_blocks(self, html_content: str) -> List[Dict[str, Any]]:
394
+ """
395
+ Extract code blocks from HTML content.
396
+
397
+ Args:
398
+ html_content: HTML content to parse
399
+
400
+ Returns:
401
+ List of code block information
402
+ """
403
+ code_blocks = []
404
+ soup = BeautifulSoup(html_content, 'html.parser')
405
+
406
+ # Find all code blocks
407
+ for code_element in soup.find_all(['pre', 'code']):
408
+ if code_element.name == 'pre' or (
409
+ code_element.name == 'code' and
410
+ code_element.parent and
411
+ code_element.parent.name == 'pre'
412
+ ):
413
+ language = None
414
+ classes = code_element.get('class', [])
415
+
416
+ # Extract language from classes
417
+ if classes:
418
+ for cls in classes:
419
+ if isinstance(cls, str):
420
+ if cls.startswith('language-'):
421
+ language = cls[9:]
422
+ elif cls in ['python', 'javascript', 'java', 'cpp', 'html', 'css', 'sql']:
423
+ language = cls
424
+
425
+ code_content = code_element.get_text()
426
+ code_html = str(code_element)
427
+
428
+ code_blocks.append({
429
+ 'language': language or 'text',
430
+ 'content': code_content,
431
+ 'html': code_html,
432
+ 'position': html_content.find(code_html)
433
+ })
434
+
435
+ logger.info(
436
+ "Code blocks extracted",
437
+ total_blocks=len(code_blocks),
438
+ languages=[cb['language'] for cb in code_blocks]
439
+ )
440
+
441
+ return code_blocks
442
+
443
+ def preserve_code_blocks(
444
+ self,
445
+ html_content: str,
446
+ translated_content: str
447
+ ) -> str:
448
+ """
449
+ Preserve code blocks in translated content.
450
+
451
+ Args:
452
+ html_content: Original HTML with code blocks
453
+ translated_content: Translated HTML (code blocks might be altered)
454
+
455
+ Returns:
456
+ HTML with original code blocks preserved
457
+ """
458
+ # Extract code blocks from original
459
+ original_blocks = self.extract_code_blocks(html_content)
460
+
461
+ # Replace code blocks in translated content with originals
462
+ result = translated_content
463
+ for block in original_blocks:
464
+ result = result.replace(block['html'], block['html'], 1)
465
+
466
+ logger.info(
467
+ "Code blocks preserved",
468
+ blocks_count=len(original_blocks)
469
+ )
470
+
471
+ return result
472
+
473
+ def validate_structure(
474
+ self,
475
+ original_elements: List[ContentElement],
476
+ translated_elements: List[ContentElement]
477
+ ) -> List[str]:
478
+ """
479
+ Validate that structure is preserved between original and translated.
480
+
481
+ Args:
482
+ original_elements: Original parsed elements
483
+ translated_elements: Translated parsed elements
484
+
485
+ Returns:
486
+ List of validation errors
487
+ """
488
+ errors = []
489
+
490
+ # Compare structure counts
491
+ original_types = self._count_element_types(original_elements)
492
+ translated_types = self._count_element_types(translated_elements)
493
+
494
+ for element_type, count in original_types.items():
495
+ if element_type != ContentType.TEXT: # Text count may differ
496
+ translated_count = translated_types.get(element_type, 0)
497
+ if count != translated_count:
498
+ errors.append(
499
+ f"Element count mismatch for {element_type.value}: "
500
+ f"original={count}, translated={translated_count}"
501
+ )
502
+
503
+ # Check that code blocks are preserved
504
+ original_code_blocks = len([
505
+ e for e in self._flatten_elements(original_elements)
506
+ if e.element_type == ContentType.CODE
507
+ ])
508
+ translated_code_blocks = len([
509
+ e for e in self._flatten_elements(translated_elements)
510
+ if e.element_type == ContentType.CODE
511
+ ])
512
+
513
+ if original_code_blocks != translated_code_blocks:
514
+ errors.append(
515
+ f"Code block count mismatch: "
516
+ f"original={original_code_blocks}, translated={translated_code_blocks}"
517
+ )
518
+
519
+ logger.info(
520
+ "Structure validation complete",
521
+ errors_count=len(errors),
522
+ element_types_matched=len(set(original_types.keys()) & set(translated_types.keys()))
523
+ )
524
+
525
+ return errors
526
+
527
+ def _count_element_types(self, elements: List[ContentElement]) -> Dict[ContentType, int]:
528
+ """Count occurrences of each element type."""
529
+ counts = {}
530
+ for element in self._flatten_elements(elements):
531
+ counts[element.element_type] = counts.get(element.element_type, 0) + 1
532
+ return counts
533
+
534
+ def generate_structure_report(
535
+ self,
536
+ elements: List[ContentElement]
537
+ ) -> Dict[str, Any]:
538
+ """
539
+ Generate a report of the content structure.
540
+
541
+ Args:
542
+ elements: Parsed content elements
543
+
544
+ Returns:
545
+ Structure report
546
+ """
547
+ flattened = self._flatten_elements(elements)
548
+ type_counts = self._count_element_types(elements)
549
+
550
+ report = {
551
+ "total_elements": len(flattened),
552
+ "element_types": {
553
+ type_name.value: count
554
+ for type_name, count in type_counts.items()
555
+ },
556
+ "translatable_elements": len([e for e in flattened if e.should_translate]),
557
+ "code_blocks": type_counts.get(ContentType.CODE, 0),
558
+ "headings": type_counts.get(ContentType.HEADING, 0),
559
+ "lists": type_counts.get(ContentType.LIST, 0),
560
+ "links": type_counts.get(ContentType.LINK, 0),
561
+ "images": type_counts.get(ContentType.IMAGE, 0),
562
+ "tables": type_counts.get(ContentType.TABLE, 0)
563
+ }
564
+
565
+ return report
src/services/openai_translation/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAI Translation Service with Gemini API.
3
+
4
+ This package provides translation services using OpenAI Agents SDK
5
+ with Gemini API for high-quality English to Urdu translation.
6
+ """
7
+
8
+ from .service import OpenAITranslationService
9
+
10
+ __all__ = ["OpenAITranslationService"]
src/services/openai_translation/client.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAI Agents SDK Client for Gemini API.
3
+ """
4
+
5
+ import os
6
+ from agents import AsyncOpenAI, OpenAIChatCompletionsModel
7
+
8
+
9
+ class GeminiOpenAIClient:
10
+ """OpenAI Agents SDK client for Gemini API."""
11
+
12
+ def __init__(self):
13
+ """Initialize Gemini OpenAI client."""
14
+ api_key = os.getenv("GEMINI_API_KEY")
15
+ if not api_key:
16
+ raise ValueError("GEMINI_API_KEY not configured")
17
+
18
+ # Initialize AsyncOpenAI client for Gemini
19
+ self.provider = AsyncOpenAI(
20
+ base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
21
+ api_key=api_key,
22
+ )
23
+
24
+ # Define the chat completions model using Gemini
25
+ self.model = OpenAIChatCompletionsModel(
26
+ openai_client=self.provider,
27
+ model="gemini-2.0-flash-lite",
28
+ )
29
+
30
+ def get_provider(self) -> AsyncOpenAI:
31
+ """Get the AsyncOpenAI provider."""
32
+ return self.provider
33
+
34
+ def get_client(self) -> AsyncOpenAI:
35
+ """Get the AsyncOpenAI client (alias for get_provider)."""
36
+ return self.provider
37
+
38
+ def get_model(self) -> OpenAIChatCompletionsModel:
39
+ """Get the OpenAI chat completions model."""
40
+ return self.model
41
+
42
+ async def test_connection(self) -> bool:
43
+ """Test the connection to Gemini API."""
44
+ try:
45
+ # Try a simple completion request
46
+ response = await self.provider.chat.completions.create(
47
+ model="gemini-2.0-flash-lite",
48
+ messages=[{"role": "user", "content": "test"}],
49
+ max_tokens=1
50
+ )
51
+ return True
52
+ except Exception as e:
53
+ print(f"Connection test failed: {str(e)}")
54
+ return False
55
+
56
+
57
+ def get_gemini_client() -> GeminiOpenAIClient:
58
+ """Get the Gemini client instance."""
59
+ return GeminiOpenAIClient()
src/services/openai_translation/openai_agent.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAI Agents SDK Implementation for Translation.
3
+
4
+ This module properly implements translation using the OpenAI Agents SDK
5
+ with Gemini API integration, including proper error handling for rate limits.
6
+ """
7
+
8
+ import asyncio
9
+ from typing import Dict, List, Optional, Any, AsyncGenerator
10
+ from dataclasses import dataclass
11
+ import time
12
+ import json
13
+
14
+ from agents import Agent, Runner, function_tool, RunContextWrapper
15
+ from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
16
+ from src.models.translation_openai import TranslationJob, TranslationChunk
17
+ from src.utils.translation_logger import get_translation_logger
18
+ from src.utils.translation_errors import (
19
+ TranslationError, RateLimitError, APIError,
20
+ retry_with_exponential_backoff, handle_api_error
21
+ )
22
+
23
+ logger = get_translation_logger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class TranslationContext:
28
+ """Context information for translation."""
29
+ page_url: Optional[str] = None
30
+ page_title: Optional[str] = None
31
+ document_type: Optional[str] = None # book, article, documentation, etc.
32
+ technical_domain: Optional[str] = None # AI, robotics, programming, etc.
33
+ target_audience: Optional[str] = None # students, professionals, general
34
+ previous_translations: Optional[List[str]] = None
35
+ glossary: Optional[Dict[str, str]] = None
36
+ chunk_index: Optional[int] = None
37
+ total_chunks: Optional[int] = None
38
+
39
+
40
+ class OpenAITranslationAgent:
41
+ """
42
+ OpenAI Agents SDK-based translation agent with proper error handling.
43
+
44
+ Uses the official OpenAI Agents SDK with Gemini API for intelligent translation
45
+ with context awareness and specialized tools.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ gemini_client: GeminiOpenAIClient,
51
+ model: str = "gemini-2.0-flash-lite"
52
+ ):
53
+ """
54
+ Initialize translation agent.
55
+
56
+ Args:
57
+ gemini_client: Configured Gemini OpenAI client
58
+ model: Model to use for translation
59
+ """
60
+ self.client = gemini_client
61
+ self.model = model
62
+ self.agent = self._create_agent()
63
+
64
+ logger.info(
65
+ "OpenAI Translation Agent initialized",
66
+ model=model
67
+ )
68
+
69
+ def _create_agent(self) -> Agent:
70
+ """Create the translation agent with tools and proper error handling."""
71
+ instructions = """
72
+ You are a professional translator specializing in technical content translation from English to Urdu.
73
+
74
+ Your primary task is to translate English content to Urdu while:
75
+ 1. Maintaining technical accuracy
76
+ 2. Using appropriate Urdu terminology
77
+ 3. Preserving code blocks and technical identifiers
78
+ 4. Providing contextually appropriate translations
79
+ 5. Using Urdu script (Nastaleeq) for all Urdu text
80
+
81
+ Key Translation Guidelines:
82
+ - Translate ALL content unless explicitly marked as code
83
+ - Use Urdu script for all translations
84
+ - For technical terms, use established Urdu translations where available
85
+ - For brand new terms, create appropriate Urdu equivalents
86
+ - Maintain the original document structure and formatting
87
+ - Code blocks remain in English but add Urdu comments if helpful
88
+
89
+ Technical Term Examples:
90
+ - AI → مصنوعی ذہانت
91
+ - Machine Learning → مشین لرننگ
92
+ - Robotics → روبوٹکس
93
+ - Computer Vision → کمپیوٹر ویژن
94
+ - Neural Network → نیورل نیٹورک
95
+ - Algorithm → الگورتھم
96
+
97
+ Error Handling:
98
+ - If you encounter rate limiting errors, wait and retry automatically
99
+ - If translation fails for a chunk, note the error and continue
100
+ - Always provide meaningful error messages
101
+
102
+ Always strive for natural, fluent Urdu that accurately conveys the technical meaning.
103
+ """
104
+
105
+ return Agent(
106
+ name="UrduTechnicalTranslator",
107
+ instructions=instructions,
108
+ model=self.model,
109
+ tools=[
110
+ self._create_translate_tool(),
111
+ self._create_analyze_code_tool(),
112
+ self._create_glossary_tool(),
113
+ self._create_context_tool()
114
+ ]
115
+ )
116
+
117
+ async def _handle_rate_limit_error(self, error: Exception) -> None:
118
+ """
119
+ Handle rate limit errors with proper backoff.
120
+
121
+ Args:
122
+ error: The rate limit error
123
+ """
124
+ if isinstance(error, OpenAIRateLimitError):
125
+ logger.warning(
126
+ "Rate limit hit, implementing backoff",
127
+ retry_after=error.retry_after if hasattr(error, 'retry_after') else None
128
+ )
129
+
130
+ # Implement exponential backoff
131
+ retry_after = getattr(error, 'retry_after', 1)
132
+ await asyncio.sleep(retry_after)
133
+
134
+ # Handle HTTP 429 from OpenAI client
135
+ elif hasattr(error, 'status_code') and error.status_code == 429:
136
+ retry_after = 1
137
+ if hasattr(error, 'response') and error.response:
138
+ try:
139
+ error_data = error.response.json()
140
+ retry_after = error_data.get('retry_after', retry_after)
141
+ except:
142
+ pass
143
+
144
+ logger.warning(
145
+ "HTTP 429 rate limit hit",
146
+ retry_after=retry_after
147
+ )
148
+ await asyncio.sleep(retry_after)
149
+
150
+ async def translate_with_agent(
151
+ self,
152
+ text: str,
153
+ context: Optional[TranslationContext] = None
154
+ ) -> Dict[str, Any]:
155
+ """
156
+ Translate text using OpenAI Agents SDK with proper error handling.
157
+
158
+ Args:
159
+ text: Text to translate
160
+ context: Translation context
161
+
162
+ Returns:
163
+ Translation result with metadata
164
+ """
165
+ logger.info(
166
+ "Starting translation with OpenAI Agents SDK",
167
+ text_length=len(text),
168
+ has_context=bool(context)
169
+ )
170
+
171
+ # Prepare context prompt
172
+ context_info = ""
173
+ if context:
174
+ if context.technical_domain:
175
+ context_info += f"\nDomain: {context.technical_domain}"
176
+ if context.document_type:
177
+ context_info += f"\nDocument Type: {context.document_type}"
178
+ if context.target_audience:
179
+ context_info += f"\nTarget Audience: {context.target_audience}"
180
+ if context.chunk_index is not None:
181
+ context_info += f"\nChunk: {context.chunk_index + 1} of {context.total_chunks or '?'}"
182
+
183
+ # Create the translation prompt
184
+ prompt = f"""
185
+ Translate the following English text to Urdu:
186
+
187
+ {context_info}
188
+
189
+ Text:
190
+ {text}
191
+
192
+ Requirements:
193
+ - Use Urdu script (Nastaleeq)
194
+ - Translate all non-code content
195
+ - Preserve formatting and structure
196
+ - Use appropriate technical terminology
197
+ - Maintain consistency with previous translations
198
+ """
199
+
200
+ try:
201
+ # Create runner and execute with retry logic
202
+ runner = Runner(self.agent)
203
+
204
+ # Implement retry with rate limit handling
205
+ max_retries = 3
206
+ for attempt in range(max_retries):
207
+ try:
208
+ result = await runner.run(prompt)
209
+
210
+ # Extract metadata
211
+ tokens_used = 0
212
+ if hasattr(result, 'usage') and result.usage:
213
+ tokens_used = result.usage.total_tokens
214
+
215
+ return {
216
+ "translated_text": result.final_output.strip(),
217
+ "original_text": text,
218
+ "tokens_used": tokens_used,
219
+ "model": self.model,
220
+ "confidence_score": 0.9, # Placeholder
221
+ "attempt": attempt + 1,
222
+ "context": context_info
223
+ }
224
+
225
+ except OpenAIRateLimitError as e:
226
+ if attempt < max_retries - 1:
227
+ await self._handle_rate_limit_error(e)
228
+ continue
229
+ else:
230
+ raise RateLimitError(
231
+ f"Rate limit exceeded after {max_retries} attempts",
232
+ retry_after=getattr(e, 'retry_after', None)
233
+ )
234
+
235
+ except Exception as e:
236
+ # Check if it's an HTTP 429 error
237
+ if hasattr(e, 'status_code') and e.status_code == 429:
238
+ if attempt < max_retries - 1:
239
+ await self._handle_rate_limit_error(e)
240
+ continue
241
+ else:
242
+ raise RateLimitError(
243
+ f"Rate limit exceeded after {max_retries} attempts",
244
+ retry_after=getattr(e, 'retry_after', 1)
245
+ )
246
+ else:
247
+ # Re-raise non-rate-limit errors
248
+ raise
249
+
250
+ except RateLimitError:
251
+ raise
252
+ except Exception as e:
253
+ logger.error(
254
+ "Agent translation failed",
255
+ error=str(e),
256
+ error_type=type(e).__name__
257
+ )
258
+ raise TranslationError(
259
+ f"Translation failed: {str(e)}",
260
+ error_type="AGENT_ERROR",
261
+ details={"original_error": str(e)}
262
+ )
263
+
264
+ def _create_translate_tool(self):
265
+ """Create the translate tool for the agent."""
266
+ @function_tool
267
+ async def translate_text(
268
+ ctx: RunContextWrapper[Any],
269
+ text: str,
270
+ context: Optional[Dict[str, Any]] = None,
271
+ preserve_formatting: bool = True
272
+ ) -> str:
273
+ """
274
+ Translate text from English to Urdu using the OpenAI client directly.
275
+
276
+ This is a fallback tool used by the agent for complex translations.
277
+ """
278
+ logger.debug(
279
+ "Using translate_text tool",
280
+ text_length=len(text)
281
+ )
282
+
283
+ try:
284
+ # Use the Gemini OpenAI client directly
285
+ client = self.client.get_client()
286
+
287
+ response = await client.chat.completions.create(
288
+ model=self.model,
289
+ messages=[
290
+ {
291
+ "role": "system",
292
+ "content": "You are a professional translator for technical content."
293
+ },
294
+ {
295
+ "role": "user",
296
+ "content": f"Translate to Urdu: {text}"
297
+ }
298
+ ],
299
+ temperature=0.3,
300
+ max_tokens=4000
301
+ )
302
+
303
+ return response.choices[0].message.content.strip()
304
+
305
+ except Exception as e:
306
+ if hasattr(e, 'status_code') and e.status_code == 429:
307
+ # Convert to OpenAI Agents SDK rate limit error
308
+ raise OpenAIRateLimitError(
309
+ "Rate limit exceeded",
310
+ retry_after=getattr(e, 'retry_after', 1)
311
+ )
312
+ raise
313
+
314
+ return translate_text
315
+
316
+ def _create_analyze_code_tool(self):
317
+ """Create the code analysis tool for the agent."""
318
+ @function_tool
319
+ async def analyze_code_blocks(
320
+ ctx: RunContextWrapper[Any],
321
+ text: str
322
+ ) -> List[Dict[str, Any]]:
323
+ """
324
+ Analyze text to identify and extract code blocks.
325
+ """
326
+ import re
327
+
328
+ # Pattern to match code blocks
329
+ code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
330
+
331
+ code_blocks = []
332
+ for match in code_pattern.finditer(text):
333
+ language = match.group(1) or "text"
334
+ code_content = match.group(2)
335
+ start_pos = match.start()
336
+ end_pos = match.end()
337
+
338
+ code_blocks.append({
339
+ "language": language,
340
+ "content": code_content,
341
+ "start_position": start_pos,
342
+ "end_position": end_pos,
343
+ "length": len(code_content)
344
+ })
345
+
346
+ return code_blocks
347
+
348
+ return analyze_code_blocks
349
+
350
+ def _create_glossary_tool(self):
351
+ """Create the glossary tool for the agent."""
352
+ @function_tool
353
+ async def get_translation_glossary(
354
+ ctx: RunContextWrapper[Any],
355
+ domain: Optional[str] = None
356
+ ) -> Dict[str, str]:
357
+ """
358
+ Get domain-specific translation glossary.
359
+ """
360
+ glossaries = {
361
+ "ai": {
362
+ "Artificial Intelligence": "مصنوعی ذہانت",
363
+ "Machine Learning": "مشین لرننگ",
364
+ "Deep Learning": "ڈیپ لرننگ",
365
+ "Neural Network": "نیورل نیٹورک",
366
+ "Algorithm": "الگورتھم",
367
+ "Model": "ماڈل",
368
+ "Training": "تربیت",
369
+ "Inference": "استنتاج",
370
+ "Dataset": "ڈیٹاسیٹ",
371
+ "Feature": "خصوصیت"
372
+ },
373
+ "robotics": {
374
+ "Robot": "روبوٹ",
375
+ "Actuator": "ایکچویٹر",
376
+ "Sensor": "سینسر",
377
+ "Kinematics": "کائنیمیٹکس",
378
+ "Path Planning": "پاتھ پلاننگ",
379
+ "Control System": "کنٹرول سسٹم",
380
+ "Embedded": "ایمبیڈڈ",
381
+ "Autonomous": "خودکار"
382
+ },
383
+ "programming": {
384
+ "Function": "فنکشن",
385
+ "Variable": "متغیر",
386
+ "Class": "کلاس",
387
+ "Object": "آبجیکٹ",
388
+ "Method": "میٹھڈ",
389
+ "Library": "لائبریری",
390
+ "Framework": "فریم ورک",
391
+ "API": "API",
392
+ "Database": "ڈیٹا بیس",
393
+ "Server": "سرور"
394
+ }
395
+ }
396
+
397
+ if domain and domain.lower() in glossaries:
398
+ return glossaries[domain.lower()]
399
+
400
+ # Return combined glossary for general use
401
+ combined = {}
402
+ for gloss in glossaries.values():
403
+ combined.update(gloss)
404
+
405
+ return combined
406
+
407
+ return get_translation_glossary
408
+
409
+ def _create_context_tool(self):
410
+ """Create the context tool for the agent."""
411
+ @function_tool
412
+ async def set_translation_context(
413
+ ctx: RunContextWrapper[Any],
414
+ page_url: Optional[str] = None,
415
+ document_type: Optional[str] = None,
416
+ technical_domain: Optional[str] = None,
417
+ target_audience: Optional[str] = None
418
+ ) -> Dict[str, Any]:
419
+ """
420
+ Set context for translation decisions.
421
+ """
422
+ context = {
423
+ "page_url": page_url,
424
+ "document_type": document_type,
425
+ "technical_domain": technical_domain,
426
+ "target_audience": target_audience,
427
+ "set_at": time.time()
428
+ }
429
+
430
+ logger.info(
431
+ "Translation context set via tool",
432
+ context=context
433
+ )
434
+
435
+ return {
436
+ "success": True,
437
+ "message": "Translation context updated successfully",
438
+ "context": context
439
+ }
440
+
441
+ return set_translation_context
442
+
443
+ async def translate_chunk_sequence(
444
+ self,
445
+ chunks: List[str],
446
+ context: Optional[TranslationContext] = None
447
+ ) -> List[Dict[str, Any]]:
448
+ """
449
+ Translate a sequence of chunks maintaining consistency.
450
+
451
+ Args:
452
+ chunks: List of text chunks to translate
453
+ context: Translation context
454
+
455
+ Returns:
456
+ List of translation results
457
+ """
458
+ logger.info(
459
+ "Translating chunk sequence with OpenAI Agents SDK",
460
+ chunk_count=len(chunks),
461
+ has_context=bool(context)
462
+ )
463
+
464
+ results = []
465
+ total_tokens = 0
466
+
467
+ for i, chunk in enumerate(chunks):
468
+ logger.debug(
469
+ "Translating chunk",
470
+ chunk_index=i,
471
+ chunk_length=len(chunk)
472
+ )
473
+
474
+ # Update context with chunk info
475
+ chunk_context = context
476
+ if chunk_context:
477
+ chunk_context.chunk_index = i
478
+ chunk_context.total_chunks = len(chunks)
479
+
480
+ try:
481
+ result = await self.translate_with_agent(chunk, chunk_context)
482
+ result["chunk_index"] = i
483
+ results.append(result)
484
+ total_tokens += result.get("tokens_used", 0)
485
+
486
+ except RateLimitError as e:
487
+ logger.error(
488
+ "Rate limit hit for chunk",
489
+ chunk_index=i,
490
+ retry_after=e.retry_after
491
+ )
492
+ # Add rate limit error result
493
+ results.append({
494
+ "chunk_index": i,
495
+ "translated_text": f"[RATE LIMIT ERROR: {str(e)}]",
496
+ "original_text": chunk,
497
+ "error": str(e),
498
+ "error_type": "RATE_LIMIT",
499
+ "tokens_used": 0,
500
+ "model": self.model,
501
+ "confidence_score": 0.0,
502
+ "retry_after": e.retry_after
503
+ })
504
+
505
+ except Exception as e:
506
+ logger.error(
507
+ "Chunk translation failed",
508
+ chunk_index=i,
509
+ error=str(e)
510
+ )
511
+ # Add failed result
512
+ results.append({
513
+ "chunk_index": i,
514
+ "translated_text": chunk, # Fallback to original
515
+ "original_text": chunk,
516
+ "error": str(e),
517
+ "tokens_used": 0,
518
+ "model": self.model,
519
+ "confidence_score": 0.0
520
+ })
521
+
522
+ logger.info(
523
+ "Chunk sequence translation completed",
524
+ total_chunks=len(chunks),
525
+ successful_chunks=len([r for r in results if not r.get("error")]),
526
+ total_tokens=total_tokens
527
+ )
528
+
529
+ return results
530
+
531
+ async def get_agent(self) -> Agent:
532
+ """Get the configured translation agent."""
533
+ return self.agent
src/services/openai_translation/service.py ADDED
@@ -0,0 +1,855 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAI Translation Service using Gemini API.
3
+
4
+ This service implements the core translation functionality using
5
+ OpenAI Agents SDK with Gemini's OpenAI-compatible endpoint.
6
+ """
7
+
8
+ import asyncio
9
+ import hashlib
10
+ import json
11
+ import time
12
+ import uuid
13
+ from datetime import datetime, timedelta
14
+ from typing import Dict, List, Optional, Any, AsyncGenerator
15
+ from dataclasses import dataclass
16
+
17
+ from openai import AsyncOpenAI
18
+ from openai.types.chat import ChatCompletion
19
+
20
+ from src.models.translation_openai import (
21
+ TranslationJob, TranslationChunk, TranslationError, TranslationSession,
22
+ TranslationCache, TranslationJobStatus, ChunkStatus, ErrorSeverity
23
+ )
24
+ from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
25
+ from src.services.cache_service import CacheService, get_cache_service
26
+ from src.database.base import get_db
27
+ from src.utils.translation_errors import (
28
+ TranslationError as TranslationServiceError, APIError, RateLimitError,
29
+ with_translation_error_handling, retry_with_exponential_backoff
30
+ )
31
+ from src.utils.translation_logger import get_translation_logger, log_translation_performance
32
+
33
+ logger = get_translation_logger(__name__)
34
+
35
+
36
+ @dataclass
37
+ class OpenAITranslationRequest:
38
+ """Translation request with comprehensive parameters."""
39
+ text: str
40
+ source_language: str
41
+ target_language: str
42
+ page_url: Optional[str] = None
43
+ user_id: Optional[str] = None
44
+ session_id: Optional[str] = None
45
+
46
+ # OpenAI parameters
47
+ model: str = "gemini-2.0-flash-lite"
48
+ temperature: float = 0.3
49
+ max_tokens: int = 2048
50
+
51
+ # Processing options
52
+ preserve_code_blocks: bool = True
53
+ enable_transliteration: bool = True
54
+ chunk_size: int = 2000
55
+ max_chunks: int = 100
56
+
57
+ # Retry settings
58
+ max_retries: int = 3
59
+ retry_delay: float = 1.0
60
+
61
+ # Streaming
62
+ streaming: bool = False
63
+
64
+ # Session context
65
+ user_agent: Optional[str] = None
66
+ ip_address: Optional[str] = None
67
+
68
+
69
+ @dataclass
70
+ class OpenAITranslationResponse:
71
+ """Translation response with comprehensive metadata."""
72
+ job_id: str
73
+ translated_text: str
74
+ status: TranslationJobStatus
75
+ progress: float # 0-100
76
+ chunks: List[Dict[str, Any]]
77
+ processing_time_ms: int
78
+ cached: bool
79
+
80
+ # Cost tracking
81
+ input_tokens: int
82
+ output_tokens: int
83
+ estimated_cost_usd: float
84
+
85
+ # Quality metrics
86
+ confidence_score: Optional[float] = None
87
+ quality_score: Optional[float] = None
88
+
89
+ # Error information
90
+ error_message: Optional[str] = None
91
+ error_details: Optional[Dict[str, Any]] = None
92
+
93
+ # Cache information
94
+ cache_key: Optional[str] = None
95
+ cache_hit: bool = False
96
+
97
+
98
+ class OpenAITranslationService:
99
+ """
100
+ Translation service using OpenAI Agents SDK with Gemini API.
101
+
102
+ Features:
103
+ - OpenAI Agents SDK with Gemini 2.0 Flash model
104
+ - Content chunking for large texts
105
+ - Enhanced caching with page URL support
106
+ - Progress tracking and streaming
107
+ - Error handling and retries
108
+ - Session management
109
+ - Cost and quality tracking
110
+ """
111
+
112
+ # Translation prompt templates
113
+ TRANSLATION_PROMPT_TEMPLATE = """
114
+ You are a professional translator. Translate the following text from {source_lang} to {target_lang}.
115
+
116
+ CRITICAL REQUIREMENTS:
117
+ 1. Translate ALL text to {target_lang} - no English words should remain
118
+ 2. ONLY preserve code blocks marked with ```
119
+ 3. Translate technical terms with context (e.g., AI → مصنوعی ذہانت)
120
+ 4. Use Urdu script (Nastaleeq) for Urdu text
121
+ 5. Maintain formatting and structure
122
+ 6. Mix Urdu with Roman Urdu for technical terms where appropriate
123
+
124
+ Text to translate:
125
+ {text}
126
+
127
+ Translate only the content above.
128
+ """
129
+
130
+ CHUNK_TRANSLATION_PROMPT = """
131
+ Translate this text segment from {source_lang} to {target_lang}.
132
+
133
+ Context: This is part {current_part} of {total_parts} of a larger document.
134
+
135
+ Requirements:
136
+ - Maintain consistency with the overall document
137
+ - Translate accurately while preserving meaning
138
+ - Handle technical terms appropriately
139
+ - Keep the flow natural
140
+ - Use Urdu script (Nastaleeq)
141
+
142
+ Text:
143
+ {text}
144
+
145
+ Translation:
146
+ """
147
+
148
+ # Model pricing (approximate USD per 1K tokens)
149
+ MODEL_PRICING = {
150
+ "gemini-2.0-flash-lite": {
151
+ "input": 0.000075, # $0.075 per 1M input tokens
152
+ "output": 0.00015 # $0.15 per 1M output tokens
153
+ }
154
+ }
155
+
156
+ def __init__(
157
+ self,
158
+ gemini_client: Optional[GeminiOpenAIClient] = None,
159
+ cache_service: Optional[CacheService] = None,
160
+ enable_analytics: bool = True
161
+ ):
162
+ """
163
+ Initialize OpenAI translation service.
164
+
165
+ Args:
166
+ gemini_client: Gemini OpenAI client
167
+ cache_service: Cache service instance
168
+ enable_analytics: Whether to collect detailed analytics
169
+ """
170
+ self.gemini_client = gemini_client
171
+ self.cache_service = cache_service
172
+ self.enable_analytics = enable_analytics
173
+
174
+ # Initialize services if not provided
175
+ if not self.gemini_client:
176
+ self.gemini_client = get_gemini_client()
177
+
178
+ if not self.cache_service:
179
+ self.cache_service = get_cache_service()
180
+
181
+ logger.info(
182
+ "OpenAI Translation Service initialized",
183
+ model="gemini-2.0-flash-lite",
184
+ analytics_enabled=enable_analytics
185
+ )
186
+
187
+ def _generate_content_hash(self, text: str, source_lang: str, target_lang: str) -> str:
188
+ """Generate SHA-256 hash for content identification."""
189
+ content = f"{text}:{source_lang}:{target_lang}"
190
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()
191
+
192
+ def _generate_cache_key(self, content_hash: str, page_url: Optional[str] = None) -> str:
193
+ """Generate comprehensive cache key including page URL."""
194
+ if page_url:
195
+ url_hash = hashlib.sha256(page_url.encode('utf-8')).hexdigest()[:16]
196
+ return f"openai_translation:{content_hash}:{url_hash}"
197
+ return f"openai_translation:{content_hash}"
198
+
199
+ async def _check_cache(
200
+ self,
201
+ content_hash: str,
202
+ page_url: Optional[str] = None
203
+ ) -> Optional[TranslationCache]:
204
+ """Check if translation is cached in database."""
205
+ cache_key = self._generate_cache_key(content_hash, page_url)
206
+
207
+ db = next(get_db())
208
+ try:
209
+ cache_entry = db.query(TranslationCache).filter(
210
+ TranslationCache.cache_key == cache_key,
211
+ TranslationCache.expires_at > datetime.utcnow()
212
+ ).first()
213
+
214
+ if cache_entry:
215
+ # Update hit statistics
216
+ cache_entry.hit_count += 1
217
+ cache_entry.last_hit_at = datetime.utcnow()
218
+ db.commit()
219
+ logger.info(
220
+ "Cache hit found",
221
+ cache_key=cache_key[:20],
222
+ hits=cache_entry.hit_count
223
+ )
224
+ return cache_entry
225
+
226
+ finally:
227
+ db.close()
228
+
229
+ return None
230
+
231
+ async def _cache_translation(
232
+ self,
233
+ job: TranslationJob,
234
+ cache_key: str,
235
+ quality_score: Optional[float] = None
236
+ ) -> bool:
237
+ """Cache a successful translation."""
238
+ try:
239
+ db = next(get_db())
240
+
241
+ # Determine TTL based on quality
242
+ if quality_score and quality_score >= 4.5:
243
+ ttl_hours = 30 * 24 # 30 days for high quality
244
+ elif quality_score and quality_score < 3.0:
245
+ ttl_hours = 24 # 1 day for low quality
246
+ else:
247
+ ttl_hours = 7 * 24 # 7 days default
248
+
249
+ expires_at = datetime.utcnow() + timedelta(hours=ttl_hours)
250
+
251
+ cache_entry = TranslationCache(
252
+ cache_key=cache_key,
253
+ job_id=job.id,
254
+ content_hash=job.content_hash,
255
+ page_url=job.page_url,
256
+ source_language=job.source_language,
257
+ target_language=job.target_language,
258
+ original_text=job.original_text,
259
+ translated_text=job.translated_text,
260
+ model_version=job.model_name,
261
+ processing_time_ms=job.processing_time_ms,
262
+ ttl_hours=ttl_hours,
263
+ expires_at=expires_at,
264
+ quality_score=quality_score,
265
+ is_validated=quality_score is not None
266
+ )
267
+
268
+ db.add(cache_entry)
269
+ db.commit()
270
+
271
+ logger.info(
272
+ "Translation cached",
273
+ cache_key=cache_key[:20],
274
+ ttl_hours=ttl_hours
275
+ )
276
+ return True
277
+
278
+ except Exception as e:
279
+ logger.error("Failed to cache translation", error=str(e))
280
+ return False
281
+ finally:
282
+ db.close()
283
+
284
+ async def _translate_with_gemini(
285
+ self,
286
+ text: str,
287
+ source_lang: str,
288
+ target_lang: str,
289
+ model: str,
290
+ temperature: float,
291
+ max_tokens: int,
292
+ is_chunk: bool = False,
293
+ context: Optional[Dict[str, Any]] = None
294
+ ) -> Dict[str, Any]:
295
+ """
296
+ Translate text using Gemini via OpenAI SDK.
297
+
298
+ Returns:
299
+ Dict containing translated_text, tokens_used, and response metadata
300
+ """
301
+ client = self.gemini_client.get_client()
302
+
303
+ try:
304
+ # Select appropriate prompt
305
+ if is_chunk and context:
306
+ prompt = self.CHUNK_TRANSLATION_PROMPT.format(
307
+ source_lang=source_lang,
308
+ target_lang=target_lang,
309
+ current_part=context.get('current_part', 1),
310
+ total_parts=context.get('total_parts', 1),
311
+ text=text
312
+ )
313
+ else:
314
+ prompt = self.TRANSLATION_PROMPT_TEMPLATE.format(
315
+ source_lang=source_lang,
316
+ target_lang=target_lang,
317
+ text=text
318
+ )
319
+
320
+ # Call Gemini API via OpenAI SDK
321
+ response = await client.chat.completions.create(
322
+ model=model,
323
+ messages=[
324
+ {"role": "system", "content": "You are a professional translator."},
325
+ {"role": "user", "content": prompt}
326
+ ],
327
+ temperature=temperature,
328
+ max_tokens=max_tokens
329
+ )
330
+
331
+ # Extract translation and metrics
332
+ translated_text = response.choices[0].message.content
333
+ input_tokens = response.usage.prompt_tokens
334
+ output_tokens = response.usage.completion_tokens
335
+
336
+ # Calculate cost
337
+ pricing = self.MODEL_PRICING.get(model, self.MODEL_PRICING["gemini-2.0-flash-lite"])
338
+ estimated_cost = (
339
+ (input_tokens / 1000 * pricing["input"]) +
340
+ (output_tokens / 1000 * pricing["output"])
341
+ )
342
+
343
+ return {
344
+ "translated_text": translated_text.strip() if translated_text else "",
345
+ "input_tokens": input_tokens,
346
+ "output_tokens": output_tokens,
347
+ "total_tokens": input_tokens + output_tokens,
348
+ "estimated_cost": estimated_cost,
349
+ "model": model,
350
+ "response_id": response.id
351
+ }
352
+
353
+ except Exception as e:
354
+ logger.error("Gemini API error", error=str(e))
355
+ raise TranslationServiceError(
356
+ f"Translation failed: {str(e)}",
357
+ error_type="API_ERROR",
358
+ is_retriable=True
359
+ )
360
+
361
+ def _split_text_into_chunks(
362
+ self,
363
+ text: str,
364
+ chunk_size: int,
365
+ max_chunks: int,
366
+ preserve_code_blocks: bool = True
367
+ ) -> List[Dict[str, Any]]:
368
+ """
369
+ Split text into chunks for processing.
370
+
371
+ Returns:
372
+ List of chunks with text, position, and metadata
373
+ """
374
+ chunks = []
375
+
376
+ if preserve_code_blocks:
377
+ # Handle code blocks separately
378
+ import re
379
+ code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
380
+
381
+ last_end = 0
382
+ chunk_index = 0
383
+
384
+ for match in code_pattern.finditer(text):
385
+ # Process text before code block
386
+ text_before = text[last_end:match.start()]
387
+ if text_before:
388
+ text_chunks = self._split_plain_text(text_before, chunk_size - 200)
389
+ for chunk_text in text_chunks:
390
+ if chunk_index >= max_chunks:
391
+ break
392
+ chunks.append({
393
+ "text": chunk_text,
394
+ "start": last_end,
395
+ "end": last_end + len(chunk_text),
396
+ "is_code_block": False,
397
+ "index": chunk_index
398
+ })
399
+ chunk_index += 1
400
+ last_end += len(chunk_text)
401
+
402
+ # Add code block as separate chunk
403
+ if chunk_index < max_chunks:
404
+ code_lang = match.group(1) or "unknown"
405
+ code_content = match.group(2)
406
+ full_code = f"```{code_lang}\n{code_content}\n```"
407
+ chunks.append({
408
+ "text": full_code,
409
+ "start": match.start(),
410
+ "end": match.end(),
411
+ "is_code_block": True,
412
+ "code_language": code_lang,
413
+ "index": chunk_index
414
+ })
415
+ chunk_index += 1
416
+ last_end = match.end()
417
+
418
+ # Process remaining text
419
+ if last_end < len(text) and chunk_index < max_chunks:
420
+ remaining_text = text[last_end:]
421
+ text_chunks = self._split_plain_text(remaining_text, chunk_size)
422
+ for chunk_text in text_chunks:
423
+ if chunk_index >= max_chunks:
424
+ break
425
+ chunks.append({
426
+ "text": chunk_text,
427
+ "start": last_end,
428
+ "end": last_end + len(chunk_text),
429
+ "is_code_block": False,
430
+ "index": chunk_index
431
+ })
432
+ chunk_index += 1
433
+ last_end += len(chunk_text)
434
+ else:
435
+ # Simple text splitting
436
+ text_chunks = self._split_plain_text(text, chunk_size)
437
+ chunks = [
438
+ {
439
+ "text": chunk,
440
+ "start": i * chunk_size,
441
+ "end": (i + 1) * chunk_size,
442
+ "is_code_block": False,
443
+ "index": i
444
+ }
445
+ for i, chunk in enumerate(text_chunks[:max_chunks])
446
+ ]
447
+
448
+ return chunks
449
+
450
+ def _split_plain_text(self, text: str, chunk_size: int) -> List[str]:
451
+ """Split plain text into chunks, trying to preserve sentences."""
452
+ import re
453
+
454
+ chunks = []
455
+ sentences = re.split(r'(?<=[.!?])\s+', text)
456
+
457
+ current_chunk = ""
458
+ for sentence in sentences:
459
+ if len(current_chunk) + len(sentence) <= chunk_size:
460
+ current_chunk += sentence
461
+ else:
462
+ if current_chunk:
463
+ chunks.append(current_chunk)
464
+ current_chunk = sentence
465
+
466
+ if current_chunk:
467
+ chunks.append(current_chunk)
468
+
469
+ return chunks
470
+
471
+ @log_translation_performance
472
+ async def translate(
473
+ self,
474
+ request: OpenAITranslationRequest
475
+ ) -> OpenAITranslationResponse:
476
+ """
477
+ Translate text with comprehensive tracking and caching.
478
+
479
+ Args:
480
+ request: Translation request with all parameters
481
+
482
+ Returns:
483
+ Translation response with metadata
484
+ """
485
+ start_time = time.time()
486
+ job_id = str(uuid.uuid4())
487
+ content_hash = self._generate_content_hash(
488
+ request.text,
489
+ request.source_language,
490
+ request.target_language
491
+ )
492
+ cache_key = self._generate_cache_key(content_hash, request.page_url)
493
+
494
+ logger.bind_request(request_id=job_id).log_translation_request(
495
+ text_length=len(request.text),
496
+ source_lang=request.source_language,
497
+ target_lang=request.target_language,
498
+ page_url=request.page_url
499
+ )
500
+
501
+ # Check cache first
502
+ cached_translation = await self._check_cache(content_hash, request.page_url)
503
+ if cached_translation:
504
+ processing_time = int((time.time() - start_time) * 1000)
505
+
506
+ logger.log_translation_response(
507
+ translated_length=len(cached_translation.translated_text),
508
+ chunks_count=1,
509
+ cached=True
510
+ )
511
+
512
+ return OpenAITranslationResponse(
513
+ job_id=job_id,
514
+ translated_text=cached_translation.translated_text,
515
+ status=TranslationJobStatus.COMPLETED,
516
+ progress=100.0,
517
+ chunks=[],
518
+ processing_time_ms=processing_time,
519
+ cached=True,
520
+ input_tokens=0,
521
+ output_tokens=0,
522
+ estimated_cost_usd=0.0,
523
+ cache_key=cache_key,
524
+ cache_hit=True
525
+ )
526
+
527
+ # Create translation job
528
+ db = next(get_db())
529
+ try:
530
+ job = TranslationJob(
531
+ job_id=job_id,
532
+ user_id=request.user_id,
533
+ session_id=request.session_id,
534
+ content_hash=content_hash,
535
+ page_url=request.page_url,
536
+ source_language=request.source_language,
537
+ target_language=request.target_language,
538
+ original_text=request.text,
539
+ model_name=request.model,
540
+ temperature=request.temperature,
541
+ max_tokens=request.max_tokens,
542
+ preserve_code_blocks=request.preserve_code_blocks,
543
+ enable_transliteration=request.enable_transliteration,
544
+ chunk_size=request.chunk_size,
545
+ max_chunks=request.max_chunks,
546
+ user_agent=request.user_agent,
547
+ ip_address=request.ip_address
548
+ )
549
+
550
+ db.add(job)
551
+ db.commit()
552
+
553
+ # Split text into chunks
554
+ chunks_data = self._split_text_into_chunks(
555
+ request.text,
556
+ request.chunk_size,
557
+ request.max_chunks,
558
+ request.preserve_code_blocks
559
+ )
560
+
561
+ job.chunks_total = len(chunks_data)
562
+ job.status = TranslationJobStatus.PROCESSING.value
563
+ job.started_at = datetime.utcnow()
564
+ db.commit()
565
+
566
+ # Process chunks
567
+ translated_chunks = []
568
+ total_input_tokens = 0
569
+ total_output_tokens = 0
570
+ total_cost = 0.0
571
+
572
+ for i, chunk_data in enumerate(chunks_data):
573
+ try:
574
+ # Create chunk record
575
+ chunk = TranslationChunk(
576
+ job_id=job.id,
577
+ chunk_index=i,
578
+ original_text=chunk_data["text"],
579
+ start_position=chunk_data["start"],
580
+ end_position=chunk_data["end"],
581
+ is_code_block=chunk_data["is_code_block"],
582
+ code_language=chunk_data.get("code_language"),
583
+ word_count=len(chunk_data["text"].split()),
584
+ status=ChunkStatus.PROCESSING.value,
585
+ started_at=datetime.utcnow()
586
+ )
587
+ db.add(chunk)
588
+ db.commit()
589
+
590
+ # Translate or skip code blocks
591
+ if chunk_data["is_code_block"] and request.preserve_code_blocks:
592
+ translated_text = chunk_data["text"]
593
+ chunk.status = ChunkStatus.COMPLETED.value
594
+ chunk.translated_text = translated_text
595
+ chunk.completed_at = datetime.utcnow()
596
+ else:
597
+ # Translate chunk with retry logic
598
+ async def translate_chunk():
599
+ return await self._translate_with_gemini(
600
+ chunk_data["text"],
601
+ request.source_language,
602
+ request.target_language,
603
+ request.model,
604
+ request.temperature,
605
+ request.max_tokens,
606
+ is_chunk=True,
607
+ context={
608
+ "current_part": i + 1,
609
+ "total_parts": len(chunks_data)
610
+ } if len(chunks_data) > 1 else None
611
+ )
612
+
613
+ result = await retry_with_exponential_backoff(
614
+ translate_chunk,
615
+ max_retries=request.max_retries
616
+ )
617
+
618
+ translated_text = result["translated_text"]
619
+ chunk.translated_text = translated_text
620
+ chunk.input_tokens = result["input_tokens"]
621
+ chunk.output_tokens = result["output_tokens"]
622
+ chunk.status = ChunkStatus.COMPLETED.value
623
+ chunk.completed_at = datetime.utcnow()
624
+
625
+ total_input_tokens += result["input_tokens"]
626
+ total_output_tokens += result["output_tokens"]
627
+ total_cost += result["estimated_cost"]
628
+
629
+ # Update job progress
630
+ job.chunks_completed += 1
631
+ job.progress_percentage = (job.chunks_completed / job.chunks_total) * 100
632
+ db.commit()
633
+
634
+ # Add to response chunks
635
+ translated_chunks.append({
636
+ "index": i,
637
+ "original_text": chunk_data["text"],
638
+ "translated_text": translated_text,
639
+ "start_position": chunk_data["start"],
640
+ "end_position": chunk_data["end"],
641
+ "is_code_block": chunk_data["is_code_block"],
642
+ "code_language": chunk_data.get("code_language")
643
+ })
644
+
645
+ except Exception as e:
646
+ # Handle chunk error
647
+ chunk.status = ChunkStatus.FAILED.value
648
+ chunk.last_error = str(e)
649
+ job.chunks_failed += 1
650
+
651
+ # Log error
652
+ logger.log_error(e, chunk_index=i)
653
+
654
+ db.commit()
655
+ logger.error(f"Chunk {i} translation failed", error=str(e))
656
+
657
+ # Reconstruct final translation
658
+ final_translation = ''.join(chunk["translated_text"] for chunk in translated_chunks)
659
+
660
+ # Update job completion
661
+ job.translated_text = final_translation
662
+ job.input_tokens = total_input_tokens
663
+ job.output_tokens = total_output_tokens
664
+ job.estimated_cost_usd = total_cost
665
+ job.status = (
666
+ TranslationJobStatus.COMPLETED.value
667
+ if job.chunks_failed == 0
668
+ else TranslationJobStatus.FAILED.value
669
+ )
670
+ job.completed_at = datetime.utcnow()
671
+ job.processing_time_ms = int((time.time() - start_time) * 1000)
672
+ job.progress_percentage = 100.0
673
+ db.commit()
674
+
675
+ # Cache successful translation
676
+ if job.chunks_failed == 0:
677
+ await self._cache_translation(job, cache_key)
678
+
679
+ processing_time = int((time.time() - start_time) * 1000)
680
+
681
+ logger.log_translation_response(
682
+ translated_length=len(final_translation),
683
+ chunks_count=len(translated_chunks),
684
+ tokens_used=total_input_tokens + total_output_tokens,
685
+ cost_usd=total_cost,
686
+ cached=False
687
+ )
688
+
689
+ logger.info(
690
+ "Translation completed",
691
+ job_id=job_id,
692
+ chunks=len(chunks_data),
693
+ failed=job.chunks_failed,
694
+ processing_time_ms=processing_time,
695
+ total_cost=total_cost
696
+ )
697
+
698
+ return OpenAITranslationResponse(
699
+ job_id=job_id,
700
+ translated_text=final_translation,
701
+ status=TranslationJobStatus(job.status),
702
+ progress=100.0,
703
+ chunks=translated_chunks,
704
+ processing_time_ms=processing_time,
705
+ cached=False,
706
+ input_tokens=total_input_tokens,
707
+ output_tokens=total_output_tokens,
708
+ estimated_cost_usd=total_cost,
709
+ cache_key=cache_key,
710
+ cache_hit=False,
711
+ error_message=(
712
+ f"{job.chunks_failed} chunks failed"
713
+ if job.chunks_failed > 0
714
+ else None
715
+ )
716
+ )
717
+
718
+ except Exception as e:
719
+ # Update job status to failed
720
+ if 'job' in locals():
721
+ job.status = TranslationJobStatus.FAILED.value
722
+ job.completed_at = datetime.utcnow()
723
+ db.commit()
724
+
725
+ logger.log_error(e, job_id=job_id)
726
+ raise TranslationServiceError(
727
+ f"Translation failed: {str(e)}",
728
+ error_type="SYSTEM_ERROR"
729
+ )
730
+
731
+ finally:
732
+ db.close()
733
+
734
+ async def get_translation_status(self, job_id: str) -> Dict[str, Any]:
735
+ """Get the status of a translation job."""
736
+ db = next(get_db())
737
+ try:
738
+ job = db.query(TranslationJob).filter(
739
+ TranslationJob.job_id == job_id
740
+ ).first()
741
+
742
+ if not job:
743
+ raise TranslationServiceError(
744
+ "Translation job not found",
745
+ error_type="VALIDATION_ERROR"
746
+ )
747
+
748
+ return {
749
+ "job_id": job.job_id,
750
+ "status": job.status,
751
+ "progress": float(job.progress_percentage),
752
+ "chunks_total": job.chunks_total,
753
+ "chunks_completed": job.chunks_completed,
754
+ "chunks_failed": job.chunks_failed,
755
+ "processing_time_ms": job.processing_time_ms,
756
+ "estimated_cost_usd": float(job.estimated_cost_usd),
757
+ "created_at": job.created_at.isoformat(),
758
+ "started_at": job.started_at.isoformat() if job.started_at else None,
759
+ "completed_at": job.completed_at.isoformat() if job.completed_at else None
760
+ }
761
+
762
+ finally:
763
+ db.close()
764
+
765
+ async def stream_translation_status(self, job_id: str) -> AsyncGenerator[Dict[str, Any], None]:
766
+ """Stream translation status updates."""
767
+ # Implementation for streaming status updates
768
+ # This would typically check status periodically and yield updates
769
+ yield {"type": "start", "job_id": job_id, "message": "Starting stream..."}
770
+
771
+ # In a real implementation, you would:
772
+ # 1. Get initial job status
773
+ # 2. Poll status changes
774
+ # 3. Yield updates as they occur
775
+ # 4. Close stream when job completes
776
+
777
+ async def check_cache(self, content_hash: str, page_url: Optional[str] = None) -> Optional[TranslationCache]:
778
+ """Check cache for translation."""
779
+ return await self._check_cache(content_hash, page_url)
780
+
781
+ def generate_cache_key(self, content_hash: str, page_url: Optional[str] = None) -> str:
782
+ """Generate cache key."""
783
+ return self._generate_cache_key(content_hash, page_url)
784
+
785
+ async def clear_cache(self, page_url: Optional[str] = None, older_than_hours: Optional[int] = None) -> int:
786
+ """Clear translation cache entries."""
787
+ db = next(get_db())
788
+ try:
789
+ query = db.query(TranslationCache)
790
+
791
+ if page_url:
792
+ query = query.filter(TranslationCache.page_url == page_url)
793
+
794
+ if older_than_hours:
795
+ cutoff_time = datetime.utcnow() - timedelta(hours=older_than_hours)
796
+ query = query.filter(TranslationCache.created_at < cutoff_time)
797
+
798
+ # Get count before deleting
799
+ count = query.count()
800
+
801
+ # Delete entries
802
+ query.delete()
803
+ db.commit()
804
+
805
+ logger.info(
806
+ "Cache cleared",
807
+ entries_deleted=count,
808
+ page_url=page_url,
809
+ older_than_hours=older_than_hours
810
+ )
811
+
812
+ return count
813
+
814
+ finally:
815
+ db.close()
816
+
817
+ async def health_check(self) -> bool:
818
+ """Check if the service is healthy."""
819
+ try:
820
+ # Test Gemini connection
821
+ await self.gemini_client.test_connection()
822
+ return True
823
+ except Exception as e:
824
+ logger.error("Health check failed", error=str(e))
825
+ return False
826
+
827
+ async def get_metrics(self, period: str = "24h") -> Dict[str, Any]:
828
+ """Get translation metrics."""
829
+ # Implementation would aggregate metrics from database
830
+ # This is a placeholder
831
+ return {
832
+ "period": period,
833
+ "total_requests": 0,
834
+ "successful_requests": 0,
835
+ "failed_requests": 0,
836
+ "cache_hit_rate": 0.0,
837
+ "avg_processing_time_ms": 0.0,
838
+ "total_cost_usd": 0.0
839
+ }
840
+
841
+
842
+ # Global service instance
843
+ _translation_service: Optional[OpenAITranslationService] = None
844
+
845
+
846
+ async def get_translation_service() -> OpenAITranslationService:
847
+ """Get or create OpenAI translation service instance."""
848
+ global _translation_service
849
+
850
+ if _translation_service is None:
851
+ _translation_service = OpenAITranslationService()
852
+ # Initialize the async client
853
+ _translation_service.gemini_client = get_gemini_client()
854
+
855
+ return _translation_service
src/services/openai_translation/translation_agent.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simplified OpenAI Translation Agent using proper Runner.run pattern.
3
+ """
4
+
5
+ import asyncio
6
+ from typing import Dict, Optional, Any
7
+ from dataclasses import dataclass
8
+
9
+ from agents import Agent, Runner
10
+ from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
11
+ from src.utils.translation_logger import get_translation_logger
12
+
13
+ logger = get_translation_logger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class TranslationContext:
18
+ """Context information for translation."""
19
+ page_url: Optional[str] = None
20
+ document_type: Optional[str] = None
21
+ technical_domain: Optional[str] = None
22
+ target_audience: Optional[str] = None
23
+
24
+
25
+ class OpenAITranslationAgent:
26
+ """
27
+ OpenAI Agents SDK-based translation agent using proper Runner.run pattern.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ gemini_client: Optional[GeminiOpenAIClient] = None,
33
+ model: str = "gemini-2.0-flash-lite"
34
+ ):
35
+ """Initialize translation agent."""
36
+ self.client = gemini_client or get_gemini_client()
37
+ self.model = model
38
+
39
+ # Create the agent with translation instructions
40
+ self.agent = Agent(
41
+ name="Translation Agent",
42
+ instructions=self._get_translation_instructions(),
43
+ model=self.client.get_model()
44
+ )
45
+
46
+ def _get_translation_instructions(self) -> str:
47
+ """Get the base translation instructions for the agent."""
48
+ return """
49
+ You are a professional translator specializing in English to Urdu translation.
50
+
51
+ CRITICAL REQUIREMENTS:
52
+ 1. Translate ALL text to Urdu - no English words should remain
53
+ 2. ONLY preserve code blocks marked with ```
54
+ 3. Translate technical terms with context (e.g., AI -> مصنوعی ذہانت)
55
+ 4. Use Urdu script (Nastaleeq) for Urdu text
56
+ 5. Maintain formatting and structure
57
+ 6. Mix Urdu with Roman Urdu for technical terms where appropriate
58
+
59
+ When translating:
60
+ - Use appropriate honorifics and politeness levels
61
+ - Translate idioms and expressions to their Urdu equivalents
62
+ - Preserve the meaning and tone of the original text
63
+ - Handle technical terminology correctly
64
+ - Ensure grammatical correctness in Urdu
65
+
66
+ Additional context will be provided as needed for specific domains.
67
+ """
68
+
69
+ async def translate_with_agent(
70
+ self,
71
+ text: str,
72
+ context: Optional[TranslationContext] = None,
73
+ user_id: Optional[str] = None
74
+ ) -> Dict[str, Any]:
75
+ """
76
+ Translate text using OpenAI Agents SDK with proper Runner.run pattern.
77
+
78
+ Args:
79
+ text: Text to translate
80
+ context: Translation context information
81
+ user_id: User ID for tracking
82
+
83
+ Returns:
84
+ Dictionary containing translation result
85
+ """
86
+ try:
87
+ # Build the prompt with context
88
+ prompt = self._build_translation_prompt(text, context)
89
+
90
+ logger.info(
91
+ "Starting translation with agent",
92
+ text_length=len(text),
93
+ context=context.document_type if context else None,
94
+ model=self.model
95
+ )
96
+
97
+ # Run the agent using the proper Runner.run pattern
98
+ result = await Runner.run(
99
+ self.agent,
100
+ prompt,
101
+ max_turns=1 # Single turn for simple translation
102
+ )
103
+
104
+ # Extract the translated text
105
+ translated_text = result.final_output
106
+
107
+ # Try to extract tokens from usage if available
108
+ tokens_used = 0
109
+ model_used = self.model
110
+
111
+ # The result might have usage information in different formats
112
+ if hasattr(result, 'usage') and result.usage:
113
+ tokens_used = result.usage.total_tokens if hasattr(result.usage, 'total_tokens') else 0
114
+ model_used = result.usage.model if hasattr(result.usage, 'model') else self.model
115
+
116
+ # Check if the translation contains code blocks
117
+ has_code_blocks = "```" in translated_text
118
+
119
+ # Extract code blocks if present
120
+ code_blocks = []
121
+ if has_code_blocks:
122
+ import re
123
+ code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
124
+ code_blocks = [
125
+ {
126
+ "language": match.group(1) or "unknown",
127
+ "code": match.group(2)
128
+ }
129
+ for match in code_pattern.finditer(translated_text)
130
+ ]
131
+
132
+ logger.info(
133
+ "Translation completed successfully",
134
+ original_length=len(text),
135
+ translated_length=len(translated_text),
136
+ tokens_used=tokens_used,
137
+ has_code_blocks=has_code_blocks
138
+ )
139
+
140
+ return {
141
+ "translated_text": translated_text.strip(),
142
+ "original_text": text,
143
+ "tokens_used": tokens_used,
144
+ "model": model_used,
145
+ "confidence_score": 0.95, # Agent typically produces high-quality translations
146
+ "has_code_blocks": has_code_blocks,
147
+ "code_blocks": code_blocks,
148
+ "context_used": context is not None,
149
+ "processing_time_ms": 0, # Could track this if needed
150
+ "cache_hit": False
151
+ }
152
+
153
+ except Exception as e:
154
+ logger.error(
155
+ "Agent translation failed",
156
+ error=str(e),
157
+ error_type=type(e).__name__,
158
+ text_length=len(text)
159
+ )
160
+
161
+ # Re-raise with context
162
+ raise Exception(f"Translation failed: {str(e)}") from e
163
+
164
+ def _build_translation_prompt(
165
+ self,
166
+ text: str,
167
+ context: Optional[TranslationContext]
168
+ ) -> str:
169
+ """Build the translation prompt with context."""
170
+ prompt_parts = ["Translate the following text from English to Urdu:"]
171
+
172
+ # Add context information if provided
173
+ if context:
174
+ context_parts = []
175
+ if context.document_type:
176
+ context_parts.append(f"Document Type: {context.document_type}")
177
+ if context.technical_domain:
178
+ context_parts.append(f"Technical Domain: {context.technical_domain}")
179
+ if context.target_audience:
180
+ context_parts.append(f"Target Audience: {context.target_audience}")
181
+
182
+ if context_parts:
183
+ prompt_parts.append("\nContext:")
184
+ prompt_parts.append("\n".join(f"- {part}" for part in context_parts))
185
+
186
+ # Add the text to translate
187
+ prompt_parts.append(f"\n\nText to translate:\n{text}")
188
+
189
+ # Add instruction to translate only the content
190
+ prompt_parts.append("\n\nTranslate only the text above.")
191
+
192
+ return "\n".join(prompt_parts)
193
+
194
+
195
+ # Factory function
196
+ def create_translation_agent(model: str = "gemini-2.0-flash-lite") -> OpenAITranslationAgent:
197
+ """Create a translation agent instance."""
198
+ return OpenAITranslationAgent(model=model)