GitHub Actions
commited on
Commit
·
457b685
1
Parent(s):
84b0fa3
Deploy backend from GitHub Actions
Browse files🚀 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <[email protected]>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .env.example +159 -15
- .gitignore +3 -1
- OPENAI_AGENTS_FIX.md +348 -0
- alembic/env.py +5 -1
- alembic/versions/001_reader_features_tables.py +179 -0
- alembic/versions/004_add_translation_tables.py +114 -0
- alembic/versions/005_add_openai_translation_tables.py +295 -0
- create_translation_tables.py +47 -0
- fix_async_client.py +44 -0
- fix_jsonb.py +28 -0
- fix_translation_endpoint.py +45 -0
- fix_user_id_issue.py +34 -0
- fix_user_model.py +53 -0
- main.py +58 -3
- migrate_user_id.py +63 -0
- migrate_user_id_fixed.py +53 -0
- migration_summary_translation_tables.md +124 -0
- migrations/versions/001_create_openai_translation_tables.py +297 -0
- pyproject.toml +7 -1
- requirements.txt +7 -0
- src/api/v1/progress.py +450 -0
- src/api/v1/reader_features.py +94 -0
- src/api/v1/translation.py +336 -0
- src/config/logging_config.py +442 -0
- src/config/translation_config.py +432 -0
- src/database/base.py +1 -1
- src/middleware/auth.py +302 -0
- src/middleware/cors.py +356 -0
- src/middleware/rate_limit.py +385 -0
- src/models/__init__.py +29 -0
- src/models/auth.py +3 -0
- src/models/base.py +26 -0
- src/models/bookmark.py +53 -0
- src/models/chat.py +1 -1
- src/models/content_localization.py +50 -0
- src/models/personalization.py +64 -0
- src/models/reading_progress.py +33 -0
- src/models/search_index.py +30 -0
- src/models/translation_openai.py +512 -0
- src/models/user_preferences.py +54 -0
- src/services/cache_examples.py +231 -0
- src/services/cache_service.py +690 -0
- src/services/code_block_handler.py +630 -0
- src/services/content_reconstructor.py +471 -0
- src/services/html_parser.py +565 -0
- src/services/openai_translation/__init__.py +10 -0
- src/services/openai_translation/client.py +59 -0
- src/services/openai_translation/openai_agent.py +533 -0
- src/services/openai_translation/service.py +855 -0
- src/services/openai_translation/translation_agent.py +198 -0
.env.example
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Google OAuth Configuration
|
|
|
|
| 2 |
GOOGLE_CLIENT_ID=your-google-client-id
|
| 3 |
GOOGLE_CLIENT_SECRET=your-google-client-secret
|
| 4 |
# For production:
|
|
@@ -7,47 +16,182 @@ GOOGLE_CLIENT_SECRET=your-google-client-secret
|
|
| 7 |
AUTH_REDIRECT_URI=http://localhost:3000/auth/google/callback
|
| 8 |
FRONTEND_URL=http://localhost:3000
|
| 9 |
|
|
|
|
| 10 |
# JWT Configuration
|
|
|
|
| 11 |
JWT_SECRET_KEY=your-super-secret-jwt-key-at-least-32-characters-long
|
| 12 |
JWT_ALGORITHM=HS256
|
| 13 |
JWT_EXPIRE_MINUTES=10080 # 7 days
|
| 14 |
|
|
|
|
| 15 |
# Database Configuration
|
|
|
|
| 16 |
DATABASE_URL=sqlite:///./database/auth.db
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
| 18 |
# API Configuration
|
|
|
|
| 19 |
API_HOST=0.0.0.0
|
| 20 |
API_PORT=7860
|
| 21 |
LOG_LEVEL=INFO
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
RATE_LIMIT_REQUESTS=60
|
| 25 |
-
RATE_LIMIT_WINDOW=60
|
| 26 |
-
|
| 27 |
# CORS Configuration
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
QDRANT_URL=http://localhost:6333
|
| 37 |
QDRANT_API_KEY=your-qdrant-api-key-if-needed
|
| 38 |
|
| 39 |
-
#
|
|
|
|
|
|
|
| 40 |
BOOK_CONTENT_PATH=./book_content
|
| 41 |
CHUNK_SIZE=1000
|
| 42 |
CHUNK_OVERLAP=200
|
| 43 |
|
| 44 |
-
#
|
|
|
|
|
|
|
| 45 |
MAX_CONTEXT_MESSAGES=3
|
| 46 |
CONTEXT_WINDOW_SIZE=4000
|
| 47 |
|
| 48 |
-
#
|
|
|
|
|
|
|
| 49 |
BATCH_SIZE=100
|
| 50 |
MAX_CONCURRENT_REQUESTS=10
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# Environment Configuration
|
| 3 |
+
# ============================================
|
| 4 |
+
# Environment: development, testing, staging, production
|
| 5 |
+
ENVIRONMENT=development
|
| 6 |
+
DEBUG=true
|
| 7 |
+
|
| 8 |
+
# ============================================
|
| 9 |
# Google OAuth Configuration
|
| 10 |
+
# ============================================
|
| 11 |
GOOGLE_CLIENT_ID=your-google-client-id
|
| 12 |
GOOGLE_CLIENT_SECRET=your-google-client-secret
|
| 13 |
# For production:
|
|
|
|
| 16 |
AUTH_REDIRECT_URI=http://localhost:3000/auth/google/callback
|
| 17 |
FRONTEND_URL=http://localhost:3000
|
| 18 |
|
| 19 |
+
# ============================================
|
| 20 |
# JWT Configuration
|
| 21 |
+
# ============================================
|
| 22 |
JWT_SECRET_KEY=your-super-secret-jwt-key-at-least-32-characters-long
|
| 23 |
JWT_ALGORITHM=HS256
|
| 24 |
JWT_EXPIRE_MINUTES=10080 # 7 days
|
| 25 |
|
| 26 |
+
# ============================================
|
| 27 |
# Database Configuration
|
| 28 |
+
# ============================================
|
| 29 |
DATABASE_URL=sqlite:///./database/auth.db
|
| 30 |
+
DB_POOL_SIZE=5
|
| 31 |
+
DB_MAX_OVERFLOW=10
|
| 32 |
+
DB_POOL_TIMEOUT=30
|
| 33 |
+
DB_POOL_RECYCLE=3600
|
| 34 |
+
DB_AUTO_MIGRATE=true
|
| 35 |
+
|
| 36 |
+
# ============================================
|
| 37 |
+
# Gemini API Configuration (for OpenAI SDK)
|
| 38 |
+
# ============================================
|
| 39 |
+
GEMINI_API_KEY=your-gemini-api-key
|
| 40 |
+
GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
|
| 41 |
+
GEMINI_MODEL=gemini-2.0-flash-lite
|
| 42 |
+
GEMINI_TIMEOUT=60
|
| 43 |
+
GEMINI_MAX_RETRIES=3
|
| 44 |
+
GEMINI_RETRY_DELAY=1.0
|
| 45 |
+
GEMINI_HTTP2=true
|
| 46 |
+
GEMINI_RPM=60
|
| 47 |
+
GEMINI_RPH=1000
|
| 48 |
+
|
| 49 |
+
# ============================================
|
| 50 |
+
# OpenAI Agents SDK Configuration
|
| 51 |
+
# ============================================
|
| 52 |
+
OPENAI_AGENTS_ENABLED=true
|
| 53 |
+
OPENAI_AGENTS_TRACING=false
|
| 54 |
+
OPENAI_AGENTS_VERBOSE=false
|
| 55 |
+
AGENT_DEFAULT_TEMPERATURE=0.3
|
| 56 |
+
AGENT_MAX_TOKENS=2048
|
| 57 |
+
AGENT_MAX_TURNS=5
|
| 58 |
+
AGENT_HTML_TOOL=true
|
| 59 |
+
AGENT_CODE_TOOL=true
|
| 60 |
+
AGENT_QUALITY_TOOL=true
|
| 61 |
+
AGENT_QUALITY_CHECK=true
|
| 62 |
+
AGENT_CONFIDENCE_THRESHOLD=0.8
|
| 63 |
+
|
| 64 |
+
# ============================================
|
| 65 |
+
# Legacy OpenAI Configuration (for RAG)
|
| 66 |
+
# ============================================
|
| 67 |
+
OPENAI_API_KEY=your-openai-api-key
|
| 68 |
+
OPENAI_MODEL=gpt-4.1-nano
|
| 69 |
+
OPENAI_EMBEDDING_MODEL=text-embedding-3-small
|
| 70 |
+
|
| 71 |
+
# ============================================
|
| 72 |
+
# Cache Configuration
|
| 73 |
+
# ============================================
|
| 74 |
+
CACHE_BACKEND=memory
|
| 75 |
+
CACHE_DEFAULT_TTL=168
|
| 76 |
+
CACHE_HIGH_QUALITY_TTL=720
|
| 77 |
+
CACHE_LOW_QUALITY_TTL=24
|
| 78 |
+
REDIS_URL=redis://localhost:6379
|
| 79 |
+
REDIS_PREFIX=translation:
|
| 80 |
+
REDIS_MAX_CONNECTIONS=10
|
| 81 |
+
CACHE_MEMORY_MAX_SIZE=1000
|
| 82 |
+
CACHE_CLEANUP_INTERVAL=3600
|
| 83 |
+
|
| 84 |
+
# ============================================
|
| 85 |
+
# Rate Limiting Configuration
|
| 86 |
+
# ============================================
|
| 87 |
+
RATE_LIMIT_ENABLED=true
|
| 88 |
+
RATE_LIMIT_RPM=60
|
| 89 |
+
RATE_LIMIT_RPH=1000
|
| 90 |
+
RATE_LIMIT_RPD=10000
|
| 91 |
+
TRANSLATION_RPM=10
|
| 92 |
+
TRANSLATION_RPH=500
|
| 93 |
+
RATE_LIMIT_BLOCK_DURATION=3600
|
| 94 |
+
RATE_LIMIT_WARNING_THRESHOLD=0.8
|
| 95 |
+
RATE_LIMIT_REDIS=false
|
| 96 |
|
| 97 |
+
# ============================================
|
| 98 |
# API Configuration
|
| 99 |
+
# ============================================
|
| 100 |
API_HOST=0.0.0.0
|
| 101 |
API_PORT=7860
|
| 102 |
LOG_LEVEL=INFO
|
| 103 |
|
| 104 |
+
# ============================================
|
|
|
|
|
|
|
|
|
|
| 105 |
# CORS Configuration
|
| 106 |
+
# ============================================
|
| 107 |
+
CORS_ORIGINS=http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co
|
| 108 |
+
CORS_METHODS=GET,POST,PUT,DELETE
|
| 109 |
+
CORS_HEADERS=*
|
| 110 |
|
| 111 |
+
# ============================================
|
| 112 |
+
# Security Configuration
|
| 113 |
+
# ============================================
|
| 114 |
+
SECURITY_REQUIRE_API_KEY=false
|
| 115 |
+
SECURITY_API_KEY_HEADER=X-API-Key
|
| 116 |
+
SECURITY_MAX_TEXT_LENGTH=100000
|
| 117 |
+
SECURITY_MAX_CHUNKS=100
|
| 118 |
+
SECURITY_CONTENT_FILTER=true
|
| 119 |
+
SECURITY_BLOCKED_PATTERNS=
|
| 120 |
+
SECURITY_IP_WHITELIST=
|
| 121 |
+
SECURITY_IP_BLACKLIST=
|
| 122 |
+
|
| 123 |
+
# ============================================
|
| 124 |
+
# Logging Configuration
|
| 125 |
+
# ============================================
|
| 126 |
+
LOG_FILE_ENABLED=true
|
| 127 |
+
LOG_FILE_PATH=logs/translation.log
|
| 128 |
+
LOG_FILE_ROTATION=1 day
|
| 129 |
+
LOG_FILE_RETENTION=30 days
|
| 130 |
+
LOG_MAX_FILE_SIZE=100 MB
|
| 131 |
+
LOG_JSON_FORMAT=false
|
| 132 |
+
LOG_INCLUDE_REQUEST_ID=true
|
| 133 |
+
LOG_FILTER_SENSITIVE=true
|
| 134 |
+
SENSITIVE_FIELDS=api_key,password,token,authorization
|
| 135 |
|
| 136 |
+
# ============================================
|
| 137 |
+
# Monitoring Configuration
|
| 138 |
+
# ============================================
|
| 139 |
+
MONITORING_ENABLED=true
|
| 140 |
+
METRICS_ENDPOINT=/metrics
|
| 141 |
+
METRICS_PORT=9090
|
| 142 |
+
HEALTH_ENDPOINT=/health
|
| 143 |
+
HEALTH_DETAILED=true
|
| 144 |
+
TRACK_PERFORMANCE=true
|
| 145 |
+
SLOW_QUERY_THRESHOLD=1000
|
| 146 |
+
TRACK_ERRORS=true
|
| 147 |
+
ERROR_SAMPLE_RATE=1.0
|
| 148 |
+
|
| 149 |
+
# External Monitoring
|
| 150 |
+
SENTRY_DSN=
|
| 151 |
+
PROMETHEUS_GATEWAY=
|
| 152 |
+
|
| 153 |
+
# ============================================
|
| 154 |
+
# Qdrant Configuration (for RAG)
|
| 155 |
+
# ============================================
|
| 156 |
QDRANT_URL=http://localhost:6333
|
| 157 |
QDRANT_API_KEY=your-qdrant-api-key-if-needed
|
| 158 |
|
| 159 |
+
# ============================================
|
| 160 |
+
# Content Configuration (for RAG)
|
| 161 |
+
# ============================================
|
| 162 |
BOOK_CONTENT_PATH=./book_content
|
| 163 |
CHUNK_SIZE=1000
|
| 164 |
CHUNK_OVERLAP=200
|
| 165 |
|
| 166 |
+
# ============================================
|
| 167 |
+
# Conversation Context (for RAG)
|
| 168 |
+
# ============================================
|
| 169 |
MAX_CONTEXT_MESSAGES=3
|
| 170 |
CONTEXT_WINDOW_SIZE=4000
|
| 171 |
|
| 172 |
+
# ============================================
|
| 173 |
+
# Ingestion Configuration (for RAG)
|
| 174 |
+
# ============================================
|
| 175 |
BATCH_SIZE=100
|
| 176 |
MAX_CONCURRENT_REQUESTS=10
|
| 177 |
|
| 178 |
+
# ============================================
|
| 179 |
+
# Health Monitoring
|
| 180 |
+
# ============================================
|
| 181 |
+
HEALTH_CHECK_INTERVAL=30
|
| 182 |
+
|
| 183 |
+
# ============================================
|
| 184 |
+
# Proxy Configuration (Optional)
|
| 185 |
+
# ============================================
|
| 186 |
+
HTTP_PROXY=
|
| 187 |
+
HTTPS_PROXY=
|
| 188 |
+
|
| 189 |
+
# ============================================
|
| 190 |
+
# Feature Flags
|
| 191 |
+
# ============================================
|
| 192 |
+
FEATURE_STREAMING=true
|
| 193 |
+
FEATURE_QUALITY_CHECK=true
|
| 194 |
+
FEATURE_CHUNKING=true
|
| 195 |
+
FEATURE_CODE_PRESERVATION=true
|
| 196 |
+
FEATURE_HTML_PRESERVATION=true
|
| 197 |
+
FEATURE_BATCH_TRANSLATION=true
|
.gitignore
CHANGED
|
@@ -149,4 +149,6 @@ Thumbs.db
|
|
| 149 |
|
| 150 |
# Test files
|
| 151 |
test_output/
|
| 152 |
-
test_reports/
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
# Test files
|
| 151 |
test_output/
|
| 152 |
+
test_reports/
|
| 153 |
+
|
| 154 |
+
.playwright-mcp
|
OPENAI_AGENTS_FIX.md
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenAI Agents SDK Implementation Fix for Gemini API Quota Errors
|
| 2 |
+
|
| 3 |
+
## Problem Summary
|
| 4 |
+
|
| 5 |
+
The translation system was experiencing Gemini API quota exceeded errors (HTTP 429) due to several issues with the OpenAI Agents SDK implementation:
|
| 6 |
+
|
| 7 |
+
1. **Incorrect Package Name**: The code was importing from `agents` package instead of the correct `openai-agents-sdk`
|
| 8 |
+
2. **Not Actually Using OpenAI Agents SDK**: Despite claiming to use the SDK, the implementation was using the OpenAI client directly
|
| 9 |
+
3. **Insufficient Rate Limit Handling**: Basic error handling that didn't properly implement exponential backoff
|
| 10 |
+
4. **Missing Per-User Rate Limiting**: No per-user or per-IP rate limiting to prevent quota exhaustion
|
| 11 |
+
|
| 12 |
+
## Solution Implementation
|
| 13 |
+
|
| 14 |
+
### 1. Fixed Package Dependencies
|
| 15 |
+
|
| 16 |
+
Updated `pyproject.toml`:
|
| 17 |
+
|
| 18 |
+
```toml
|
| 19 |
+
# Before
|
| 20 |
+
"openai-agents>=0.1.0"
|
| 21 |
+
|
| 22 |
+
# After
|
| 23 |
+
"openai-agents-sdk>=0.2.9"
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### 2. Created Proper OpenAI Agents SDK Implementation
|
| 27 |
+
|
| 28 |
+
**File**: `src/services/openai_translation/openai_agent.py`
|
| 29 |
+
|
| 30 |
+
- Correct imports from `openai_agents_sdk`
|
| 31 |
+
- Proper agent implementation with tools
|
| 32 |
+
- Enhanced error handling for rate limits
|
| 33 |
+
- Exponential backoff with jitter
|
| 34 |
+
- Detailed error reporting
|
| 35 |
+
|
| 36 |
+
Key features:
|
| 37 |
+
|
| 38 |
+
```python
|
| 39 |
+
from openai_agents_sdk import Agent, Runner, function_tool, RunContextWrapper
|
| 40 |
+
from openai_agents_sdk.errors import RateLimitError as OpenAIRateLimitError
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### 3. Enhanced Error Handling
|
| 44 |
+
|
| 45 |
+
**File**: `src/services/openai_translation/enhanced_service.py`
|
| 46 |
+
|
| 47 |
+
- Per-user rate limiting
|
| 48 |
+
- Exponential backoff implementation
|
| 49 |
+
- Detailed rate limit error responses
|
| 50 |
+
- Retry attempt tracking
|
| 51 |
+
- Backoff time accumulation
|
| 52 |
+
|
| 53 |
+
Example retry logic:
|
| 54 |
+
|
| 55 |
+
```python
|
| 56 |
+
for attempt in range(request.max_retries + 1):
|
| 57 |
+
try:
|
| 58 |
+
# API call
|
| 59 |
+
result = await api_call()
|
| 60 |
+
return result
|
| 61 |
+
except RateLimitError as e:
|
| 62 |
+
if attempt < request.max_retries:
|
| 63 |
+
delay = min(
|
| 64 |
+
request.retry_delay * (request.backoff_factor ** attempt),
|
| 65 |
+
request.max_retry_delay
|
| 66 |
+
)
|
| 67 |
+
# Add jitter
|
| 68 |
+
delay *= (0.5 + random.random() * 0.5)
|
| 69 |
+
await asyncio.sleep(delay)
|
| 70 |
+
continue
|
| 71 |
+
else:
|
| 72 |
+
raise
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### 4. Enhanced API Endpoints
|
| 76 |
+
|
| 77 |
+
**File**: `src/api/v1/enhanced_translation.py`
|
| 78 |
+
|
| 79 |
+
- Proper HTTP 429 status codes
|
| 80 |
+
- Retry-After headers
|
| 81 |
+
- Detailed rate limit information
|
| 82 |
+
- Per-endpoint rate limiting
|
| 83 |
+
|
| 84 |
+
Example response:
|
| 85 |
+
|
| 86 |
+
```json
|
| 87 |
+
{
|
| 88 |
+
"error": "RATE_LIMIT_EXCEEDED",
|
| 89 |
+
"message": "User rate limit exceeded. Please wait 45.2 seconds.",
|
| 90 |
+
"retry_after": 45.2,
|
| 91 |
+
"rate_limit_info": {
|
| 92 |
+
"retry_after": 45.2,
|
| 93 |
+
"limit_type": "quota_exceeded",
|
| 94 |
+
"user_id": "user123"
|
| 95 |
+
},
|
| 96 |
+
"timestamp": 1703847123.45
|
| 97 |
+
}
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
### 5. Rate Limiting Middleware
|
| 101 |
+
|
| 102 |
+
**File**: `src/middleware/rate_limit.py`
|
| 103 |
+
|
| 104 |
+
- Per-IP rate limiting
|
| 105 |
+
- Per-user rate limiting (if authenticated)
|
| 106 |
+
- Sliding window algorithm
|
| 107 |
+
- Redis support for distributed systems
|
| 108 |
+
- In-memory fallback
|
| 109 |
+
|
| 110 |
+
## How to Use the Enhanced System
|
| 111 |
+
|
| 112 |
+
### 1. Update Your Environment
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
cd backend
|
| 116 |
+
pip install -e .
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### 2. Update Your `.env` File
|
| 120 |
+
|
| 121 |
+
Make sure you have:
|
| 122 |
+
|
| 123 |
+
```env
|
| 124 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 125 |
+
GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
|
| 126 |
+
GEMINI_MODEL=gemini-2.0-flash-lite
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### 3. Add Rate Limiting to Your App
|
| 130 |
+
|
| 131 |
+
In your FastAPI app initialization:
|
| 132 |
+
|
| 133 |
+
```python
|
| 134 |
+
from src.middleware.rate_limit import TranslationRateLimitMiddleware
|
| 135 |
+
|
| 136 |
+
app.add_middleware(TranslationRateLimitMiddleware)
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
### 4. Use Enhanced Endpoints
|
| 140 |
+
|
| 141 |
+
Instead of `/translation/translate`, use the enhanced endpoint:
|
| 142 |
+
|
| 143 |
+
```http
|
| 144 |
+
POST /translation/translate
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
This provides better error handling and rate limit information.
|
| 148 |
+
|
| 149 |
+
## Rate Limit Configuration
|
| 150 |
+
|
| 151 |
+
Default limits:
|
| 152 |
+
|
| 153 |
+
- **Per IP**: 60 requests per minute, 1000 per hour
|
| 154 |
+
- **Per User (if authenticated)**: 10 translations per minute, 500 per hour
|
| 155 |
+
- **Translation Endpoints**: Stricter limits (10/min, 500/hour)
|
| 156 |
+
|
| 157 |
+
These can be configured via environment variables or in the middleware initialization.
|
| 158 |
+
|
| 159 |
+
## Monitoring and Metrics
|
| 160 |
+
|
| 161 |
+
The enhanced system provides detailed metrics:
|
| 162 |
+
|
| 163 |
+
```json
|
| 164 |
+
{
|
| 165 |
+
"period": "24h",
|
| 166 |
+
"total_requests": 1250,
|
| 167 |
+
"successful_requests": 1180,
|
| 168 |
+
"failed_requests": 45,
|
| 169 |
+
"rate_limited_requests": 25,
|
| 170 |
+
"cache_hit_rate": 0.35,
|
| 171 |
+
"avg_processing_time_ms": 2340,
|
| 172 |
+
"total_cost_usd": 2.45,
|
| 173 |
+
"active_users": 15,
|
| 174 |
+
"user_rate_limits": {
|
| 175 |
+
"user123": {
|
| 176 |
+
"requests_last_minute": 3,
|
| 177 |
+
"last_reset": 1703847123.45
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
}
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
## Best Practices
|
| 184 |
+
|
| 185 |
+
1. **Handle Rate Limit Errors Properly**
|
| 186 |
+
|
| 187 |
+
```python
|
| 188 |
+
try:
|
| 189 |
+
result = await translate_text(text)
|
| 190 |
+
except RateLimitError as e:
|
| 191 |
+
print(f"Rate limited. Retry after {e.retry_after} seconds")
|
| 192 |
+
await asyncio.sleep(e.retry_after)
|
| 193 |
+
# Retry with backoff
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
2. **Use Caching When Possible**
|
| 197 |
+
|
| 198 |
+
- The system automatically caches successful translations
|
| 199 |
+
- Cache hits don't count against rate limits
|
| 200 |
+
- Provide `page_url` for better cache keys
|
| 201 |
+
|
| 202 |
+
3. **Batch Large Translations**
|
| 203 |
+
|
| 204 |
+
- The system automatically chunks large texts
|
| 205 |
+
- Configure `chunk_size` and `max_chunks` appropriately
|
| 206 |
+
- Monitor processing time to optimize chunk size
|
| 207 |
+
|
| 208 |
+
4. **Monitor Your Usage**
|
| 209 |
+
- Use `/translation/metrics` endpoint (admin only)
|
| 210 |
+
- Watch for rate limit errors in logs
|
| 211 |
+
- Adjust retry settings based on your quota
|
| 212 |
+
|
| 213 |
+
## Testing the Fix
|
| 214 |
+
|
| 215 |
+
To test the rate limiting:
|
| 216 |
+
|
| 217 |
+
```python
|
| 218 |
+
import asyncio
|
| 219 |
+
import httpx
|
| 220 |
+
|
| 221 |
+
async def test_rate_limit():
|
| 222 |
+
async with httpx.AsyncClient() as client:
|
| 223 |
+
# Make rapid requests to trigger rate limit
|
| 224 |
+
for i in range(15):
|
| 225 |
+
response = await client.post(
|
| 226 |
+
"http://localhost:8000/translation/translate",
|
| 227 |
+
json={
|
| 228 |
+
"text": f"Test translation {i}",
|
| 229 |
+
"source_language": "en",
|
| 230 |
+
"target_language": "ur"
|
| 231 |
+
}
|
| 232 |
+
)
|
| 233 |
+
print(f"Request {i}: Status {response.status_code}")
|
| 234 |
+
if response.status_code == 429:
|
| 235 |
+
retry_after = response.headers.get("Retry-After")
|
| 236 |
+
print(f"Rate limited. Retry after {retry_after} seconds")
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
asyncio.run(test_rate_limit())
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
## Troubleshooting
|
| 243 |
+
|
| 244 |
+
### Still Getting 429 Errors?
|
| 245 |
+
|
| 246 |
+
1. **Check Your Gemini API Quota**
|
| 247 |
+
|
| 248 |
+
- Visit Google AI Studio
|
| 249 |
+
- Verify your daily/monthly quota
|
| 250 |
+
- Request quota increase if needed
|
| 251 |
+
|
| 252 |
+
2. **Implement Client-Side Rate Limiting**
|
| 253 |
+
|
| 254 |
+
```python
|
| 255 |
+
import asyncio
|
| 256 |
+
from asyncio import Semaphore
|
| 257 |
+
|
| 258 |
+
# Limit concurrent requests
|
| 259 |
+
semaphore = Semaphore(5) # Max 5 concurrent requests
|
| 260 |
+
|
| 261 |
+
async def translate_with_limit(text):
|
| 262 |
+
async with semaphore:
|
| 263 |
+
return await translate_text(text)
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
3. **Use Backoff in Your Client**
|
| 267 |
+
|
| 268 |
+
```python
|
| 269 |
+
import backoff
|
| 270 |
+
|
| 271 |
+
@backoff.on_exception(backoff.expo, RateLimitError, max_tries=3)
|
| 272 |
+
async def safe_translate(text):
|
| 273 |
+
return await translate_text(text)
|
| 274 |
+
```
|
| 275 |
+
|
| 276 |
+
### Performance Issues?
|
| 277 |
+
|
| 278 |
+
1. **Reduce Chunk Size**
|
| 279 |
+
|
| 280 |
+
- Smaller chunks process faster
|
| 281 |
+
- Less chance of timeout
|
| 282 |
+
- Better error recovery
|
| 283 |
+
|
| 284 |
+
2. **Enable Caching**
|
| 285 |
+
|
| 286 |
+
- Set `page_url` for content-based caching
|
| 287 |
+
- Cache hits are instant
|
| 288 |
+
- Reduces API usage
|
| 289 |
+
|
| 290 |
+
3. **Monitor Memory Usage**
|
| 291 |
+
- Large translations use more memory
|
| 292 |
+
- Consider streaming for very large texts
|
| 293 |
+
- Implement pagination for batch jobs
|
| 294 |
+
|
| 295 |
+
## Migration Guide
|
| 296 |
+
|
| 297 |
+
To migrate from the old implementation:
|
| 298 |
+
|
| 299 |
+
1. **Update Dependencies**
|
| 300 |
+
|
| 301 |
+
```bash
|
| 302 |
+
pip install openai-agents-sdk>=0.2.9
|
| 303 |
+
```
|
| 304 |
+
|
| 305 |
+
2. **Update Imports**
|
| 306 |
+
|
| 307 |
+
```python
|
| 308 |
+
# Old
|
| 309 |
+
from agents import Agent, Runner
|
| 310 |
+
|
| 311 |
+
# New
|
| 312 |
+
from openai_agents_sdk import Agent, Runner
|
| 313 |
+
```
|
| 314 |
+
|
| 315 |
+
3. **Update Error Handling**
|
| 316 |
+
|
| 317 |
+
```python
|
| 318 |
+
# Old
|
| 319 |
+
except Exception as e:
|
| 320 |
+
if "429" in str(e):
|
| 321 |
+
# Handle rate limit
|
| 322 |
+
|
| 323 |
+
# New
|
| 324 |
+
except RateLimitError as e:
|
| 325 |
+
retry_after = e.retry_after
|
| 326 |
+
# Handle with proper backoff
|
| 327 |
+
```
|
| 328 |
+
|
| 329 |
+
4. **Add Rate Limiting**
|
| 330 |
+
```python
|
| 331 |
+
from src.middleware.rate_limit import TranslationRateLimitMiddleware
|
| 332 |
+
app.add_middleware(TranslationRateLimitMiddleware)
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
## Conclusion
|
| 336 |
+
|
| 337 |
+
The enhanced OpenAI Agents SDK implementation provides:
|
| 338 |
+
|
| 339 |
+
- ✅ Correct package usage and imports
|
| 340 |
+
- ✅ Proper agent implementation with tools
|
| 341 |
+
- ✅ Robust rate limit error handling
|
| 342 |
+
- ✅ Exponential backoff with jitter
|
| 343 |
+
- ✅ Per-user and per-IP rate limiting
|
| 344 |
+
- ✅ Detailed error reporting and metrics
|
| 345 |
+
- ✅ Caching to reduce API usage
|
| 346 |
+
- ✅ Monitoring and health checks
|
| 347 |
+
|
| 348 |
+
This should significantly reduce Gemini API quota errors and provide a better user experience with proper error handling and retry logic.
|
alembic/env.py
CHANGED
|
@@ -10,7 +10,11 @@ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
|
| 10 |
|
| 11 |
# Import models
|
| 12 |
from src.models.auth import Base
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# this is the Alembic Config object, which provides
|
| 16 |
# access to the values within the .ini file in use.
|
|
|
|
| 10 |
|
| 11 |
# Import models
|
| 12 |
from src.models.auth import Base
|
| 13 |
+
# Import other models to register them with the Base metadata
|
| 14 |
+
import src.models.chat
|
| 15 |
+
import src.models.translation
|
| 16 |
+
import src.models.personalization
|
| 17 |
+
import src.models.content_localization
|
| 18 |
|
| 19 |
# this is the Alembic Config object, which provides
|
| 20 |
# access to the values within the .ini file in use.
|
alembic/versions/001_reader_features_tables.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Create tables for reader experience features
|
| 2 |
+
|
| 3 |
+
Revision ID: 003_reader_features_tables
|
| 4 |
+
Revises: 002_add_onboarding_tables
|
| 5 |
+
Create Date: 2025-01-09
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from alembic import op
|
| 9 |
+
import sqlalchemy as sa
|
| 10 |
+
|
| 11 |
+
# revision identifiers
|
| 12 |
+
revision = '003_reader_features_tables'
|
| 13 |
+
down_revision = '002_add_onboarding_tables'
|
| 14 |
+
branch_labels = None
|
| 15 |
+
depends_on = None
|
| 16 |
+
|
| 17 |
+
def upgrade():
|
| 18 |
+
# Create reading_progress table
|
| 19 |
+
op.create_table('reading_progress',
|
| 20 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 21 |
+
sa.Column('user_id', sa.String(), nullable=False),
|
| 22 |
+
sa.Column('chapter_id', sa.String(), nullable=False),
|
| 23 |
+
sa.Column('section_id', sa.String(), nullable=False),
|
| 24 |
+
sa.Column('position', sa.Float(), nullable=False),
|
| 25 |
+
sa.Column('completed', sa.Boolean(), nullable=False, server_default='false'),
|
| 26 |
+
sa.Column('time_spent', sa.Integer(), nullable=False, server_default='0'),
|
| 27 |
+
sa.Column('last_accessed', sa.DateTime(), nullable=False),
|
| 28 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 29 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 30 |
+
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
|
| 31 |
+
sa.PrimaryKeyConstraint('id'),
|
| 32 |
+
sa.UniqueConstraint('user_id', 'chapter_id', 'section_id')
|
| 33 |
+
)
|
| 34 |
+
op.create_index('idx_reading_progress_user_chapter', 'reading_progress', ['user_id', 'chapter_id'])
|
| 35 |
+
op.create_index('idx_reading_progress_last_accessed', 'reading_progress', ['last_accessed'])
|
| 36 |
+
|
| 37 |
+
# Create bookmarks table
|
| 38 |
+
op.create_table('bookmarks',
|
| 39 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 40 |
+
sa.Column('user_id', sa.String(), nullable=False),
|
| 41 |
+
sa.Column('chapter_id', sa.String(), nullable=False),
|
| 42 |
+
sa.Column('section_id', sa.String(), nullable=True),
|
| 43 |
+
sa.Column('page_url', sa.String(), nullable=False),
|
| 44 |
+
sa.Column('page_title', sa.String(length=255), nullable=False),
|
| 45 |
+
sa.Column('snippet', sa.String(), nullable=True),
|
| 46 |
+
sa.Column('note', sa.String(length=1000), nullable=True),
|
| 47 |
+
sa.Column('is_private', sa.Boolean(), nullable=False, server_default='true'),
|
| 48 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 49 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 50 |
+
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
|
| 51 |
+
sa.PrimaryKeyConstraint('id')
|
| 52 |
+
)
|
| 53 |
+
op.create_index('idx_bookmarks_user_created', 'bookmarks', ['user_id', 'created_at'])
|
| 54 |
+
op.create_index('idx_bookmarks_chapter', 'bookmarks', ['chapter_id'])
|
| 55 |
+
|
| 56 |
+
# Create bookmark_tags table
|
| 57 |
+
op.create_table('bookmark_tags',
|
| 58 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 59 |
+
sa.Column('bookmark_id', sa.String(), nullable=False),
|
| 60 |
+
sa.Column('tag', sa.String(length=50), nullable=False),
|
| 61 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 62 |
+
sa.ForeignKeyConstraint(['bookmark_id'], ['bookmarks.id'], ondelete='CASCADE'),
|
| 63 |
+
sa.PrimaryKeyConstraint('id'),
|
| 64 |
+
sa.UniqueConstraint('bookmark_id', 'tag')
|
| 65 |
+
)
|
| 66 |
+
op.create_index('idx_bookmark_tags_tag', 'bookmark_tags', ['tag'])
|
| 67 |
+
|
| 68 |
+
# Create user_preferences table
|
| 69 |
+
op.create_table('user_preferences',
|
| 70 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 71 |
+
sa.Column('user_id', sa.String(), nullable=False),
|
| 72 |
+
sa.Column('language', sa.String(), nullable=False),
|
| 73 |
+
sa.Column('reading_pace', sa.String(), nullable=False),
|
| 74 |
+
sa.Column('preferred_depth', sa.String(), nullable=False),
|
| 75 |
+
sa.Column('show_code_examples', sa.Boolean(), nullable=False, server_default='true'),
|
| 76 |
+
sa.Column('adaptive_difficulty', sa.Boolean(), nullable=False, server_default='false'),
|
| 77 |
+
sa.Column('theme', sa.String(), nullable=False),
|
| 78 |
+
sa.Column('font_size', sa.Integer(), nullable=False, server_default='16'),
|
| 79 |
+
sa.Column('line_height', sa.Float(), nullable=False, server_default='1.5'),
|
| 80 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 81 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 82 |
+
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
|
| 83 |
+
sa.PrimaryKeyConstraint('id'),
|
| 84 |
+
sa.UniqueConstraint('user_id')
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Create user_custom_notes table
|
| 88 |
+
op.create_table('user_custom_notes',
|
| 89 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 90 |
+
sa.Column('user_preference_id', sa.String(), nullable=False),
|
| 91 |
+
sa.Column('key', sa.String(), nullable=False),
|
| 92 |
+
sa.Column('value', sa.String(), nullable=False),
|
| 93 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 94 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 95 |
+
sa.ForeignKeyConstraint(['user_preference_id'], ['user_preferences.id'], ondelete='CASCADE'),
|
| 96 |
+
sa.PrimaryKeyConstraint('id'),
|
| 97 |
+
sa.UniqueConstraint('user_preference_id', 'key')
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Create content_localization table
|
| 101 |
+
op.create_table('content_localization',
|
| 102 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 103 |
+
sa.Column('content_id', sa.String(), nullable=False),
|
| 104 |
+
sa.Column('language', sa.String(), nullable=False),
|
| 105 |
+
sa.Column('title', sa.String(length=255), nullable=False),
|
| 106 |
+
sa.Column('content', sa.String(), nullable=False),
|
| 107 |
+
sa.Column('word_count', sa.Integer(), nullable=False, server_default='0'),
|
| 108 |
+
sa.Column('reading_time_minutes', sa.Integer(), nullable=False, server_default='0'),
|
| 109 |
+
sa.Column('last_updated', sa.DateTime(), nullable=False),
|
| 110 |
+
sa.Column('translator', sa.String(), nullable=True),
|
| 111 |
+
sa.Column('reviewed', sa.Boolean(), nullable=False, server_default='false'),
|
| 112 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 113 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 114 |
+
sa.PrimaryKeyConstraint('id'),
|
| 115 |
+
sa.UniqueConstraint('content_id', 'language')
|
| 116 |
+
)
|
| 117 |
+
op.create_index('idx_content_localization_language', 'content_localization', ['language'])
|
| 118 |
+
op.create_index('idx_content_localization_content', 'content_localization', ['content_id'])
|
| 119 |
+
|
| 120 |
+
# Create search_index table
|
| 121 |
+
op.create_table('search_index',
|
| 122 |
+
sa.Column('id', sa.String(), nullable=False),
|
| 123 |
+
sa.Column('content_id', sa.String(), nullable=False),
|
| 124 |
+
sa.Column('language', sa.String(), nullable=False),
|
| 125 |
+
sa.Column('content_type', sa.String(), nullable=False),
|
| 126 |
+
sa.Column('title', sa.String(), nullable=False),
|
| 127 |
+
sa.Column('content', sa.String(), nullable=False),
|
| 128 |
+
sa.Column('chapter_id', sa.String(), nullable=False),
|
| 129 |
+
sa.Column('section_id', sa.String(), nullable=True),
|
| 130 |
+
sa.Column('rank', sa.Float(), nullable=False, server_default='0.5'),
|
| 131 |
+
sa.Column('indexed_at', sa.DateTime(), nullable=False),
|
| 132 |
+
sa.PrimaryKeyConstraint('id')
|
| 133 |
+
)
|
| 134 |
+
op.create_index('idx_search_index_language_rank', 'search_index', ['language', 'rank'])
|
| 135 |
+
op.create_index('idx_search_index_chapter', 'search_index', ['chapter_id'])
|
| 136 |
+
|
| 137 |
+
# Create FTS virtual table for search
|
| 138 |
+
op.execute("""
|
| 139 |
+
CREATE VIRTUAL TABLE search_index_fts USING fts5(
|
| 140 |
+
title,
|
| 141 |
+
content,
|
| 142 |
+
keywords,
|
| 143 |
+
content=search_index
|
| 144 |
+
)
|
| 145 |
+
""")
|
| 146 |
+
|
| 147 |
+
# Create FTS triggers
|
| 148 |
+
op.execute("""
|
| 149 |
+
CREATE TRIGGER search_index_ai AFTER INSERT ON search_index BEGIN
|
| 150 |
+
INSERT INTO search_index_fts(rowid, title, content, keywords)
|
| 151 |
+
VALUES (new.id, new.title, new.content, new.title || ' ' || new.content);
|
| 152 |
+
END
|
| 153 |
+
""")
|
| 154 |
+
|
| 155 |
+
op.execute("""
|
| 156 |
+
CREATE TRIGGER search_index_ad AFTER DELETE ON search_index BEGIN
|
| 157 |
+
INSERT INTO search_index_fts(search_index_fts, rowid, title, content, keywords)
|
| 158 |
+
VALUES ('delete', old.id, old.title, old.content, NULL);
|
| 159 |
+
END
|
| 160 |
+
""")
|
| 161 |
+
|
| 162 |
+
op.execute("""
|
| 163 |
+
CREATE TRIGGER search_index_au AFTER UPDATE ON search_index BEGIN
|
| 164 |
+
DELETE FROM search_index_fts WHERE rowid = old.id;
|
| 165 |
+
INSERT INTO search_index_fts(rowid, title, content, keywords)
|
| 166 |
+
VALUES (new.id, new.title, new.content, new.title || ' ' || new.content);
|
| 167 |
+
END
|
| 168 |
+
""")
|
| 169 |
+
|
| 170 |
+
def downgrade():
|
| 171 |
+
# Drop tables in reverse order
|
| 172 |
+
op.drop_table('search_index')
|
| 173 |
+
op.execute('DROP TABLE IF EXISTS search_index_fts')
|
| 174 |
+
op.drop_table('content_localization')
|
| 175 |
+
op.drop_table('user_custom_notes')
|
| 176 |
+
op.drop_table('user_preferences')
|
| 177 |
+
op.drop_table('bookmark_tags')
|
| 178 |
+
op.drop_table('bookmarks')
|
| 179 |
+
op.drop_table('reading_progress')
|
alembic/versions/004_add_translation_tables.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Add translation tables and personalization features
|
| 2 |
+
|
| 3 |
+
Revision ID: 004_add_translation_tables
|
| 4 |
+
Revises: 003_reader_features_tables
|
| 5 |
+
Create Date: 2025-01-10
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from alembic import op
|
| 9 |
+
import sqlalchemy as sa
|
| 10 |
+
from sqlalchemy.dialects import sqlite
|
| 11 |
+
|
| 12 |
+
# revision identifiers
|
| 13 |
+
revision = '004_add_translation_tables'
|
| 14 |
+
down_revision = '003_reader_features_tables'
|
| 15 |
+
branch_labels = None
|
| 16 |
+
depends_on = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def upgrade():
|
| 20 |
+
# Create translations table
|
| 21 |
+
op.create_table('translations',
|
| 22 |
+
sa.Column('id', sa.Integer(), nullable=False),
|
| 23 |
+
sa.Column('content_hash', sa.String(length=64), nullable=False),
|
| 24 |
+
sa.Column('source_language', sa.String(length=10), nullable=False),
|
| 25 |
+
sa.Column('target_language', sa.String(length=10), nullable=False),
|
| 26 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 27 |
+
sa.Column('translated_text', sa.Text(), nullable=False),
|
| 28 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 29 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 30 |
+
sa.Column('translation_model', sa.String(length=50), nullable=False),
|
| 31 |
+
sa.Column('character_count', sa.Integer(), nullable=False),
|
| 32 |
+
sa.PrimaryKeyConstraint('id'),
|
| 33 |
+
sa.UniqueConstraint('content_hash')
|
| 34 |
+
)
|
| 35 |
+
op.create_index('idx_content_lookup', 'translations', ['content_hash', 'source_language', 'target_language'], unique=False)
|
| 36 |
+
op.create_index(op.f('ix_translations_content_hash'), 'translations', ['content_hash'], unique=True)
|
| 37 |
+
|
| 38 |
+
# Create translation_feedback table
|
| 39 |
+
op.create_table('translation_feedback',
|
| 40 |
+
sa.Column('id', sa.Integer(), nullable=False),
|
| 41 |
+
sa.Column('translation_id', sa.Integer(), nullable=False),
|
| 42 |
+
sa.Column('user_id', sa.String(length=36), nullable=False),
|
| 43 |
+
sa.Column('rating', sa.SmallInteger(), nullable=False),
|
| 44 |
+
sa.Column('comment', sa.Text(), nullable=True),
|
| 45 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 46 |
+
sa.ForeignKeyConstraint(['translation_id'], ['translations.id'], ),
|
| 47 |
+
sa.PrimaryKeyConstraint('id'),
|
| 48 |
+
sa.CheckConstraint('rating IN (-1, 1)', name='check_rating_range')
|
| 49 |
+
)
|
| 50 |
+
op.create_index('idx_user_translation', 'translation_feedback', ['user_id', 'translation_id'], unique=True)
|
| 51 |
+
|
| 52 |
+
# Create personalization_profiles table
|
| 53 |
+
op.create_table('personalization_profiles',
|
| 54 |
+
sa.Column('id', sa.Integer(), nullable=False),
|
| 55 |
+
sa.Column('user_id', sa.String(length=36), nullable=False),
|
| 56 |
+
sa.Column('reading_level', sa.String(length=20), nullable=True),
|
| 57 |
+
sa.Column('preferred_language', sa.String(length=10), nullable=True),
|
| 58 |
+
sa.Column('focus_areas', sa.JSON(), nullable=True),
|
| 59 |
+
sa.Column('learning_style', sa.String(length=20), nullable=True),
|
| 60 |
+
sa.Column('enable_transliteration', sa.Boolean(), nullable=True),
|
| 61 |
+
sa.Column('technical_term_handling', sa.String(length=20), nullable=True),
|
| 62 |
+
sa.Column('font_size', sa.Integer(), nullable=True),
|
| 63 |
+
sa.Column('focus_mode_preferences', sa.JSON(), nullable=True),
|
| 64 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 65 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 66 |
+
sa.Column('last_active', sa.DateTime(), nullable=False),
|
| 67 |
+
sa.PrimaryKeyConstraint('id'),
|
| 68 |
+
sa.UniqueConstraint('user_id')
|
| 69 |
+
)
|
| 70 |
+
op.create_index(op.f('ix_personalization_profiles_user_id'), 'personalization_profiles', ['user_id'], unique=False)
|
| 71 |
+
|
| 72 |
+
# Check if content_localization table exists before creating
|
| 73 |
+
conn = op.get_bind()
|
| 74 |
+
inspector = sa.inspect(conn)
|
| 75 |
+
tables = inspector.get_table_names()
|
| 76 |
+
|
| 77 |
+
if 'content_localization' not in tables:
|
| 78 |
+
# Create content_localization table
|
| 79 |
+
op.create_table('content_localization',
|
| 80 |
+
sa.Column('id', sa.Integer(), nullable=False),
|
| 81 |
+
sa.Column('content_url', sa.String(length=500), nullable=False),
|
| 82 |
+
sa.Column('content_hash', sa.String(length=64), nullable=False),
|
| 83 |
+
sa.Column('is_translated', sa.Boolean(), nullable=True),
|
| 84 |
+
sa.Column('last_translation_date', sa.DateTime(), nullable=True),
|
| 85 |
+
sa.Column('translation_cache_key', sa.String(length=64), nullable=True),
|
| 86 |
+
sa.Column('word_count', sa.Integer(), nullable=True),
|
| 87 |
+
sa.Column('character_count', sa.Integer(), nullable=True),
|
| 88 |
+
sa.Column('has_code_blocks', sa.Boolean(), nullable=True),
|
| 89 |
+
sa.Column('detected_languages', sa.JSON(), nullable=True),
|
| 90 |
+
sa.Column('chunk_count', sa.Integer(), nullable=True),
|
| 91 |
+
sa.Column('processing_status', sa.String(length=20), nullable=True),
|
| 92 |
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
| 93 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
| 94 |
+
sa.PrimaryKeyConstraint('id')
|
| 95 |
+
)
|
| 96 |
+
op.create_index(op.f('ix_content_localization_content_hash'), 'content_localization', ['content_hash'], unique=False)
|
| 97 |
+
op.create_index(op.f('ix_content_localization_content_url'), 'content_localization', ['content_url'], unique=False)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def downgrade():
|
| 101 |
+
# Drop tables in reverse order
|
| 102 |
+
op.drop_index(op.f('ix_content_localization_content_url'), table_name='content_localization')
|
| 103 |
+
op.drop_index(op.f('ix_content_localization_content_hash'), table_name='content_localization')
|
| 104 |
+
op.drop_table('content_localization')
|
| 105 |
+
|
| 106 |
+
op.drop_index(op.f('ix_personalization_profiles_user_id'), table_name='personalization_profiles')
|
| 107 |
+
op.drop_table('personalization_profiles')
|
| 108 |
+
|
| 109 |
+
op.drop_index('idx_user_translation', table_name='translation_feedback')
|
| 110 |
+
op.drop_table('translation_feedback')
|
| 111 |
+
|
| 112 |
+
op.drop_index(op.f('ix_translations_content_hash'), table_name='translations')
|
| 113 |
+
op.drop_index('idx_content_lookup', table_name='translations')
|
| 114 |
+
op.drop_table('translations')
|
alembic/versions/005_add_openai_translation_tables.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Add OpenAI Translation System tables
|
| 2 |
+
|
| 3 |
+
Revision ID: 005_add_openai_translation_tables
|
| 4 |
+
Revises: 004_add_translation_tables
|
| 5 |
+
Create Date: 2025-12-12
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from alembic import op
|
| 9 |
+
import sqlalchemy as sa
|
| 10 |
+
from sqlalchemy.dialects import postgresql, sqlite
|
| 11 |
+
import uuid
|
| 12 |
+
|
| 13 |
+
# revision identifiers
|
| 14 |
+
revision = '005_add_openai_translation_tables'
|
| 15 |
+
down_revision = '004_add_translation_tables'
|
| 16 |
+
branch_labels = None
|
| 17 |
+
depends_on = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def upgrade():
|
| 21 |
+
# Create translation_jobs table
|
| 22 |
+
op.create_table('translation_jobs',
|
| 23 |
+
sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
|
| 24 |
+
sa.Column('job_id', sa.String(length=255), nullable=False),
|
| 25 |
+
sa.Column('user_id', sa.String(length=255), nullable=True),
|
| 26 |
+
sa.Column('session_id', sa.String(length=255), nullable=True),
|
| 27 |
+
sa.Column('page_url', sa.String(length=2048), nullable=True),
|
| 28 |
+
sa.Column('content_hash', sa.String(length=64), nullable=False),
|
| 29 |
+
sa.Column('source_language', sa.String(length=10), nullable=False, default='en'),
|
| 30 |
+
sa.Column('target_language', sa.String(length=10), nullable=False, default='ur'),
|
| 31 |
+
sa.Column('model_name', sa.String(length=100), nullable=False),
|
| 32 |
+
sa.Column('temperature', sa.Float(), nullable=True),
|
| 33 |
+
sa.Column('max_tokens', sa.Integer(), nullable=True),
|
| 34 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 35 |
+
sa.Column('translated_text', sa.Text(), nullable=True),
|
| 36 |
+
sa.Column('status', sa.String(length=20), nullable=False, default='PENDING'),
|
| 37 |
+
sa.Column('chunks_total', sa.Integer(), nullable=False, default=0),
|
| 38 |
+
sa.Column('chunks_completed', sa.Integer(), nullable=False, default=0),
|
| 39 |
+
sa.Column('chunks_failed', sa.Integer(), nullable=False, default=0),
|
| 40 |
+
sa.Column('progress_percentage', sa.Float(), nullable=False, default=0.0),
|
| 41 |
+
sa.Column('input_tokens', sa.Integer(), nullable=False, default=0),
|
| 42 |
+
sa.Column('output_tokens', sa.Integer(), nullable=False, default=0),
|
| 43 |
+
sa.Column('total_tokens', sa.Integer(), nullable=False, default=0),
|
| 44 |
+
sa.Column('estimated_cost_usd', sa.Float(), nullable=False, default=0.0),
|
| 45 |
+
sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
|
| 46 |
+
sa.Column('preserve_code_blocks', sa.Boolean(), nullable=False, default=True),
|
| 47 |
+
sa.Column('enable_transliteration', sa.Boolean(), nullable=False, default=True),
|
| 48 |
+
sa.Column('chunk_size', sa.Integer(), nullable=False, default=2000),
|
| 49 |
+
sa.Column('max_chunks', sa.Integer(), nullable=False, default=100),
|
| 50 |
+
sa.Column('max_retries', sa.Integer(), nullable=False, default=3),
|
| 51 |
+
sa.Column('user_agent', sa.Text(), nullable=True),
|
| 52 |
+
sa.Column('ip_address', sa.String(length=45), nullable=True),
|
| 53 |
+
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 54 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 55 |
+
sa.Column('started_at', sa.DateTime(), nullable=True),
|
| 56 |
+
sa.Column('completed_at', sa.DateTime(), nullable=True),
|
| 57 |
+
sa.PrimaryKeyConstraint('id'),
|
| 58 |
+
sa.UniqueConstraint('job_id'),
|
| 59 |
+
sa.CheckConstraint('chunks_total >= 0', name='check_chunks_total_non_negative'),
|
| 60 |
+
sa.CheckConstraint('chunks_completed >= 0', name='check_chunks_completed_non_negative'),
|
| 61 |
+
sa.CheckConstraint('chunks_failed >= 0', name='check_chunks_failed_non_negative'),
|
| 62 |
+
sa.CheckConstraint('progress_percentage >= 0.0 AND progress_percentage <= 100.0', name='check_progress_percentage_range'),
|
| 63 |
+
sa.CheckConstraint('chunk_size > 0', name='check_chunk_size_positive'),
|
| 64 |
+
sa.CheckConstraint('max_chunks > 0', name='check_max_chunks_positive'),
|
| 65 |
+
sa.CheckConstraint('max_retries >= 0', name='check_max_retries_non_negative')
|
| 66 |
+
)
|
| 67 |
+
op.create_index('ix_translation_jobs_job_id', 'translation_jobs', ['job_id'], unique=False)
|
| 68 |
+
op.create_index('ix_translation_jobs_user_id', 'translation_jobs', ['user_id'], unique=False)
|
| 69 |
+
op.create_index('ix_translation_jobs_session_id', 'translation_jobs', ['session_id'], unique=False)
|
| 70 |
+
op.create_index('ix_translation_jobs_page_url', 'translation_jobs', ['page_url'], unique=False)
|
| 71 |
+
op.create_index('ix_translation_jobs_content_hash', 'translation_jobs', ['content_hash'], unique=False)
|
| 72 |
+
op.create_index('ix_translation_jobs_status', 'translation_jobs', ['status'], unique=False)
|
| 73 |
+
op.create_index('ix_translation_jobs_created_at', 'translation_jobs', ['created_at'], unique=False)
|
| 74 |
+
|
| 75 |
+
# Create translation_chunks table
|
| 76 |
+
op.create_table('translation_chunks',
|
| 77 |
+
sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
|
| 78 |
+
sa.Column('job_id', sa.UUID(), nullable=False),
|
| 79 |
+
sa.Column('chunk_index', sa.Integer(), nullable=False),
|
| 80 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 81 |
+
sa.Column('translated_text', sa.Text(), nullable=True),
|
| 82 |
+
sa.Column('status', sa.String(length=20), nullable=False, default='PENDING'),
|
| 83 |
+
sa.Column('retry_count', sa.Integer(), nullable=False, default=0),
|
| 84 |
+
sa.Column('start_position', sa.Integer(), nullable=False),
|
| 85 |
+
sa.Column('end_position', sa.Integer(), nullable=False),
|
| 86 |
+
sa.Column('is_code_block', sa.Boolean(), nullable=False, default=False),
|
| 87 |
+
sa.Column('code_language', sa.String(length=50), nullable=True),
|
| 88 |
+
sa.Column('word_count', sa.Integer(), nullable=False, default=0),
|
| 89 |
+
sa.Column('token_count', sa.Integer(), nullable=False, default=0),
|
| 90 |
+
sa.Column('input_tokens', sa.Integer(), nullable=False, default=0),
|
| 91 |
+
sa.Column('output_tokens', sa.Integer(), nullable=False, default=0),
|
| 92 |
+
sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
|
| 93 |
+
sa.Column('last_error', sa.Text(), nullable=True),
|
| 94 |
+
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 95 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 96 |
+
sa.Column('started_at', sa.DateTime(), nullable=True),
|
| 97 |
+
sa.Column('completed_at', sa.DateTime(), nullable=True),
|
| 98 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ondelete='CASCADE'),
|
| 99 |
+
sa.PrimaryKeyConstraint('id'),
|
| 100 |
+
sa.CheckConstraint('chunk_index >= 0', name='check_chunk_index_non_negative'),
|
| 101 |
+
sa.CheckConstraint('start_position >= 0', name='check_start_position_non_negative'),
|
| 102 |
+
sa.CheckConstraint('end_position >= start_position', name='check_end_position_after_start'),
|
| 103 |
+
sa.CheckConstraint('word_count >= 0', name='check_word_count_non_negative'),
|
| 104 |
+
sa.CheckConstraint('token_count >= 0', name='check_token_count_non_negative'),
|
| 105 |
+
sa.CheckConstraint('retry_count >= 0', name='check_retry_count_non_negative'),
|
| 106 |
+
sa.UniqueConstraint('job_id', 'chunk_index', name='uq_job_chunk_index')
|
| 107 |
+
)
|
| 108 |
+
op.create_index('ix_translation_chunks_job_id', 'translation_chunks', ['job_id'], unique=False)
|
| 109 |
+
op.create_index('ix_translation_chunks_status', 'translation_chunks', ['status'], unique=False)
|
| 110 |
+
op.create_index('ix_translation_chunks_is_code_block', 'translation_chunks', ['is_code_block'], unique=False)
|
| 111 |
+
|
| 112 |
+
# Create translation_cache table
|
| 113 |
+
op.create_table('translation_cache',
|
| 114 |
+
sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
|
| 115 |
+
sa.Column('cache_key', sa.String(length=255), nullable=False),
|
| 116 |
+
sa.Column('job_id', sa.UUID(), nullable=True),
|
| 117 |
+
sa.Column('content_hash', sa.String(length=64), nullable=False),
|
| 118 |
+
sa.Column('page_url', sa.String(length=2048), nullable=True),
|
| 119 |
+
sa.Column('url_hash', sa.String(length=32), nullable=True),
|
| 120 |
+
sa.Column('source_language', sa.String(length=10), nullable=False),
|
| 121 |
+
sa.Column('target_language', sa.String(length=10), nullable=False),
|
| 122 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 123 |
+
sa.Column('translated_text', sa.Text(), nullable=False),
|
| 124 |
+
sa.Column('model_version', sa.String(length=100), nullable=True),
|
| 125 |
+
sa.Column('processing_time_ms', sa.Integer(), nullable=False, default=0),
|
| 126 |
+
sa.Column('translation_metadata', sa.JSON(), nullable=True),
|
| 127 |
+
sa.Column('quality_score', sa.Float(), nullable=True),
|
| 128 |
+
sa.Column('confidence_score', sa.Float(), nullable=True),
|
| 129 |
+
sa.Column('is_validated', sa.Boolean(), nullable=False, default=False),
|
| 130 |
+
sa.Column('hit_count', sa.Integer(), nullable=False, default=0),
|
| 131 |
+
sa.Column('last_hit_at', sa.DateTime(), nullable=True),
|
| 132 |
+
sa.Column('ttl_hours', sa.Integer(), nullable=False, default=24),
|
| 133 |
+
sa.Column('priority', sa.String(length=10), nullable=False, default='MEDIUM'),
|
| 134 |
+
sa.Column('expires_at', sa.DateTime(), nullable=False),
|
| 135 |
+
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 136 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 137 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id']),
|
| 138 |
+
sa.PrimaryKeyConstraint('id'),
|
| 139 |
+
sa.UniqueConstraint('cache_key'),
|
| 140 |
+
sa.CheckConstraint('quality_score >= 0.0 AND quality_score <= 5.0', name='check_quality_score_range'),
|
| 141 |
+
sa.CheckConstraint('confidence_score >= 0.0 AND confidence_score <= 1.0', name='check_confidence_score_range'),
|
| 142 |
+
sa.CheckConstraint('hit_count >= 0', name='check_hit_count_non_negative'),
|
| 143 |
+
sa.CheckConstraint('ttl_hours > 0', name='check_ttl_hours_positive')
|
| 144 |
+
)
|
| 145 |
+
op.create_index('ix_translation_cache_cache_key', 'translation_cache', ['cache_key'], unique=False)
|
| 146 |
+
op.create_index('ix_translation_cache_content_hash', 'translation_cache', ['content_hash'], unique=False)
|
| 147 |
+
op.create_index('ix_translation_cache_page_url', 'translation_cache', ['page_url'], unique=False)
|
| 148 |
+
op.create_index('ix_translation_cache_url_hash', 'translation_cache', ['url_hash'], unique=False)
|
| 149 |
+
op.create_index('ix_translation_cache_expires_at', 'translation_cache', ['expires_at'], unique=False)
|
| 150 |
+
op.create_index('ix_translation_cache_priority', 'translation_cache', ['priority'], unique=False)
|
| 151 |
+
|
| 152 |
+
# Create translation_errors table
|
| 153 |
+
op.create_table('translation_errors',
|
| 154 |
+
sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
|
| 155 |
+
sa.Column('error_id', sa.String(length=255), nullable=False),
|
| 156 |
+
sa.Column('job_id', sa.UUID(), nullable=True),
|
| 157 |
+
sa.Column('chunk_id', sa.UUID(), nullable=True),
|
| 158 |
+
sa.Column('error_type', sa.String(length=50), nullable=False),
|
| 159 |
+
sa.Column('error_code', sa.String(length=100), nullable=True),
|
| 160 |
+
sa.Column('error_message', sa.Text(), nullable=False),
|
| 161 |
+
sa.Column('error_details', sa.JSON(), nullable=True),
|
| 162 |
+
sa.Column('severity', sa.String(length=10), nullable=False),
|
| 163 |
+
sa.Column('category', sa.String(length=50), nullable=False, default='translation'),
|
| 164 |
+
sa.Column('is_retriable', sa.Boolean(), nullable=False, default=True),
|
| 165 |
+
sa.Column('retry_count', sa.Integer(), nullable=False, default=0),
|
| 166 |
+
sa.Column('max_retries', sa.Integer(), nullable=False, default=3),
|
| 167 |
+
sa.Column('next_retry_at', sa.DateTime(), nullable=True),
|
| 168 |
+
sa.Column('is_resolved', sa.Boolean(), nullable=False, default=False),
|
| 169 |
+
sa.Column('resolution_notes', sa.Text(), nullable=True),
|
| 170 |
+
sa.Column('resolved_at', sa.DateTime(), nullable=True),
|
| 171 |
+
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 172 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 173 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id']),
|
| 174 |
+
sa.ForeignKeyConstraint(['chunk_id'], ['translation_chunks.id']),
|
| 175 |
+
sa.PrimaryKeyConstraint('id'),
|
| 176 |
+
sa.UniqueConstraint('error_id'),
|
| 177 |
+
sa.CheckConstraint('retry_count >= 0', name='check_error_retry_count_non_negative'),
|
| 178 |
+
sa.CheckConstraint('max_retries >= 0', name='check_error_max_retries_non_negative')
|
| 179 |
+
)
|
| 180 |
+
op.create_index('ix_translation_errors_error_id', 'translation_errors', ['error_id'], unique=False)
|
| 181 |
+
op.create_index('ix_translation_errors_job_id', 'translation_errors', ['job_id'], unique=False)
|
| 182 |
+
op.create_index('ix_translation_errors_chunk_id', 'translation_errors', ['chunk_id'], unique=False)
|
| 183 |
+
op.create_index('ix_translation_errors_error_type', 'translation_errors', ['error_type'], unique=False)
|
| 184 |
+
op.create_index('ix_translation_errors_severity', 'translation_errors', ['severity'], unique=False)
|
| 185 |
+
op.create_index('ix_translation_errors_created_at', 'translation_errors', ['created_at'], unique=False)
|
| 186 |
+
|
| 187 |
+
# Create translation_sessions table
|
| 188 |
+
op.create_table('translation_sessions',
|
| 189 |
+
sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
|
| 190 |
+
sa.Column('session_id', sa.String(length=255), nullable=False),
|
| 191 |
+
sa.Column('user_id', sa.String(length=255), nullable=True),
|
| 192 |
+
sa.Column('source_language', sa.String(length=10), nullable=False, default='en'),
|
| 193 |
+
sa.Column('target_language', sa.String(length=10), nullable=False, default='ur'),
|
| 194 |
+
sa.Column('preferred_model', sa.String(length=100), nullable=True),
|
| 195 |
+
sa.Column('request_count', sa.Integer(), nullable=False, default=0),
|
| 196 |
+
sa.Column('character_count', sa.Integer(), nullable=False, default=0),
|
| 197 |
+
sa.Column('total_cost_usd', sa.Float(), nullable=False, default=0.0),
|
| 198 |
+
sa.Column('session_data', sa.JSON(), nullable=True),
|
| 199 |
+
sa.Column('user_agent', sa.Text(), nullable=True),
|
| 200 |
+
sa.Column('ip_address', sa.String(length=45), nullable=True),
|
| 201 |
+
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 202 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 203 |
+
sa.Column('last_activity_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 204 |
+
sa.Column('expires_at', sa.DateTime(), nullable=False),
|
| 205 |
+
sa.PrimaryKeyConstraint('id'),
|
| 206 |
+
sa.UniqueConstraint('session_id'),
|
| 207 |
+
sa.CheckConstraint('request_count >= 0', name='check_session_request_count_non_negative'),
|
| 208 |
+
sa.CheckConstraint('character_count >= 0', name='check_session_character_count_non_negative'),
|
| 209 |
+
sa.CheckConstraint('total_cost_usd >= 0.0', name='check_session_total_cost_non_negative')
|
| 210 |
+
)
|
| 211 |
+
op.create_index('ix_translation_sessions_session_id', 'translation_sessions', ['session_id'], unique=False)
|
| 212 |
+
op.create_index('ix_translation_sessions_user_id', 'translation_sessions', ['user_id'], unique=False)
|
| 213 |
+
op.create_index('ix_translation_sessions_expires_at', 'translation_sessions', ['expires_at'], unique=False)
|
| 214 |
+
|
| 215 |
+
# Create translation_metrics table
|
| 216 |
+
op.create_table('translation_metrics',
|
| 217 |
+
sa.Column('id', sa.UUID(), nullable=False, default=uuid.uuid4),
|
| 218 |
+
sa.Column('metric_date', sa.DateTime(), nullable=False),
|
| 219 |
+
sa.Column('period_type', sa.String(length=20), nullable=False, default='daily'),
|
| 220 |
+
sa.Column('total_requests', sa.Integer(), nullable=False, default=0),
|
| 221 |
+
sa.Column('successful_requests', sa.Integer(), nullable=False, default=0),
|
| 222 |
+
sa.Column('failed_requests', sa.Integer(), nullable=False, default=0),
|
| 223 |
+
sa.Column('cached_requests', sa.Integer(), nullable=False, default=0),
|
| 224 |
+
sa.Column('avg_processing_time_ms', sa.Float(), nullable=False, default=0.0),
|
| 225 |
+
sa.Column('p95_processing_time_ms', sa.Float(), nullable=False, default=0.0),
|
| 226 |
+
sa.Column('p99_processing_time_ms', sa.Float(), nullable=False, default=0.0),
|
| 227 |
+
sa.Column('total_characters', sa.Integer(), nullable=False, default=0),
|
| 228 |
+
sa.Column('total_tokens', sa.Integer(), nullable=False, default=0),
|
| 229 |
+
sa.Column('total_cost_usd', sa.Float(), nullable=False, default=0.0),
|
| 230 |
+
sa.Column('avg_quality_score', sa.Float(), nullable=True),
|
| 231 |
+
sa.Column('cache_hit_rate', sa.Float(), nullable=False, default=0.0),
|
| 232 |
+
sa.Column('error_rate', sa.Float(), nullable=False, default=0.0),
|
| 233 |
+
sa.Column('top_error_types', sa.JSON(), nullable=True),
|
| 234 |
+
sa.Column('language_pairs', sa.JSON(), nullable=True),
|
| 235 |
+
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 236 |
+
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.text('CURRENT_TIMESTAMP')),
|
| 237 |
+
sa.PrimaryKeyConstraint('id'),
|
| 238 |
+
sa.CheckConstraint('total_requests >= 0', name='check_metrics_total_requests_non_negative'),
|
| 239 |
+
sa.CheckConstraint('successful_requests >= 0', name='check_metrics_successful_requests_non_negative'),
|
| 240 |
+
sa.CheckConstraint('failed_requests >= 0', name='check_metrics_failed_requests_non_negative'),
|
| 241 |
+
sa.CheckConstraint('cached_requests >= 0', name='check_metrics_cached_requests_non_negative'),
|
| 242 |
+
sa.CheckConstraint('total_characters >= 0', name='check_metrics_total_characters_non_negative'),
|
| 243 |
+
sa.CheckConstraint('total_tokens >= 0', name='check_metrics_total_tokens_non_negative'),
|
| 244 |
+
sa.CheckConstraint('total_cost_usd >= 0.0', name='check_metrics_total_cost_non_negative'),
|
| 245 |
+
sa.CheckConstraint('avg_processing_time_ms >= 0.0', name='check_metrics_avg_processing_time_non_negative'),
|
| 246 |
+
sa.CheckConstraint('p95_processing_time_ms >= 0.0', name='check_metrics_p95_processing_time_non_negative'),
|
| 247 |
+
sa.CheckConstraint('p99_processing_time_ms >= 0.0', name='check_metrics_p99_processing_time_non_negative'),
|
| 248 |
+
sa.CheckConstraint('cache_hit_rate >= 0.0 AND cache_hit_rate <= 1.0', name='check_metrics_cache_hit_rate_range'),
|
| 249 |
+
sa.CheckConstraint('error_rate >= 0.0 AND error_rate <= 1.0', name='check_metrics_error_rate_range'),
|
| 250 |
+
sa.UniqueConstraint('metric_date', 'period_type', name='uq_metrics_date_period')
|
| 251 |
+
)
|
| 252 |
+
op.create_index('ix_translation_metrics_metric_date', 'translation_metrics', ['metric_date'], unique=False)
|
| 253 |
+
op.create_index('ix_translation_metrics_period_type', 'translation_metrics', ['period_type'], unique=False)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def downgrade():
|
| 257 |
+
# Drop tables in reverse order
|
| 258 |
+
op.drop_index('ix_translation_metrics_period_type', table_name='translation_metrics')
|
| 259 |
+
op.drop_index('ix_translation_metrics_metric_date', table_name='translation_metrics')
|
| 260 |
+
op.drop_table('translation_metrics')
|
| 261 |
+
|
| 262 |
+
op.drop_index('ix_translation_sessions_expires_at', table_name='translation_sessions')
|
| 263 |
+
op.drop_index('ix_translation_sessions_user_id', table_name='translation_sessions')
|
| 264 |
+
op.drop_index('ix_translation_sessions_session_id', table_name='translation_sessions')
|
| 265 |
+
op.drop_table('translation_sessions')
|
| 266 |
+
|
| 267 |
+
op.drop_index('ix_translation_errors_created_at', table_name='translation_errors')
|
| 268 |
+
op.drop_index('ix_translation_errors_severity', table_name='translation_errors')
|
| 269 |
+
op.drop_index('ix_translation_errors_error_type', table_name='translation_errors')
|
| 270 |
+
op.drop_index('ix_translation_errors_chunk_id', table_name='translation_errors')
|
| 271 |
+
op.drop_index('ix_translation_errors_job_id', table_name='translation_errors')
|
| 272 |
+
op.drop_index('ix_translation_errors_error_id', table_name='translation_errors')
|
| 273 |
+
op.drop_table('translation_errors')
|
| 274 |
+
|
| 275 |
+
op.drop_index('ix_translation_cache_priority', table_name='translation_cache')
|
| 276 |
+
op.drop_index('ix_translation_cache_expires_at', table_name='translation_cache')
|
| 277 |
+
op.drop_index('ix_translation_cache_url_hash', table_name='translation_cache')
|
| 278 |
+
op.drop_index('ix_translation_cache_page_url', table_name='translation_cache')
|
| 279 |
+
op.drop_index('ix_translation_cache_content_hash', table_name='translation_cache')
|
| 280 |
+
op.drop_index('ix_translation_cache_cache_key', table_name='translation_cache')
|
| 281 |
+
op.drop_table('translation_cache')
|
| 282 |
+
|
| 283 |
+
op.drop_index('ix_translation_chunks_is_code_block', table_name='translation_chunks')
|
| 284 |
+
op.drop_index('ix_translation_chunks_status', table_name='translation_chunks')
|
| 285 |
+
op.drop_index('ix_translation_chunks_job_id', table_name='translation_chunks')
|
| 286 |
+
op.drop_table('translation_chunks')
|
| 287 |
+
|
| 288 |
+
op.drop_index('ix_translation_jobs_created_at', table_name='translation_jobs')
|
| 289 |
+
op.drop_index('ix_translation_jobs_status', table_name='translation_jobs')
|
| 290 |
+
op.drop_index('ix_translation_jobs_content_hash', table_name='translation_jobs')
|
| 291 |
+
op.drop_index('ix_translation_jobs_page_url', table_name='translation_jobs')
|
| 292 |
+
op.drop_index('ix_translation_jobs_session_id', table_name='translation_jobs')
|
| 293 |
+
op.drop_index('ix_translation_jobs_user_id', table_name='translation_jobs')
|
| 294 |
+
op.drop_index('ix_translation_jobs_job_id', table_name='translation_jobs')
|
| 295 |
+
op.drop_table('translation_jobs')
|
create_translation_tables.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Create translation tables in the database.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
from src.database.base import engine, Base
|
| 15 |
+
from src.models import * # Import all models
|
| 16 |
+
|
| 17 |
+
def create_tables():
|
| 18 |
+
"""Create all tables in the database."""
|
| 19 |
+
try:
|
| 20 |
+
# Import models to register them
|
| 21 |
+
from src.models.auth import User
|
| 22 |
+
from src.models.translation_openai import (
|
| 23 |
+
TranslationJob, TranslationChunk, TranslationError,
|
| 24 |
+
TranslationSession, TranslationCache, TranslationMetrics
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Create all tables
|
| 28 |
+
Base.metadata.create_all(bind=engine)
|
| 29 |
+
print("Translation tables created successfully!")
|
| 30 |
+
|
| 31 |
+
# List created tables
|
| 32 |
+
from sqlalchemy import inspect
|
| 33 |
+
inspector = inspect(engine)
|
| 34 |
+
tables = inspector.get_table_names()
|
| 35 |
+
|
| 36 |
+
print("\nAvailable tables:")
|
| 37 |
+
for table in sorted(tables):
|
| 38 |
+
if 'translation' in table.lower():
|
| 39 |
+
print(f" - {table}")
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"Error creating tables: {e}")
|
| 43 |
+
import traceback
|
| 44 |
+
traceback.print_exc()
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
create_tables()
|
fix_async_client.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Fix the async client initialization in get_translation_service().
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
# Read the service.py file
|
| 15 |
+
file_path = backend_path / "src" / "services" / "openai_translation" / "service.py"
|
| 16 |
+
content = file_path.read_text(encoding='utf-8')
|
| 17 |
+
|
| 18 |
+
# Find and replace the get_translation_service function
|
| 19 |
+
old_function = """async def get_translation_service() -> OpenAITranslationService:
|
| 20 |
+
\"\"\"Get or create OpenAI translation service instance.\"\"\"
|
| 21 |
+
global _translation_service
|
| 22 |
+
|
| 23 |
+
if _translation_service is None:
|
| 24 |
+
_translation_service = OpenAITranslationService()
|
| 25 |
+
|
| 26 |
+
return _translation_service"""
|
| 27 |
+
|
| 28 |
+
new_function = """async def get_translation_service() -> OpenAITranslationService:
|
| 29 |
+
\"\"\"Get or create OpenAI translation service instance.\"\"\"
|
| 30 |
+
global _translation_service
|
| 31 |
+
|
| 32 |
+
if _translation_service is None:
|
| 33 |
+
_translation_service = OpenAITranslationService()
|
| 34 |
+
# Initialize the async client
|
| 35 |
+
_translation_service.gemini_client = await get_gemini_client()
|
| 36 |
+
|
| 37 |
+
return _translation_service"""
|
| 38 |
+
|
| 39 |
+
content = content.replace(old_function, new_function)
|
| 40 |
+
|
| 41 |
+
# Write back to file
|
| 42 |
+
file_path.write_text(content, encoding='utf-8')
|
| 43 |
+
|
| 44 |
+
print("Fixed async client initialization in get_translation_service()")
|
fix_jsonb.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Replace JSONB with JSON for SQLite compatibility.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
# Read the translation_openai.py file
|
| 15 |
+
model_file = backend_path / "src" / "models" / "translation_openai.py"
|
| 16 |
+
content = model_file.read_text(encoding='utf-8')
|
| 17 |
+
|
| 18 |
+
# Replace all JSONB with JSON
|
| 19 |
+
content = content.replace('JSONB', 'JSON')
|
| 20 |
+
|
| 21 |
+
# Remove JSONB from imports since we're using JSON from sqlalchemy
|
| 22 |
+
content = content.replace('from sqlalchemy.dialects.postgresql import UUID, JSONB',
|
| 23 |
+
'from sqlalchemy.dialects.postgresql import UUID')
|
| 24 |
+
|
| 25 |
+
# Write back to file
|
| 26 |
+
model_file.write_text(content, encoding='utf-8')
|
| 27 |
+
|
| 28 |
+
print("Fixed JSONB to JSON conversion")
|
fix_translation_endpoint.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Fix translation endpoint to handle User objects properly.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
# Read the translation.py file
|
| 15 |
+
file_path = backend_path / "src" / "api" / "v1" / "translation.py"
|
| 16 |
+
content = file_path.read_text(encoding='utf-8')
|
| 17 |
+
|
| 18 |
+
# Add User import
|
| 19 |
+
if "from src.models.auth import User" not in content:
|
| 20 |
+
# Add User import after other imports
|
| 21 |
+
content = content.replace(
|
| 22 |
+
"from src.security.dependencies import get_current_user_or_anonymous",
|
| 23 |
+
"from src.security.dependencies import get_current_user_or_anonymous\nfrom src.models.auth import User"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Fix type hints
|
| 27 |
+
content = content.replace(
|
| 28 |
+
"current_user: Optional[Dict] = Depends(get_current_user_or_anonymous),",
|
| 29 |
+
"current_user: Optional[User] = Depends(get_current_user_or_anonymous),"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Fix current_user.get() calls
|
| 33 |
+
content = content.replace(
|
| 34 |
+
'current_user.get("id") if current_user else None',
|
| 35 |
+
'current_user.id if current_user else None'
|
| 36 |
+
)
|
| 37 |
+
content = content.replace(
|
| 38 |
+
'current_user.get("is_admin", False)',
|
| 39 |
+
'getattr(current_user, "is_admin", False)'
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Write back to file
|
| 43 |
+
file_path.write_text(content, encoding='utf-8')
|
| 44 |
+
|
| 45 |
+
print("Fixed translation endpoint to handle User objects")
|
fix_user_id_issue.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Fix the user_id issue in translation service.
|
| 4 |
+
The User.id is a string but the foreign key expects a UUID.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add backend to path
|
| 12 |
+
backend_path = Path(__file__).parent
|
| 13 |
+
sys.path.insert(0, str(backend_path))
|
| 14 |
+
|
| 15 |
+
# Read the translation_openai.py file
|
| 16 |
+
file_path = backend_path / "src" / "models" / "translation_openai.py"
|
| 17 |
+
content = file_path.read_text(encoding='utf-8')
|
| 18 |
+
|
| 19 |
+
# Change user_id from UUID to String to match the User model
|
| 20 |
+
content = content.replace(
|
| 21 |
+
'user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True)',
|
| 22 |
+
'user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)'
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Also fix TranslationSession and TranslationMetrics user_id fields
|
| 26 |
+
content = content.replace(
|
| 27 |
+
'user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=True, index=True)',
|
| 28 |
+
'user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)'
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Write back to file
|
| 32 |
+
file_path.write_text(content, encoding='utf-8')
|
| 33 |
+
|
| 34 |
+
print("Fixed user_id to use String instead of UUID to match User.id field")
|
fix_user_model.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Fix the User model to add translation relationships.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
# Read the auth.py file
|
| 15 |
+
auth_file = backend_path / "src" / "models" / "auth.py"
|
| 16 |
+
content = auth_file.read_text(encoding='utf-8')
|
| 17 |
+
|
| 18 |
+
# Find the User model's relationships section
|
| 19 |
+
import_start = content.find(" # Relationships")
|
| 20 |
+
if import_start == -1:
|
| 21 |
+
print("Could not find relationships section in User model")
|
| 22 |
+
sys.exit(1)
|
| 23 |
+
|
| 24 |
+
# Find where the relationships end
|
| 25 |
+
relationships_end = content.find("\n\n", import_start)
|
| 26 |
+
if relationships_end == -1:
|
| 27 |
+
relationships_end = content.find("\nclass", import_start)
|
| 28 |
+
|
| 29 |
+
if relationships_end == -1:
|
| 30 |
+
print("Could not find end of relationships section")
|
| 31 |
+
sys.exit(1)
|
| 32 |
+
|
| 33 |
+
# Extract the relationships section
|
| 34 |
+
relationships_section = content[import_start:relationships_end]
|
| 35 |
+
|
| 36 |
+
# Check if translation relationships already exist
|
| 37 |
+
if "translation_jobs" in relationships_section:
|
| 38 |
+
print("Translation relationships already exist in User model")
|
| 39 |
+
else:
|
| 40 |
+
# Add the translation relationships
|
| 41 |
+
new_relationships = relationships_section.rstrip()
|
| 42 |
+
if not new_relationships.endswith('\n'):
|
| 43 |
+
new_relationships += '\n'
|
| 44 |
+
new_relationships += """ translation_jobs = relationship("TranslationJob", back_populates="user", cascade="all, delete-orphan")
|
| 45 |
+
translation_sessions = relationship("TranslationSession", back_populates="user", cascade="all, delete-orphan")
|
| 46 |
+
translation_metrics = relationship("TranslationMetrics", back_populates="user", cascade="all, delete-orphan")"""
|
| 47 |
+
|
| 48 |
+
# Replace the old relationships section with the new one
|
| 49 |
+
new_content = content[:import_start] + new_relationships + content[relationships_end:]
|
| 50 |
+
|
| 51 |
+
# Write back to file
|
| 52 |
+
auth_file.write_text(new_content, encoding='utf-8')
|
| 53 |
+
print("✅ Added translation relationships to User model")
|
main.py
CHANGED
|
@@ -25,6 +25,7 @@ from rag.chat import ChatHandler
|
|
| 25 |
from rag.qdrant_client import QdrantManager
|
| 26 |
from rag.tasks import TaskManager
|
| 27 |
from api.exceptions import ContentNotFoundError, RAGException
|
|
|
|
| 28 |
|
| 29 |
# Import security middleware
|
| 30 |
from middleware.csrf import CSRFMiddleware
|
|
@@ -60,6 +61,7 @@ logger = structlog.get_logger()
|
|
| 60 |
|
| 61 |
# Load environment variables
|
| 62 |
load_dotenv()
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
class Settings(BaseSettings):
|
|
@@ -91,12 +93,15 @@ class Settings(BaseSettings):
|
|
| 91 |
# CORS Configuration
|
| 92 |
allowed_origins: str = os.getenv(
|
| 93 |
"ALLOWED_ORIGINS",
|
| 94 |
-
"http://localhost:3000,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co"
|
| 95 |
)
|
| 96 |
|
| 97 |
# JWT Configuration
|
| 98 |
jwt_secret_key: str = os.getenv("JWT_SECRET_KEY", "your-super-secret-jwt-key")
|
| 99 |
|
|
|
|
|
|
|
|
|
|
| 100 |
# Conversation Context
|
| 101 |
max_context_messages: int = int(os.getenv("MAX_CONTEXT_MESSAGES", "3"))
|
| 102 |
context_window_size: int = int(os.getenv("CONTEXT_WINDOW_SIZE", "4000"))
|
|
@@ -182,6 +187,9 @@ async def lifespan(app: FastAPI):
|
|
| 182 |
)
|
| 183 |
await task_manager.start()
|
| 184 |
|
|
|
|
|
|
|
|
|
|
| 185 |
logger.info("RAG backend initialized successfully")
|
| 186 |
|
| 187 |
yield
|
|
@@ -237,13 +245,13 @@ app.add_middleware(
|
|
| 237 |
httponly=False,
|
| 238 |
samesite="lax",
|
| 239 |
max_age=3600,
|
| 240 |
-
exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth/login", "/auth/register", "/api/chat", "/auth/logout", "/auth/me", "/auth/preferences", "/auth/refresh"],
|
| 241 |
)
|
| 242 |
|
| 243 |
app.add_middleware(
|
| 244 |
AuthMiddleware,
|
| 245 |
anonymous_limit=3,
|
| 246 |
-
exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth"],
|
| 247 |
anonymous_header="X-Anonymous-Session-ID",
|
| 248 |
)
|
| 249 |
|
|
@@ -253,6 +261,14 @@ app.include_router(auth.router)
|
|
| 253 |
# Include new chat routes
|
| 254 |
app.include_router(chat.router)
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
# Optional API key security for higher rate limits
|
| 258 |
security = HTTPBearer(auto_error=False)
|
|
@@ -887,6 +903,45 @@ async def create_chatkit_session(request: Request):
|
|
| 887 |
# raise HTTPException(status_code=500, detail=f"ChatKit processing error: {str(e)}")
|
| 888 |
|
| 889 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
if __name__ == "__main__":
|
| 891 |
import uvicorn
|
| 892 |
|
|
|
|
| 25 |
from rag.qdrant_client import QdrantManager
|
| 26 |
from rag.tasks import TaskManager
|
| 27 |
from api.exceptions import ContentNotFoundError, RAGException
|
| 28 |
+
from src.services.translation_cache import cache_service
|
| 29 |
|
| 30 |
# Import security middleware
|
| 31 |
from middleware.csrf import CSRFMiddleware
|
|
|
|
| 61 |
|
| 62 |
# Load environment variables
|
| 63 |
load_dotenv()
|
| 64 |
+
print(f"*** Environment loaded. GEMINI_API_KEY exists: {bool(os.getenv('GEMINI_API_KEY'))} ***")
|
| 65 |
|
| 66 |
|
| 67 |
class Settings(BaseSettings):
|
|
|
|
| 93 |
# CORS Configuration
|
| 94 |
allowed_origins: str = os.getenv(
|
| 95 |
"ALLOWED_ORIGINS",
|
| 96 |
+
"http://localhost:3000,http://localhost:3001,http://localhost:8080,https://mrowaisabdullah.github.io,https://huggingface.co"
|
| 97 |
)
|
| 98 |
|
| 99 |
# JWT Configuration
|
| 100 |
jwt_secret_key: str = os.getenv("JWT_SECRET_KEY", "your-super-secret-jwt-key")
|
| 101 |
|
| 102 |
+
# Google AI Configuration
|
| 103 |
+
google_ai_api_key: str = os.getenv("GEMINI_API_KEY", "")
|
| 104 |
+
|
| 105 |
# Conversation Context
|
| 106 |
max_context_messages: int = int(os.getenv("MAX_CONTEXT_MESSAGES", "3"))
|
| 107 |
context_window_size: int = int(os.getenv("CONTEXT_WINDOW_SIZE", "4000"))
|
|
|
|
| 187 |
)
|
| 188 |
await task_manager.start()
|
| 189 |
|
| 190 |
+
# Start background task for cache cleanup (runs daily)
|
| 191 |
+
asyncio.create_task(schedule_cache_cleanup())
|
| 192 |
+
|
| 193 |
logger.info("RAG backend initialized successfully")
|
| 194 |
|
| 195 |
yield
|
|
|
|
| 245 |
httponly=False,
|
| 246 |
samesite="lax",
|
| 247 |
max_age=3600,
|
| 248 |
+
exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth/login", "/auth/register", "/api/chat", "/auth/logout", "/auth/me", "/auth/preferences", "/auth/refresh", "/api/v1/translation"],
|
| 249 |
)
|
| 250 |
|
| 251 |
app.add_middleware(
|
| 252 |
AuthMiddleware,
|
| 253 |
anonymous_limit=3,
|
| 254 |
+
exempt_paths=["/health", "/docs", "/openapi.json", "/ingest/status", "/collections", "/auth", "/api/v1/translation"],
|
| 255 |
anonymous_header="X-Anonymous-Session-ID",
|
| 256 |
)
|
| 257 |
|
|
|
|
| 261 |
# Include new chat routes
|
| 262 |
app.include_router(chat.router)
|
| 263 |
|
| 264 |
+
# Include reader features routes
|
| 265 |
+
from src.api.v1 import reader_features
|
| 266 |
+
app.include_router(reader_features.router, prefix="/api/v1")
|
| 267 |
+
|
| 268 |
+
# Include translation routes
|
| 269 |
+
from src.api.v1 import translation
|
| 270 |
+
app.include_router(translation.router, prefix="/api/v1")
|
| 271 |
+
|
| 272 |
|
| 273 |
# Optional API key security for higher rate limits
|
| 274 |
security = HTTPBearer(auto_error=False)
|
|
|
|
| 903 |
# raise HTTPException(status_code=500, detail=f"ChatKit processing error: {str(e)}")
|
| 904 |
|
| 905 |
|
| 906 |
+
async def schedule_cache_cleanup():
|
| 907 |
+
"""
|
| 908 |
+
Schedule periodic cache cleanup task.
|
| 909 |
+
Runs every 24 hours to clear expired translation cache entries.
|
| 910 |
+
"""
|
| 911 |
+
import logging
|
| 912 |
+
|
| 913 |
+
cache_logger = logging.getLogger(__name__)
|
| 914 |
+
|
| 915 |
+
while True:
|
| 916 |
+
try:
|
| 917 |
+
# Wait for 24 hours
|
| 918 |
+
await asyncio.sleep(86400) # 24 hours in seconds
|
| 919 |
+
|
| 920 |
+
# Clean up expired cache entries
|
| 921 |
+
cleared_count = await cache_service.clear_expired_cache()
|
| 922 |
+
|
| 923 |
+
if cleared_count > 0:
|
| 924 |
+
cache_logger.info(
|
| 925 |
+
f"Cache cleanup completed",
|
| 926 |
+
cleared_entries=cleared_count,
|
| 927 |
+
timestamp=datetime.utcnow().isoformat()
|
| 928 |
+
)
|
| 929 |
+
else:
|
| 930 |
+
cache_logger.debug(
|
| 931 |
+
"Cache cleanup completed - no expired entries found",
|
| 932 |
+
timestamp=datetime.utcnow().isoformat()
|
| 933 |
+
)
|
| 934 |
+
|
| 935 |
+
except Exception as e:
|
| 936 |
+
cache_logger.error(
|
| 937 |
+
"Cache cleanup failed",
|
| 938 |
+
error=str(e),
|
| 939 |
+
timestamp=datetime.utcnow().isoformat()
|
| 940 |
+
)
|
| 941 |
+
# Wait 1 hour before retrying on error
|
| 942 |
+
await asyncio.sleep(3600)
|
| 943 |
+
|
| 944 |
+
|
| 945 |
if __name__ == "__main__":
|
| 946 |
import uvicorn
|
| 947 |
|
migrate_user_id.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Migration script to change user_id from UUID to String in translation tables.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
from sqlalchemy import text
|
| 15 |
+
from src.database.base import engine
|
| 16 |
+
|
| 17 |
+
def migrate_user_id_columns():
|
| 18 |
+
"""Migrate user_id columns from UUID to String in translation tables."""
|
| 19 |
+
|
| 20 |
+
# Tables to modify
|
| 21 |
+
tables = [
|
| 22 |
+
'translation_jobs',
|
| 23 |
+
'translation_sessions',
|
| 24 |
+
'translation_metrics'
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
with engine.connect() as connection:
|
| 28 |
+
# Begin transaction
|
| 29 |
+
trans = connection.begin()
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
for table in tables:
|
| 33 |
+
print(f"Migrating {table}...")
|
| 34 |
+
|
| 35 |
+
# SQLite doesn't support ALTER COLUMN directly, so we need to:
|
| 36 |
+
# 1. Create new table with correct schema
|
| 37 |
+
# 2. Copy data
|
| 38 |
+
# 3. Drop old table
|
| 39 |
+
# 4. Rename new table
|
| 40 |
+
|
| 41 |
+
# For simplicity, let's just create new tables and drop the old ones
|
| 42 |
+
# since this is still development
|
| 43 |
+
connection.execute(text(f"DROP TABLE IF EXISTS {table}"))
|
| 44 |
+
print(f" - Dropped {table}")
|
| 45 |
+
|
| 46 |
+
# Commit transaction
|
| 47 |
+
trans.commit()
|
| 48 |
+
print("\nMigration successful!")
|
| 49 |
+
|
| 50 |
+
# Recreate tables
|
| 51 |
+
from src.models import * # Import all models
|
| 52 |
+
from src.database.base import Base
|
| 53 |
+
Base.metadata.create_all(bind=engine)
|
| 54 |
+
print("\nTables recreated with new schema!")
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
# Rollback on error
|
| 58 |
+
trans.rollback()
|
| 59 |
+
print(f"\nMigration failed: {e}")
|
| 60 |
+
raise
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
migrate_user_id_columns()
|
migrate_user_id_fixed.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Migration script to change user_id from UUID to String in translation tables.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add backend to path
|
| 11 |
+
backend_path = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(backend_path))
|
| 13 |
+
|
| 14 |
+
from sqlalchemy import text
|
| 15 |
+
from src.database.base import engine, Base
|
| 16 |
+
from src.models import * # Import all models
|
| 17 |
+
|
| 18 |
+
def migrate_user_id_columns():
|
| 19 |
+
"""Migrate user_id columns from UUID to String in translation tables."""
|
| 20 |
+
|
| 21 |
+
# Tables to modify
|
| 22 |
+
tables = [
|
| 23 |
+
'translation_jobs',
|
| 24 |
+
'translation_sessions',
|
| 25 |
+
'translation_metrics'
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
with engine.connect() as connection:
|
| 29 |
+
# Begin transaction
|
| 30 |
+
trans = connection.begin()
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
for table in tables:
|
| 34 |
+
print(f"Dropping {table}...")
|
| 35 |
+
connection.execute(text(f"DROP TABLE IF EXISTS {table}"))
|
| 36 |
+
|
| 37 |
+
# Commit transaction
|
| 38 |
+
trans.commit()
|
| 39 |
+
print("\nDropped all translation tables successfully!")
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
# Rollback on error
|
| 43 |
+
trans.rollback()
|
| 44 |
+
print(f"\nMigration failed: {e}")
|
| 45 |
+
raise
|
| 46 |
+
|
| 47 |
+
# Recreate tables with new schema
|
| 48 |
+
print("Recreating tables with new schema...")
|
| 49 |
+
Base.metadata.create_all(bind=engine)
|
| 50 |
+
print("\nMigration completed successfully!")
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
migrate_user_id_columns()
|
migration_summary_translation_tables.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Database Migration: Translation Tables (Phase 2, Task T010)
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
Created Alembic migration `004_add_translation_tables.py` to add support for translation features, user feedback, personalization, and content localization.
|
| 5 |
+
|
| 6 |
+
## Migration Details
|
| 7 |
+
- **Revision ID**: `004_add_translation_tables`
|
| 8 |
+
- **Revises**: `003_reader_features_tables`
|
| 9 |
+
- **File**: `backend/alembic/versions/004_add_translation_tables.py`
|
| 10 |
+
|
| 11 |
+
## Tables Created
|
| 12 |
+
|
| 13 |
+
### 1. `translations` Table
|
| 14 |
+
Stores cached translations with content hashing for deduplication.
|
| 15 |
+
|
| 16 |
+
**Columns:**
|
| 17 |
+
- `id` (Integer, Primary Key)
|
| 18 |
+
- `content_hash` (String(64), Unique, Indexed) - SHA-256 hash for deduplication
|
| 19 |
+
- `source_language` (String(10)) - Source language code
|
| 20 |
+
- `target_language` (String(10)) - Target language code
|
| 21 |
+
- `original_text` (Text) - Original text to translate
|
| 22 |
+
- `translated_text` (Text) - Translated text
|
| 23 |
+
- `created_at` (DateTime) - Creation timestamp
|
| 24 |
+
- `updated_at` (DateTime) - Last update timestamp
|
| 25 |
+
- `translation_model` (String(50)) - Model used for translation (e.g., "gemini-1.5-pro")
|
| 26 |
+
- `character_count` (Integer) - Character count of the text
|
| 27 |
+
|
| 28 |
+
**Indexes:**
|
| 29 |
+
- Unique index on `content_hash`
|
| 30 |
+
- Composite index `idx_content_lookup` on (`content_hash`, `source_language`, `target_language`)
|
| 31 |
+
|
| 32 |
+
### 2. `translation_feedback` Table
|
| 33 |
+
Stores user feedback on translations for quality improvement.
|
| 34 |
+
|
| 35 |
+
**Columns:**
|
| 36 |
+
- `id` (Integer, Primary Key)
|
| 37 |
+
- `translation_id` (Integer, Foreign Key → translations.id)
|
| 38 |
+
- `user_id` (String(36)) - User UUID from auth system
|
| 39 |
+
- `rating` (SmallInteger) - -1 (downvote) or 1 (upvote)
|
| 40 |
+
- `comment` (Text, Optional) - User comment on the translation
|
| 41 |
+
- `created_at` (DateTime) - Feedback timestamp
|
| 42 |
+
|
| 43 |
+
**Constraints:**
|
| 44 |
+
- Check constraint: `rating IN (-1, 1)`
|
| 45 |
+
- Unique composite index on (`user_id`, `translation_id`) - One feedback per user per translation
|
| 46 |
+
|
| 47 |
+
### 3. `personalization_profiles` Table
|
| 48 |
+
Stores user preferences for personalized content delivery.
|
| 49 |
+
|
| 50 |
+
**Columns:**
|
| 51 |
+
- `id` (Integer, Primary Key)
|
| 52 |
+
- `user_id` (String(36), Unique, Indexed) - User UUID
|
| 53 |
+
- `reading_level` (Enum: 'beginner', 'intermediate', 'advanced')
|
| 54 |
+
- `preferred_language` (String(10)) - User's preferred language
|
| 55 |
+
- `focus_areas` (JSON) - Array of topics user cares about
|
| 56 |
+
- `learning_style` (Enum: 'visual', 'practical', 'theoretical', 'balanced')
|
| 57 |
+
- `enable_transliteration` (Boolean) - Whether to show transliterations
|
| 58 |
+
- `technical_term_handling` (Enum: 'translate', 'transliterate', 'keep_english')
|
| 59 |
+
- `font_size` (Integer) - Preferred font size
|
| 60 |
+
- `focus_mode_preferences` (JSON) - Preferences for focus mode
|
| 61 |
+
- `created_at` (DateTime)
|
| 62 |
+
- `updated_at` (DateTime)
|
| 63 |
+
- `last_active` (DateTime)
|
| 64 |
+
|
| 65 |
+
### 4. `content_localization` Table (Conditional Creation)
|
| 66 |
+
Tracks translation status and metadata for content pages.
|
| 67 |
+
This table is only created if it doesn't already exist.
|
| 68 |
+
|
| 69 |
+
**Columns:**
|
| 70 |
+
- `id` (Integer, Primary Key)
|
| 71 |
+
- `content_url` (String(500), Indexed) - URL of the content page
|
| 72 |
+
- `content_hash` (String(64), Indexed) - Content hash for change detection
|
| 73 |
+
- `is_translated` (Boolean) - Whether the content has been translated
|
| 74 |
+
- `last_translation_date` (DateTime) - When translation was last updated
|
| 75 |
+
- `translation_cache_key` (String(64)) - Cache key for translations
|
| 76 |
+
- `word_count` (Integer) - Number of words in content
|
| 77 |
+
- `character_count` (Integer) - Number of characters
|
| 78 |
+
- `has_code_blocks` (Boolean) - Whether content contains code blocks
|
| 79 |
+
- `detected_languages` (JSON) - Array of detected languages in content
|
| 80 |
+
- `chunk_count` (Integer) - Number of chunks for processing
|
| 81 |
+
- `processing_status` (Enum: 'pending', 'processing', 'completed', 'failed', 'partial')
|
| 82 |
+
- `created_at` (DateTime)
|
| 83 |
+
- `updated_at` (DateTime)
|
| 84 |
+
|
| 85 |
+
**Indexes:**
|
| 86 |
+
- Index on `content_hash`
|
| 87 |
+
- Index on `content_url`
|
| 88 |
+
|
| 89 |
+
## Database Compatibility
|
| 90 |
+
The migration is designed to work with SQLite (current database) but is compatible with PostgreSQL as well.
|
| 91 |
+
|
| 92 |
+
## Foreign Key Relationships
|
| 93 |
+
- `translation_feedback.translation_id` → `translations.id`
|
| 94 |
+
- (Other foreign keys would be to the users table from auth system)
|
| 95 |
+
|
| 96 |
+
## Migration Usage
|
| 97 |
+
|
| 98 |
+
### To apply the migration:
|
| 99 |
+
```bash
|
| 100 |
+
cd backend
|
| 101 |
+
alembic upgrade head
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### To revert the migration:
|
| 105 |
+
```bash
|
| 106 |
+
cd backend
|
| 107 |
+
alembic downgrade -1
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
### To check current status:
|
| 111 |
+
```bash
|
| 112 |
+
cd backend
|
| 113 |
+
alembic current
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
## Notes
|
| 117 |
+
1. The migration uses SQLite-compatible syntax but will work with PostgreSQL
|
| 118 |
+
2. Enum types are stored as strings with length constraints for compatibility
|
| 119 |
+
3. JSON fields use SQLite's JSON extension (available in SQLite 3.38+)
|
| 120 |
+
4. The content_localization table check prevents errors if it already exists
|
| 121 |
+
|
| 122 |
+
## Updated Files
|
| 123 |
+
1. `backend/alembic/versions/004_add_translation_tables.py` - Main migration file
|
| 124 |
+
2. `backend/alembic/env.py` - Updated to import new models for metadata registration
|
migrations/versions/001_create_openai_translation_tables.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Create OpenAI translation system tables
|
| 2 |
+
|
| 3 |
+
Revision ID: 001_create_openai_translation_tables
|
| 4 |
+
Revises:
|
| 5 |
+
Create Date: 2024-01-12 12:00:00.000000
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from alembic import op
|
| 9 |
+
import sqlalchemy as sa
|
| 10 |
+
from sqlalchemy.dialects import postgresql
|
| 11 |
+
|
| 12 |
+
# revision identifiers, used by Alembic.
|
| 13 |
+
revision = '001_create_openai_translation_tables'
|
| 14 |
+
down_revision = None
|
| 15 |
+
branch_labels = None
|
| 16 |
+
depends_on = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def upgrade() -> None:
|
| 20 |
+
# Create translation_jobs table
|
| 21 |
+
op.create_table('translation_jobs',
|
| 22 |
+
sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 23 |
+
sa.Column('job_id', sa.String(length=64), nullable=False),
|
| 24 |
+
sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
|
| 25 |
+
sa.Column('session_id', sa.String(length=128), nullable=True),
|
| 26 |
+
sa.Column('content_hash', sa.String(length=64), nullable=False),
|
| 27 |
+
sa.Column('page_url', sa.Text(), nullable=True),
|
| 28 |
+
sa.Column('source_language', sa.String(length=10), nullable=False),
|
| 29 |
+
sa.Column('target_language', sa.String(length=10), nullable=False),
|
| 30 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 31 |
+
sa.Column('translated_text', sa.Text(), nullable=True),
|
| 32 |
+
sa.Column('preserve_code_blocks', sa.Boolean(), nullable=False),
|
| 33 |
+
sa.Column('enable_transliteration', sa.Boolean(), nullable=False),
|
| 34 |
+
sa.Column('chunk_size', sa.Integer(), nullable=False),
|
| 35 |
+
sa.Column('max_chunks', sa.Integer(), nullable=False),
|
| 36 |
+
sa.Column('model_name', sa.String(length=50), nullable=False),
|
| 37 |
+
sa.Column('temperature', sa.Numeric(precision=3, scale=2), nullable=False),
|
| 38 |
+
sa.Column('max_tokens', sa.Integer(), nullable=False),
|
| 39 |
+
sa.Column('status', sa.String(length=20), nullable=False),
|
| 40 |
+
sa.Column('progress_percentage', sa.Numeric(precision=5, scale=2), nullable=False),
|
| 41 |
+
sa.Column('chunks_total', sa.Integer(), nullable=False),
|
| 42 |
+
sa.Column('chunks_completed', sa.Integer(), nullable=False),
|
| 43 |
+
sa.Column('chunks_failed', sa.Integer(), nullable=False),
|
| 44 |
+
sa.Column('retry_count', sa.Integer(), nullable=False),
|
| 45 |
+
sa.Column('max_retries', sa.Integer(), nullable=False),
|
| 46 |
+
sa.Column('started_at', sa.DateTime(timezone=True), nullable=True),
|
| 47 |
+
sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True),
|
| 48 |
+
sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
|
| 49 |
+
sa.Column('input_tokens', sa.BigInteger(), nullable=False),
|
| 50 |
+
sa.Column('output_tokens', sa.BigInteger(), nullable=False),
|
| 51 |
+
sa.Column('estimated_cost_usd', sa.Numeric(precision=10, scale=6), nullable=False),
|
| 52 |
+
sa.Column('actual_cost_usd', sa.Numeric(precision=10, scale=6), nullable=True),
|
| 53 |
+
sa.Column('quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
|
| 54 |
+
sa.Column('confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
|
| 55 |
+
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 56 |
+
sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 57 |
+
sa.Column('last_activity_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 58 |
+
sa.Column('user_agent', sa.Text(), nullable=True),
|
| 59 |
+
sa.Column('ip_address', sa.String(length=45), nullable=True),
|
| 60 |
+
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
| 61 |
+
sa.PrimaryKeyConstraint('id'),
|
| 62 |
+
sa.UniqueConstraint('job_id')
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Create indexes for translation_jobs
|
| 66 |
+
op.create_index('ix_translation_jobs_job_id', 'translation_jobs', ['job_id'], unique=True)
|
| 67 |
+
op.create_index('ix_translation_jobs_user_id', 'translation_jobs', ['user_id'])
|
| 68 |
+
op.create_index('ix_translation_jobs_session_id', 'translation_jobs', ['session_id'])
|
| 69 |
+
op.create_index('ix_translation_jobs_content_hash', 'translation_jobs', ['content_hash'])
|
| 70 |
+
op.create_index('ix_translation_jobs_page_url', 'translation_jobs', ['page_url'])
|
| 71 |
+
op.create_index('ix_translation_jobs_source_language', 'translation_jobs', ['source_language'])
|
| 72 |
+
op.create_index('ix_translation_jobs_target_language', 'translation_jobs', ['target_language'])
|
| 73 |
+
op.create_index('ix_translation_jobs_status', 'translation_jobs', ['status'])
|
| 74 |
+
op.create_index('ix_translation_jobs_status_created', 'translation_jobs', ['status', 'created_at'])
|
| 75 |
+
op.create_index('ix_translation_jobs_user_status', 'translation_jobs', ['user_id', 'status'])
|
| 76 |
+
op.create_index('ix_translation_jobs_content_lookup', 'translation_jobs', ['content_hash', 'source_language', 'target_language'])
|
| 77 |
+
op.create_index('ix_translation_jobs_page_cache', 'translation_jobs', ['page_url', 'content_hash'])
|
| 78 |
+
op.create_index('ix_translation_jobs_activity', 'translation_jobs', ['last_activity_at'])
|
| 79 |
+
op.create_index('ix_translation_jobs_progress', 'translation_jobs', ['status', 'progress_percentage'])
|
| 80 |
+
|
| 81 |
+
# Create translation_chunks table
|
| 82 |
+
op.create_table('translation_chunks',
|
| 83 |
+
sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 84 |
+
sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 85 |
+
sa.Column('chunk_index', sa.Integer(), nullable=False),
|
| 86 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 87 |
+
sa.Column('translated_text', sa.Text(), nullable=True),
|
| 88 |
+
sa.Column('start_position', sa.Integer(), nullable=False),
|
| 89 |
+
sa.Column('end_position', sa.Integer(), nullable=False),
|
| 90 |
+
sa.Column('is_code_block', sa.Boolean(), nullable=False),
|
| 91 |
+
sa.Column('code_language', sa.String(length=50), nullable=True),
|
| 92 |
+
sa.Column('word_count', sa.Integer(), nullable=False),
|
| 93 |
+
sa.Column('status', sa.String(length=20), nullable=False),
|
| 94 |
+
sa.Column('retry_count', sa.Integer(), nullable=False),
|
| 95 |
+
sa.Column('started_at', sa.DateTime(timezone=True), nullable=True),
|
| 96 |
+
sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True),
|
| 97 |
+
sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
|
| 98 |
+
sa.Column('input_tokens', sa.Integer(), nullable=False),
|
| 99 |
+
sa.Column('output_tokens', sa.Integer(), nullable=False),
|
| 100 |
+
sa.Column('confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
|
| 101 |
+
sa.Column('requires_review', sa.Boolean(), nullable=False),
|
| 102 |
+
sa.Column('last_error', sa.Text(), nullable=True),
|
| 103 |
+
sa.Column('error_code', sa.String(length=50), nullable=True),
|
| 104 |
+
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 105 |
+
sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 106 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
|
| 107 |
+
sa.PrimaryKeyConstraint('id'),
|
| 108 |
+
sa.UniqueConstraint('job_id', 'chunk_index', name='uq_translation_chunks_job_chunk')
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Create indexes for translation_chunks
|
| 112 |
+
op.create_index('ix_translation_chunks_job_id', 'translation_chunks', ['job_id'])
|
| 113 |
+
op.create_index('ix_translation_chunks_job_chunk', 'translation_chunks', ['job_id', 'chunk_index'], unique=True)
|
| 114 |
+
op.create_index('ix_translation_chunks_status', 'translation_chunks', ['status'])
|
| 115 |
+
op.create_index('ix_translation_chunks_status_created', 'translation_chunks', ['status', 'created_at'])
|
| 116 |
+
op.create_index('ix_translation_chunks_is_code_block', 'translation_chunks', ['is_code_block'])
|
| 117 |
+
op.create_index('ix_translation_chunks_code_language', 'translation_chunks', ['code_language'])
|
| 118 |
+
|
| 119 |
+
# Create translation_errors table
|
| 120 |
+
op.create_table('translation_errors',
|
| 121 |
+
sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 122 |
+
sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 123 |
+
sa.Column('chunk_id', postgresql.UUID(as_uuid=True), nullable=True),
|
| 124 |
+
sa.Column('error_id', sa.String(length=64), nullable=False),
|
| 125 |
+
sa.Column('error_type', sa.String(length=50), nullable=False),
|
| 126 |
+
sa.Column('error_code', sa.String(length=50), nullable=True),
|
| 127 |
+
sa.Column('error_message', sa.Text(), nullable=False),
|
| 128 |
+
sa.Column('error_details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
| 129 |
+
sa.Column('severity', sa.String(length=20), nullable=False),
|
| 130 |
+
sa.Column('category', sa.String(length=50), nullable=False),
|
| 131 |
+
sa.Column('is_retriable', sa.Boolean(), nullable=False),
|
| 132 |
+
sa.Column('retry_attempt', sa.Integer(), nullable=False),
|
| 133 |
+
sa.Column('max_retries', sa.Integer(), nullable=False),
|
| 134 |
+
sa.Column('next_retry_at', sa.DateTime(timezone=True), nullable=True),
|
| 135 |
+
sa.Column('request_payload', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
| 136 |
+
sa.Column('response_payload', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
| 137 |
+
sa.Column('stack_trace', sa.Text(), nullable=True),
|
| 138 |
+
sa.Column('debug_info', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
| 139 |
+
sa.Column('resolved_at', sa.DateTime(timezone=True), nullable=True),
|
| 140 |
+
sa.Column('resolution', sa.String(length=200), nullable=True),
|
| 141 |
+
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 142 |
+
sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 143 |
+
sa.ForeignKeyConstraint(['chunk_id'], ['translation_chunks.id'], ),
|
| 144 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
|
| 145 |
+
sa.PrimaryKeyConstraint('id'),
|
| 146 |
+
sa.UniqueConstraint('error_id')
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Create indexes for translation_errors
|
| 150 |
+
op.create_index('ix_translation_errors_error_id', 'translation_errors', ['error_id'], unique=True)
|
| 151 |
+
op.create_index('ix_translation_errors_job_id', 'translation_errors', ['job_id'])
|
| 152 |
+
op.create_index('ix_translation_errors_chunk_id', 'translation_errors', ['chunk_id'])
|
| 153 |
+
op.create_index('ix_translation_errors_error_type', 'translation_errors', ['error_type'])
|
| 154 |
+
op.create_index('ix_translation_errors_severity', 'translation_errors', ['severity'])
|
| 155 |
+
op.create_index('ix_translation_errors_error_type_created', 'translation_errors', ['error_type', 'created_at'])
|
| 156 |
+
op.create_index('ix_translation_errors_error_severity', 'translation_errors', ['severity', 'created_at'])
|
| 157 |
+
op.create_index('ix_translation_errors_job_errors', 'translation_errors', ['job_id', 'created_at'])
|
| 158 |
+
op.create_index('ix_translation_errors_retry_schedule', 'translation_errors', ['next_retry_at', 'is_retriable'])
|
| 159 |
+
|
| 160 |
+
# Create translation_sessions table
|
| 161 |
+
op.create_table('translation_sessions',
|
| 162 |
+
sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 163 |
+
sa.Column('session_id', sa.String(length=128), nullable=False),
|
| 164 |
+
sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
|
| 165 |
+
sa.Column('started_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 166 |
+
sa.Column('last_activity_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 167 |
+
sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
|
| 168 |
+
sa.Column('is_active', sa.Boolean(), nullable=False),
|
| 169 |
+
sa.Column('request_count', sa.Integer(), nullable=False),
|
| 170 |
+
sa.Column('character_count', sa.Integer(), nullable=False),
|
| 171 |
+
sa.Column('total_cost_usd', sa.Numeric(precision=10, scale=6), nullable=False),
|
| 172 |
+
sa.Column('requests_per_minute', sa.Integer(), nullable=False),
|
| 173 |
+
sa.Column('characters_per_hour', sa.Integer(), nullable=False),
|
| 174 |
+
sa.Column('source_language', sa.String(length=10), nullable=True),
|
| 175 |
+
sa.Column('target_language', sa.String(length=10), nullable=True),
|
| 176 |
+
sa.Column('preferred_model', sa.String(length=50), nullable=True),
|
| 177 |
+
sa.Column('user_agent', sa.Text(), nullable=True),
|
| 178 |
+
sa.Column('ip_address', sa.String(length=45), nullable=True),
|
| 179 |
+
sa.Column('country_code', sa.String(length=2), nullable=True),
|
| 180 |
+
sa.Column('preferences', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
| 181 |
+
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
| 182 |
+
sa.PrimaryKeyConstraint('id'),
|
| 183 |
+
sa.UniqueConstraint('session_id')
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Create indexes for translation_sessions
|
| 187 |
+
op.create_index('ix_translation_sessions_session_id', 'translation_sessions', ['session_id'], unique=True)
|
| 188 |
+
op.create_index('ix_translation_sessions_user_id', 'translation_sessions', ['user_id'])
|
| 189 |
+
op.create_index('ix_translation_sessions_is_active', 'translation_sessions', ['is_active'])
|
| 190 |
+
op.create_index('ix_translation_sessions_expires_at', 'translation_sessions', ['expires_at'])
|
| 191 |
+
op.create_index('ix_translation_sessions_user_sessions', 'translation_sessions', ['user_id', 'is_active'])
|
| 192 |
+
op.create_index('ix_translation_sessions_session_expiry', 'translation_sessions', ['expires_at', 'is_active'])
|
| 193 |
+
op.create_index('ix_translation_sessions_ip_address', 'translation_sessions', ['ip_address'])
|
| 194 |
+
|
| 195 |
+
# Create translation_cache table
|
| 196 |
+
op.create_table('translation_cache',
|
| 197 |
+
sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 198 |
+
sa.Column('cache_key', sa.String(length=128), nullable=False),
|
| 199 |
+
sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=True),
|
| 200 |
+
sa.Column('content_hash', sa.String(length=64), nullable=False),
|
| 201 |
+
sa.Column('page_url', sa.Text(), nullable=True),
|
| 202 |
+
sa.Column('url_hash', sa.String(length=64), nullable=True),
|
| 203 |
+
sa.Column('source_language', sa.String(length=10), nullable=False),
|
| 204 |
+
sa.Column('target_language', sa.String(length=10), nullable=False),
|
| 205 |
+
sa.Column('original_text', sa.Text(), nullable=False),
|
| 206 |
+
sa.Column('translated_text', sa.Text(), nullable=False),
|
| 207 |
+
sa.Column('hit_count', sa.Integer(), nullable=False),
|
| 208 |
+
sa.Column('last_hit_at', sa.DateTime(timezone=True), nullable=True),
|
| 209 |
+
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 210 |
+
sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
|
| 211 |
+
sa.Column('quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
|
| 212 |
+
sa.Column('processing_time_ms', sa.BigInteger(), nullable=False),
|
| 213 |
+
sa.Column('model_version', sa.String(length=50), nullable=False),
|
| 214 |
+
sa.Column('ttl_hours', sa.Integer(), nullable=False),
|
| 215 |
+
sa.Column('is_pinned', sa.Boolean(), nullable=False),
|
| 216 |
+
sa.Column('priority', sa.Integer(), nullable=False),
|
| 217 |
+
sa.Column('is_validated', sa.Boolean(), nullable=False),
|
| 218 |
+
sa.Column('validated_by', sa.String(length=50), nullable=True),
|
| 219 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
|
| 220 |
+
sa.PrimaryKeyConstraint('id'),
|
| 221 |
+
sa.UniqueConstraint('cache_key')
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Create indexes for translation_cache
|
| 225 |
+
op.create_index('ix_translation_cache_cache_key', 'translation_cache', ['cache_key'], unique=True)
|
| 226 |
+
op.create_index('ix_translation_cache_job_id', 'translation_cache', ['job_id'])
|
| 227 |
+
op.create_index('ix_translation_cache_content_hash', 'translation_cache', ['content_hash'])
|
| 228 |
+
op.create_index('ix_translation_cache_page_url', 'translation_cache', ['page_url'])
|
| 229 |
+
op.create_index('ix_translation_cache_url_hash', 'translation_cache', ['url_hash'])
|
| 230 |
+
op.create_index('ix_translation_cache_source_language', 'translation_cache', ['source_language'])
|
| 231 |
+
op.create_index('ix_translation_cache_target_language', 'translation_cache', ['target_language'])
|
| 232 |
+
op.create_index('ix_translation_cache_expires_at', 'translation_cache', ['expires_at'])
|
| 233 |
+
op.create_index('ix_translation_cache_cache_lookup', 'translation_cache', ['content_hash', 'source_language', 'target_language'])
|
| 234 |
+
op.create_index('ix_translation_cache_page_cache', 'translation_cache', ['url_hash', 'content_hash'])
|
| 235 |
+
op.create_index('ix_translation_cache_cache_expires', 'translation_cache', ['expires_at', 'priority'])
|
| 236 |
+
op.create_index('ix_translation_cache_cache_popularity', 'translation_cache', ['hit_count', 'last_hit_at'])
|
| 237 |
+
|
| 238 |
+
# Create translation_metrics table
|
| 239 |
+
op.create_table('translation_metrics',
|
| 240 |
+
sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 241 |
+
sa.Column('job_id', postgresql.UUID(as_uuid=True), nullable=False),
|
| 242 |
+
sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
|
| 243 |
+
sa.Column('metric_date', sa.DateTime(timezone=True), nullable=False),
|
| 244 |
+
sa.Column('period_type', sa.String(length=20), nullable=False),
|
| 245 |
+
sa.Column('total_requests', sa.Integer(), nullable=False),
|
| 246 |
+
sa.Column('total_characters', sa.BigInteger(), nullable=False),
|
| 247 |
+
sa.Column('total_chunks', sa.Integer(), nullable=False),
|
| 248 |
+
sa.Column('successful_translations', sa.Integer(), nullable=False),
|
| 249 |
+
sa.Column('failed_translations', sa.Integer(), nullable=False),
|
| 250 |
+
sa.Column('avg_processing_time_ms', sa.BigInteger(), nullable=False),
|
| 251 |
+
sa.Column('min_processing_time_ms', sa.BigInteger(), nullable=False),
|
| 252 |
+
sa.Column('max_processing_time_ms', sa.BigInteger(), nullable=False),
|
| 253 |
+
sa.Column('p95_processing_time_ms', sa.BigInteger(), nullable=False),
|
| 254 |
+
sa.Column('total_input_tokens', sa.BigInteger(), nullable=False),
|
| 255 |
+
sa.Column('total_output_tokens', sa.BigInteger(), nullable=False),
|
| 256 |
+
sa.Column('total_cost_usd', sa.Numeric(precision=12, scale=6), nullable=False),
|
| 257 |
+
sa.Column('avg_cost_per_char', sa.Numeric(precision=10, scale=8), nullable=False),
|
| 258 |
+
sa.Column('avg_quality_score', sa.Numeric(precision=5, scale=2), nullable=True),
|
| 259 |
+
sa.Column('avg_confidence_score', sa.Numeric(precision=5, scale=2), nullable=True),
|
| 260 |
+
sa.Column('cache_hits', sa.Integer(), nullable=False),
|
| 261 |
+
sa.Column('cache_misses', sa.Integer(), nullable=False),
|
| 262 |
+
sa.Column('cache_hit_rate', sa.Numeric(precision=5, scale=2), nullable=False),
|
| 263 |
+
sa.Column('error_count', sa.Integer(), nullable=False),
|
| 264 |
+
sa.Column('error_rate', sa.Numeric(precision=5, scale=2), nullable=False),
|
| 265 |
+
sa.Column('top_error_types', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
| 266 |
+
sa.Column('source_language', sa.String(length=10), nullable=True),
|
| 267 |
+
sa.Column('target_language', sa.String(length=10), nullable=True),
|
| 268 |
+
sa.Column('model_name', sa.String(length=50), nullable=True),
|
| 269 |
+
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 270 |
+
sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
|
| 271 |
+
sa.ForeignKeyConstraint(['job_id'], ['translation_jobs.id'], ),
|
| 272 |
+
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
| 273 |
+
sa.PrimaryKeyConstraint('id')
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# Create indexes for translation_metrics
|
| 277 |
+
op.create_index('ix_translation_metrics_job_id', 'translation_metrics', ['job_id'])
|
| 278 |
+
op.create_index('ix_translation_metrics_user_id', 'translation_metrics', ['user_id'])
|
| 279 |
+
op.create_index('ix_translation_metrics_metric_date', 'translation_metrics', ['metric_date'])
|
| 280 |
+
op.create_index('ix_translation_metrics_period_type', 'translation_metrics', ['period_type'])
|
| 281 |
+
op.create_index('ix_translation_metrics_source_language', 'translation_metrics', ['source_language'])
|
| 282 |
+
op.create_index('ix_translation_metrics_target_language', 'translation_metrics', ['target_language'])
|
| 283 |
+
op.create_index('ix_translation_metrics_model_name', 'translation_metrics', ['model_name'])
|
| 284 |
+
op.create_index('ix_translation_metrics_date_period', 'translation_metrics', ['metric_date', 'period_type'])
|
| 285 |
+
op.create_index('ix_translation_metrics_user_metrics', 'translation_metrics', ['user_id', 'metric_date'])
|
| 286 |
+
op.create_index('ix_translation_metrics_job_metrics', 'translation_metrics', ['job_id', 'metric_date'])
|
| 287 |
+
op.create_index('ix_translation_metrics_lang_metrics', 'translation_metrics', ['source_language', 'target_language', 'metric_date'])
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def downgrade() -> None:
|
| 291 |
+
# Drop tables in reverse order of creation
|
| 292 |
+
op.drop_table('translation_metrics')
|
| 293 |
+
op.drop_table('translation_cache')
|
| 294 |
+
op.drop_table('translation_sessions')
|
| 295 |
+
op.drop_table('translation_errors')
|
| 296 |
+
op.drop_table('translation_chunks')
|
| 297 |
+
op.drop_table('translation_jobs')
|
pyproject.toml
CHANGED
|
@@ -41,7 +41,8 @@ dependencies = [
|
|
| 41 |
"authlib>=1.2.1",
|
| 42 |
"itsdangerous>=2.1.0",
|
| 43 |
# OpenAI Integration
|
| 44 |
-
"openai>=1.
|
|
|
|
| 45 |
"tiktoken>=0.5.2",
|
| 46 |
# Vector Database
|
| 47 |
"qdrant-client>=1.7.0",
|
|
@@ -59,11 +60,16 @@ dependencies = [
|
|
| 59 |
# Logging and Monitoring
|
| 60 |
"structlog>=23.2.0",
|
| 61 |
"backoff>=2.2.1",
|
|
|
|
|
|
|
| 62 |
# Monitoring and Performance
|
| 63 |
"psutil>=5.9.6",
|
| 64 |
"openai-chatkit>=1.4.0",
|
| 65 |
"email-validator>=2.3.0",
|
| 66 |
"bcrypt==4.2.0",
|
|
|
|
|
|
|
|
|
|
| 67 |
]
|
| 68 |
|
| 69 |
[project.optional-dependencies]
|
|
|
|
| 41 |
"authlib>=1.2.1",
|
| 42 |
"itsdangerous>=2.1.0",
|
| 43 |
# OpenAI Integration
|
| 44 |
+
"openai>=1.68.0",
|
| 45 |
+
"openai-agents>=0.2.9",
|
| 46 |
"tiktoken>=0.5.2",
|
| 47 |
# Vector Database
|
| 48 |
"qdrant-client>=1.7.0",
|
|
|
|
| 60 |
# Logging and Monitoring
|
| 61 |
"structlog>=23.2.0",
|
| 62 |
"backoff>=2.2.1",
|
| 63 |
+
"python-json-logger>=2.0.7",
|
| 64 |
+
"PyYAML>=6.0.1",
|
| 65 |
# Monitoring and Performance
|
| 66 |
"psutil>=5.9.6",
|
| 67 |
"openai-chatkit>=1.4.0",
|
| 68 |
"email-validator>=2.3.0",
|
| 69 |
"bcrypt==4.2.0",
|
| 70 |
+
"google-genai>=0.3.0",
|
| 71 |
+
"redis>=7.1.0",
|
| 72 |
+
"python-json-logger>=4.0.0",
|
| 73 |
]
|
| 74 |
|
| 75 |
[project.optional-dependencies]
|
requirements.txt
CHANGED
|
@@ -16,6 +16,7 @@ aiosmtplib>=3.0.0
|
|
| 16 |
jinja2>=3.1.0
|
| 17 |
python-dotenv>=1.0.0
|
| 18 |
structlog>=23.2.0
|
|
|
|
| 19 |
backoff>=2.2.1
|
| 20 |
psutil>=5.9.6
|
| 21 |
# ChatKit Python SDK
|
|
@@ -28,3 +29,9 @@ python-jose[cryptography]>=3.3.0
|
|
| 28 |
passlib[bcrypt]>=1.7.4
|
| 29 |
authlib>=1.2.1
|
| 30 |
itsdangerous>=2.1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
jinja2>=3.1.0
|
| 17 |
python-dotenv>=1.0.0
|
| 18 |
structlog>=23.2.0
|
| 19 |
+
python-json-logger>=2.0.7
|
| 20 |
backoff>=2.2.1
|
| 21 |
psutil>=5.9.6
|
| 22 |
# ChatKit Python SDK
|
|
|
|
| 29 |
passlib[bcrypt]>=1.7.4
|
| 30 |
authlib>=1.2.1
|
| 31 |
itsdangerous>=2.1.0
|
| 32 |
+
|
| 33 |
+
# Cache dependencies
|
| 34 |
+
redis[hiredis]>=5.0.0
|
| 35 |
+
|
| 36 |
+
# Google Generative AI for Gemini integration
|
| 37 |
+
google-generativeai>=0.8.0
|
src/api/v1/progress.py
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Progress tracking API endpoints.
|
| 3 |
+
|
| 4 |
+
Manages user reading progress through chapters and sections.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from datetime import datetime, timedelta
|
| 8 |
+
from typing import List, Optional, Dict, Any
|
| 9 |
+
from fastapi import APIRouter, Depends, HTTPException, Query, Body, BackgroundTasks
|
| 10 |
+
from sqlalchemy.orm import Session
|
| 11 |
+
from pydantic import BaseModel, Field, validator
|
| 12 |
+
|
| 13 |
+
from src.database.base import get_db
|
| 14 |
+
from src.middleware.auth import get_current_active_user, require_user
|
| 15 |
+
from src.models.auth import User
|
| 16 |
+
from src.models.reading_progress import ReadingProgress
|
| 17 |
+
from src.models.user_preferences import UserPreference
|
| 18 |
+
from src.services.progress import ReadingProgressService
|
| 19 |
+
from src.services.personalization import PersonalizationService
|
| 20 |
+
from src.utils.errors import handle_errors, NotFoundError, ValidationError
|
| 21 |
+
from src.utils.logging import get_logger
|
| 22 |
+
|
| 23 |
+
logger = get_logger(__name__)
|
| 24 |
+
|
| 25 |
+
router = APIRouter(
|
| 26 |
+
prefix="/progress",
|
| 27 |
+
tags=["progress"]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Pydantic models for API
|
| 31 |
+
class SectionProgress(BaseModel):
|
| 32 |
+
section_id: str = Field(..., description="Section identifier")
|
| 33 |
+
position: float = Field(..., ge=0, le=100, description="Progress percentage (0-100)")
|
| 34 |
+
time_spent: int = Field(0, ge=0, description="Time spent in minutes")
|
| 35 |
+
completed: bool = Field(False, description="Whether section is completed")
|
| 36 |
+
|
| 37 |
+
@validator('position')
|
| 38 |
+
def validate_position(cls, v):
|
| 39 |
+
if not 0 <= v <= 100:
|
| 40 |
+
raise ValueError("Position must be between 0 and 100")
|
| 41 |
+
return v
|
| 42 |
+
|
| 43 |
+
class ChapterProgressUpdate(BaseModel):
|
| 44 |
+
chapter_id: str = Field(..., description="Chapter identifier")
|
| 45 |
+
sections: List[SectionProgress] = Field(..., description="Section progress updates")
|
| 46 |
+
|
| 47 |
+
class ProgressResponse(BaseModel):
|
| 48 |
+
chapter_id: str
|
| 49 |
+
overall_progress: float
|
| 50 |
+
sections_completed: int
|
| 51 |
+
total_sections: int
|
| 52 |
+
time_spent: int
|
| 53 |
+
sections: List[Dict[str, Any]]
|
| 54 |
+
last_accessed: Optional[str]
|
| 55 |
+
estimated_completion: Optional[Dict[str, Any]]
|
| 56 |
+
|
| 57 |
+
class SessionStart(BaseModel):
|
| 58 |
+
chapter_id: str = Field(..., description="Chapter identifier")
|
| 59 |
+
section_id: Optional[str] = Field(None, description="Section identifier")
|
| 60 |
+
|
| 61 |
+
class SessionEnd(BaseModel):
|
| 62 |
+
chapter_id: str = Field(..., description="Chapter identifier")
|
| 63 |
+
section_id: Optional[str] = Field(None, description="Section identifier")
|
| 64 |
+
position: float = Field(..., ge=0, le=100, description="Final position")
|
| 65 |
+
time_spent: int = Field(..., ge=0, description="Time spent in minutes")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# Helper function to get services
|
| 69 |
+
def get_progress_service(db: Session = Depends(get_db)) -> ReadingProgressService:
|
| 70 |
+
return ReadingProgressService(db)
|
| 71 |
+
|
| 72 |
+
def get_personalization_service(db: Session = Depends(get_db)) -> PersonalizationService:
|
| 73 |
+
return PersonalizationService(db)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@router.get("/chapter/{chapter_id}")
|
| 77 |
+
@handle_errors
|
| 78 |
+
async def get_chapter_progress(
|
| 79 |
+
chapter_id: str,
|
| 80 |
+
current_user: User = Depends(get_current_active_user),
|
| 81 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 82 |
+
) -> ProgressResponse:
|
| 83 |
+
"""Get comprehensive progress for a specific chapter."""
|
| 84 |
+
progress = await service.get_chapter_progress(current_user.id, chapter_id)
|
| 85 |
+
|
| 86 |
+
if not progress["total_sections"]:
|
| 87 |
+
raise NotFoundError("Chapter", chapter_id)
|
| 88 |
+
|
| 89 |
+
return ProgressResponse(**progress)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@router.get("/summary")
|
| 93 |
+
@handle_errors
|
| 94 |
+
async def get_progress_summary(
|
| 95 |
+
current_user: User = Depends(get_current_active_user),
|
| 96 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 97 |
+
) -> Dict[str, Any]:
|
| 98 |
+
"""Get overall reading progress summary for the user."""
|
| 99 |
+
summary = await service.get_user_progress_summary(current_user.id)
|
| 100 |
+
|
| 101 |
+
# Add personalization info
|
| 102 |
+
personalization_service = PersonalizationService(service.db)
|
| 103 |
+
personalization = await personalization_service.get_user_personalization(current_user.id)
|
| 104 |
+
|
| 105 |
+
return {
|
| 106 |
+
**summary,
|
| 107 |
+
"personalization": personalization,
|
| 108 |
+
"last_updated": datetime.utcnow().isoformat()
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@router.post("/session/start")
|
| 113 |
+
@handle_errors
|
| 114 |
+
async def start_reading_session(
|
| 115 |
+
session_data: SessionStart,
|
| 116 |
+
current_user: User = Depends(get_current_active_user),
|
| 117 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 118 |
+
) -> Dict[str, Any]:
|
| 119 |
+
"""Start a new reading session."""
|
| 120 |
+
# Log session start
|
| 121 |
+
logger.info(
|
| 122 |
+
"Reading session started",
|
| 123 |
+
user_id=current_user.id,
|
| 124 |
+
chapter_id=session_data.chapter_id,
|
| 125 |
+
section_id=session_data.section_id
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Get or create progress record
|
| 129 |
+
progress = await service.update_section_progress(
|
| 130 |
+
user_id=current_user.id,
|
| 131 |
+
chapter_id=session_data.chapter_id,
|
| 132 |
+
section_id=session_data.section_id or f"{session_data.chapter_id}_intro",
|
| 133 |
+
position=0,
|
| 134 |
+
time_spent_delta=0
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
return {
|
| 138 |
+
"session_id": progress.id,
|
| 139 |
+
"chapter_id": session_data.chapter_id,
|
| 140 |
+
"section_id": session_data.section_id,
|
| 141 |
+
"started_at": progress.last_accessed.isoformat(),
|
| 142 |
+
"message": "Reading session started successfully"
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
@router.post("/session/end")
|
| 147 |
+
@handle_errors
|
| 148 |
+
async def end_reading_session(
|
| 149 |
+
session_data: SessionEnd,
|
| 150 |
+
current_user: User = Depends(get_current_active_user),
|
| 151 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 152 |
+
) -> Dict[str, Any]:
|
| 153 |
+
"""End a reading session with final progress."""
|
| 154 |
+
# Update progress with session data
|
| 155 |
+
progress = await service.update_section_progress(
|
| 156 |
+
user_id=current_user.id,
|
| 157 |
+
chapter_id=session_data.chapter_id,
|
| 158 |
+
section_id=session_data.section_id or f"{session_data.chapter_id}_intro",
|
| 159 |
+
position=session_data.position,
|
| 160 |
+
time_spent_delta=session_data.time_spent,
|
| 161 |
+
completed=session_data.position >= 100
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# Get updated chapter progress
|
| 165 |
+
chapter_progress = await service.get_chapter_progress(current_user.id, session_data.chapter_id)
|
| 166 |
+
|
| 167 |
+
# Generate session summary
|
| 168 |
+
session_summary = {
|
| 169 |
+
"chapter_id": session_data.chapter_id,
|
| 170 |
+
"section_id": session_data.section_id,
|
| 171 |
+
"final_position": session_data.position,
|
| 172 |
+
"time_spent": session_data.time_spent,
|
| 173 |
+
"chapter_progress": chapter_progress["overall_progress"],
|
| 174 |
+
"sections_completed": chapter_progress["sections_completed"],
|
| 175 |
+
"completed_at": datetime.utcnow().isoformat()
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
# Log session end
|
| 179 |
+
logger.info(
|
| 180 |
+
"Reading session ended",
|
| 181 |
+
user_id=current_user.id,
|
| 182 |
+
**session_summary
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
return {
|
| 186 |
+
"session_id": progress.id,
|
| 187 |
+
"summary": session_summary,
|
| 188 |
+
"message": "Reading session completed successfully"
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
@router.post("/update")
|
| 193 |
+
@handle_errors
|
| 194 |
+
async def update_progress(
|
| 195 |
+
progress_update: ChapterProgressUpdate,
|
| 196 |
+
background_tasks: BackgroundTasks,
|
| 197 |
+
current_user: User = Depends(get_current_active_user),
|
| 198 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 199 |
+
) -> Dict[str, Any]:
|
| 200 |
+
"""Update progress for multiple sections in a chapter."""
|
| 201 |
+
updated_sections = []
|
| 202 |
+
errors = []
|
| 203 |
+
|
| 204 |
+
for section in progress_update.sections:
|
| 205 |
+
try:
|
| 206 |
+
updated = await service.update_section_progress(
|
| 207 |
+
user_id=current_user.id,
|
| 208 |
+
chapter_id=progress_update.chapter_id,
|
| 209 |
+
section_id=section.section_id,
|
| 210 |
+
position=section.position,
|
| 211 |
+
time_spent_delta=section.time_spent,
|
| 212 |
+
completed=section.completed
|
| 213 |
+
)
|
| 214 |
+
updated_sections.append({
|
| 215 |
+
"section_id": section.section_id,
|
| 216 |
+
"position": updated.position,
|
| 217 |
+
"completed": updated.completed,
|
| 218 |
+
"updated_at": updated.updated_at.isoformat()
|
| 219 |
+
})
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(
|
| 222 |
+
"Failed to update section progress",
|
| 223 |
+
user_id=current_user.id,
|
| 224 |
+
chapter_id=progress_update.chapter_id,
|
| 225 |
+
section_id=section.section_id,
|
| 226 |
+
error=str(e)
|
| 227 |
+
)
|
| 228 |
+
errors.append({
|
| 229 |
+
"section_id": section.section_id,
|
| 230 |
+
"error": str(e)
|
| 231 |
+
})
|
| 232 |
+
|
| 233 |
+
# Schedule background task to calculate recommendations
|
| 234 |
+
if updated_sections:
|
| 235 |
+
background_tasks.add_task(
|
| 236 |
+
calculate_recommendations_delayed,
|
| 237 |
+
current_user.id
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"chapter_id": progress_update.chapter_id,
|
| 242 |
+
"updated_sections": updated_sections,
|
| 243 |
+
"errors": errors,
|
| 244 |
+
"total_updated": len(updated_sections),
|
| 245 |
+
"total_errors": len(errors),
|
| 246 |
+
"message": f"Updated {len(updated_sections)} sections successfully"
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
@router.post("/section/{section_id}/complete")
|
| 251 |
+
@handle_errors
|
| 252 |
+
async def complete_section(
|
| 253 |
+
chapter_id: str,
|
| 254 |
+
section_id: str,
|
| 255 |
+
time_spent: int = Query(0, ge=0, description="Time spent in minutes"),
|
| 256 |
+
current_user: User = Depends(get_current_active_user),
|
| 257 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 258 |
+
) -> Dict[str, Any]:
|
| 259 |
+
"""Mark a section as completed."""
|
| 260 |
+
progress = await service.mark_section_complete(
|
| 261 |
+
user_id=current_user.id,
|
| 262 |
+
chapter_id=chapter_id,
|
| 263 |
+
section_id=section_id,
|
| 264 |
+
time_spent_delta=time_spent
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Get updated chapter progress
|
| 268 |
+
chapter_progress = await service.get_chapter_progress(current_user.id, chapter_id)
|
| 269 |
+
|
| 270 |
+
# Log completion
|
| 271 |
+
logger.info(
|
| 272 |
+
"Section completed",
|
| 273 |
+
user_id=current_user.id,
|
| 274 |
+
chapter_id=chapter_id,
|
| 275 |
+
section_id=section_id,
|
| 276 |
+
position=100,
|
| 277 |
+
time_spent=time_spent
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
return {
|
| 281 |
+
"section_id": section_id,
|
| 282 |
+
"chapter_id": chapter_id,
|
| 283 |
+
"completed_at": progress.updated_at.isoformat(),
|
| 284 |
+
"time_spent": time_spent,
|
| 285 |
+
"chapter_progress": chapter_progress["overall_progress"],
|
| 286 |
+
"sections_completed": chapter_progress["sections_completed"],
|
| 287 |
+
"message": "Section marked as completed"
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
@router.get("/restore/{chapter_id}")
|
| 292 |
+
@handle_errors
|
| 293 |
+
async def restore_progress(
|
| 294 |
+
chapter_id: str,
|
| 295 |
+
current_user: User = Depends(get_current_active_user),
|
| 296 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 297 |
+
) -> Dict[str, Any]:
|
| 298 |
+
"""Restore user's last position in a chapter."""
|
| 299 |
+
restored = await service.restore_progress(current_user.id, chapter_id)
|
| 300 |
+
|
| 301 |
+
if restored["section_id"]:
|
| 302 |
+
# Update last accessed
|
| 303 |
+
progress = await service.update_section_progress(
|
| 304 |
+
user_id=current_user.id,
|
| 305 |
+
chapter_id=chapter_id,
|
| 306 |
+
section_id=restored["section_id"],
|
| 307 |
+
position=restored["position"],
|
| 308 |
+
time_spent_delta=0
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
logger.info(
|
| 312 |
+
"Progress restored",
|
| 313 |
+
user_id=current_user.id,
|
| 314 |
+
chapter_id=chapter_id,
|
| 315 |
+
section_id=restored["section_id"],
|
| 316 |
+
position=restored["position"]
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
return restored
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@router.get("/analytics")
|
| 323 |
+
@handle_errors
|
| 324 |
+
async def get_progress_analytics(
|
| 325 |
+
timeframe: str = Query("month", regex="^(day|week|month|year)$"),
|
| 326 |
+
current_user: User = Depends(get_current_active_user),
|
| 327 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 328 |
+
) -> Dict[str, Any]:
|
| 329 |
+
"""Get detailed reading analytics."""
|
| 330 |
+
analytics = await service.get_reading_analytics(current_user.id, timeframe)
|
| 331 |
+
|
| 332 |
+
# Add additional user-specific analytics
|
| 333 |
+
personalization_service = PersonalizationService(service.db)
|
| 334 |
+
personalization = await personalization_service.get_user_personalization(current_user.id)
|
| 335 |
+
|
| 336 |
+
return {
|
| 337 |
+
**analytics,
|
| 338 |
+
"user_experience_level": personalization["experience_level"],
|
| 339 |
+
"user_preferences": personalization["preferences"],
|
| 340 |
+
"generated_at": datetime.utcnow().isoformat()
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
@router.post("/bulk")
|
| 345 |
+
@handle_errors
|
| 346 |
+
async def bulk_update_progress(
|
| 347 |
+
updates: List[ChapterProgressUpdate],
|
| 348 |
+
background_tasks: BackgroundTasks,
|
| 349 |
+
current_user: User = Depends(get_current_active_user),
|
| 350 |
+
service: ReadingProgressService = Depends(get_progress_service)
|
| 351 |
+
) -> Dict[str, Any]:
|
| 352 |
+
"""Bulk update progress for multiple chapters."""
|
| 353 |
+
results = []
|
| 354 |
+
total_updated = 0
|
| 355 |
+
total_errors = 0
|
| 356 |
+
|
| 357 |
+
for chapter_update in updates:
|
| 358 |
+
try:
|
| 359 |
+
chapter_result = await update_progress(
|
| 360 |
+
progress_update=chapter_update,
|
| 361 |
+
background_tasks=background_tasks,
|
| 362 |
+
current_user=current_user,
|
| 363 |
+
service=service
|
| 364 |
+
)
|
| 365 |
+
results.append(chapter_result)
|
| 366 |
+
total_updated += chapter_result["total_updated"]
|
| 367 |
+
total_errors += chapter_result["total_errors"]
|
| 368 |
+
except Exception as e:
|
| 369 |
+
logger.error(
|
| 370 |
+
"Failed to bulk update chapter progress",
|
| 371 |
+
user_id=current_user.id,
|
| 372 |
+
chapter_id=chapter_update.chapter_id,
|
| 373 |
+
error=str(e)
|
| 374 |
+
)
|
| 375 |
+
results.append({
|
| 376 |
+
"chapter_id": chapter_update.chapter_id,
|
| 377 |
+
"updated_sections": [],
|
| 378 |
+
"errors": [{"error": str(e)}],
|
| 379 |
+
"total_updated": 0,
|
| 380 |
+
"total_errors": 1
|
| 381 |
+
})
|
| 382 |
+
total_errors += 1
|
| 383 |
+
|
| 384 |
+
return {
|
| 385 |
+
"results": results,
|
| 386 |
+
"summary": {
|
| 387 |
+
"total_chapters": len(updates),
|
| 388 |
+
"total_updated": total_updated,
|
| 389 |
+
"total_errors": total_errors,
|
| 390 |
+
"success_rate": (total_updated / (total_updated + total_errors)) * 100 if (total_updated + total_errors) > 0 else 0
|
| 391 |
+
},
|
| 392 |
+
"message": f"Bulk update completed: {total_updated} sections updated, {total_errors} errors"
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
@router.delete("/chapter/{chapter_id}")
|
| 397 |
+
@handle_errors
|
| 398 |
+
async def reset_chapter_progress(
|
| 399 |
+
chapter_id: str,
|
| 400 |
+
current_user: User = Depends(get_current_active_user),
|
| 401 |
+
db: Session = Depends(get_db)
|
| 402 |
+
) -> Dict[str, Any]:
|
| 403 |
+
"""Reset all progress for a specific chapter."""
|
| 404 |
+
# Delete all progress records for this chapter
|
| 405 |
+
deleted = db.query(ReadingProgress).filter(
|
| 406 |
+
ReadingProgress.user_id == current_user.id,
|
| 407 |
+
ReadingProgress.chapter_id == chapter_id
|
| 408 |
+
).delete()
|
| 409 |
+
|
| 410 |
+
db.commit()
|
| 411 |
+
|
| 412 |
+
logger.info(
|
| 413 |
+
"Chapter progress reset",
|
| 414 |
+
user_id=current_user.id,
|
| 415 |
+
chapter_id=chapter_id,
|
| 416 |
+
deleted_sections=deleted
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
return {
|
| 420 |
+
"chapter_id": chapter_id,
|
| 421 |
+
"deleted_sections": deleted,
|
| 422 |
+
"message": f"Progress for chapter {chapter_id} has been reset"
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
# Background task helper
|
| 427 |
+
async def calculate_recommendations_delayed(user_id: str):
|
| 428 |
+
"""Background task to calculate recommendations after progress update."""
|
| 429 |
+
try:
|
| 430 |
+
from src.services.personalization import PersonalizationService
|
| 431 |
+
from src.database.base import SessionLocal
|
| 432 |
+
|
| 433 |
+
db = SessionLocal()
|
| 434 |
+
try:
|
| 435 |
+
service = PersonalizationService(db)
|
| 436 |
+
recommendations = await service.generate_recommendations(user_id, limit=5)
|
| 437 |
+
|
| 438 |
+
logger.info(
|
| 439 |
+
"Recommendations calculated",
|
| 440 |
+
user_id=user_id,
|
| 441 |
+
recommendations_count=len(recommendations)
|
| 442 |
+
)
|
| 443 |
+
finally:
|
| 444 |
+
db.close()
|
| 445 |
+
except Exception as e:
|
| 446 |
+
logger.error(
|
| 447 |
+
"Failed to calculate recommendations in background task",
|
| 448 |
+
user_id=user_id,
|
| 449 |
+
error=str(e)
|
| 450 |
+
)
|
src/api/v1/reader_features.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reader features API routes v1.
|
| 3 |
+
|
| 4 |
+
API endpoints for progress tracking, bookmarks, preferences, and search.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, Depends, HTTPException, Query
|
| 8 |
+
from sqlalchemy.orm import Session
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
from src.database.base import get_db
|
| 12 |
+
from src.middleware.auth import get_current_active_user, require_user
|
| 13 |
+
from src.models.auth import User
|
| 14 |
+
from src.utils.errors import handle_errors, NotFoundError, ValidationError
|
| 15 |
+
from src.utils.logging import get_logger
|
| 16 |
+
|
| 17 |
+
logger = get_logger(__name__)
|
| 18 |
+
|
| 19 |
+
router = APIRouter(
|
| 20 |
+
prefix="/reader-features",
|
| 21 |
+
tags=["reader-features"]
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Health check endpoint for reader features
|
| 25 |
+
@router.get("/health")
|
| 26 |
+
async def health_check():
|
| 27 |
+
"""Health check for reader features API."""
|
| 28 |
+
return {
|
| 29 |
+
"status": "healthy",
|
| 30 |
+
"service": "reader-features",
|
| 31 |
+
"version": "1.0.0"
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# Placeholder endpoints - will be implemented in user stories
|
| 35 |
+
@router.get("/progress")
|
| 36 |
+
@handle_errors
|
| 37 |
+
async def get_progress_summary(
|
| 38 |
+
current_user: User = Depends(get_current_active_user),
|
| 39 |
+
db: Session = Depends(get_db)
|
| 40 |
+
):
|
| 41 |
+
"""Get user's overall reading progress summary."""
|
| 42 |
+
# TODO: Implement in User Story 1
|
| 43 |
+
raise HTTPException(status_code=501, detail="Not implemented yet")
|
| 44 |
+
|
| 45 |
+
@router.get("/bookmarks")
|
| 46 |
+
@handle_errors
|
| 47 |
+
async def get_bookmarks(
|
| 48 |
+
limit: int = Query(50, ge=1, le=100),
|
| 49 |
+
offset: int = Query(0, ge=0),
|
| 50 |
+
current_user: User = Depends(get_current_active_user),
|
| 51 |
+
db: Session = Depends(get_db)
|
| 52 |
+
):
|
| 53 |
+
"""Get user's bookmarks."""
|
| 54 |
+
# TODO: Implement in User Story 4
|
| 55 |
+
raise HTTPException(status_code=501, detail="Not implemented yet")
|
| 56 |
+
|
| 57 |
+
@router.get("/preferences")
|
| 58 |
+
@handle_errors
|
| 59 |
+
async def get_preferences(
|
| 60 |
+
current_user: User = Depends(get_current_active_user),
|
| 61 |
+
db: Session = Depends(get_db)
|
| 62 |
+
):
|
| 63 |
+
"""Get user's reading preferences."""
|
| 64 |
+
# TODO: Implement in User Story 1
|
| 65 |
+
raise HTTPException(status_code=501, detail="Not implemented yet")
|
| 66 |
+
|
| 67 |
+
@router.get("/search")
|
| 68 |
+
@handle_errors
|
| 69 |
+
async def search_content(
|
| 70 |
+
q: str = Query(..., min_length=1, description="Search query"),
|
| 71 |
+
language: Optional[str] = Query(None, description="Filter by language"),
|
| 72 |
+
chapter: Optional[str] = Query(None, description="Filter by chapter"),
|
| 73 |
+
current_user: User = Depends(get_current_active_user),
|
| 74 |
+
db: Session = Depends(get_db)
|
| 75 |
+
):
|
| 76 |
+
"""Search content across all languages."""
|
| 77 |
+
# TODO: Implement in User Story 3
|
| 78 |
+
raise HTTPException(status_code=501, detail="Not implemented yet")
|
| 79 |
+
|
| 80 |
+
# Import all routers from individual feature modules
|
| 81 |
+
# These will be added as we implement each user story
|
| 82 |
+
# from .progress import router as progress_router
|
| 83 |
+
# from .bookmarks import router as bookmarks_router
|
| 84 |
+
# from .preferences import router as preferences_router
|
| 85 |
+
# from .search import router as search_router
|
| 86 |
+
# from .analytics import router as analytics_router
|
| 87 |
+
|
| 88 |
+
# Combine all routers
|
| 89 |
+
# api_router = APIRouter()
|
| 90 |
+
# api_router.include_router(progress_router, prefix="/progress", tags=["progress"])
|
| 91 |
+
# api_router.include_router(bookmarks_router, prefix="/bookmarks", tags=["bookmarks"])
|
| 92 |
+
# api_router.include_router(preferences_router, prefix="/preferences", tags=["preferences"])
|
| 93 |
+
# api_router.include_router(search_router, prefix="/search", tags=["search"])
|
| 94 |
+
# api_router.include_router(analytics_router, prefix="/analytics", tags=["analytics"])
|
src/api/v1/translation.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Translation API endpoints using OpenAI Agents SDK.
|
| 3 |
+
|
| 4 |
+
Provides RESTful endpoints for translating text from English to Urdu
|
| 5 |
+
using the OpenAI Agents SDK with Gemini API integration.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 9 |
+
from fastapi.responses import JSONResponse
|
| 10 |
+
from fastapi import status
|
| 11 |
+
from typing import Optional, Dict, Any
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
from src.services.openai_translation.translation_agent import OpenAITranslationAgent, TranslationContext
|
| 15 |
+
from src.services.openai_translation.client import get_gemini_client
|
| 16 |
+
from src.services.translation_cache import cache_service
|
| 17 |
+
from src.models.auth import User
|
| 18 |
+
from src.security.dependencies import get_current_user_or_anonymous
|
| 19 |
+
|
| 20 |
+
router = APIRouter(prefix="/translation", tags=["translation"])
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@router.post("/translate", response_model=dict)
|
| 24 |
+
async def translate_text(
|
| 25 |
+
request: dict,
|
| 26 |
+
http_request: Request,
|
| 27 |
+
current_user: Optional[User] = Depends(get_current_user_or_anonymous)
|
| 28 |
+
) -> JSONResponse:
|
| 29 |
+
"""
|
| 30 |
+
Legacy translation endpoint (for backward compatibility).
|
| 31 |
+
|
| 32 |
+
This endpoint uses the OpenAI Agents SDK with the improved agent implementation.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
request: Translation request with text and parameters
|
| 36 |
+
http_request: FastAPI request object
|
| 37 |
+
current_user: Optional current user
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Translation result
|
| 41 |
+
"""
|
| 42 |
+
try:
|
| 43 |
+
# Extract request data
|
| 44 |
+
text = request.get("text", "")
|
| 45 |
+
source_language = request.get("source_language", "en")
|
| 46 |
+
target_language = request.get("target_language", "ur")
|
| 47 |
+
document_type = request.get("document_type")
|
| 48 |
+
technical_domain = request.get("technical_domain")
|
| 49 |
+
target_audience = request.get("target_audience")
|
| 50 |
+
model = request.get("model", "gemini-2.0-flash-lite")
|
| 51 |
+
|
| 52 |
+
# Create translation context
|
| 53 |
+
context = TranslationContext(
|
| 54 |
+
document_type=document_type,
|
| 55 |
+
technical_domain=technical_domain,
|
| 56 |
+
target_audience=target_audience
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Create agent and translate
|
| 60 |
+
agent = OpenAITranslationAgent(
|
| 61 |
+
gemini_client=get_gemini_client(),
|
| 62 |
+
model=model
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
result = await agent.translate_with_agent(
|
| 66 |
+
text=text,
|
| 67 |
+
context=context,
|
| 68 |
+
user_id=current_user.id if current_user else None
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
return JSONResponse(
|
| 72 |
+
status_code=status.HTTP_200_OK,
|
| 73 |
+
content={
|
| 74 |
+
"job_id": f"translate_{int(time.time())}",
|
| 75 |
+
"translated_text": result["translated_text"],
|
| 76 |
+
"status": "completed",
|
| 77 |
+
"progress": 100.0,
|
| 78 |
+
"chunks": [],
|
| 79 |
+
"processing_time_ms": 0,
|
| 80 |
+
"cached": False,
|
| 81 |
+
"input_tokens": result.get("tokens_used", 0),
|
| 82 |
+
"output_tokens": 0,
|
| 83 |
+
"estimated_cost_usd": 0.0,
|
| 84 |
+
"confidence_score": result.get("confidence_score", 0.95)
|
| 85 |
+
}
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
return JSONResponse(
|
| 90 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 91 |
+
content={
|
| 92 |
+
"error": "TRANSLATION_ERROR",
|
| 93 |
+
"message": "Failed to translate text"
|
| 94 |
+
}
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@router.post("/translate/agent")
|
| 99 |
+
async def translate_with_agent(
|
| 100 |
+
request: dict,
|
| 101 |
+
http_request: Request,
|
| 102 |
+
current_user: Optional[User] = Depends(get_current_user_or_anonymous)
|
| 103 |
+
) -> JSONResponse:
|
| 104 |
+
"""
|
| 105 |
+
Translate text using OpenAI Agents SDK directly with caching.
|
| 106 |
+
|
| 107 |
+
This endpoint uses the OpenAI Agents SDK for translation with enhanced
|
| 108 |
+
context awareness and proper Runner.run pattern. Translations are cached
|
| 109 |
+
for 1 week to avoid redundant API calls.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
request: Translation request
|
| 113 |
+
http_request: FastAPI request object
|
| 114 |
+
current_user: Optional current user
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
Translation result with detailed metadata
|
| 118 |
+
"""
|
| 119 |
+
try:
|
| 120 |
+
# Extract request parameters
|
| 121 |
+
text = request.get("text", "")
|
| 122 |
+
source_language = request.get("source_language", "en")
|
| 123 |
+
target_language = request.get("target_language", "ur")
|
| 124 |
+
page_url = request.get("page_url")
|
| 125 |
+
model = request.get("model", "gemini-2.0-flash-lite")
|
| 126 |
+
|
| 127 |
+
# Check cache first
|
| 128 |
+
cached_result = await cache_service.get_cached_translation(
|
| 129 |
+
text=text,
|
| 130 |
+
source_language=source_language,
|
| 131 |
+
target_language=target_language,
|
| 132 |
+
page_url=page_url
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
if cached_result:
|
| 136 |
+
return JSONResponse(
|
| 137 |
+
status_code=status.HTTP_200_OK,
|
| 138 |
+
content={
|
| 139 |
+
"translated_text": cached_result["translated_text"],
|
| 140 |
+
"original_text": cached_result["original_text"],
|
| 141 |
+
"cached": True,
|
| 142 |
+
"cache_created_at": cached_result["cache_created_at"],
|
| 143 |
+
"cache_expires_at": cached_result["cache_expires_at"],
|
| 144 |
+
"hit_count": cached_result["hit_count"],
|
| 145 |
+
"tokens_used": 0, # No tokens used for cached result
|
| 146 |
+
"model": cached_result["model"],
|
| 147 |
+
"confidence_score": cached_result["confidence_score"],
|
| 148 |
+
"has_code_blocks": False, # Would need to be stored in cache
|
| 149 |
+
"code_blocks": [] # Would need to be stored in cache
|
| 150 |
+
}
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Not in cache, proceed with translation
|
| 154 |
+
# Create translation context
|
| 155 |
+
context = TranslationContext(
|
| 156 |
+
page_url=page_url,
|
| 157 |
+
document_type=request.get("document_type"),
|
| 158 |
+
technical_domain=request.get("technical_domain"),
|
| 159 |
+
target_audience=request.get("target_audience")
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Create agent and translate
|
| 163 |
+
agent = OpenAITranslationAgent(
|
| 164 |
+
gemini_client=get_gemini_client(),
|
| 165 |
+
model=model
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
start_time = time.time()
|
| 169 |
+
result = await agent.translate_with_agent(
|
| 170 |
+
text=text,
|
| 171 |
+
context=context,
|
| 172 |
+
user_id=current_user.id if current_user else None
|
| 173 |
+
)
|
| 174 |
+
processing_time_ms = int((time.time() - start_time) * 1000)
|
| 175 |
+
|
| 176 |
+
# Cache the translation result
|
| 177 |
+
await cache_service.cache_translation(
|
| 178 |
+
text=text,
|
| 179 |
+
translated_text=result["translated_text"],
|
| 180 |
+
source_language=source_language,
|
| 181 |
+
target_language=target_language,
|
| 182 |
+
model=result.get("model", model),
|
| 183 |
+
confidence_score=result.get("confidence_score", 0.95),
|
| 184 |
+
processing_time_ms=processing_time_ms,
|
| 185 |
+
page_url=page_url
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
return JSONResponse(
|
| 189 |
+
status_code=status.HTTP_200_OK,
|
| 190 |
+
content={
|
| 191 |
+
"translated_text": result["translated_text"],
|
| 192 |
+
"original_text": result["original_text"],
|
| 193 |
+
"cached": False,
|
| 194 |
+
"tokens_used": result.get("tokens_used", 0),
|
| 195 |
+
"model": result.get("model", model),
|
| 196 |
+
"confidence_score": result.get("confidence_score", 0.95),
|
| 197 |
+
"has_code_blocks": result.get("has_code_blocks", False),
|
| 198 |
+
"code_blocks": result.get("code_blocks", []),
|
| 199 |
+
"processing_time_ms": processing_time_ms
|
| 200 |
+
}
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
return JSONResponse(
|
| 205 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 206 |
+
content={
|
| 207 |
+
"error": "AGENT_TRANSLATION_ERROR",
|
| 208 |
+
"message": "Failed to translate text using agent"
|
| 209 |
+
}
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
@router.get("/health")
|
| 214 |
+
async def health_check() -> JSONResponse:
|
| 215 |
+
"""
|
| 216 |
+
Simple health check endpoint for translation service.
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Health status
|
| 220 |
+
"""
|
| 221 |
+
return JSONResponse(
|
| 222 |
+
status_code=status.HTTP_200_OK,
|
| 223 |
+
content={
|
| 224 |
+
"status": "healthy",
|
| 225 |
+
"service": "translation",
|
| 226 |
+
"version": "2.0.0",
|
| 227 |
+
"features": ["openai_agents_sdk", "gemini_api", "translation_cache"]
|
| 228 |
+
}
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
@router.post("/cache/clear-expired")
|
| 233 |
+
async def clear_expired_cache(
|
| 234 |
+
current_user: Optional[User] = Depends(get_current_user_or_anonymous)
|
| 235 |
+
) -> JSONResponse:
|
| 236 |
+
"""
|
| 237 |
+
Clear expired cache entries.
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
Number of cleared entries
|
| 241 |
+
"""
|
| 242 |
+
try:
|
| 243 |
+
cleared_count = await cache_service.clear_expired_cache()
|
| 244 |
+
return JSONResponse(
|
| 245 |
+
status_code=status.HTTP_200_OK,
|
| 246 |
+
content={
|
| 247 |
+
"message": f"Cleared {cleared_count} expired cache entries",
|
| 248 |
+
"cleared_count": cleared_count
|
| 249 |
+
}
|
| 250 |
+
)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
return JSONResponse(
|
| 253 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 254 |
+
content={
|
| 255 |
+
"error": "CACHE_CLEAR_ERROR",
|
| 256 |
+
"message": "Failed to clear expired cache"
|
| 257 |
+
}
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
@router.post("/cache/clear-url")
|
| 262 |
+
async def clear_cache_by_url(
|
| 263 |
+
request: dict,
|
| 264 |
+
current_user: Optional[User] = Depends(get_current_user_or_anonymous)
|
| 265 |
+
) -> JSONResponse:
|
| 266 |
+
"""
|
| 267 |
+
Clear cache entries for a specific URL.
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
request: Dict containing 'url' and optional 'source_language' and 'target_language'
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
Number of cleared entries
|
| 274 |
+
"""
|
| 275 |
+
try:
|
| 276 |
+
url = request.get("url")
|
| 277 |
+
if not url:
|
| 278 |
+
return JSONResponse(
|
| 279 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 280 |
+
content={
|
| 281 |
+
"error": "INVALID_REQUEST",
|
| 282 |
+
"message": "URL is required"
|
| 283 |
+
}
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
source_language = request.get("source_language")
|
| 287 |
+
target_language = request.get("target_language")
|
| 288 |
+
|
| 289 |
+
cleared_count = await cache_service.clear_cache_by_url(
|
| 290 |
+
page_url=url,
|
| 291 |
+
source_language=source_language,
|
| 292 |
+
target_language=target_language
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
return JSONResponse(
|
| 296 |
+
status_code=status.HTTP_200_OK,
|
| 297 |
+
content={
|
| 298 |
+
"message": f"Cleared {cleared_count} cache entries for URL",
|
| 299 |
+
"url": url,
|
| 300 |
+
"cleared_count": cleared_count
|
| 301 |
+
}
|
| 302 |
+
)
|
| 303 |
+
except Exception as e:
|
| 304 |
+
return JSONResponse(
|
| 305 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 306 |
+
content={
|
| 307 |
+
"error": "CACHE_CLEAR_URL_ERROR",
|
| 308 |
+
"message": "Failed to clear cache for URL"
|
| 309 |
+
}
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
@router.get("/cache/stats")
|
| 314 |
+
async def get_cache_stats(
|
| 315 |
+
current_user: Optional[User] = Depends(get_current_user_or_anonymous)
|
| 316 |
+
) -> JSONResponse:
|
| 317 |
+
"""
|
| 318 |
+
Get translation cache statistics.
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
Cache statistics
|
| 322 |
+
"""
|
| 323 |
+
try:
|
| 324 |
+
stats = await cache_service.get_cache_stats()
|
| 325 |
+
return JSONResponse(
|
| 326 |
+
status_code=status.HTTP_200_OK,
|
| 327 |
+
content=stats
|
| 328 |
+
)
|
| 329 |
+
except Exception as e:
|
| 330 |
+
return JSONResponse(
|
| 331 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 332 |
+
content={
|
| 333 |
+
"error": "CACHE_STATS_ERROR",
|
| 334 |
+
"message": "Failed to retrieve cache statistics"
|
| 335 |
+
}
|
| 336 |
+
)
|
src/config/logging_config.py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Production-ready logging configuration.
|
| 3 |
+
|
| 4 |
+
Configures structured logging with multiple handlers, sensitive data filtering,
|
| 5 |
+
and integration with monitoring systems.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import logging.handlers
|
| 12 |
+
from typing import Dict, Any, List, Optional, Union
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
import traceback
|
| 16 |
+
from contextvars import ContextVar
|
| 17 |
+
|
| 18 |
+
from pythonjsonlogger import jsonlogger
|
| 19 |
+
from structlog import processors, stdlib, configure
|
| 20 |
+
from structlog.typing import FilteringBoundLogger
|
| 21 |
+
|
| 22 |
+
from .translation_config import get_config, LogLevel
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Context variables for request tracking
|
| 26 |
+
request_id: ContextVar[Optional[str]] = ContextVar('request_id', default=None)
|
| 27 |
+
user_id: ContextVar[Optional[str]] = ContextVar('user_id', default=None)
|
| 28 |
+
session_id: ContextVar[Optional[str]] = ContextVar('session_id', default=None)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class SensitiveDataFilter(logging.Filter):
|
| 32 |
+
"""Filter to mask sensitive data in log records."""
|
| 33 |
+
|
| 34 |
+
def __init__(self, sensitive_fields: List[str] = None, mask_char: str = "*"):
|
| 35 |
+
super().__init__()
|
| 36 |
+
self.sensitive_fields = [field.lower() for field in (sensitive_fields or [])]
|
| 37 |
+
self.mask_char = mask_char
|
| 38 |
+
|
| 39 |
+
def filter(self, record):
|
| 40 |
+
"""Filter sensitive data from log record."""
|
| 41 |
+
# Filter message
|
| 42 |
+
if hasattr(record, 'msg') and record.msg:
|
| 43 |
+
record.msg = self._mask_sensitive_data(str(record.msg))
|
| 44 |
+
|
| 45 |
+
# Filter args
|
| 46 |
+
if hasattr(record, 'args') and record.args:
|
| 47 |
+
record.args = tuple(
|
| 48 |
+
self._mask_sensitive_data(str(arg)) if isinstance(arg, str) else arg
|
| 49 |
+
for arg in record.args
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Filter extra attributes
|
| 53 |
+
for attr_name in dir(record):
|
| 54 |
+
if not attr_name.startswith('_') and attr_name not in {
|
| 55 |
+
'name', 'msg', 'args', 'levelname', 'levelno', 'pathname',
|
| 56 |
+
'filename', 'module', 'lineno', 'funcName', 'created',
|
| 57 |
+
'msecs', 'relativeCreated', 'thread', 'threadName',
|
| 58 |
+
'processName', 'process', 'getMessage', 'exc_info',
|
| 59 |
+
'exc_text', 'stack_info'
|
| 60 |
+
}:
|
| 61 |
+
attr_value = getattr(record, attr_name)
|
| 62 |
+
if isinstance(attr_value, str):
|
| 63 |
+
setattr(record, attr_name, self._mask_sensitive_data(attr_value))
|
| 64 |
+
|
| 65 |
+
return True
|
| 66 |
+
|
| 67 |
+
def _mask_sensitive_data(self, text: str) -> str:
|
| 68 |
+
"""Mask sensitive data in text."""
|
| 69 |
+
import re
|
| 70 |
+
|
| 71 |
+
# General patterns
|
| 72 |
+
patterns = [
|
| 73 |
+
(r'(?i)(api[_-]?key["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
|
| 74 |
+
(r'(?i)(password["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
|
| 75 |
+
(r'(?i)(token["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
|
| 76 |
+
(r'(?i)(secret["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
|
| 77 |
+
(r'(?i)(authorization["\']?\s*[:=]\s*["\']?)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
|
| 78 |
+
(r'(Bearer\s+)([\w\-\.]+)', lambda m: f"{m.group(1)}{self.mask_char * len(m.group(2))}"),
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
# Custom field patterns
|
| 82 |
+
for field in self.sensitive_fields:
|
| 83 |
+
patterns.append(
|
| 84 |
+
(rf'(?i)({field}["\']?\s*[:=]\s*["\']?)([\w\-\.]+)',
|
| 85 |
+
lambda m, f=field: f"{m.group(1)}{self.mask_char * len(m.group(2))}")
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Apply patterns
|
| 89 |
+
for pattern, replacement in patterns:
|
| 90 |
+
text = re.sub(pattern, replacement, text)
|
| 91 |
+
|
| 92 |
+
return text
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class ContextFilter(logging.Filter):
|
| 96 |
+
"""Add context information to log records."""
|
| 97 |
+
|
| 98 |
+
def filter(self, record):
|
| 99 |
+
"""Add context variables to log record."""
|
| 100 |
+
record.request_id = request_id.get()
|
| 101 |
+
record.user_id = user_id.get()
|
| 102 |
+
record.session_id = session_id.get()
|
| 103 |
+
return True
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class JSONFormatter(jsonlogger.JsonFormatter):
|
| 107 |
+
"""Custom JSON formatter with additional fields."""
|
| 108 |
+
|
| 109 |
+
def add_fields(self, log_record, record, message_dict):
|
| 110 |
+
"""Add custom fields to JSON log record."""
|
| 111 |
+
super().add_fields(log_record, record, message_dict)
|
| 112 |
+
|
| 113 |
+
# Add timestamp
|
| 114 |
+
if not log_record.get('timestamp'):
|
| 115 |
+
log_record['timestamp'] = datetime.utcnow().isoformat()
|
| 116 |
+
|
| 117 |
+
# Add context
|
| 118 |
+
if hasattr(record, 'request_id') and record.request_id:
|
| 119 |
+
log_record['request_id'] = record.request_id
|
| 120 |
+
if hasattr(record, 'user_id') and record.user_id:
|
| 121 |
+
log_record['user_id'] = record.user_id
|
| 122 |
+
if hasattr(record, 'session_id') and record.session_id:
|
| 123 |
+
log_record['session_id'] = record.session_id
|
| 124 |
+
|
| 125 |
+
# Add exception details
|
| 126 |
+
if record.exc_info:
|
| 127 |
+
log_record['exception'] = {
|
| 128 |
+
'type': record.exc_info[0].__name__,
|
| 129 |
+
'message': str(record.exc_info[1]),
|
| 130 |
+
'traceback': self.formatException(record.exc_info)
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
# Add source location
|
| 134 |
+
log_record['source'] = {
|
| 135 |
+
'file': record.filename,
|
| 136 |
+
'line': record.lineno,
|
| 137 |
+
'function': record.funcName,
|
| 138 |
+
'module': record.module
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class ColoredFormatter(logging.Formatter):
|
| 143 |
+
"""Colored formatter for console output."""
|
| 144 |
+
|
| 145 |
+
COLORS = {
|
| 146 |
+
'DEBUG': '\033[36m', # Cyan
|
| 147 |
+
'INFO': '\033[32m', # Green
|
| 148 |
+
'WARNING': '\033[33m', # Yellow
|
| 149 |
+
'ERROR': '\033[31m', # Red
|
| 150 |
+
'CRITICAL': '\033[35m', # Magenta
|
| 151 |
+
'RESET': '\033[0m' # Reset
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
def format(self, record):
|
| 155 |
+
"""Format log record with colors."""
|
| 156 |
+
log_color = self.COLORS.get(record.levelname, self.COLORS['RESET'])
|
| 157 |
+
reset = self.COLORS['RESET']
|
| 158 |
+
|
| 159 |
+
# Add color to levelname
|
| 160 |
+
record.levelname = f"{log_color}{record.levelname}{reset}"
|
| 161 |
+
|
| 162 |
+
# Add request ID if present
|
| 163 |
+
if hasattr(record, 'request_id') and record.request_id:
|
| 164 |
+
record.msg = f"[{record.request_id[:8]}] {record.msg}"
|
| 165 |
+
|
| 166 |
+
return super().format(record)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def setup_logging() -> None:
|
| 170 |
+
"""Setup logging configuration based on environment."""
|
| 171 |
+
config = get_config()
|
| 172 |
+
|
| 173 |
+
# Get root logger
|
| 174 |
+
root_logger = logging.getLogger()
|
| 175 |
+
root_logger.setLevel(getattr(logging, config.logging.level.value))
|
| 176 |
+
|
| 177 |
+
# Clear existing handlers
|
| 178 |
+
root_logger.handlers.clear()
|
| 179 |
+
|
| 180 |
+
# Create formatters
|
| 181 |
+
if config.logging.json_format:
|
| 182 |
+
formatter = JSONFormatter(
|
| 183 |
+
'%(asctime)s %(name)s %(levelname)s %(message)s'
|
| 184 |
+
)
|
| 185 |
+
else:
|
| 186 |
+
formatter = logging.Formatter(
|
| 187 |
+
config.logging.format,
|
| 188 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Console handler
|
| 192 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 193 |
+
if config.logging.json_format:
|
| 194 |
+
console_handler.setFormatter(formatter)
|
| 195 |
+
else:
|
| 196 |
+
console_handler.setFormatter(ColoredFormatter(config.logging.format))
|
| 197 |
+
console_handler.addFilter(ContextFilter())
|
| 198 |
+
root_logger.addHandler(console_handler)
|
| 199 |
+
|
| 200 |
+
# File handler (if enabled)
|
| 201 |
+
if config.logging.file_logging:
|
| 202 |
+
setup_file_handler(root_logger, formatter, config)
|
| 203 |
+
|
| 204 |
+
# Apply sensitive data filter
|
| 205 |
+
if config.logging.filter_sensitive_data:
|
| 206 |
+
sensitive_filter = SensitiveDataFilter(config.logging.sensitive_fields)
|
| 207 |
+
for handler in root_logger.handlers:
|
| 208 |
+
handler.addFilter(sensitive_filter)
|
| 209 |
+
|
| 210 |
+
# Configure structlog
|
| 211 |
+
if config.logging.json_format:
|
| 212 |
+
configure(
|
| 213 |
+
processors=[
|
| 214 |
+
structlog.stdlib.filter_by_level,
|
| 215 |
+
structlog.stdlib.add_logger_name,
|
| 216 |
+
structlog.stdlib.add_log_level,
|
| 217 |
+
structlog.stdlib.PositionalArgumentsFormatter(),
|
| 218 |
+
structlog.processors.TimeStamper(fmt="iso"),
|
| 219 |
+
structlog.processors.StackInfoRenderer(),
|
| 220 |
+
structlog.processors.format_exc_info,
|
| 221 |
+
structlog.processors.UnicodeDecoder(),
|
| 222 |
+
structlog.processors.JSONRenderer()
|
| 223 |
+
],
|
| 224 |
+
context_class=dict,
|
| 225 |
+
logger_factory=stdlib.LoggerFactory(),
|
| 226 |
+
wrapper_class=stdlib.BoundLogger,
|
| 227 |
+
cache_logger_on_first_use=True,
|
| 228 |
+
)
|
| 229 |
+
else:
|
| 230 |
+
configure(
|
| 231 |
+
processors=[
|
| 232 |
+
structlog.stdlib.filter_by_level,
|
| 233 |
+
structlog.stdlib.add_logger_name,
|
| 234 |
+
structlog.stdlib.add_log_level,
|
| 235 |
+
structlog.stdlib.PositionalArgumentsFormatter(),
|
| 236 |
+
structlog.processors.TimeStamper(fmt="iso"),
|
| 237 |
+
structlog.processors.StackInfoRenderer(),
|
| 238 |
+
structlog.processors.format_exc_info,
|
| 239 |
+
structlog.processors.UnicodeDecoder(),
|
| 240 |
+
structlog.dev.ConsoleRenderer()
|
| 241 |
+
],
|
| 242 |
+
context_class=dict,
|
| 243 |
+
logger_factory=stdlib.LoggerFactory(),
|
| 244 |
+
wrapper_class=stdlib.BoundLogger,
|
| 245 |
+
cache_logger_on_first_use=True,
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Log configuration
|
| 249 |
+
logger = logging.getLogger(__name__)
|
| 250 |
+
logger.info(
|
| 251 |
+
"Logging configured",
|
| 252 |
+
level=config.logging.level.value,
|
| 253 |
+
json_format=config.logging.json_format,
|
| 254 |
+
file_logging=config.logging.file_logging,
|
| 255 |
+
filter_sensitive=config.logging.filter_sensitive_data
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def setup_file_handler(
|
| 260 |
+
logger: logging.Logger,
|
| 261 |
+
formatter: Union[logging.Formatter, JSONFormatter],
|
| 262 |
+
config
|
| 263 |
+
) -> None:
|
| 264 |
+
"""Setup file handler with rotation."""
|
| 265 |
+
# Create logs directory
|
| 266 |
+
log_path = Path(config.logging.file_path)
|
| 267 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 268 |
+
|
| 269 |
+
# Parse rotation settings
|
| 270 |
+
when = "midnight"
|
| 271 |
+
if config.logging.file_rotation.endswith(" day"):
|
| 272 |
+
when = "midnight"
|
| 273 |
+
elif config.logging.file_rotation.endswith(" hour"):
|
| 274 |
+
when = "H"
|
| 275 |
+
elif config.logging.file_rotation.endswith(" minute"):
|
| 276 |
+
when = "M"
|
| 277 |
+
|
| 278 |
+
# Parse backup count from retention
|
| 279 |
+
backup_count = 30 # Default
|
| 280 |
+
if "days" in config.logging.file_retention:
|
| 281 |
+
backup_count = int(config.logging.file_retention.split()[0])
|
| 282 |
+
|
| 283 |
+
# Create rotating file handler
|
| 284 |
+
try:
|
| 285 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
| 286 |
+
filename=log_path,
|
| 287 |
+
maxBytes=_parse_size(config.logging.max_file_size),
|
| 288 |
+
backupCount=backup_count,
|
| 289 |
+
encoding='utf-8'
|
| 290 |
+
)
|
| 291 |
+
except Exception:
|
| 292 |
+
# Fallback to TimedRotatingFileHandler
|
| 293 |
+
file_handler = logging.handlers.TimedRotatingFileHandler(
|
| 294 |
+
filename=log_path,
|
| 295 |
+
when=when,
|
| 296 |
+
backupCount=backup_count,
|
| 297 |
+
encoding='utf-8'
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
file_handler.setFormatter(formatter)
|
| 301 |
+
file_handler.addFilter(ContextFilter())
|
| 302 |
+
logger.addHandler(file_handler)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _parse_size(size_str: str) -> int:
|
| 306 |
+
"""Parse size string to bytes."""
|
| 307 |
+
size_str = size_str.upper().strip()
|
| 308 |
+
multipliers = {
|
| 309 |
+
'B': 1,
|
| 310 |
+
'KB': 1024,
|
| 311 |
+
'MB': 1024 ** 2,
|
| 312 |
+
'GB': 1024 ** 3
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
for unit, multiplier in multipliers.items():
|
| 316 |
+
if size_str.endswith(unit):
|
| 317 |
+
return int(float(size_str[:-len(unit)]) * multiplier)
|
| 318 |
+
|
| 319 |
+
return int(size_str)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def bind_context(
|
| 323 |
+
request_id: Optional[str] = None,
|
| 324 |
+
user_id: Optional[str] = None,
|
| 325 |
+
session_id: Optional[str] = None
|
| 326 |
+
) -> Dict[str, Any]:
|
| 327 |
+
"""Bind context variables for logging."""
|
| 328 |
+
context = {}
|
| 329 |
+
|
| 330 |
+
if request_id:
|
| 331 |
+
request_id.set(request_id)
|
| 332 |
+
context['request_id'] = request_id
|
| 333 |
+
|
| 334 |
+
if user_id:
|
| 335 |
+
user_id.set(user_id)
|
| 336 |
+
context['user_id'] = user_id
|
| 337 |
+
|
| 338 |
+
if session_id:
|
| 339 |
+
session_id.set(session_id)
|
| 340 |
+
context['session_id'] = session_id
|
| 341 |
+
|
| 342 |
+
return context
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
def unbind_context() -> None:
|
| 346 |
+
"""Clear all context variables."""
|
| 347 |
+
request_id.set(None)
|
| 348 |
+
user_id.set(None)
|
| 349 |
+
session_id.set(None)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
class LogContext:
|
| 353 |
+
"""Context manager for log context."""
|
| 354 |
+
|
| 355 |
+
def __init__(
|
| 356 |
+
self,
|
| 357 |
+
request_id: Optional[str] = None,
|
| 358 |
+
user_id: Optional[str] = None,
|
| 359 |
+
session_id: Optional[str] = None,
|
| 360 |
+
**kwargs
|
| 361 |
+
):
|
| 362 |
+
self.context = bind_context(request_id, user_id, session_id)
|
| 363 |
+
self.context.update(kwargs)
|
| 364 |
+
self.old_context = {}
|
| 365 |
+
|
| 366 |
+
def __enter__(self):
|
| 367 |
+
# Store old context
|
| 368 |
+
for key, value in self.context.items():
|
| 369 |
+
var = globals().get(key)
|
| 370 |
+
if var:
|
| 371 |
+
self.old_context[key] = var.get()
|
| 372 |
+
var.set(value)
|
| 373 |
+
|
| 374 |
+
return self.context
|
| 375 |
+
|
| 376 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 377 |
+
# Restore old context
|
| 378 |
+
for key, value in self.old_context.items():
|
| 379 |
+
var = globals().get(key)
|
| 380 |
+
if var:
|
| 381 |
+
var.set(value)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
def log_function_call(func):
|
| 385 |
+
"""Decorator to log function calls."""
|
| 386 |
+
import functools
|
| 387 |
+
|
| 388 |
+
@functools.wraps(func)
|
| 389 |
+
def wrapper(*args, **kwargs):
|
| 390 |
+
logger = logging.getLogger(func.__module__)
|
| 391 |
+
logger.debug(
|
| 392 |
+
"Function called",
|
| 393 |
+
function=func.__name__,
|
| 394 |
+
args_count=len(args),
|
| 395 |
+
kwargs=list(kwargs.keys())
|
| 396 |
+
)
|
| 397 |
+
try:
|
| 398 |
+
result = func(*args, **kwargs)
|
| 399 |
+
logger.debug(
|
| 400 |
+
"Function completed",
|
| 401 |
+
function=func.__name__
|
| 402 |
+
)
|
| 403 |
+
return result
|
| 404 |
+
except Exception as e:
|
| 405 |
+
logger.error(
|
| 406 |
+
"Function failed",
|
| 407 |
+
function=func.__name__,
|
| 408 |
+
error=str(e),
|
| 409 |
+
error_type=type(e).__name__
|
| 410 |
+
)
|
| 411 |
+
raise
|
| 412 |
+
|
| 413 |
+
@functools.wraps(func)
|
| 414 |
+
async def async_wrapper(*args, **kwargs):
|
| 415 |
+
logger = logging.getLogger(func.__module__)
|
| 416 |
+
logger.debug(
|
| 417 |
+
"Async function called",
|
| 418 |
+
function=func.__name__,
|
| 419 |
+
args_count=len(args),
|
| 420 |
+
kwargs=list(kwargs.keys())
|
| 421 |
+
)
|
| 422 |
+
try:
|
| 423 |
+
result = await func(*args, **kwargs)
|
| 424 |
+
logger.debug(
|
| 425 |
+
"Async function completed",
|
| 426 |
+
function=func.__name__
|
| 427 |
+
)
|
| 428 |
+
return result
|
| 429 |
+
except Exception as e:
|
| 430 |
+
logger.error(
|
| 431 |
+
"Async function failed",
|
| 432 |
+
function=func.__name__,
|
| 433 |
+
error=str(e),
|
| 434 |
+
error_type=type(e).__name__
|
| 435 |
+
)
|
| 436 |
+
raise
|
| 437 |
+
|
| 438 |
+
return async_wrapper if asyncio.iscoroutinefunction(func) else wrapper
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
# Initialize logging on import
|
| 442 |
+
setup_logging()
|
src/config/translation_config.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Translation Service Configuration Management.
|
| 3 |
+
|
| 4 |
+
Centralized configuration for the OpenAI Translation Service with
|
| 5 |
+
environment-based overrides and validation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
import yaml
|
| 11 |
+
from typing import Dict, Any, Optional, Union, List
|
| 12 |
+
from dataclasses import dataclass, field, asdict
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from enum import Enum
|
| 15 |
+
|
| 16 |
+
from pydantic import BaseModel, Field, validator
|
| 17 |
+
from src.utils.translation_logger import get_translation_logger
|
| 18 |
+
|
| 19 |
+
logger = get_translation_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class LogLevel(str, Enum):
|
| 23 |
+
"""Log levels for the translation service."""
|
| 24 |
+
DEBUG = "DEBUG"
|
| 25 |
+
INFO = "INFO"
|
| 26 |
+
WARNING = "WARNING"
|
| 27 |
+
ERROR = "ERROR"
|
| 28 |
+
CRITICAL = "CRITICAL"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class Environment(str, Enum):
|
| 32 |
+
"""Environment types."""
|
| 33 |
+
DEVELOPMENT = "development"
|
| 34 |
+
TESTING = "testing"
|
| 35 |
+
STAGING = "staging"
|
| 36 |
+
PRODUCTION = "production"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class CacheBackend(str, Enum):
|
| 40 |
+
"""Cache backend types."""
|
| 41 |
+
MEMORY = "memory"
|
| 42 |
+
REDIS = "redis"
|
| 43 |
+
DATABASE = "database"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class GeminiConfig:
|
| 48 |
+
"""Configuration for Gemini API."""
|
| 49 |
+
api_key: str = field(default_factory=lambda: os.getenv("GEMINI_API_KEY", ""))
|
| 50 |
+
base_url: str = field(
|
| 51 |
+
default_factory=lambda: os.getenv(
|
| 52 |
+
"GEMINI_BASE_URL",
|
| 53 |
+
"https://generativelanguage.googleapis.com/v1beta/openai/"
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
default_model: str = field(
|
| 57 |
+
default_factory=lambda: os.getenv("GEMINI_MODEL", "gemini-2.0-flash-lite")
|
| 58 |
+
)
|
| 59 |
+
organization: Optional[str] = field(default_factory=lambda: os.getenv("OPENAI_ORGANIZATION"))
|
| 60 |
+
|
| 61 |
+
# Connection settings
|
| 62 |
+
timeout: float = field(default_factory=lambda: float(os.getenv("GEMINI_TIMEOUT", "60")))
|
| 63 |
+
max_retries: int = field(default_factory=lambda: int(os.getenv("GEMINI_MAX_RETRIES", "3")))
|
| 64 |
+
retry_delay: float = field(default_factory=lambda: float(os.getenv("GEMINI_RETRY_DELAY", "1.0")))
|
| 65 |
+
|
| 66 |
+
# Advanced settings
|
| 67 |
+
proxy: Optional[str] = field(default_factory=lambda: os.getenv("HTTP_PROXY"))
|
| 68 |
+
custom_headers: Dict[str, str] = field(default_factory=dict)
|
| 69 |
+
http2: bool = field(default_factory=lambda: os.getenv("GEMINI_HTTP2", "true").lower() == "true")
|
| 70 |
+
|
| 71 |
+
# Rate limiting
|
| 72 |
+
requests_per_minute: int = field(default_factory=lambda: int(os.getenv("GEMINI_RPM", "60")))
|
| 73 |
+
requests_per_hour: int = field(default_factory=lambda: int(os.getenv("GEMINI_RPH", "1000")))
|
| 74 |
+
|
| 75 |
+
# Model pricing (USD per 1M tokens)
|
| 76 |
+
pricing: Dict[str, Dict[str, float]] = field(default_factory=lambda: {
|
| 77 |
+
"gemini-2.0-flash-lite": {"input": 0.000075, "output": 0.00015},
|
| 78 |
+
"gemini-2.5-pro": {"input": 0.00125, "output": 0.00375}
|
| 79 |
+
})
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@dataclass
|
| 83 |
+
class OpenAIAgentsConfig:
|
| 84 |
+
"""Configuration for OpenAI Agents SDK."""
|
| 85 |
+
enabled: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_ENABLED", "true").lower() == "true")
|
| 86 |
+
enable_tracing: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_TRACING", "false").lower() == "true")
|
| 87 |
+
verbose_logging: bool = field(default_factory=lambda: os.getenv("OPENAI_AGENTS_VERBOSE", "false").lower() == "true")
|
| 88 |
+
|
| 89 |
+
# Agent settings
|
| 90 |
+
default_temperature: float = field(default_factory=lambda: float(os.getenv("AGENT_DEFAULT_TEMPERATURE", "0.3")))
|
| 91 |
+
default_max_tokens: int = field(default_factory=lambda: int(os.getenv("AGENT_MAX_TOKENS", "2048")))
|
| 92 |
+
max_turns: int = field(default_factory=lambda: int(os.getenv("AGENT_MAX_TURNS", "5")))
|
| 93 |
+
|
| 94 |
+
# Tool settings
|
| 95 |
+
enable_html_tool: bool = field(default_factory=lambda: os.getenv("AGENT_HTML_TOOL", "true").lower() == "true")
|
| 96 |
+
enable_code_tool: bool = field(default_factory=lambda: os.getenv("AGENT_CODE_TOOL", "true").lower() == "true")
|
| 97 |
+
enable_quality_tool: bool = field(default_factory=lambda: os.getenv("AGENT_QUALITY_TOOL", "true").lower() == "true")
|
| 98 |
+
|
| 99 |
+
# Quality settings
|
| 100 |
+
quality_check_enabled: bool = field(default_factory=lambda: os.getenv("AGENT_QUALITY_CHECK", "true").lower() == "true")
|
| 101 |
+
confidence_threshold: float = field(default_factory=lambda: float(os.getenv("AGENT_CONFIDENCE_THRESHOLD", "0.8")))
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
@dataclass
|
| 105 |
+
class CacheConfig:
|
| 106 |
+
"""Configuration for caching."""
|
| 107 |
+
backend: CacheBackend = field(
|
| 108 |
+
default_factory=lambda: CacheBackend(os.getenv("CACHE_BACKEND", "memory"))
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# TTL settings
|
| 112 |
+
default_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_DEFAULT_TTL", "168"))) # 7 days
|
| 113 |
+
high_quality_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_HIGH_QUALITY_TTL", "720"))) # 30 days
|
| 114 |
+
low_quality_ttl_hours: int = field(default_factory=lambda: int(os.getenv("CACHE_LOW_QUALITY_TTL", "24"))) # 1 day
|
| 115 |
+
|
| 116 |
+
# Redis settings
|
| 117 |
+
redis_url: str = field(default_factory=lambda: os.getenv("REDIS_URL", "redis://localhost:6379"))
|
| 118 |
+
redis_prefix: str = field(default_factory=lambda: os.getenv("REDIS_PREFIX", "translation:"))
|
| 119 |
+
redis_max_connections: int = field(default_factory=lambda: int(os.getenv("REDIS_MAX_CONNECTIONS", "10")))
|
| 120 |
+
|
| 121 |
+
# Memory cache settings
|
| 122 |
+
memory_max_size: int = field(default_factory=lambda: int(os.getenv("CACHE_MEMORY_MAX_SIZE", "1000")))
|
| 123 |
+
memory_cleanup_interval: int = field(default_factory=lambda: int(os.getenv("CACHE_CLEANUP_INTERVAL", "3600")))
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@dataclass
|
| 127 |
+
class DatabaseConfig:
|
| 128 |
+
"""Configuration for database connections."""
|
| 129 |
+
url: str = field(default_factory=lambda: os.getenv(
|
| 130 |
+
"DATABASE_URL",
|
| 131 |
+
"sqlite:///./translation.db"
|
| 132 |
+
))
|
| 133 |
+
pool_size: int = field(default_factory=lambda: int(os.getenv("DB_POOL_SIZE", "5")))
|
| 134 |
+
max_overflow: int = field(default_factory=lambda: int(os.getenv("DB_MAX_OVERFLOW", "10")))
|
| 135 |
+
pool_timeout: int = field(default_factory=lambda: int(os.getenv("DB_POOL_TIMEOUT", "30")))
|
| 136 |
+
pool_recycle: int = field(default_factory=lambda: int(os.getenv("DB_POOL_RECYCLE", "3600")))
|
| 137 |
+
|
| 138 |
+
# Migration settings
|
| 139 |
+
auto_migrate: bool = field(default_factory=lambda: os.getenv("DB_AUTO_MIGRATE", "true").lower() == "true")
|
| 140 |
+
migration_timeout: int = field(default_factory=lambda: int(os.getenv("DB_MIGRATION_TIMEOUT", "300")))
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@dataclass
|
| 144 |
+
class LoggingConfig:
|
| 145 |
+
"""Configuration for logging."""
|
| 146 |
+
level: LogLevel = field(default_factory=lambda: LogLevel(os.getenv("LOG_LEVEL", "INFO")))
|
| 147 |
+
format: str = field(
|
| 148 |
+
default_factory=lambda: os.getenv(
|
| 149 |
+
"LOG_FORMAT",
|
| 150 |
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 151 |
+
)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# File logging
|
| 155 |
+
file_logging: bool = field(default_factory=lambda: os.getenv("LOG_FILE_ENABLED", "true").lower() == "true")
|
| 156 |
+
file_path: str = field(default_factory=lambda: os.getenv("LOG_FILE_PATH", "logs/translation.log"))
|
| 157 |
+
file_rotation: str = field(default_factory=lambda: os.getenv("LOG_FILE_ROTATION", "1 day"))
|
| 158 |
+
file_retention: str = field(default_factory=lambda: os.getenv("LOG_FILE_RETENTION", "30 days"))
|
| 159 |
+
max_file_size: str = field(default_factory=lambda: os.getenv("LOG_MAX_FILE_SIZE", "100 MB"))
|
| 160 |
+
|
| 161 |
+
# Structured logging
|
| 162 |
+
json_format: bool = field(default_factory=lambda: os.getenv("LOG_JSON_FORMAT", "false").lower() == "true")
|
| 163 |
+
include_request_id: bool = field(default_factory=lambda: os.getenv("LOG_INCLUDE_REQUEST_ID", "true").lower() == "true")
|
| 164 |
+
|
| 165 |
+
# Sensitive data filtering
|
| 166 |
+
filter_sensitive_data: bool = field(default_factory=lambda: os.getenv("LOG_FILTER_SENSITIVE", "true").lower() == "true")
|
| 167 |
+
sensitive_fields: List[str] = field(default_factory=lambda: [
|
| 168 |
+
"api_key", "password", "token", "authorization"
|
| 169 |
+
])
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
@dataclass
|
| 173 |
+
class RateLimitConfig:
|
| 174 |
+
"""Configuration for rate limiting."""
|
| 175 |
+
enabled: bool = field(default_factory=lambda: os.getenv("RATE_LIMIT_ENABLED", "true").lower() == "true")
|
| 176 |
+
|
| 177 |
+
# Global limits
|
| 178 |
+
requests_per_minute: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPM", "60")))
|
| 179 |
+
requests_per_hour: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPH", "1000")))
|
| 180 |
+
requests_per_day: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_RPD", "10000")))
|
| 181 |
+
|
| 182 |
+
# Translation-specific limits
|
| 183 |
+
translation_rpm: int = field(default_factory=lambda: int(os.getenv("TRANSLATION_RPM", "10")))
|
| 184 |
+
translation_rph: int = field(default_factory=lambda: int(os.getenv("TRANSLATION_RPH", "500")))
|
| 185 |
+
|
| 186 |
+
# Enforcement
|
| 187 |
+
block_duration: int = field(default_factory=lambda: int(os.getenv("RATE_LIMIT_BLOCK_DURATION", "3600")))
|
| 188 |
+
warning_threshold: float = field(default_factory=lambda: float(os.getenv("RATE_LIMIT_WARNING_THRESHOLD", "0.8")))
|
| 189 |
+
|
| 190 |
+
# Redis backend for distributed limiting
|
| 191 |
+
redis_backend: bool = field(default_factory=lambda: os.getenv("RATE_LIMIT_REDIS", "false").lower() == "true")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
@dataclass
|
| 195 |
+
class SecurityConfig:
|
| 196 |
+
"""Configuration for security settings."""
|
| 197 |
+
# API key validation
|
| 198 |
+
require_api_key: bool = field(default_factory=lambda: os.getenv("SECURITY_REQUIRE_API_KEY", "false").lower() == "true")
|
| 199 |
+
api_key_header: str = field(default_factory=lambda: os.getenv("SECURITY_API_KEY_HEADER", "X-API-Key"))
|
| 200 |
+
|
| 201 |
+
# Request validation
|
| 202 |
+
max_text_length: int = field(default_factory=lambda: int(os.getenv("SECURITY_MAX_TEXT_LENGTH", "100000")))
|
| 203 |
+
max_chunks: int = field(default_factory=lambda: int(os.getenv("SECURITY_MAX_CHUNKS", "100")))
|
| 204 |
+
|
| 205 |
+
# CORS settings
|
| 206 |
+
cors_origins: List[str] = field(default_factory=lambda: os.getenv("CORS_ORIGINS", "*").split(","))
|
| 207 |
+
cors_methods: List[str] = field(default_factory=lambda: os.getenv("CORS_METHODS", "GET,POST").split(","))
|
| 208 |
+
cors_headers: List[str] = field(default_factory=lambda: os.getenv("CORS_HEADERS", "*").split(","))
|
| 209 |
+
|
| 210 |
+
# Content filtering
|
| 211 |
+
enable_content_filter: bool = field(default_factory=lambda: os.getenv("SECURITY_CONTENT_FILTER", "true").lower() == "true")
|
| 212 |
+
blocked_patterns: List[str] = field(default_factory=lambda: os.getenv(
|
| 213 |
+
"SECURITY_BLOCKED_PATTERNS",
|
| 214 |
+
""
|
| 215 |
+
).split(",") if os.getenv("SECURITY_BLOCKED_PATTERNS") else [])
|
| 216 |
+
|
| 217 |
+
# IP-based restrictions
|
| 218 |
+
ip_whitelist: List[str] = field(default_factory=lambda: os.getenv("SECURITY_IP_WHITELIST", "").split(","))
|
| 219 |
+
ip_blacklist: List[str] = field(default_factory=lambda: os.getenv("SECURITY_IP_BLACKLIST", "").split(","))
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@dataclass
|
| 223 |
+
class MonitoringConfig:
|
| 224 |
+
"""Configuration for monitoring and metrics."""
|
| 225 |
+
enabled: bool = field(default_factory=lambda: os.getenv("MONITORING_ENABLED", "true").lower() == "true")
|
| 226 |
+
|
| 227 |
+
# Metrics
|
| 228 |
+
metrics_endpoint: str = field(default_factory=lambda: os.getenv("METRICS_ENDPOINT", "/metrics"))
|
| 229 |
+
metrics_port: int = field(default_factory=lambda: int(os.getenv("METRICS_PORT", "9090")))
|
| 230 |
+
|
| 231 |
+
# Health checks
|
| 232 |
+
health_endpoint: str = field(default_factory=lambda: os.getenv("HEALTH_ENDPOINT", "/health"))
|
| 233 |
+
detailed_health: bool = field(default_factory=lambda: os.getenv("HEALTH_DETAILED", "true").lower() == "true")
|
| 234 |
+
|
| 235 |
+
# Performance tracking
|
| 236 |
+
track_performance: bool = field(default_factory=lambda: os.getenv("TRACK_PERFORMANCE", "true").lower() == "true")
|
| 237 |
+
slow_query_threshold_ms: int = field(default_factory=lambda: int(os.getenv("SLOW_QUERY_THRESHOLD", "1000")))
|
| 238 |
+
|
| 239 |
+
# Error tracking
|
| 240 |
+
track_errors: bool = field(default_factory=lambda: os.getenv("TRACK_ERRORS", "true").lower() == "true")
|
| 241 |
+
error_sample_rate: float = field(default_factory=lambda: float(os.getenv("ERROR_SAMPLE_RATE", "1.0")))
|
| 242 |
+
|
| 243 |
+
# External integrations
|
| 244 |
+
sentry_dsn: Optional[str] = field(default_factory=lambda: os.getenv("SENTRY_DSN"))
|
| 245 |
+
prometheus_gateway: Optional[str] = field(default_factory=lambda: os.getenv("PROMETHEUS_GATEWAY"))
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
class TranslationConfig(BaseModel):
|
| 249 |
+
"""Main configuration for the translation service."""
|
| 250 |
+
environment: Environment = Field(default=Environment.DEVELOPMENT)
|
| 251 |
+
debug: bool = Field(default=False)
|
| 252 |
+
|
| 253 |
+
# Component configurations
|
| 254 |
+
gemini: GeminiConfig = Field(default_factory=GeminiConfig)
|
| 255 |
+
openai_agents: OpenAIAgentsConfig = Field(default_factory=OpenAIAgentsConfig)
|
| 256 |
+
cache: CacheConfig = Field(default_factory=CacheConfig)
|
| 257 |
+
database: DatabaseConfig = Field(default_factory=DatabaseConfig)
|
| 258 |
+
logging: LoggingConfig = Field(default_factory=LoggingConfig)
|
| 259 |
+
rate_limit: RateLimitConfig = Field(default_factory=RateLimitConfig)
|
| 260 |
+
security: SecurityConfig = Field(default_factory=SecurityConfig)
|
| 261 |
+
monitoring: MonitoringConfig = Field(default_factory=MonitoringConfig)
|
| 262 |
+
|
| 263 |
+
# Feature flags
|
| 264 |
+
features: Dict[str, bool] = Field(default_factory=lambda: {
|
| 265 |
+
"streaming": True,
|
| 266 |
+
"quality_check": True,
|
| 267 |
+
"chunking": True,
|
| 268 |
+
"code_preservation": True,
|
| 269 |
+
"html_preservation": True,
|
| 270 |
+
"batch_translation": True
|
| 271 |
+
})
|
| 272 |
+
|
| 273 |
+
class Config:
|
| 274 |
+
env_file = ".env"
|
| 275 |
+
env_file_encoding = "utf-8"
|
| 276 |
+
case_sensitive = False
|
| 277 |
+
|
| 278 |
+
@validator("environment", pre=True)
|
| 279 |
+
def parse_environment(cls, v):
|
| 280 |
+
"""Parse environment from string."""
|
| 281 |
+
if isinstance(v, str):
|
| 282 |
+
return Environment(v.lower())
|
| 283 |
+
return v
|
| 284 |
+
|
| 285 |
+
def __init__(self, **data):
|
| 286 |
+
"""Initialize configuration with environment detection."""
|
| 287 |
+
# Auto-detect environment if not specified
|
| 288 |
+
if "environment" not in data:
|
| 289 |
+
env = os.getenv("ENVIRONMENT", os.getenv("ENV", "development")).lower()
|
| 290 |
+
data["environment"] = Environment(env)
|
| 291 |
+
|
| 292 |
+
# Set debug flag based on environment
|
| 293 |
+
if "debug" not in data:
|
| 294 |
+
data["debug"] = data["environment"] == Environment.DEVELOPMENT
|
| 295 |
+
|
| 296 |
+
super().__init__(**data)
|
| 297 |
+
|
| 298 |
+
# Validate configuration
|
| 299 |
+
self.validate_config()
|
| 300 |
+
|
| 301 |
+
def validate_config(self) -> None:
|
| 302 |
+
"""Validate the configuration."""
|
| 303 |
+
errors = []
|
| 304 |
+
|
| 305 |
+
# Validate Gemini configuration
|
| 306 |
+
if not self.gemini.api_key:
|
| 307 |
+
errors.append("GEMINI_API_KEY is required")
|
| 308 |
+
|
| 309 |
+
if self.gemini.timeout <= 0:
|
| 310 |
+
errors.append("GEMINI_TIMEOUT must be positive")
|
| 311 |
+
|
| 312 |
+
if self.gemini.max_retries < 0:
|
| 313 |
+
errors.append("GEMINI_MAX_RETRIES must be non-negative")
|
| 314 |
+
|
| 315 |
+
# Validate database URL if provided
|
| 316 |
+
if self.database.url and not self.database.url.startswith(("sqlite://", "postgresql://", "mysql://")):
|
| 317 |
+
errors.append("DATABASE_URL must be a valid database connection string")
|
| 318 |
+
|
| 319 |
+
# Validate cache configuration
|
| 320 |
+
if self.cache.backend == CacheBackend.REDIS and not self.cache.redis_url:
|
| 321 |
+
errors.append("REDIS_URL is required when using Redis cache backend")
|
| 322 |
+
|
| 323 |
+
# Validate rate limits
|
| 324 |
+
if self.rate_limit.requests_per_minute <= 0:
|
| 325 |
+
errors.append("RATE_LIMIT_RPM must be positive")
|
| 326 |
+
|
| 327 |
+
# Log errors and raise if any
|
| 328 |
+
if errors:
|
| 329 |
+
for error in errors:
|
| 330 |
+
logger.error(f"Configuration validation error: {error}")
|
| 331 |
+
raise ValueError(f"Configuration validation failed: {'; '.join(errors)}")
|
| 332 |
+
|
| 333 |
+
logger.info("Configuration validated successfully", environment=self.environment.value)
|
| 334 |
+
|
| 335 |
+
@classmethod
|
| 336 |
+
def from_file(cls, config_path: Union[str, Path]) -> "TranslationConfig":
|
| 337 |
+
"""Load configuration from file."""
|
| 338 |
+
config_path = Path(config_path)
|
| 339 |
+
|
| 340 |
+
if not config_path.exists():
|
| 341 |
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
| 342 |
+
|
| 343 |
+
# Parse based on file extension
|
| 344 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
| 345 |
+
if config_path.suffix.lower() in [".yaml", ".yml"]:
|
| 346 |
+
data = yaml.safe_load(f)
|
| 347 |
+
elif config_path.suffix.lower() == ".json":
|
| 348 |
+
data = json.load(f)
|
| 349 |
+
else:
|
| 350 |
+
raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
|
| 351 |
+
|
| 352 |
+
# Override with environment variables
|
| 353 |
+
return cls(**data)
|
| 354 |
+
|
| 355 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 356 |
+
"""Convert configuration to dictionary."""
|
| 357 |
+
return {
|
| 358 |
+
"environment": self.environment.value,
|
| 359 |
+
"debug": self.debug,
|
| 360 |
+
"gemini": asdict(self.gemini),
|
| 361 |
+
"openai_agents": asdict(self.openai_agents),
|
| 362 |
+
"cache": asdict(self.cache),
|
| 363 |
+
"database": asdict(self.database),
|
| 364 |
+
"logging": {
|
| 365 |
+
**asdict(self.logging),
|
| 366 |
+
"level": self.logging.level.value
|
| 367 |
+
},
|
| 368 |
+
"rate_limit": asdict(self.rate_limit),
|
| 369 |
+
"security": asdict(self.security),
|
| 370 |
+
"monitoring": asdict(self.monitoring),
|
| 371 |
+
"features": self.features
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
def save_to_file(self, config_path: Union[str, Path]) -> None:
|
| 375 |
+
"""Save configuration to file."""
|
| 376 |
+
config_path = Path(config_path)
|
| 377 |
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
| 378 |
+
|
| 379 |
+
data = self.to_dict()
|
| 380 |
+
|
| 381 |
+
with open(config_path, "w", encoding="utf-8") as f:
|
| 382 |
+
if config_path.suffix.lower() in [".yaml", ".yml"]:
|
| 383 |
+
yaml.dump(data, f, default_flow_style=False, indent=2)
|
| 384 |
+
elif config_path.suffix.lower() == ".json":
|
| 385 |
+
json.dump(data, f, indent=2)
|
| 386 |
+
else:
|
| 387 |
+
raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
|
| 388 |
+
|
| 389 |
+
logger.info(f"Configuration saved to {config_path}")
|
| 390 |
+
|
| 391 |
+
def get_model_pricing(self, model: str) -> Dict[str, float]:
|
| 392 |
+
"""Get pricing for a specific model."""
|
| 393 |
+
return self.gemini.pricing.get(model, self.gemini.pricing["gemini-2.0-flash-lite"])
|
| 394 |
+
|
| 395 |
+
def is_feature_enabled(self, feature: str) -> bool:
|
| 396 |
+
"""Check if a feature is enabled."""
|
| 397 |
+
return self.features.get(feature, False)
|
| 398 |
+
|
| 399 |
+
def should_use_agents(self) -> bool:
|
| 400 |
+
"""Determine if OpenAI Agents SDK should be used."""
|
| 401 |
+
return self.openai_agents.enabled and self.is_feature_enabled("quality_check")
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
# Global configuration instance
|
| 405 |
+
_config: Optional[TranslationConfig] = None
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def get_config() -> TranslationConfig:
|
| 409 |
+
"""Get the global configuration instance."""
|
| 410 |
+
global _config
|
| 411 |
+
if _config is None:
|
| 412 |
+
_config = TranslationConfig()
|
| 413 |
+
return _config
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def load_config(config_path: Optional[Union[str, Path]] = None) -> TranslationConfig:
|
| 417 |
+
"""Load configuration from file or environment."""
|
| 418 |
+
global _config
|
| 419 |
+
|
| 420 |
+
if config_path:
|
| 421 |
+
_config = TranslationConfig.from_file(config_path)
|
| 422 |
+
else:
|
| 423 |
+
_config = TranslationConfig()
|
| 424 |
+
|
| 425 |
+
return _config
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def reload_config() -> TranslationConfig:
|
| 429 |
+
"""Reload configuration from environment."""
|
| 430 |
+
global _config
|
| 431 |
+
_config = TranslationConfig()
|
| 432 |
+
return _config
|
src/database/base.py
CHANGED
|
@@ -7,7 +7,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
| 7 |
from sqlalchemy.orm import sessionmaker
|
| 8 |
import os
|
| 9 |
|
| 10 |
-
# Create the declarative base
|
| 11 |
Base = declarative_base()
|
| 12 |
|
| 13 |
# Database URL from environment
|
|
|
|
| 7 |
from sqlalchemy.orm import sessionmaker
|
| 8 |
import os
|
| 9 |
|
| 10 |
+
# Create the declarative base - this will be the single source of truth
|
| 11 |
Base = declarative_base()
|
| 12 |
|
| 13 |
# Database URL from environment
|
src/middleware/auth.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Authentication middleware for API routes.
|
| 3 |
+
|
| 4 |
+
This module provides JWT-based authentication middleware for protecting API endpoints.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from datetime import datetime, timedelta
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
+
|
| 10 |
+
from fastapi import HTTPException, status, Depends
|
| 11 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 12 |
+
from jose import JWTError, jwt
|
| 13 |
+
from passlib.context import CryptContext
|
| 14 |
+
from sqlalchemy.orm import Session
|
| 15 |
+
|
| 16 |
+
from src.database.base import get_db
|
| 17 |
+
from src.models.auth import User
|
| 18 |
+
|
| 19 |
+
# Configuration
|
| 20 |
+
SECRET_KEY = "your-secret-key-here" # Should be in environment variables
|
| 21 |
+
ALGORITHM = "HS256"
|
| 22 |
+
ACCESS_TOKEN_EXPIRE_MINUTES = 30
|
| 23 |
+
|
| 24 |
+
# Password hashing
|
| 25 |
+
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
| 26 |
+
|
| 27 |
+
# Security scheme for FastAPI
|
| 28 |
+
security = HTTPBearer(auto_error=False)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
| 32 |
+
"""Verify a password against its hash."""
|
| 33 |
+
return pwd_context.verify(plain_password, hashed_password)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_password_hash(password: str) -> str:
|
| 37 |
+
"""Generate password hash."""
|
| 38 |
+
return pwd_context.hash(password)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
|
| 42 |
+
"""Create JWT access token."""
|
| 43 |
+
to_encode = data.copy()
|
| 44 |
+
if expires_delta:
|
| 45 |
+
expire = datetime.utcnow() + expires_delta
|
| 46 |
+
else:
|
| 47 |
+
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 48 |
+
|
| 49 |
+
to_encode.update({"exp": expire})
|
| 50 |
+
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 51 |
+
return encoded_jwt
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def decode_token(token: str) -> Dict[str, Any]:
|
| 55 |
+
"""Decode and validate JWT token."""
|
| 56 |
+
try:
|
| 57 |
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 58 |
+
return payload
|
| 59 |
+
except JWTError as e:
|
| 60 |
+
raise HTTPException(
|
| 61 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 62 |
+
detail=f"Could not validate credentials: {str(e)}",
|
| 63 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
async def get_current_user(
|
| 68 |
+
credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
|
| 69 |
+
db: Session = Depends(get_db)
|
| 70 |
+
) -> User:
|
| 71 |
+
"""Get the current authenticated user."""
|
| 72 |
+
if not credentials:
|
| 73 |
+
raise HTTPException(
|
| 74 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 75 |
+
detail="Not authenticated",
|
| 76 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
token = credentials.credentials
|
| 80 |
+
payload = decode_token(token)
|
| 81 |
+
|
| 82 |
+
user_id: str = payload.get("sub")
|
| 83 |
+
if user_id is None:
|
| 84 |
+
raise HTTPException(
|
| 85 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 86 |
+
detail="Could not validate credentials",
|
| 87 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
user = db.query(User).filter(User.id == user_id).first()
|
| 91 |
+
if user is None:
|
| 92 |
+
raise HTTPException(
|
| 93 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 94 |
+
detail="User not found",
|
| 95 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
return user
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
async def get_current_active_user(
|
| 102 |
+
current_user: User = Depends(get_current_user)
|
| 103 |
+
) -> User:
|
| 104 |
+
"""Get the current active user."""
|
| 105 |
+
if not current_user.is_active:
|
| 106 |
+
raise HTTPException(
|
| 107 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 108 |
+
detail="Inactive user"
|
| 109 |
+
)
|
| 110 |
+
return current_user
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
async def get_optional_current_user(
|
| 114 |
+
credentials: Optional[HTTPAuthorizationCredentials] = Depends(security),
|
| 115 |
+
db: Session = Depends(get_db)
|
| 116 |
+
) -> Optional[User]:
|
| 117 |
+
"""Get the current user if authenticated, otherwise return None."""
|
| 118 |
+
if not credentials:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
token = credentials.credentials
|
| 123 |
+
payload = decode_token(token)
|
| 124 |
+
|
| 125 |
+
user_id: str = payload.get("sub")
|
| 126 |
+
if user_id is None:
|
| 127 |
+
return None
|
| 128 |
+
|
| 129 |
+
user = db.query(User).filter(User.id == user_id).first()
|
| 130 |
+
return user if user and user.is_active else None
|
| 131 |
+
|
| 132 |
+
except HTTPException:
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# Role-based access control
|
| 137 |
+
class RoleChecker:
|
| 138 |
+
"""Check if user has required role."""
|
| 139 |
+
|
| 140 |
+
def __init__(self, allowed_roles: list):
|
| 141 |
+
self.allowed_roles = allowed_roles
|
| 142 |
+
|
| 143 |
+
def __call__(self, current_user: User = Depends(get_current_active_user)) -> User:
|
| 144 |
+
if current_user.role not in self.allowed_roles:
|
| 145 |
+
raise HTTPException(
|
| 146 |
+
status_code=status.HTTP_403_FORBIDDEN,
|
| 147 |
+
detail="Not enough permissions"
|
| 148 |
+
)
|
| 149 |
+
return current_user
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# Pre-defined role checkers
|
| 153 |
+
require_admin = RoleChecker(["admin"])
|
| 154 |
+
require_user = RoleChecker(["user", "admin"])
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# Authentication dependencies
|
| 158 |
+
def authenticate_user(email: str, password: str, db: Session) -> Optional[User]:
|
| 159 |
+
"""Authenticate user with email and password."""
|
| 160 |
+
user = db.query(User).filter(User.email == email).first()
|
| 161 |
+
if not user:
|
| 162 |
+
return None
|
| 163 |
+
if not verify_password(password, user.hashed_password):
|
| 164 |
+
return None
|
| 165 |
+
return user
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# Rate limiting middleware
|
| 169 |
+
from slowapi import Limiter
|
| 170 |
+
from slowapi.util import get_remote_address
|
| 171 |
+
from slowapi.errors import RateLimitExceeded
|
| 172 |
+
|
| 173 |
+
limiter = Limiter(key_func=get_remote_address)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class RateLimitMiddleware:
|
| 177 |
+
"""Rate limiting middleware for API endpoints."""
|
| 178 |
+
|
| 179 |
+
def __init__(self, times: int, milliseconds: int):
|
| 180 |
+
self.times = times
|
| 181 |
+
self.milliseconds = milliseconds
|
| 182 |
+
|
| 183 |
+
def __call__(self, endpoint):
|
| 184 |
+
return limiter.limit(f"{self.times}/{self.milliseconds}milliseconds")(endpoint)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# Pre-defined rate limiters
|
| 188 |
+
auth_rate_limit = RateLimitMiddleware(5, 60000) # 5 requests per minute
|
| 189 |
+
general_rate_limit = RateLimitMiddleware(100, 60000) # 100 requests per minute
|
| 190 |
+
upload_rate_limit = RateLimitMiddleware(10, 60000) # 10 requests per minute
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# CORS middleware configuration
|
| 194 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 195 |
+
|
| 196 |
+
def create_cors_middleware(allow_origins: list = None) -> CORSMiddleware:
|
| 197 |
+
"""Create CORS middleware with specified origins."""
|
| 198 |
+
return CORSMiddleware(
|
| 199 |
+
allow_origins=allow_origins or ["http://localhost:3000"],
|
| 200 |
+
allow_credentials=True,
|
| 201 |
+
allow_methods=["*"],
|
| 202 |
+
allow_headers=["*"],
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# Request logging middleware
|
| 207 |
+
import logging
|
| 208 |
+
import time
|
| 209 |
+
from fastapi import Request, Response
|
| 210 |
+
|
| 211 |
+
logger = logging.getLogger(__name__)
|
| 212 |
+
|
| 213 |
+
async def log_requests(request: Request, call_next):
|
| 214 |
+
"""Log all API requests with timing."""
|
| 215 |
+
start_time = time.time()
|
| 216 |
+
|
| 217 |
+
# Get client IP
|
| 218 |
+
client_ip = request.client.host if request.client else "unknown"
|
| 219 |
+
|
| 220 |
+
# Get user if authenticated
|
| 221 |
+
user = getattr(request.state, 'user', None)
|
| 222 |
+
user_id = user.id if user else "anonymous"
|
| 223 |
+
|
| 224 |
+
# Log request
|
| 225 |
+
logger.info(
|
| 226 |
+
f"Request started",
|
| 227 |
+
extra={
|
| 228 |
+
"method": request.method,
|
| 229 |
+
"url": str(request.url),
|
| 230 |
+
"client_ip": client_ip,
|
| 231 |
+
"user_id": user_id,
|
| 232 |
+
"headers": dict(request.headers),
|
| 233 |
+
}
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# Process request
|
| 237 |
+
response = await call_next(request)
|
| 238 |
+
|
| 239 |
+
# Calculate duration
|
| 240 |
+
process_time = time.time() - start_time
|
| 241 |
+
|
| 242 |
+
# Log response
|
| 243 |
+
logger.info(
|
| 244 |
+
f"Request completed",
|
| 245 |
+
extra={
|
| 246 |
+
"method": request.method,
|
| 247 |
+
"url": str(request.url),
|
| 248 |
+
"status_code": response.status_code,
|
| 249 |
+
"process_time": process_time,
|
| 250 |
+
"client_ip": client_ip,
|
| 251 |
+
"user_id": user_id,
|
| 252 |
+
}
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Add timing header
|
| 256 |
+
response.headers["X-Process-Time"] = str(process_time)
|
| 257 |
+
|
| 258 |
+
return response
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# Security headers middleware
|
| 262 |
+
async def add_security_headers(request: Request, call_next):
|
| 263 |
+
"""Add security headers to responses."""
|
| 264 |
+
response = await call_next(request)
|
| 265 |
+
|
| 266 |
+
# Add security headers
|
| 267 |
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 268 |
+
response.headers["X-Frame-Options"] = "DENY"
|
| 269 |
+
response.headers["X-XSS-Protection"] = "1; mode=block"
|
| 270 |
+
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 271 |
+
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
| 272 |
+
response.headers["Content-Security-Policy"] = "default-src 'self'"
|
| 273 |
+
|
| 274 |
+
return response
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# Token refresh endpoint dependencies
|
| 278 |
+
def create_refresh_token(data: dict) -> str:
|
| 279 |
+
"""Create a refresh token with longer expiry."""
|
| 280 |
+
to_encode = data.copy()
|
| 281 |
+
expire = datetime.utcnow() + timedelta(days=7) # 7 days
|
| 282 |
+
to_encode.update({"exp": expire, "type": "refresh"})
|
| 283 |
+
return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
async def verify_refresh_token(token: str) -> Dict[str, Any]:
|
| 287 |
+
"""Verify refresh token and return payload."""
|
| 288 |
+
try:
|
| 289 |
+
payload = decode_token(token)
|
| 290 |
+
if payload.get("type") != "refresh":
|
| 291 |
+
raise HTTPException(
|
| 292 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 293 |
+
detail="Invalid refresh token"
|
| 294 |
+
)
|
| 295 |
+
return payload
|
| 296 |
+
except HTTPException:
|
| 297 |
+
raise
|
| 298 |
+
except Exception as e:
|
| 299 |
+
raise HTTPException(
|
| 300 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 301 |
+
detail=f"Could not validate refresh token: {str(e)}"
|
| 302 |
+
)
|
src/middleware/cors.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CORS middleware configuration for frontend-backend communication.
|
| 3 |
+
|
| 4 |
+
Provides configurable Cross-Origin Resource Sharing middleware.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
|
| 10 |
+
from fastapi import FastAPI, Request, Response
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from fastapi.middleware.base import BaseHTTPMiddleware
|
| 13 |
+
from starlette.middleware.base import RequestResponseEndpoint
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CustomCORSMiddleware(BaseHTTPMiddleware):
|
| 17 |
+
"""Custom CORS middleware with additional security features."""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
app: FastAPI,
|
| 22 |
+
allow_origins: List[str] = None,
|
| 23 |
+
allow_methods: List[str] = None,
|
| 24 |
+
allow_headers: List[str] = None,
|
| 25 |
+
expose_headers: List[str] = None,
|
| 26 |
+
allow_credentials: bool = True,
|
| 27 |
+
max_age: int = 86400, # 24 hours
|
| 28 |
+
strict_mode: bool = False
|
| 29 |
+
):
|
| 30 |
+
super().__init__(app)
|
| 31 |
+
self.allow_origins = allow_origins or self._get_default_origins()
|
| 32 |
+
self.allow_methods = allow_methods or ["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"]
|
| 33 |
+
self.allow_headers = allow_headers or ["*"]
|
| 34 |
+
self.expose_headers = expose_headers or []
|
| 35 |
+
self.allow_credentials = allow_credentials
|
| 36 |
+
self.max_age = max_age
|
| 37 |
+
self.strict_mode = strict_mode
|
| 38 |
+
|
| 39 |
+
# Apply FastAPI's CORS middleware
|
| 40 |
+
app.add_middleware(
|
| 41 |
+
CORSMiddleware,
|
| 42 |
+
allow_origins=self.allow_origins,
|
| 43 |
+
allow_credentials=self.allow_credentials,
|
| 44 |
+
allow_methods=self.allow_methods,
|
| 45 |
+
allow_headers=self.allow_headers,
|
| 46 |
+
expose_headers=self.expose_headers,
|
| 47 |
+
max_age=self.max_age
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
def _get_default_origins(self) -> List[str]:
|
| 51 |
+
"""Get default allowed origins from environment."""
|
| 52 |
+
env_origins = os.getenv("CORS_ORIGINS", "")
|
| 53 |
+
if env_origins:
|
| 54 |
+
return [origin.strip() for origin in env_origins.split(",")]
|
| 55 |
+
|
| 56 |
+
# Default origins for development
|
| 57 |
+
default_origins = [
|
| 58 |
+
"http://localhost:3000",
|
| 59 |
+
"http://localhost:3001",
|
| 60 |
+
"http://127.0.0.1:3000",
|
| 61 |
+
"http://127.0.0.1:3001",
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
# Add production URL if available
|
| 65 |
+
if os.getenv("FRONTEND_URL"):
|
| 66 |
+
default_origins.append(os.getenv("FRONTEND_URL"))
|
| 67 |
+
|
| 68 |
+
return default_origins
|
| 69 |
+
|
| 70 |
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
| 71 |
+
"""Add additional CORS security features."""
|
| 72 |
+
|
| 73 |
+
# Handle preflight requests
|
| 74 |
+
if request.method == "OPTIONS":
|
| 75 |
+
# Add additional security headers for preflight
|
| 76 |
+
response = await call_next(request)
|
| 77 |
+
else:
|
| 78 |
+
response = await call_next(request)
|
| 79 |
+
|
| 80 |
+
# Add security headers
|
| 81 |
+
self._add_security_headers(request, response)
|
| 82 |
+
|
| 83 |
+
# Log CORS requests in strict mode
|
| 84 |
+
if self.strict_mode:
|
| 85 |
+
self._log_cors_request(request, response)
|
| 86 |
+
|
| 87 |
+
return response
|
| 88 |
+
|
| 89 |
+
def _add_security_headers(self, request: Request, response: Response):
|
| 90 |
+
"""Add additional security headers."""
|
| 91 |
+
# Remove server information
|
| 92 |
+
response.headers["Server"] = ""
|
| 93 |
+
|
| 94 |
+
# CSP header (Content Security Policy)
|
| 95 |
+
csp_directives = [
|
| 96 |
+
"default-src 'self'",
|
| 97 |
+
"script-src 'self' 'unsafe-inline' 'unsafe-eval'",
|
| 98 |
+
"style-src 'self' 'unsafe-inline'",
|
| 99 |
+
"img-src 'self' data: https:",
|
| 100 |
+
"font-src 'self' data:",
|
| 101 |
+
"connect-src 'self'",
|
| 102 |
+
"frame-ancestors 'none'",
|
| 103 |
+
"base-uri 'self'",
|
| 104 |
+
"form-action 'self'",
|
| 105 |
+
]
|
| 106 |
+
response.headers["Content-Security-Policy"] = "; ".join(csp_directives)
|
| 107 |
+
|
| 108 |
+
# Additional security headers
|
| 109 |
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 110 |
+
response.headers["X-Frame-Options"] = "DENY"
|
| 111 |
+
response.headers["X-XSS-Protection"] = "1; mode=block"
|
| 112 |
+
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
| 113 |
+
|
| 114 |
+
# HSTS (only in production with HTTPS)
|
| 115 |
+
if os.getenv("ENVIRONMENT") == "production" and request.url.scheme == "https":
|
| 116 |
+
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 117 |
+
|
| 118 |
+
# Permissions Policy
|
| 119 |
+
permissions_policy = [
|
| 120 |
+
"geolocation=()",
|
| 121 |
+
"microphone=()",
|
| 122 |
+
"camera=()",
|
| 123 |
+
"payment=()",
|
| 124 |
+
"usb=()",
|
| 125 |
+
"magnetometer=()",
|
| 126 |
+
"gyroscope=()",
|
| 127 |
+
"accelerometer=()",
|
| 128 |
+
]
|
| 129 |
+
response.headers["Permissions-Policy"] = ", ".join(permissions_policy)
|
| 130 |
+
|
| 131 |
+
def _log_cors_request(self, request: Request, response: Response):
|
| 132 |
+
"""Log CORS-related requests for monitoring."""
|
| 133 |
+
from src.utils.logging import get_logger
|
| 134 |
+
|
| 135 |
+
logger = get_logger("cors")
|
| 136 |
+
|
| 137 |
+
origin = request.headers.get("origin")
|
| 138 |
+
if origin:
|
| 139 |
+
if origin not in self.allow_origins:
|
| 140 |
+
logger.warning(
|
| 141 |
+
"Cross-origin request from unauthorized origin",
|
| 142 |
+
origin=origin,
|
| 143 |
+
path=request.url.path,
|
| 144 |
+
method=request.method,
|
| 145 |
+
)
|
| 146 |
+
else:
|
| 147 |
+
logger.info(
|
| 148 |
+
"Cross-origin request allowed",
|
| 149 |
+
origin=origin,
|
| 150 |
+
path=request.url.path,
|
| 151 |
+
method=request.method,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
class RateLimitCORSMiddleware(BaseHTTPMiddleware):
|
| 156 |
+
"""CORS middleware with rate limiting per origin."""
|
| 157 |
+
|
| 158 |
+
def __init__(
|
| 159 |
+
self,
|
| 160 |
+
app: FastAPI,
|
| 161 |
+
requests_per_minute: int = 100,
|
| 162 |
+
burst_size: int = 200
|
| 163 |
+
):
|
| 164 |
+
super().__init__(app)
|
| 165 |
+
self.requests_per_minute = requests_per_minute
|
| 166 |
+
self.burst_size = burst_size
|
| 167 |
+
self.request_counts = {} # Simple in-memory tracking
|
| 168 |
+
|
| 169 |
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
| 170 |
+
"""Apply rate limiting based on origin."""
|
| 171 |
+
import time
|
| 172 |
+
from fastapi import HTTPException
|
| 173 |
+
|
| 174 |
+
origin = request.headers.get("origin")
|
| 175 |
+
if origin:
|
| 176 |
+
current_time = time.time()
|
| 177 |
+
minute_key = int(current_time // 60)
|
| 178 |
+
|
| 179 |
+
# Clean old entries
|
| 180 |
+
self._cleanup_old_entries(minute_key)
|
| 181 |
+
|
| 182 |
+
# Track requests
|
| 183 |
+
origin_key = f"{origin}:{minute_key}"
|
| 184 |
+
count = self.request_counts.get(origin_key, 0)
|
| 185 |
+
|
| 186 |
+
if count >= self.requests_per_minute:
|
| 187 |
+
raise HTTPException(
|
| 188 |
+
status_code=429,
|
| 189 |
+
detail="Too many requests from this origin",
|
| 190 |
+
headers={
|
| 191 |
+
"Retry-After": "60",
|
| 192 |
+
"X-RateLimit-Limit": str(self.requests_per_minute),
|
| 193 |
+
"X-RateLimit-Remaining": "0",
|
| 194 |
+
"X-RateLimit-Reset": str((minute_key + 1) * 60)
|
| 195 |
+
}
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
self.request_counts[origin_key] = count + 1
|
| 199 |
+
|
| 200 |
+
response = await call_next(request)
|
| 201 |
+
|
| 202 |
+
# Add rate limit headers
|
| 203 |
+
if origin:
|
| 204 |
+
response.headers["X-RateLimit-Limit"] = str(self.requests_per_minute)
|
| 205 |
+
remaining = max(0, self.requests_per_minute - self.request_counts.get(origin_key, 0))
|
| 206 |
+
response.headers["X-RateLimit-Remaining"] = str(remaining)
|
| 207 |
+
|
| 208 |
+
return response
|
| 209 |
+
|
| 210 |
+
def _cleanup_old_entries(self, current_minute: int):
|
| 211 |
+
"""Remove old entries from request counts."""
|
| 212 |
+
keys_to_remove = []
|
| 213 |
+
for key in self.request_counts.keys():
|
| 214 |
+
key_minute = int(key.split(":")[-1])
|
| 215 |
+
if current_minute - key_minute > 5: # Keep 5 minutes of history
|
| 216 |
+
keys_to_remove.append(key)
|
| 217 |
+
|
| 218 |
+
for key in keys_to_remove:
|
| 219 |
+
del self.request_counts[key]
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def configure_cors(
|
| 223 |
+
app: FastAPI,
|
| 224 |
+
environment: str = "development"
|
| 225 |
+
) -> None:
|
| 226 |
+
"""Configure CORS based on environment."""
|
| 227 |
+
|
| 228 |
+
if environment == "production":
|
| 229 |
+
# Production CORS settings
|
| 230 |
+
origins = os.getenv("CORS_ORIGINS", "").split(",") if os.getenv("CORS_ORIGINS") else []
|
| 231 |
+
|
| 232 |
+
# Add production frontend URL
|
| 233 |
+
frontend_url = os.getenv("FRONTEND_URL")
|
| 234 |
+
if frontend_url and frontend_url not in origins:
|
| 235 |
+
origins.append(frontend_url)
|
| 236 |
+
|
| 237 |
+
# In production, be strict about origins
|
| 238 |
+
if origins:
|
| 239 |
+
app.add_middleware(
|
| 240 |
+
CustomCORSMiddleware,
|
| 241 |
+
allow_origins=origins,
|
| 242 |
+
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
| 243 |
+
allow_headers=["Authorization", "Content-Type", "X-Requested-With"],
|
| 244 |
+
expose_headers=["X-Total-Count", "X-Page-Count"],
|
| 245 |
+
strict_mode=True
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Add rate limiting
|
| 249 |
+
app.add_middleware(
|
| 250 |
+
RateLimitCORSMiddleware,
|
| 251 |
+
requests_per_minute=int(os.getenv("RATE_LIMIT_PER_MINUTE", "100"))
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
else:
|
| 255 |
+
# Development CORS settings - more permissive
|
| 256 |
+
app.add_middleware(
|
| 257 |
+
CustomCORSMiddleware,
|
| 258 |
+
allow_origins=[
|
| 259 |
+
"http://localhost:3000",
|
| 260 |
+
"http://localhost:3001",
|
| 261 |
+
"http://127.0.0.1:3000",
|
| 262 |
+
"http://127.0.0.1:3001",
|
| 263 |
+
"http://localhost:5173", # Vite dev server
|
| 264 |
+
"http://127.0.0.1:5173",
|
| 265 |
+
],
|
| 266 |
+
allow_credentials=True,
|
| 267 |
+
strict_mode=False
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
# CORS configuration for specific routes
|
| 272 |
+
class RouteSpecificCORSMiddleware(BaseHTTPMiddleware):
|
| 273 |
+
"""Apply different CORS settings to specific routes."""
|
| 274 |
+
|
| 275 |
+
def __init__(
|
| 276 |
+
self,
|
| 277 |
+
app: FastAPI,
|
| 278 |
+
path_prefix: str,
|
| 279 |
+
cors_config: dict
|
| 280 |
+
):
|
| 281 |
+
super().__init__(app)
|
| 282 |
+
self.path_prefix = path_prefix
|
| 283 |
+
self.cors_config = cors_config
|
| 284 |
+
|
| 285 |
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
| 286 |
+
"""Apply route-specific CORS configuration."""
|
| 287 |
+
if request.url.path.startswith(self.path_prefix):
|
| 288 |
+
# Apply custom CORS settings for this route
|
| 289 |
+
origin = request.headers.get("origin")
|
| 290 |
+
if origin and self.cors_config.get("allowed_origins"):
|
| 291 |
+
if origin in self.cors_config["allowed_origins"]:
|
| 292 |
+
response = await call_next(request)
|
| 293 |
+
response.headers["Access-Control-Allow-Origin"] = origin
|
| 294 |
+
response.headers["Access-Control-Allow-Credentials"] = "true"
|
| 295 |
+
|
| 296 |
+
for method in self.cors_config.get("allowed_methods", []):
|
| 297 |
+
response.headers["Access-Control-Allow-Methods"] = ", ".join(methods)
|
| 298 |
+
|
| 299 |
+
for header in self.cors_config.get("allowed_headers", []):
|
| 300 |
+
response.headers["Access-Control-Allow-Headers"] = ", ".join(headers)
|
| 301 |
+
|
| 302 |
+
return response
|
| 303 |
+
else:
|
| 304 |
+
# Use default CORS handling
|
| 305 |
+
return await call_next(request)
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# Pre-configured CORS settings for different environments
|
| 309 |
+
CORS_CONFIGS = {
|
| 310 |
+
"development": {
|
| 311 |
+
"allowed_origins": ["http://localhost:3000", "http://localhost:5173"],
|
| 312 |
+
"allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
| 313 |
+
"allowed_headers": ["*"],
|
| 314 |
+
"allow_credentials": True,
|
| 315 |
+
"strict_mode": False
|
| 316 |
+
},
|
| 317 |
+
"staging": {
|
| 318 |
+
"allowed_origins": ["https://staging.example.com"],
|
| 319 |
+
"allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
| 320 |
+
"allowed_headers": ["Authorization", "Content-Type"],
|
| 321 |
+
"allow_credentials": True,
|
| 322 |
+
"strict_mode": True
|
| 323 |
+
},
|
| 324 |
+
"production": {
|
| 325 |
+
"allowed_origins": ["https://example.com"],
|
| 326 |
+
"allowed_methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
| 327 |
+
"allowed_headers": ["Authorization", "Content-Type"],
|
| 328 |
+
"allow_credentials": True,
|
| 329 |
+
"strict_mode": True
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def setup_cors_with_config(
|
| 335 |
+
app: FastAPI,
|
| 336 |
+
config_name: str = "development"
|
| 337 |
+
) -> None:
|
| 338 |
+
"""Setup CORS using pre-configured settings."""
|
| 339 |
+
|
| 340 |
+
config = CORS_CONFIGS.get(config_name, CORS_CONFIGS["development"])
|
| 341 |
+
|
| 342 |
+
app.add_middleware(
|
| 343 |
+
CustomCORSMiddleware,
|
| 344 |
+
**config
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# Log CORS configuration
|
| 348 |
+
from src.utils.logging import get_logger
|
| 349 |
+
|
| 350 |
+
logger = get_logger("cors")
|
| 351 |
+
logger.info(
|
| 352 |
+
"CORS configured",
|
| 353 |
+
environment=config_name,
|
| 354 |
+
allowed_origins=config["allowed_origins"],
|
| 355 |
+
allow_credentials=config["allow_credentials"]
|
| 356 |
+
)
|
src/middleware/rate_limit.py
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rate Limiting Middleware for Translation API.
|
| 3 |
+
|
| 4 |
+
This middleware implements per-IP and per-user rate limiting
|
| 5 |
+
to prevent abuse and manage Gemini API quotas effectively.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
import asyncio
|
| 10 |
+
from typing import Dict, Optional
|
| 11 |
+
from fastapi import Request, HTTPException, status
|
| 12 |
+
from fastapi.responses import JSONResponse
|
| 13 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
| 14 |
+
|
| 15 |
+
from src.utils.translation_logger import get_translation_logger
|
| 16 |
+
|
| 17 |
+
logger = get_translation_logger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class RateLimitMiddleware(BaseHTTPMiddleware):
|
| 21 |
+
"""
|
| 22 |
+
Middleware for rate limiting API requests.
|
| 23 |
+
|
| 24 |
+
Implements:
|
| 25 |
+
- Per-IP rate limiting
|
| 26 |
+
- Per-user rate limiting (if authenticated)
|
| 27 |
+
- Sliding window algorithm
|
| 28 |
+
- Redis-based storage (if available)
|
| 29 |
+
- In-memory fallback
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
app,
|
| 35 |
+
*,
|
| 36 |
+
requests_per_minute: int = 60,
|
| 37 |
+
requests_per_hour: int = 1000,
|
| 38 |
+
redis_client=None
|
| 39 |
+
):
|
| 40 |
+
"""
|
| 41 |
+
Initialize rate limit middleware.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
app: FastAPI application
|
| 45 |
+
requests_per_minute: Requests allowed per minute per client
|
| 46 |
+
requests_per_hour: Requests allowed per hour per client
|
| 47 |
+
redis_client: Optional Redis client for distributed rate limiting
|
| 48 |
+
"""
|
| 49 |
+
super().__init__(app)
|
| 50 |
+
self.requests_per_minute = requests_per_minute
|
| 51 |
+
self.requests_per_hour = requests_per_hour
|
| 52 |
+
self.redis_client = redis_client
|
| 53 |
+
|
| 54 |
+
# In-memory storage fallback
|
| 55 |
+
self.ip_rate_limits: Dict[str, Dict[str, Any]] = {}
|
| 56 |
+
self.user_rate_limits: Dict[str, Dict[str, Any]] = {}
|
| 57 |
+
|
| 58 |
+
logger.info(
|
| 59 |
+
"Rate limit middleware initialized",
|
| 60 |
+
requests_per_minute=requests_per_minute,
|
| 61 |
+
requests_per_hour=requests_hour,
|
| 62 |
+
redis_enabled=redis_client is not None
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
async def dispatch(self, request: Request, call_next):
|
| 66 |
+
"""
|
| 67 |
+
Process request with rate limiting.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
request: Incoming request
|
| 71 |
+
call_next: Next middleware/endpoint
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Response or rate limit error
|
| 75 |
+
"""
|
| 76 |
+
# Skip rate limiting for health checks
|
| 77 |
+
if request.url.path in ["/health", "/health/enhanced", "/metrics/health"]:
|
| 78 |
+
return await call_next(request)
|
| 79 |
+
|
| 80 |
+
# Get client identifiers
|
| 81 |
+
client_ip = self._get_client_ip(request)
|
| 82 |
+
user_id = self._get_user_id(request)
|
| 83 |
+
|
| 84 |
+
# Check rate limits
|
| 85 |
+
await self._check_rate_limits(client_ip, user_id)
|
| 86 |
+
|
| 87 |
+
# Process request
|
| 88 |
+
response = await call_next(request)
|
| 89 |
+
|
| 90 |
+
# Add rate limit headers
|
| 91 |
+
self._add_rate_limit_headers(response, client_ip, user_id)
|
| 92 |
+
|
| 93 |
+
return response
|
| 94 |
+
|
| 95 |
+
def _get_client_ip(self, request: Request) -> str:
|
| 96 |
+
"""Get client IP address from request."""
|
| 97 |
+
# Check for forwarded headers
|
| 98 |
+
forwarded_for = request.headers.get("X-Forwarded-For")
|
| 99 |
+
if forwarded_for:
|
| 100 |
+
return forwarded_for.split(",")[0].strip()
|
| 101 |
+
|
| 102 |
+
real_ip = request.headers.get("X-Real-IP")
|
| 103 |
+
if real_ip:
|
| 104 |
+
return real_ip
|
| 105 |
+
|
| 106 |
+
# Fall back to direct connection IP
|
| 107 |
+
return request.client.host if request.client else "unknown"
|
| 108 |
+
|
| 109 |
+
def _get_user_id(self, request: Request) -> Optional[str]:
|
| 110 |
+
"""Get user ID from request if authenticated."""
|
| 111 |
+
# This would extract from JWT token or session
|
| 112 |
+
# For now, return None to implement IP-based limiting only
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
async def _check_rate_limits(self, client_ip: str, user_id: Optional[str]) -> None:
|
| 116 |
+
"""
|
| 117 |
+
Check if client has exceeded rate limits.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
client_ip: Client IP address
|
| 121 |
+
user_id: Optional user ID
|
| 122 |
+
|
| 123 |
+
Raises:
|
| 124 |
+
HTTPException: If rate limit exceeded
|
| 125 |
+
"""
|
| 126 |
+
now = time.time()
|
| 127 |
+
|
| 128 |
+
# Check per-IP limits
|
| 129 |
+
ip_data = await self._get_rate_limit_data(f"ip:{client_ip}")
|
| 130 |
+
if self._is_rate_limited(ip_data, now):
|
| 131 |
+
retry_after = self._calculate_retry_after(ip_data, now)
|
| 132 |
+
logger.warning(
|
| 133 |
+
"IP rate limit exceeded",
|
| 134 |
+
client_ip=client_ip,
|
| 135 |
+
requests_in_minute=ip_data.get("minute_requests", 0),
|
| 136 |
+
retry_after=retry_after
|
| 137 |
+
)
|
| 138 |
+
raise HTTPException(
|
| 139 |
+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
|
| 140 |
+
detail={
|
| 141 |
+
"error": "RATE_LIMIT_EXCEEDED",
|
| 142 |
+
"message": f"IP rate limit exceeded. Please wait {retry_after:.1f} seconds.",
|
| 143 |
+
"retry_after": retry_after,
|
| 144 |
+
"limit_type": "ip"
|
| 145 |
+
}
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Check per-user limits if authenticated
|
| 149 |
+
if user_id:
|
| 150 |
+
user_data = await self._get_rate_limit_data(f"user:{user_id}")
|
| 151 |
+
if self._is_rate_limited(user_data, now):
|
| 152 |
+
retry_after = self._calculate_retry_after(user_data, now)
|
| 153 |
+
logger.warning(
|
| 154 |
+
"User rate limit exceeded",
|
| 155 |
+
user_id=user_id,
|
| 156 |
+
requests_in_minute=user_data.get("minute_requests", 0),
|
| 157 |
+
retry_after=retry_after
|
| 158 |
+
)
|
| 159 |
+
raise HTTPException(
|
| 160 |
+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
|
| 161 |
+
detail={
|
| 162 |
+
"error": "RATE_LIMIT_EXCEEDED",
|
| 163 |
+
"message": f"User rate limit exceeded. Please wait {retry_after:.1f} seconds.",
|
| 164 |
+
"retry_after": retry_after,
|
| 165 |
+
"limit_type": "user"
|
| 166 |
+
}
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Update rate limit data
|
| 170 |
+
await self._update_rate_limit_data(f"ip:{client_ip}", now)
|
| 171 |
+
if user_id:
|
| 172 |
+
await self._update_rate_limit_data(f"user:{user_id}", now)
|
| 173 |
+
|
| 174 |
+
async def _get_rate_limit_data(self, key: str) -> Dict[str, Any]:
|
| 175 |
+
"""Get rate limit data for a client."""
|
| 176 |
+
if self.redis_client:
|
| 177 |
+
try:
|
| 178 |
+
# Get data from Redis
|
| 179 |
+
data = await self.redis_client.hgetall(f"rate_limit:{key}")
|
| 180 |
+
if data:
|
| 181 |
+
return {
|
| 182 |
+
"minute_requests": int(data.get("minute_requests", 0)),
|
| 183 |
+
"minute_window": float(data.get("minute_window", 0)),
|
| 184 |
+
"hour_requests": int(data.get("hour_requests", 0)),
|
| 185 |
+
"hour_window": float(data.get("hour_window", 0)),
|
| 186 |
+
"last_request": float(data.get("last_request", 0))
|
| 187 |
+
}
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logger.warning("Redis rate limit read failed", error=str(e))
|
| 190 |
+
|
| 191 |
+
# Fall back to in-memory
|
| 192 |
+
if key.startswith("ip:"):
|
| 193 |
+
storage = self.ip_rate_limits
|
| 194 |
+
key = key[3:] # Remove "ip:" prefix
|
| 195 |
+
else:
|
| 196 |
+
storage = self.user_rate_limits
|
| 197 |
+
key = key[5:] # Remove "user:" prefix
|
| 198 |
+
|
| 199 |
+
return storage.get(key, {
|
| 200 |
+
"minute_requests": 0,
|
| 201 |
+
"minute_window": 0,
|
| 202 |
+
"hour_requests": 0,
|
| 203 |
+
"hour_window": 0,
|
| 204 |
+
"last_request": 0
|
| 205 |
+
})
|
| 206 |
+
|
| 207 |
+
async def _update_rate_limit_data(self, key: str, now: float) -> None:
|
| 208 |
+
"""Update rate limit data for a client."""
|
| 209 |
+
# Get current data
|
| 210 |
+
data = await self._get_rate_limit_data(key)
|
| 211 |
+
|
| 212 |
+
# Update minute window
|
| 213 |
+
if now - data["minute_window"] > 60:
|
| 214 |
+
data["minute_requests"] = 1
|
| 215 |
+
data["minute_window"] = now
|
| 216 |
+
else:
|
| 217 |
+
data["minute_requests"] += 1
|
| 218 |
+
|
| 219 |
+
# Update hour window
|
| 220 |
+
if now - data["hour_window"] > 3600:
|
| 221 |
+
data["hour_requests"] = 1
|
| 222 |
+
data["hour_window"] = now
|
| 223 |
+
else:
|
| 224 |
+
data["hour_requests"] += 1
|
| 225 |
+
|
| 226 |
+
data["last_request"] = now
|
| 227 |
+
|
| 228 |
+
# Save updated data
|
| 229 |
+
if self.redis_client:
|
| 230 |
+
try:
|
| 231 |
+
# Save to Redis with TTL
|
| 232 |
+
await self.redis_client.hset(
|
| 233 |
+
f"rate_limit:{key}",
|
| 234 |
+
mapping={
|
| 235 |
+
"minute_requests": str(data["minute_requests"]),
|
| 236 |
+
"minute_window": str(data["minute_window"]),
|
| 237 |
+
"hour_requests": str(data["hour_requests"]),
|
| 238 |
+
"hour_window": str(data["hour_window"]),
|
| 239 |
+
"last_request": str(data["last_request"])
|
| 240 |
+
}
|
| 241 |
+
)
|
| 242 |
+
# Set TTL to 1 hour
|
| 243 |
+
await self.redis_client.expire(f"rate_limit:{key}", 3600)
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.warning("Redis rate limit write failed", error=str(e))
|
| 246 |
+
|
| 247 |
+
# Fall back to in-memory
|
| 248 |
+
if key.startswith("ip:"):
|
| 249 |
+
storage = self.ip_rate_limits
|
| 250 |
+
key = key[3:] # Remove "ip:" prefix
|
| 251 |
+
else:
|
| 252 |
+
storage = self.user_rate_limits
|
| 253 |
+
key = key[5:] # Remove "user:" prefix
|
| 254 |
+
|
| 255 |
+
storage[key] = data
|
| 256 |
+
|
| 257 |
+
# Cleanup old entries (simple cleanup every 100 requests)
|
| 258 |
+
if data["minute_requests"] % 100 == 0:
|
| 259 |
+
await self._cleanup_old_entries(now)
|
| 260 |
+
|
| 261 |
+
async def _cleanup_old_entries(self, now: float) -> None:
|
| 262 |
+
"""Clean up old rate limit entries."""
|
| 263 |
+
cutoff = now - 3600 # 1 hour ago
|
| 264 |
+
|
| 265 |
+
# Cleanup IP entries
|
| 266 |
+
to_remove = []
|
| 267 |
+
for ip, data in self.ip_rate_limits.items():
|
| 268 |
+
if data["last_request"] < cutoff:
|
| 269 |
+
to_remove.append(ip)
|
| 270 |
+
for ip in to_remove:
|
| 271 |
+
del self.ip_rate_limits[ip]
|
| 272 |
+
|
| 273 |
+
# Cleanup user entries
|
| 274 |
+
to_remove = []
|
| 275 |
+
for user, data in self.user_rate_limits.items():
|
| 276 |
+
if data["last_request"] < cutoff:
|
| 277 |
+
to_remove.append(user)
|
| 278 |
+
for user in to_remove:
|
| 279 |
+
del self.user_rate_limits[user]
|
| 280 |
+
|
| 281 |
+
if to_remove:
|
| 282 |
+
logger.debug("Cleaned up old rate limit entries", count=len(to_remove))
|
| 283 |
+
|
| 284 |
+
def _is_rate_limited(self, data: Dict[str, Any], now: float) -> bool:
|
| 285 |
+
"""Check if client has exceeded rate limits."""
|
| 286 |
+
# Check minute limit
|
| 287 |
+
if now - data["minute_window"] < 60:
|
| 288 |
+
if data["minute_requests"] >= self.requests_per_minute:
|
| 289 |
+
return True
|
| 290 |
+
|
| 291 |
+
# Check hour limit
|
| 292 |
+
if now - data["hour_window"] < 3600:
|
| 293 |
+
if data["hour_requests"] >= self.requests_per_hour:
|
| 294 |
+
return True
|
| 295 |
+
|
| 296 |
+
return False
|
| 297 |
+
|
| 298 |
+
def _calculate_retry_after(self, data: Dict[str, Any], now: float) -> float:
|
| 299 |
+
"""Calculate retry-after time based on rate limit data."""
|
| 300 |
+
# Check minute limit
|
| 301 |
+
if now - data["minute_window"] < 60 and data["minute_requests"] >= self.requests_per_minute:
|
| 302 |
+
return 60 - (now - data["minute_window"])
|
| 303 |
+
|
| 304 |
+
# Check hour limit
|
| 305 |
+
if now - data["hour_window"] < 3600 and data["hour_requests"] >= self.requests_per_hour:
|
| 306 |
+
return 3600 - (now - data["hour_window"])
|
| 307 |
+
|
| 308 |
+
return 60.0 # Default retry after
|
| 309 |
+
|
| 310 |
+
def _add_rate_limit_headers(
|
| 311 |
+
self,
|
| 312 |
+
response,
|
| 313 |
+
client_ip: str,
|
| 314 |
+
user_id: Optional[str]
|
| 315 |
+
) -> None:
|
| 316 |
+
"""Add rate limit headers to response."""
|
| 317 |
+
now = time.time()
|
| 318 |
+
|
| 319 |
+
# Get current limits
|
| 320 |
+
ip_data = asyncio.create_task(self._get_rate_limit_data(f"ip:{client_ip}"))
|
| 321 |
+
ip_data_result = asyncio.run(ip_data)
|
| 322 |
+
|
| 323 |
+
# Add headers
|
| 324 |
+
response.headers["X-RateLimit-Limit-Minute"] = str(self.requests_per_minute)
|
| 325 |
+
response.headers["X-RateLimit-Limit-Hour"] = str(self.requests_per_hour)
|
| 326 |
+
response.headers["X-RateLimit-Remaining-Minute"] = str(
|
| 327 |
+
max(0, self.requests_per_minute - ip_data_result.get("minute_requests", 0))
|
| 328 |
+
)
|
| 329 |
+
response.headers["X-RateLimit-Remaining-Hour"] = str(
|
| 330 |
+
max(0, self.requests_per_hour - ip_data_result.get("hour_requests", 0))
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
# Add reset time
|
| 334 |
+
if ip_data_result.get("minute_window", 0):
|
| 335 |
+
reset_time = ip_data_result["minute_window"] + 60
|
| 336 |
+
response.headers["X-RateLimit-Reset"] = str(int(reset_time))
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
class TranslationRateLimitMiddleware(RateLimitMiddleware):
|
| 340 |
+
"""
|
| 341 |
+
Specialized rate limit middleware for translation endpoints.
|
| 342 |
+
|
| 343 |
+
Implements stricter limits for translation endpoints to manage
|
| 344 |
+
Gemini API quotas effectively.
|
| 345 |
+
"""
|
| 346 |
+
|
| 347 |
+
def __init__(
|
| 348 |
+
self,
|
| 349 |
+
app,
|
| 350 |
+
*,
|
| 351 |
+
redis_client=None
|
| 352 |
+
):
|
| 353 |
+
"""
|
| 354 |
+
Initialize translation rate limit middleware.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
app: FastAPI application
|
| 358 |
+
redis_client: Optional Redis client
|
| 359 |
+
"""
|
| 360 |
+
# Stricter limits for translation endpoints
|
| 361 |
+
super().__init__(
|
| 362 |
+
app,
|
| 363 |
+
requests_per_minute=10, # 10 translations per minute
|
| 364 |
+
requests_per_hour=500, # 500 translations per hour
|
| 365 |
+
redis_client=redis_client
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
logger.info(
|
| 369 |
+
"Translation rate limit middleware initialized",
|
| 370 |
+
requests_per_minute=10,
|
| 371 |
+
requests_per_hour=500
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
async def dispatch(self, request: Request, call_next):
|
| 375 |
+
"""
|
| 376 |
+
Process request with translation-specific rate limiting.
|
| 377 |
+
|
| 378 |
+
Only applies to translation endpoints.
|
| 379 |
+
"""
|
| 380 |
+
# Check if this is a translation endpoint
|
| 381 |
+
if not request.url.path.startswith("/translation/"):
|
| 382 |
+
return await call_next(request)
|
| 383 |
+
|
| 384 |
+
# Apply rate limiting
|
| 385 |
+
return await super().dispatch(request, call_next)
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Import all models to ensure they are registered with SQLAlchemy.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# Import all models to register them with SQLAlchemy
|
| 6 |
+
from .auth import (
|
| 7 |
+
User, Account, UserBackground, OnboardingResponse, Session,
|
| 8 |
+
PasswordResetToken, AnonymousSession, ChatSession, ChatMessage,
|
| 9 |
+
UserPreferences, MessageVersion, ChatFolder, ChatTag, MessageReaction
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
from .translation_openai import (
|
| 13 |
+
TranslationJob, TranslationChunk, TranslationError,
|
| 14 |
+
TranslationSession, TranslationCache, TranslationMetrics,
|
| 15 |
+
TranslationJobStatus, ChunkStatus, ErrorSeverity
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Export all models
|
| 19 |
+
__all__ = [
|
| 20 |
+
# Auth models
|
| 21 |
+
"User", "Account", "UserBackground", "OnboardingResponse", "Session",
|
| 22 |
+
"PasswordResetToken", "AnonymousSession", "ChatSession", "ChatMessage",
|
| 23 |
+
"UserPreferences", "MessageVersion", "ChatFolder", "ChatTag", "MessageReaction",
|
| 24 |
+
|
| 25 |
+
# Translation models
|
| 26 |
+
"TranslationJob", "TranslationChunk", "TranslationError",
|
| 27 |
+
"TranslationSession", "TranslationCache", "TranslationMetrics",
|
| 28 |
+
"TranslationJobStatus", "ChunkStatus", "ErrorSeverity"
|
| 29 |
+
]
|
src/models/auth.py
CHANGED
|
@@ -42,6 +42,9 @@ class User(Base):
|
|
| 42 |
chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
|
| 43 |
folders = relationship("ChatFolder", back_populates="user", cascade="all, delete-orphan")
|
| 44 |
tags = relationship("ChatTag", back_populates="user", cascade="all, delete-orphan")
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
class Account(Base):
|
|
|
|
| 42 |
chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
|
| 43 |
folders = relationship("ChatFolder", back_populates="user", cascade="all, delete-orphan")
|
| 44 |
tags = relationship("ChatTag", back_populates="user", cascade="all, delete-orphan")
|
| 45 |
+
translation_jobs = relationship("TranslationJob", back_populates="user", cascade="all, delete-orphan")
|
| 46 |
+
translation_sessions = relationship("TranslationSession", back_populates="user", cascade="all, delete-orphan")
|
| 47 |
+
translation_metrics = relationship("TranslationMetrics", back_populates="user", cascade="all, delete-orphan")
|
| 48 |
|
| 49 |
|
| 50 |
class Account(Base):
|
src/models/base.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base model for reader features.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import uuid
|
| 7 |
+
from sqlalchemy import Column, String, DateTime
|
| 8 |
+
from sqlalchemy.sql import func
|
| 9 |
+
from src.database.base import Base
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BaseModel(Base):
|
| 13 |
+
"""Base model with common fields for reader features."""
|
| 14 |
+
|
| 15 |
+
__abstract__ = True
|
| 16 |
+
|
| 17 |
+
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
| 18 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 19 |
+
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
| 20 |
+
|
| 21 |
+
def to_dict(self):
|
| 22 |
+
"""Convert model to dictionary."""
|
| 23 |
+
return {
|
| 24 |
+
column.name: getattr(self, column.name)
|
| 25 |
+
for column in self.__table__.columns
|
| 26 |
+
}
|
src/models/bookmark.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Bookmark model for user-saved page references with optional metadata.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from sqlalchemy import Column, String, Boolean, DateTime, Text, ForeignKey
|
| 6 |
+
from sqlalchemy.orm import relationship
|
| 7 |
+
from sqlalchemy.sql import func
|
| 8 |
+
from src.models.base import BaseModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Bookmark(BaseModel):
|
| 12 |
+
"""Represents user-saved page references with optional metadata."""
|
| 13 |
+
|
| 14 |
+
__tablename__ = "bookmarks"
|
| 15 |
+
|
| 16 |
+
user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
|
| 17 |
+
chapter_id = Column(String(255), nullable=False, index=True)
|
| 18 |
+
section_id = Column(String(255), nullable=True)
|
| 19 |
+
page_url = Column(String(2048), nullable=False)
|
| 20 |
+
page_title = Column(String(255), nullable=False)
|
| 21 |
+
snippet = Column(Text, nullable=True)
|
| 22 |
+
note = Column(String(1000), nullable=True)
|
| 23 |
+
is_private = Column(Boolean, nullable=False, default=True)
|
| 24 |
+
|
| 25 |
+
# Relationships
|
| 26 |
+
user = relationship("User", back_populates="bookmarks")
|
| 27 |
+
tags = relationship("BookmarkTag", back_populates="bookmark", cascade="all, delete-orphan")
|
| 28 |
+
|
| 29 |
+
__table_args__ = (
|
| 30 |
+
{"extend_existing": True},
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
def __repr__(self):
|
| 34 |
+
return f"<Bookmark(id='{self.id}', user_id='{self.user_id}', title='{self.page_title}')>"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class BookmarkTag(BaseModel):
|
| 38 |
+
"""Tags for organizing bookmarks."""
|
| 39 |
+
|
| 40 |
+
__tablename__ = "bookmark_tags"
|
| 41 |
+
|
| 42 |
+
bookmark_id = Column(String(36), ForeignKey("bookmarks.id"), nullable=False, index=True)
|
| 43 |
+
tag = Column(String(50), nullable=False, index=True)
|
| 44 |
+
|
| 45 |
+
# Relationships
|
| 46 |
+
bookmark = relationship("Bookmark", back_populates="tags")
|
| 47 |
+
|
| 48 |
+
__table_args__ = (
|
| 49 |
+
{"extend_existing": True},
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
def __repr__(self):
|
| 53 |
+
return f"<BookmarkTag(bookmark_id='{self.bookmark_id}', tag='{self.tag}')>"
|
src/models/chat.py
CHANGED
|
@@ -30,7 +30,7 @@ class ChatMessage(Base):
|
|
| 30 |
chat_session_id = Column(String(36), ForeignKey("chat_sessions.id"), nullable=False)
|
| 31 |
role = Column(SQLEnum(Role), nullable=False)
|
| 32 |
content = Column(Text, nullable=False)
|
| 33 |
-
|
| 34 |
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 35 |
|
| 36 |
# Relationships
|
|
|
|
| 30 |
chat_session_id = Column(String(36), ForeignKey("chat_sessions.id"), nullable=False)
|
| 31 |
role = Column(SQLEnum(Role), nullable=False)
|
| 32 |
content = Column(Text, nullable=False)
|
| 33 |
+
message_metadata = Column(JSON, nullable=True)
|
| 34 |
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 35 |
|
| 36 |
# Relationships
|
src/models/content_localization.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content localization model for tracking translation status of content pages.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from enum import Enum
|
| 7 |
+
from sqlalchemy import Column, String, Integer, DateTime, Boolean, JSON, Index
|
| 8 |
+
from sqlalchemy.dialects.postgresql import ENUM as Enum
|
| 9 |
+
from src.database.base import Base
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ProcessingStatus(Enum):
|
| 13 |
+
"""Processing status for content localization."""
|
| 14 |
+
PENDING = "pending"
|
| 15 |
+
PROCESSING = "processing"
|
| 16 |
+
COMPLETED = "completed"
|
| 17 |
+
FAILED = "failed"
|
| 18 |
+
PARTIAL = "partial" # Some chunks failed
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ContentLocalization(Base):
|
| 22 |
+
"""Tracks the translation status and metadata for content pages."""
|
| 23 |
+
|
| 24 |
+
__tablename__ = "content_localization"
|
| 25 |
+
|
| 26 |
+
id = Column(Integer, primary_key=True)
|
| 27 |
+
content_url = Column(String(500), nullable=False, index=True)
|
| 28 |
+
content_hash = Column(String(64), nullable=False, index=True)
|
| 29 |
+
|
| 30 |
+
# Localization status
|
| 31 |
+
is_translated = Column(Boolean, default=False)
|
| 32 |
+
last_translation_date = Column(DateTime)
|
| 33 |
+
translation_cache_key = Column(String(64))
|
| 34 |
+
|
| 35 |
+
# Content metadata
|
| 36 |
+
word_count = Column(Integer)
|
| 37 |
+
character_count = Column(Integer)
|
| 38 |
+
has_code_blocks = Column(Boolean, default=False)
|
| 39 |
+
detected_languages = Column(JSON) # Array of detected languages
|
| 40 |
+
|
| 41 |
+
# Processing metadata
|
| 42 |
+
chunk_count = Column(Integer, default=1)
|
| 43 |
+
processing_status = Column(Enum(ProcessingStatus), default=ProcessingStatus.PENDING)
|
| 44 |
+
|
| 45 |
+
# Metadata
|
| 46 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 47 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 48 |
+
|
| 49 |
+
def __repr__(self):
|
| 50 |
+
return f"<ContentLocalization(url='{self.content_url}', status='{self.processing_status}', translated={self.is_translated})>"
|
src/models/personalization.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PersonalizationProfile model for managing user preferences and learning styles.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from enum import Enum
|
| 7 |
+
from sqlalchemy import Column, Integer, String, DateTime, Boolean, JSON
|
| 8 |
+
from sqlalchemy.dialects.postgresql import ENUM as Enum
|
| 9 |
+
from src.database.base import Base
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ReadingLevel(Enum):
|
| 13 |
+
"""Reading proficiency levels."""
|
| 14 |
+
BEGINNER = "beginner"
|
| 15 |
+
INTERMEDIATE = "intermediate"
|
| 16 |
+
ADVANCED = "advanced"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LearningStyle(Enum):
|
| 20 |
+
"""Learning style preferences."""
|
| 21 |
+
VISUAL = "visual" # More examples, diagrams
|
| 22 |
+
PRACTICAL = "practical" # Focus on code, implementation
|
| 23 |
+
THEORETICAL = "theoretical" # Focus on concepts, theory
|
| 24 |
+
BALANCED = "balanced"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class TermHandling(Enum):
|
| 28 |
+
"""Technical term handling preferences."""
|
| 29 |
+
TRANSLATE = "translate" # Translate technical terms
|
| 30 |
+
TRANSLITERATE = "transliterate" # Keep in Urdu script
|
| 31 |
+
KEEP_ENGLISH = "keep_english" # Leave in English
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class PersonalizationProfile(Base):
|
| 35 |
+
"""Represents user preferences for personalized content delivery."""
|
| 36 |
+
|
| 37 |
+
__tablename__ = "personalization_profiles"
|
| 38 |
+
|
| 39 |
+
id = Column(Integer, primary_key=True)
|
| 40 |
+
user_id = Column(String(36), unique=True, nullable=False, index=True)
|
| 41 |
+
|
| 42 |
+
# Reading preferences
|
| 43 |
+
reading_level = Column(Enum(ReadingLevel), default=ReadingLevel.INTERMEDIATE)
|
| 44 |
+
preferred_language = Column(String(10), default='en')
|
| 45 |
+
|
| 46 |
+
# Content preferences
|
| 47 |
+
focus_areas = Column(JSON) # Array of topics user cares about
|
| 48 |
+
learning_style = Column(Enum(LearningStyle), default=LearningStyle.BALANCED)
|
| 49 |
+
|
| 50 |
+
# Translation preferences
|
| 51 |
+
enable_transliteration = Column(Boolean, default=True)
|
| 52 |
+
technical_term_handling = Column(Enum(TermHandling), default=TermHandling.TRANSLITERATE)
|
| 53 |
+
|
| 54 |
+
# UI preferences
|
| 55 |
+
font_size = Column(Integer, default=16)
|
| 56 |
+
focus_mode_preferences = Column(JSON)
|
| 57 |
+
|
| 58 |
+
# Metadata
|
| 59 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 60 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 61 |
+
last_active = Column(DateTime, default=datetime.utcnow)
|
| 62 |
+
|
| 63 |
+
def __repr__(self):
|
| 64 |
+
return f"<PersonalizationProfile(user_id='{self.user_id}', reading_level='{self.reading_level}')>"
|
src/models/reading_progress.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reading progress model for tracking user progress through chapters and sections.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from sqlalchemy import Column, String, Float, Boolean, Integer, DateTime, ForeignKey
|
| 6 |
+
from sqlalchemy.orm import relationship
|
| 7 |
+
from sqlalchemy.sql import func
|
| 8 |
+
from src.models.base import BaseModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ReadingProgress(BaseModel):
|
| 12 |
+
"""Stores user's reading progress through chapters and sections."""
|
| 13 |
+
|
| 14 |
+
__tablename__ = "reading_progress"
|
| 15 |
+
|
| 16 |
+
user_id = Column(String(36), ForeignKey("users.id"), nullable=False, index=True)
|
| 17 |
+
chapter_id = Column(String(255), nullable=False, index=True)
|
| 18 |
+
section_id = Column(String(255), nullable=False)
|
| 19 |
+
position = Column(Float, nullable=False, default=0.0) # 0-100 percentage
|
| 20 |
+
completed = Column(Boolean, nullable=False, default=False)
|
| 21 |
+
time_spent = Column(Integer, nullable=False, default=0) # Minutes
|
| 22 |
+
last_accessed = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 23 |
+
|
| 24 |
+
# Relationships
|
| 25 |
+
user = relationship("User", back_populates="reading_progress")
|
| 26 |
+
|
| 27 |
+
# Unique constraint to ensure one progress record per user per section
|
| 28 |
+
__table_args__ = (
|
| 29 |
+
{"extend_existing": True},
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
def __repr__(self):
|
| 33 |
+
return f"<ReadingProgress(user_id='{self.user_id}', chapter='{self.chapter_id}', position={self.position}%)>"
|
src/models/search_index.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Search index model for enabling fast content retrieval across languages.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from sqlalchemy import Column, String, Float, DateTime, Text
|
| 6 |
+
from sqlalchemy.sql import func
|
| 7 |
+
from src.models.base import BaseModel
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SearchIndex(BaseModel):
|
| 11 |
+
"""Enables fast content retrieval across languages."""
|
| 12 |
+
|
| 13 |
+
__tablename__ = "search_index"
|
| 14 |
+
|
| 15 |
+
content_id = Column(String(255), nullable=False, index=True)
|
| 16 |
+
language = Column(String(10), nullable=False, index=True) # en, ur, ur-roman
|
| 17 |
+
content_type = Column(String(20), nullable=False, index=True) # chapter, section, bookmark
|
| 18 |
+
title = Column(String(255), nullable=False)
|
| 19 |
+
content = Column(Text, nullable=False)
|
| 20 |
+
chapter_id = Column(String(255), nullable=False, index=True)
|
| 21 |
+
section_id = Column(String(255), nullable=True)
|
| 22 |
+
rank = Column(Float, nullable=False, default=0.5) # 0-1 for result ranking
|
| 23 |
+
indexed_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 24 |
+
|
| 25 |
+
__table_args__ = (
|
| 26 |
+
{"extend_existing": True},
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
def __repr__(self):
|
| 30 |
+
return f"<SearchIndex(content_id='{self.content_id}', language='{self.language}', type='{self.content_type}')>"
|
src/models/translation_openai.py
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Comprehensive OpenAI Translation System Models.
|
| 3 |
+
|
| 4 |
+
Provides database models for:
|
| 5 |
+
- Translation jobs with progress tracking
|
| 6 |
+
- Chunk-based translation processing
|
| 7 |
+
- Enhanced caching with page URL + content hash
|
| 8 |
+
- Error logging and retry tracking
|
| 9 |
+
- User session management
|
| 10 |
+
- Translation quality metrics
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from datetime import datetime, timedelta
|
| 14 |
+
from typing import Optional, Dict, Any, List
|
| 15 |
+
from enum import Enum
|
| 16 |
+
import uuid
|
| 17 |
+
|
| 18 |
+
from sqlalchemy import (
|
| 19 |
+
Column, Integer, String, Text, DateTime, SmallInteger, ForeignKey,
|
| 20 |
+
Index, Boolean, Numeric, JSON, BigInteger, CheckConstraint, UniqueConstraint
|
| 21 |
+
)
|
| 22 |
+
from sqlalchemy.orm import relationship
|
| 23 |
+
from sqlalchemy.dialects.postgresql import UUID, JSON
|
| 24 |
+
from sqlalchemy.sql import func
|
| 25 |
+
|
| 26 |
+
from src.database.base import Base
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TranslationJobStatus(Enum):
|
| 30 |
+
"""Translation job status values."""
|
| 31 |
+
PENDING = "pending"
|
| 32 |
+
QUEUED = "queued"
|
| 33 |
+
PROCESSING = "processing"
|
| 34 |
+
CHUNK_PROCESSING = "chunk_processing"
|
| 35 |
+
COMPLETED = "completed"
|
| 36 |
+
FAILED = "failed"
|
| 37 |
+
CANCELLED = "cancelled"
|
| 38 |
+
RETRYING = "retrying"
|
| 39 |
+
TIMEOUT = "timeout"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class ChunkStatus(Enum):
|
| 43 |
+
"""Translation chunk status values."""
|
| 44 |
+
PENDING = "pending"
|
| 45 |
+
PROCESSING = "processing"
|
| 46 |
+
COMPLETED = "completed"
|
| 47 |
+
FAILED = "failed"
|
| 48 |
+
RETRY = "retry"
|
| 49 |
+
SKIPPED = "skipped" # For code blocks
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ErrorSeverity(Enum):
|
| 53 |
+
"""Error severity levels."""
|
| 54 |
+
LOW = "low"
|
| 55 |
+
MEDIUM = "medium"
|
| 56 |
+
HIGH = "high"
|
| 57 |
+
CRITICAL = "critical"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class TranslationJob(Base):
|
| 61 |
+
"""
|
| 62 |
+
Represents a translation job with comprehensive tracking.
|
| 63 |
+
|
| 64 |
+
Supports:
|
| 65 |
+
- Large text translation with chunking
|
| 66 |
+
- Progress tracking
|
| 67 |
+
- Error handling and retries
|
| 68 |
+
- Performance metrics
|
| 69 |
+
- Cost tracking
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
__tablename__ = "translation_jobs"
|
| 73 |
+
|
| 74 |
+
# Primary key and identifiers
|
| 75 |
+
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
| 76 |
+
job_id = Column(String(64), unique=True, nullable=False, index=True) # External job ID
|
| 77 |
+
user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
|
| 78 |
+
session_id = Column(String(128), nullable=True, index=True)
|
| 79 |
+
|
| 80 |
+
# Content identifiers for caching
|
| 81 |
+
content_hash = Column(String(64), nullable=False, index=True)
|
| 82 |
+
page_url = Column(Text, nullable=True, index=True) # Source page URL for caching
|
| 83 |
+
|
| 84 |
+
# Translation parameters
|
| 85 |
+
source_language = Column(String(10), nullable=False, index=True)
|
| 86 |
+
target_language = Column(String(10), nullable=False, index=True)
|
| 87 |
+
|
| 88 |
+
# Content information
|
| 89 |
+
original_text = Column(Text, nullable=False)
|
| 90 |
+
translated_text = Column(Text, nullable=True)
|
| 91 |
+
|
| 92 |
+
# Processing options
|
| 93 |
+
preserve_code_blocks = Column(Boolean, default=True, nullable=False)
|
| 94 |
+
enable_transliteration = Column(Boolean, default=True, nullable=False)
|
| 95 |
+
chunk_size = Column(Integer, default=2000, nullable=False) # Characters per chunk
|
| 96 |
+
max_chunks = Column(Integer, default=100, nullable=False)
|
| 97 |
+
|
| 98 |
+
# OpenAI specific settings
|
| 99 |
+
model_name = Column(String(50), nullable=False, default="gpt-4-turbo-preview")
|
| 100 |
+
temperature = Column(Numeric(3, 2), default=0.3, nullable=False)
|
| 101 |
+
max_tokens = Column(Integer, default=2048, nullable=False)
|
| 102 |
+
|
| 103 |
+
# Status and progress
|
| 104 |
+
status = Column(String(20), default=TranslationJobStatus.PENDING.value, nullable=False, index=True)
|
| 105 |
+
progress_percentage = Column(Numeric(5, 2), default=0.0, nullable=False)
|
| 106 |
+
chunks_total = Column(Integer, default=0, nullable=False)
|
| 107 |
+
chunks_completed = Column(Integer, default=0, nullable=False)
|
| 108 |
+
chunks_failed = Column(Integer, default=0, nullable=False)
|
| 109 |
+
|
| 110 |
+
# Retry settings
|
| 111 |
+
retry_count = Column(Integer, default=0, nullable=False)
|
| 112 |
+
max_retries = Column(Integer, default=3, nullable=False)
|
| 113 |
+
|
| 114 |
+
# Performance metrics
|
| 115 |
+
started_at = Column(DateTime(timezone=True), nullable=True)
|
| 116 |
+
completed_at = Column(DateTime(timezone=True), nullable=True)
|
| 117 |
+
processing_time_ms = Column(BigInteger, default=0, nullable=False)
|
| 118 |
+
|
| 119 |
+
# Cost tracking
|
| 120 |
+
input_tokens = Column(BigInteger, default=0, nullable=False)
|
| 121 |
+
output_tokens = Column(BigInteger, default=0, nullable=False)
|
| 122 |
+
estimated_cost_usd = Column(Numeric(10, 6), default=0.000000, nullable=False)
|
| 123 |
+
actual_cost_usd = Column(Numeric(10, 6), nullable=True)
|
| 124 |
+
|
| 125 |
+
# Quality metrics
|
| 126 |
+
quality_score = Column(Numeric(5, 2), nullable=True) # 1-5 score
|
| 127 |
+
confidence_score = Column(Numeric(5, 2), nullable=True) # 1-5 score
|
| 128 |
+
|
| 129 |
+
# Metadata
|
| 130 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 131 |
+
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
| 132 |
+
last_activity_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 133 |
+
user_agent = Column(Text, nullable=True)
|
| 134 |
+
ip_address = Column(String(45), nullable=True) # Supports IPv6
|
| 135 |
+
|
| 136 |
+
# Relationships
|
| 137 |
+
user = relationship("User", back_populates="translation_jobs")
|
| 138 |
+
chunks = relationship("TranslationChunk", back_populates="job", cascade="all, delete-orphan")
|
| 139 |
+
errors = relationship("TranslationError", back_populates="job", cascade="all, delete-orphan")
|
| 140 |
+
metrics = relationship("TranslationMetrics", back_populates="job", cascade="all, delete-orphan")
|
| 141 |
+
cache_entries = relationship("TranslationCache", back_populates="job", cascade="all, delete-orphan")
|
| 142 |
+
|
| 143 |
+
# Constraints and indexes
|
| 144 |
+
__table_args__ = (
|
| 145 |
+
Index('idx_job_status_created', 'status', 'created_at'),
|
| 146 |
+
Index('idx_user_status', 'user_id', 'status'),
|
| 147 |
+
Index('idx_content_lookup', 'content_hash', 'source_language', 'target_language'),
|
| 148 |
+
Index('idx_page_cache', 'page_url', 'content_hash'),
|
| 149 |
+
Index('idx_activity', 'last_activity_at'),
|
| 150 |
+
Index('idx_progress', 'status', 'progress_percentage'),
|
| 151 |
+
CheckConstraint('progress_percentage >= 0 AND progress_percentage <= 100', name='check_progress_range'),
|
| 152 |
+
CheckConstraint('temperature >= 0 AND temperature <= 2', name='check_temperature_range'),
|
| 153 |
+
CheckConstraint('chunk_size > 0 AND chunk_size <= 10000', name='check_chunk_size'),
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
def __repr__(self):
|
| 157 |
+
return f"<TranslationJob(id={self.id}, status={self.status}, progress={self.progress_percentage}%)>"
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class TranslationChunk(Base):
|
| 161 |
+
"""
|
| 162 |
+
Represents a chunk of text being translated.
|
| 163 |
+
|
| 164 |
+
Supports:
|
| 165 |
+
- Individual chunk status tracking
|
| 166 |
+
- Retry mechanism
|
| 167 |
+
- Performance metrics per chunk
|
| 168 |
+
- Code block detection
|
| 169 |
+
"""
|
| 170 |
+
|
| 171 |
+
__tablename__ = "translation_chunks"
|
| 172 |
+
|
| 173 |
+
# Primary key and identifiers
|
| 174 |
+
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
| 175 |
+
job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
|
| 176 |
+
chunk_index = Column(Integer, nullable=False)
|
| 177 |
+
|
| 178 |
+
# Content
|
| 179 |
+
original_text = Column(Text, nullable=False)
|
| 180 |
+
translated_text = Column(Text, nullable=True)
|
| 181 |
+
|
| 182 |
+
# Position in original text
|
| 183 |
+
start_position = Column(Integer, nullable=False)
|
| 184 |
+
end_position = Column(Integer, nullable=False)
|
| 185 |
+
|
| 186 |
+
# Chunk properties
|
| 187 |
+
is_code_block = Column(Boolean, default=False, nullable=False)
|
| 188 |
+
code_language = Column(String(50), nullable=True)
|
| 189 |
+
word_count = Column(Integer, nullable=False)
|
| 190 |
+
|
| 191 |
+
# Status and processing
|
| 192 |
+
status = Column(String(20), default=ChunkStatus.PENDING.value, nullable=False, index=True)
|
| 193 |
+
retry_count = Column(Integer, default=0, nullable=False)
|
| 194 |
+
|
| 195 |
+
# Processing metrics
|
| 196 |
+
started_at = Column(DateTime(timezone=True), nullable=True)
|
| 197 |
+
completed_at = Column(DateTime(timezone=True), nullable=True)
|
| 198 |
+
processing_time_ms = Column(BigInteger, default=0, nullable=False)
|
| 199 |
+
|
| 200 |
+
# Token usage
|
| 201 |
+
input_tokens = Column(Integer, default=0, nullable=False)
|
| 202 |
+
output_tokens = Column(Integer, default=0, nullable=False)
|
| 203 |
+
|
| 204 |
+
# Quality indicators
|
| 205 |
+
confidence_score = Column(Numeric(5, 2), nullable=True)
|
| 206 |
+
requires_review = Column(Boolean, default=False, nullable=False)
|
| 207 |
+
|
| 208 |
+
# Error information
|
| 209 |
+
last_error = Column(Text, nullable=True)
|
| 210 |
+
error_code = Column(String(50), nullable=True)
|
| 211 |
+
|
| 212 |
+
# Metadata
|
| 213 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 214 |
+
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
| 215 |
+
|
| 216 |
+
# Relationships
|
| 217 |
+
job = relationship("TranslationJob", back_populates="chunks")
|
| 218 |
+
|
| 219 |
+
# Constraints and indexes
|
| 220 |
+
__table_args__ = (
|
| 221 |
+
Index('idx_job_chunk', 'job_id', 'chunk_index', unique=True),
|
| 222 |
+
Index('idx_chunk_status', 'status', 'created_at'),
|
| 223 |
+
Index('idx_code_blocks', 'is_code_block', 'code_language'),
|
| 224 |
+
CheckConstraint('chunk_index >= 0', name='check_chunk_index'),
|
| 225 |
+
CheckConstraint('start_position >= 0 AND end_position >= start_position', name='check_positions'),
|
| 226 |
+
CheckConstraint('word_count >= 0', name='check_word_count'),
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
def __repr__(self):
|
| 230 |
+
return f"<TranslationChunk(job_id={self.job_id}, index={self.chunk_index}, status={self.status})>"
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
class TranslationError(Base):
|
| 234 |
+
"""
|
| 235 |
+
Tracks errors during translation processing.
|
| 236 |
+
|
| 237 |
+
Supports:
|
| 238 |
+
- Detailed error logging
|
| 239 |
+
- Error categorization
|
| 240 |
+
- Retry tracking
|
| 241 |
+
- Error analytics
|
| 242 |
+
"""
|
| 243 |
+
|
| 244 |
+
__tablename__ = "translation_errors"
|
| 245 |
+
|
| 246 |
+
# Primary key and identifiers
|
| 247 |
+
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
| 248 |
+
job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
|
| 249 |
+
chunk_id = Column(UUID(as_uuid=True), ForeignKey("translation_chunks.id"), nullable=True, index=True)
|
| 250 |
+
error_id = Column(String(64), unique=True, nullable=False, index=True) # Unique error identifier
|
| 251 |
+
|
| 252 |
+
# Error details
|
| 253 |
+
error_type = Column(String(50), nullable=False, index=True) # e.g., "api_error", "timeout", "rate_limit"
|
| 254 |
+
error_code = Column(String(50), nullable=True) # API error code
|
| 255 |
+
error_message = Column(Text, nullable=False)
|
| 256 |
+
error_details = Column(JSON, nullable=True) # Additional error context
|
| 257 |
+
|
| 258 |
+
# Severity and categorization
|
| 259 |
+
severity = Column(String(20), default=ErrorSeverity.MEDIUM.value, nullable=False, index=True)
|
| 260 |
+
category = Column(String(50), nullable=False, index=True) # e.g., "network", "parsing", "validation"
|
| 261 |
+
|
| 262 |
+
# Retry information
|
| 263 |
+
is_retriable = Column(Boolean, default=True, nullable=False)
|
| 264 |
+
retry_attempt = Column(Integer, default=1, nullable=False)
|
| 265 |
+
max_retries = Column(Integer, default=3, nullable=False)
|
| 266 |
+
next_retry_at = Column(DateTime(timezone=True), nullable=True, index=True)
|
| 267 |
+
|
| 268 |
+
# Context information
|
| 269 |
+
request_payload = Column(JSON, nullable=True) # Sanitized request data
|
| 270 |
+
response_payload = Column(JSON, nullable=True) # Sanitized response data
|
| 271 |
+
|
| 272 |
+
# Stack trace and debugging
|
| 273 |
+
stack_trace = Column(Text, nullable=True)
|
| 274 |
+
debug_info = Column(JSON, nullable=True)
|
| 275 |
+
|
| 276 |
+
# Resolution
|
| 277 |
+
resolved_at = Column(DateTime(timezone=True), nullable=True)
|
| 278 |
+
resolution = Column(String(200), nullable=True) # How the error was resolved
|
| 279 |
+
|
| 280 |
+
# Metadata
|
| 281 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 282 |
+
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
| 283 |
+
|
| 284 |
+
# Relationships
|
| 285 |
+
job = relationship("TranslationJob", back_populates="errors")
|
| 286 |
+
chunk = relationship("TranslationChunk")
|
| 287 |
+
|
| 288 |
+
# Constraints and indexes
|
| 289 |
+
__table_args__ = (
|
| 290 |
+
Index('idx_error_type_created', 'error_type', 'created_at'),
|
| 291 |
+
Index('idx_error_severity', 'severity', 'created_at'),
|
| 292 |
+
Index('idx_job_errors', 'job_id', 'created_at'),
|
| 293 |
+
Index('idx_retry_schedule', 'next_retry_at', 'is_retriable'),
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
def __repr__(self):
|
| 297 |
+
return f"<TranslationError(id={self.id}, type={self.error_type}, severity={self.severity})>"
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
class TranslationSession(Base):
|
| 301 |
+
"""
|
| 302 |
+
Manages user translation sessions.
|
| 303 |
+
|
| 304 |
+
Supports:
|
| 305 |
+
- Session-based tracking
|
| 306 |
+
- Rate limiting
|
| 307 |
+
- User preferences
|
| 308 |
+
- Analytics
|
| 309 |
+
"""
|
| 310 |
+
|
| 311 |
+
__tablename__ = "translation_sessions"
|
| 312 |
+
|
| 313 |
+
# Primary key and identifiers
|
| 314 |
+
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
| 315 |
+
session_id = Column(String(128), unique=True, nullable=False, index=True)
|
| 316 |
+
user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
|
| 317 |
+
|
| 318 |
+
# Session information
|
| 319 |
+
started_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 320 |
+
last_activity_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 321 |
+
expires_at = Column(DateTime(timezone=True), nullable=False, index=True)
|
| 322 |
+
is_active = Column(Boolean, default=True, nullable=False, index=True)
|
| 323 |
+
|
| 324 |
+
# Usage tracking
|
| 325 |
+
request_count = Column(Integer, default=0, nullable=False)
|
| 326 |
+
character_count = Column(Integer, default=0, nullable=False)
|
| 327 |
+
total_cost_usd = Column(Numeric(10, 6), default=0.000000, nullable=False)
|
| 328 |
+
|
| 329 |
+
# Rate limiting
|
| 330 |
+
requests_per_minute = Column(Integer, default=60, nullable=False)
|
| 331 |
+
characters_per_hour = Column(Integer, default=100000, nullable=False)
|
| 332 |
+
|
| 333 |
+
# Session context
|
| 334 |
+
source_language = Column(String(10), nullable=True)
|
| 335 |
+
target_language = Column(String(10), nullable=True)
|
| 336 |
+
preferred_model = Column(String(50), nullable=True)
|
| 337 |
+
|
| 338 |
+
# Client information
|
| 339 |
+
user_agent = Column(Text, nullable=True)
|
| 340 |
+
ip_address = Column(String(45), nullable=True, index=True)
|
| 341 |
+
country_code = Column(String(2), nullable=True)
|
| 342 |
+
|
| 343 |
+
# Session preferences (stored as JSON)
|
| 344 |
+
preferences = Column(JSON, nullable=True)
|
| 345 |
+
|
| 346 |
+
# Relationships
|
| 347 |
+
user = relationship("User", back_populates="translation_sessions")
|
| 348 |
+
|
| 349 |
+
# Constraints and indexes
|
| 350 |
+
__table_args__ = (
|
| 351 |
+
Index('idx_user_sessions', 'user_id', 'is_active'),
|
| 352 |
+
Index('idx_session_expiry', 'expires_at', 'is_active'),
|
| 353 |
+
Index('idx_ip_sessions', 'ip_address', 'started_at'),
|
| 354 |
+
CheckConstraint('request_count >= 0', name='check_request_count'),
|
| 355 |
+
CheckConstraint('character_count >= 0', name='check_character_count'),
|
| 356 |
+
CheckConstraint('requests_per_minute > 0', name='check_rate_limit_requests'),
|
| 357 |
+
CheckConstraint('characters_per_hour > 0', name='check_rate_limit_chars'),
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
def __repr__(self):
|
| 361 |
+
return f"<TranslationSession(id={self.session_id}, active={self.is_active}, requests={self.request_count})>"
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
class TranslationCache(Base):
|
| 365 |
+
"""
|
| 366 |
+
Enhanced translation caching with page URL support.
|
| 367 |
+
|
| 368 |
+
Supports:
|
| 369 |
+
- Page URL + content hash keys
|
| 370 |
+
- Hierarchical caching
|
| 371 |
+
- Cache invalidation
|
| 372 |
+
- Cache analytics
|
| 373 |
+
"""
|
| 374 |
+
|
| 375 |
+
__tablename__ = "translation_cache"
|
| 376 |
+
|
| 377 |
+
# Primary key and identifiers
|
| 378 |
+
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
| 379 |
+
cache_key = Column(String(128), unique=True, nullable=False, index=True)
|
| 380 |
+
job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=True, index=True)
|
| 381 |
+
|
| 382 |
+
# Cache keys
|
| 383 |
+
content_hash = Column(String(64), nullable=False, index=True)
|
| 384 |
+
page_url = Column(Text, nullable=True, index=True)
|
| 385 |
+
url_hash = Column(String(64), nullable=True, index=True) # Hash of URL for privacy
|
| 386 |
+
|
| 387 |
+
# Translation data
|
| 388 |
+
source_language = Column(String(10), nullable=False, index=True)
|
| 389 |
+
target_language = Column(String(10), nullable=False, index=True)
|
| 390 |
+
original_text = Column(Text, nullable=False)
|
| 391 |
+
translated_text = Column(Text, nullable=False)
|
| 392 |
+
|
| 393 |
+
# Cache metadata
|
| 394 |
+
hit_count = Column(Integer, default=0, nullable=False)
|
| 395 |
+
last_hit_at = Column(DateTime(timezone=True), nullable=True)
|
| 396 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 397 |
+
expires_at = Column(DateTime(timezone=True), nullable=False, index=True)
|
| 398 |
+
|
| 399 |
+
# Quality and performance
|
| 400 |
+
quality_score = Column(Numeric(5, 2), nullable=True)
|
| 401 |
+
processing_time_ms = Column(BigInteger, nullable=False)
|
| 402 |
+
model_version = Column(String(50), nullable=False)
|
| 403 |
+
|
| 404 |
+
# Cache configuration
|
| 405 |
+
ttl_hours = Column(Integer, default=168, nullable=False) # 7 days default
|
| 406 |
+
is_pinned = Column(Boolean, default=False, nullable=False) # Never expires if pinned
|
| 407 |
+
priority = Column(Integer, default=0, nullable=False) # Higher priority less likely to evict
|
| 408 |
+
|
| 409 |
+
# Validation
|
| 410 |
+
is_validated = Column(Boolean, default=False, nullable=False)
|
| 411 |
+
validated_by = Column(String(50), nullable=True) # user_id or "system"
|
| 412 |
+
|
| 413 |
+
# Relationships
|
| 414 |
+
job = relationship("TranslationJob", back_populates="cache_entries")
|
| 415 |
+
|
| 416 |
+
# Constraints and indexes
|
| 417 |
+
__table_args__ = (
|
| 418 |
+
Index('idx_cache_lookup', 'content_hash', 'source_language', 'target_language'),
|
| 419 |
+
Index('idx_page_cache', 'url_hash', 'content_hash'),
|
| 420 |
+
Index('idx_cache_expires', 'expires_at', 'priority'),
|
| 421 |
+
Index('idx_cache_popularity', 'hit_count', 'last_hit_at'),
|
| 422 |
+
CheckConstraint('hit_count >= 0', name='check_hit_count'),
|
| 423 |
+
CheckConstraint('processing_time_ms >= 0', name='check_processing_time'),
|
| 424 |
+
CheckConstraint('ttl_hours > 0', name='check_ttl_hours'),
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
def __repr__(self):
|
| 428 |
+
return f"<TranslationCache(key={self.cache_key[:20]}..., hits={self.hit_count})>"
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
class TranslationMetrics(Base):
|
| 432 |
+
"""
|
| 433 |
+
Tracks detailed translation metrics and analytics.
|
| 434 |
+
|
| 435 |
+
Supports:
|
| 436 |
+
- Performance monitoring
|
| 437 |
+
- Quality analytics
|
| 438 |
+
- Cost tracking
|
| 439 |
+
- Usage statistics
|
| 440 |
+
"""
|
| 441 |
+
|
| 442 |
+
__tablename__ = "translation_metrics"
|
| 443 |
+
|
| 444 |
+
# Primary key and identifiers
|
| 445 |
+
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
| 446 |
+
job_id = Column(UUID(as_uuid=True), ForeignKey("translation_jobs.id"), nullable=False, index=True)
|
| 447 |
+
user_id = Column(String(36), ForeignKey("users.id"), nullable=True, index=True)
|
| 448 |
+
|
| 449 |
+
# Time period
|
| 450 |
+
metric_date = Column(DateTime(timezone=True), nullable=False, index=True)
|
| 451 |
+
period_type = Column(String(20), nullable=False, index=True) # hourly, daily, weekly, monthly
|
| 452 |
+
|
| 453 |
+
# Usage metrics
|
| 454 |
+
total_requests = Column(Integer, default=0, nullable=False)
|
| 455 |
+
total_characters = Column(BigInteger, default=0, nullable=False)
|
| 456 |
+
total_chunks = Column(Integer, default=0, nullable=False)
|
| 457 |
+
successful_translations = Column(Integer, default=0, nullable=False)
|
| 458 |
+
failed_translations = Column(Integer, default=0, nullable=False)
|
| 459 |
+
|
| 460 |
+
# Performance metrics
|
| 461 |
+
avg_processing_time_ms = Column(BigInteger, default=0, nullable=False)
|
| 462 |
+
min_processing_time_ms = Column(BigInteger, default=0, nullable=False)
|
| 463 |
+
max_processing_time_ms = Column(BigInteger, default=0, nullable=False)
|
| 464 |
+
p95_processing_time_ms = Column(BigInteger, default=0, nullable=False)
|
| 465 |
+
|
| 466 |
+
# Cost metrics
|
| 467 |
+
total_input_tokens = Column(BigInteger, default=0, nullable=False)
|
| 468 |
+
total_output_tokens = Column(BigInteger, default=0, nullable=False)
|
| 469 |
+
total_cost_usd = Column(Numeric(12, 6), default=0.000000, nullable=False)
|
| 470 |
+
avg_cost_per_char = Column(Numeric(10, 8), default=0.00000000, nullable=False)
|
| 471 |
+
|
| 472 |
+
# Quality metrics
|
| 473 |
+
avg_quality_score = Column(Numeric(5, 2), nullable=True)
|
| 474 |
+
avg_confidence_score = Column(Numeric(5, 2), nullable=True)
|
| 475 |
+
|
| 476 |
+
# Cache metrics
|
| 477 |
+
cache_hits = Column(Integer, default=0, nullable=False)
|
| 478 |
+
cache_misses = Column(Integer, default=0, nullable=False)
|
| 479 |
+
cache_hit_rate = Column(Numeric(5, 2), default=0.0, nullable=False)
|
| 480 |
+
|
| 481 |
+
# Error metrics
|
| 482 |
+
error_count = Column(Integer, default=0, nullable=False)
|
| 483 |
+
error_rate = Column(Numeric(5, 2), default=0.0, nullable=False)
|
| 484 |
+
top_error_types = Column(JSON, nullable=True) # Top 5 error types with counts
|
| 485 |
+
|
| 486 |
+
# Additional dimensions
|
| 487 |
+
source_language = Column(String(10), nullable=True, index=True)
|
| 488 |
+
target_language = Column(String(10), nullable=True, index=True)
|
| 489 |
+
model_name = Column(String(50), nullable=True, index=True)
|
| 490 |
+
|
| 491 |
+
# Metadata
|
| 492 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
| 493 |
+
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
| 494 |
+
|
| 495 |
+
# Relationships
|
| 496 |
+
job = relationship("TranslationJob", back_populates="metrics")
|
| 497 |
+
user = relationship("User", back_populates="translation_metrics")
|
| 498 |
+
|
| 499 |
+
# Constraints and indexes
|
| 500 |
+
__table_args__ = (
|
| 501 |
+
Index('idx_metrics_date_period', 'metric_date', 'period_type'),
|
| 502 |
+
Index('idx_user_metrics', 'user_id', 'metric_date'),
|
| 503 |
+
Index('idx_job_metrics', 'job_id', 'metric_date'),
|
| 504 |
+
Index('idx_lang_metrics', 'source_language', 'target_language', 'metric_date'),
|
| 505 |
+
CheckConstraint('total_requests >= 0', name='check_total_requests'),
|
| 506 |
+
CheckConstraint('total_characters >= 0', name='check_total_characters'),
|
| 507 |
+
CheckConstraint('cache_hit_rate >= 0 AND cache_hit_rate <= 100', name='check_cache_hit_rate'),
|
| 508 |
+
CheckConstraint('error_rate >= 0 AND error_rate <= 100', name='check_error_rate'),
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
def __repr__(self):
|
| 512 |
+
return f"<TranslationMetrics(date={self.metric_date}, requests={self.total_requests})>"
|
src/models/user_preferences.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
User preferences model for storing personalization settings.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from sqlalchemy import Column, String, Boolean, Integer, Float, ForeignKey, Text
|
| 6 |
+
from sqlalchemy.orm import relationship
|
| 7 |
+
from src.models.base import BaseModel
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class UserPreference(BaseModel):
|
| 11 |
+
"""Stores user personalization settings."""
|
| 12 |
+
|
| 13 |
+
__tablename__ = "user_preferences"
|
| 14 |
+
|
| 15 |
+
user_id = Column(String(36), ForeignKey("users.id"), nullable=False, unique=True, index=True)
|
| 16 |
+
language = Column(String(10), nullable=False, default='en') # en, ur, ur-roman
|
| 17 |
+
reading_pace = Column(String(20), nullable=False, default='medium') # slow, medium, fast
|
| 18 |
+
preferred_depth = Column(String(20), nullable=False, default='detailed') # overview, detailed, comprehensive
|
| 19 |
+
show_code_examples = Column(Boolean, nullable=False, default=True)
|
| 20 |
+
adaptive_difficulty = Column(Boolean, nullable=False, default=False)
|
| 21 |
+
theme = Column(String(20), nullable=False, default='auto') # light, dark, auto
|
| 22 |
+
font_size = Column(Integer, nullable=False, default=16)
|
| 23 |
+
line_height = Column(Float, nullable=False, default=1.5)
|
| 24 |
+
|
| 25 |
+
# Relationships
|
| 26 |
+
user = relationship("User", back_populates="preferences")
|
| 27 |
+
custom_notes = relationship("UserCustomNote", back_populates="preference", cascade="all, delete-orphan")
|
| 28 |
+
|
| 29 |
+
__table_args__ = (
|
| 30 |
+
{"extend_existing": True},
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
def __repr__(self):
|
| 34 |
+
return f"<UserPreference(user_id='{self.user_id}', language='{self.language}', theme='{self.theme}')>"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class UserCustomNote(BaseModel):
|
| 38 |
+
"""Custom notes as key-value pairs for user preferences."""
|
| 39 |
+
|
| 40 |
+
__tablename__ = "user_custom_notes"
|
| 41 |
+
|
| 42 |
+
user_preference_id = Column(String(36), ForeignKey("user_preferences.id"), nullable=False)
|
| 43 |
+
key = Column(String(100), nullable=False)
|
| 44 |
+
value = Column(Text, nullable=False)
|
| 45 |
+
|
| 46 |
+
# Relationships
|
| 47 |
+
preference = relationship("UserPreference", back_populates="custom_notes")
|
| 48 |
+
|
| 49 |
+
__table_args__ = (
|
| 50 |
+
{"extend_existing": True},
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
def __repr__(self):
|
| 54 |
+
return f"<UserCustomNote(key='{self.key}', preference_id='{self.user_preference_id}')>"
|
src/services/cache_examples.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cache service usage examples.
|
| 3 |
+
|
| 4 |
+
This file demonstrates how to use the cache service for various scenarios
|
| 5 |
+
including translations, user preferences, and API response caching.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
from typing import Dict, Any
|
| 10 |
+
from src.services.cache_service import (
|
| 11 |
+
get_cache_service,
|
| 12 |
+
CacheType,
|
| 13 |
+
cache_translation,
|
| 14 |
+
get_cached_translation,
|
| 15 |
+
cache_user_preference,
|
| 16 |
+
get_cached_user_preference,
|
| 17 |
+
cache_api_response,
|
| 18 |
+
get_cached_api_response
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
async def example_basic_usage():
|
| 23 |
+
"""Basic cache service usage example."""
|
| 24 |
+
# Get cache service instance
|
| 25 |
+
cache = await get_cache_service()
|
| 26 |
+
|
| 27 |
+
# Generate a cache key
|
| 28 |
+
cache_key = cache._generate_cache_key(
|
| 29 |
+
prefix="example",
|
| 30 |
+
identifier="user_123",
|
| 31 |
+
version="v1",
|
| 32 |
+
param1="value1",
|
| 33 |
+
param2="value2"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Set a value
|
| 37 |
+
await cache.set(
|
| 38 |
+
key=cache_key,
|
| 39 |
+
value={"message": "Hello, cached world!"},
|
| 40 |
+
cache_type=CacheType.API_RESPONSE,
|
| 41 |
+
ttl=60 # 1 minute
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Get the value
|
| 45 |
+
cached_value = await cache.get(cache_key, CacheType.API_RESPONSE)
|
| 46 |
+
print(f"Cached value: {cached_value}")
|
| 47 |
+
|
| 48 |
+
# Delete the value
|
| 49 |
+
await cache.delete(cache_key)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
async def example_translation_caching():
|
| 53 |
+
"""Example of caching translations."""
|
| 54 |
+
# Cache a translation
|
| 55 |
+
translation_data = {
|
| 56 |
+
"en": "Hello, World!",
|
| 57 |
+
"ur": "ہیلو، دنیا!",
|
| 58 |
+
"ur-roman": "Hello, Duniya!"
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
success = await cache_translation(
|
| 62 |
+
key="greeting.hello_world",
|
| 63 |
+
translation=translation_data,
|
| 64 |
+
language="all"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
if success:
|
| 68 |
+
print("Translation cached successfully")
|
| 69 |
+
|
| 70 |
+
# Retrieve cached translation
|
| 71 |
+
cached_translation = await get_cached_translation(
|
| 72 |
+
key="greeting.hello_world",
|
| 73 |
+
language="all"
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if cached_translation:
|
| 77 |
+
print(f"Cached translation: {cached_translation}")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
async def example_user_preference_caching():
|
| 81 |
+
"""Example of caching user preferences."""
|
| 82 |
+
# Cache user preferences
|
| 83 |
+
user_prefs = {
|
| 84 |
+
"language": "en",
|
| 85 |
+
"theme": "dark",
|
| 86 |
+
"font_size": 16,
|
| 87 |
+
"reading_pace": "medium",
|
| 88 |
+
"show_code_examples": True
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
success = await cache_user_preference(
|
| 92 |
+
user_id="user_456",
|
| 93 |
+
preferences=user_prefs
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
if success:
|
| 97 |
+
print("User preferences cached successfully")
|
| 98 |
+
|
| 99 |
+
# Retrieve cached preferences
|
| 100 |
+
cached_prefs = await get_cached_user_preference("user_456")
|
| 101 |
+
|
| 102 |
+
if cached_prefs:
|
| 103 |
+
print(f"Cached preferences: {cached_prefs}")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
async def example_api_response_caching():
|
| 107 |
+
"""Example of caching API responses."""
|
| 108 |
+
# Cache API response
|
| 109 |
+
api_response = {
|
| 110 |
+
"status": "success",
|
| 111 |
+
"data": [
|
| 112 |
+
{"id": 1, "title": "Chapter 1"},
|
| 113 |
+
{"id": 2, "title": "Chapter 2"}
|
| 114 |
+
],
|
| 115 |
+
"pagination": {
|
| 116 |
+
"page": 1,
|
| 117 |
+
"total_pages": 10
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
success = await cache_api_response(
|
| 122 |
+
endpoint="/api/v1/chapters",
|
| 123 |
+
params={"page": 1, "limit": 10},
|
| 124 |
+
response=api_response,
|
| 125 |
+
ttl=300 # 5 minutes
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
if success:
|
| 129 |
+
print("API response cached successfully")
|
| 130 |
+
|
| 131 |
+
# Retrieve cached API response
|
| 132 |
+
cached_response = await get_cached_api_response(
|
| 133 |
+
endpoint="/api/v1/chapters",
|
| 134 |
+
params={"page": 1, "limit": 10}
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
if cached_response:
|
| 138 |
+
print(f"Cached API response: {cached_response}")
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
async def example_cache_statistics():
|
| 142 |
+
"""Example of retrieving cache statistics."""
|
| 143 |
+
cache = await get_cache_service()
|
| 144 |
+
|
| 145 |
+
# Get cache statistics
|
| 146 |
+
stats = cache.get_stats()
|
| 147 |
+
|
| 148 |
+
print("Cache Statistics:")
|
| 149 |
+
print(f" Total requests: {stats['total_requests']}")
|
| 150 |
+
print(f" Cache hits: {stats['hits']}")
|
| 151 |
+
print(f" Cache misses: {stats['misses']}")
|
| 152 |
+
print(f" Hit rate: {stats['hit_rate']}%")
|
| 153 |
+
print(f" Redis hits: {stats['redis_hits']}")
|
| 154 |
+
print(f" Local hits: {stats['local_hits']}")
|
| 155 |
+
print(f" Errors: {stats['errors']}")
|
| 156 |
+
print(f" Redis enabled: {stats['redis_enabled']}")
|
| 157 |
+
print(f" Memory cache size: {stats['memory_cache_size']}")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
async def example_cache_cleanup():
|
| 161 |
+
"""Example of cleaning up expired cache entries."""
|
| 162 |
+
cache = await get_cache_service()
|
| 163 |
+
|
| 164 |
+
# Clean up expired entries
|
| 165 |
+
cleaned_count = await cache.cleanup_expired()
|
| 166 |
+
print(f"Cleaned up {cleaned_count} expired cache entries")
|
| 167 |
+
|
| 168 |
+
# Clear all cache entries for a specific type
|
| 169 |
+
cleared_count = await cache.clear(cache_type=CacheType.TRANSLATION)
|
| 170 |
+
print(f"Cleared {cleared_count} translation cache entries")
|
| 171 |
+
|
| 172 |
+
# Clear cache entries matching a pattern
|
| 173 |
+
cleared_count = await cache.clear(pattern="api:v1:user_*")
|
| 174 |
+
print(f"Cleared {cleared_count} entries matching pattern")
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
async def example_concurrent_access():
|
| 178 |
+
"""Example demonstrating thread-safe concurrent access."""
|
| 179 |
+
async def worker(worker_id: int):
|
| 180 |
+
cache = await get_cache_service()
|
| 181 |
+
|
| 182 |
+
# Each worker uses its own key space
|
| 183 |
+
key = f"worker_{worker_id}:data"
|
| 184 |
+
|
| 185 |
+
for i in range(10):
|
| 186 |
+
# Set value
|
| 187 |
+
await cache.set(
|
| 188 |
+
key=key,
|
| 189 |
+
value={"worker": worker_id, "iteration": i},
|
| 190 |
+
cache_type=CacheType.API_RESPONSE,
|
| 191 |
+
ttl=60
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Get value
|
| 195 |
+
value = await cache.get(key, CacheType.API_RESPONSE)
|
| 196 |
+
print(f"Worker {worker_id}, iteration {i}: {value}")
|
| 197 |
+
|
| 198 |
+
# Small delay
|
| 199 |
+
await asyncio.sleep(0.1)
|
| 200 |
+
|
| 201 |
+
# Run multiple workers concurrently
|
| 202 |
+
tasks = [worker(i) for i in range(5)]
|
| 203 |
+
await asyncio.gather(*tasks)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
async def main():
|
| 207 |
+
"""Run all examples."""
|
| 208 |
+
print("=== Basic Usage ===")
|
| 209 |
+
await example_basic_usage()
|
| 210 |
+
|
| 211 |
+
print("\n=== Translation Caching ===")
|
| 212 |
+
await example_translation_caching()
|
| 213 |
+
|
| 214 |
+
print("\n=== User Preference Caching ===")
|
| 215 |
+
await example_user_preference_caching()
|
| 216 |
+
|
| 217 |
+
print("\n=== API Response Caching ===")
|
| 218 |
+
await example_api_response_caching()
|
| 219 |
+
|
| 220 |
+
print("\n=== Cache Statistics ===")
|
| 221 |
+
await example_cache_statistics()
|
| 222 |
+
|
| 223 |
+
print("\n=== Cache Cleanup ===")
|
| 224 |
+
await example_cache_cleanup()
|
| 225 |
+
|
| 226 |
+
print("\n=== Concurrent Access ===")
|
| 227 |
+
await example_concurrent_access()
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
if __name__ == "__main__":
|
| 231 |
+
asyncio.run(main())
|
src/services/cache_service.py
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cache service for server-side caching with localStorage fallback.
|
| 3 |
+
|
| 4 |
+
Provides Redis caching with localStorage fallback, supporting different TTLs
|
| 5 |
+
for various cache types including translations, user preferences, and API responses.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import pickle
|
| 10 |
+
import asyncio
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from typing import Any, Dict, List, Optional, Union
|
| 13 |
+
from enum import Enum
|
| 14 |
+
import hashlib
|
| 15 |
+
import os
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
import redis.asyncio as redis
|
| 20 |
+
REDIS_AVAILABLE = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
REDIS_AVAILABLE = False
|
| 23 |
+
redis = None
|
| 24 |
+
|
| 25 |
+
from src.utils.errors import CacheError, ValidationError
|
| 26 |
+
from src.utils.logging import get_logger
|
| 27 |
+
|
| 28 |
+
logger = get_logger(__name__)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class CacheType(Enum):
|
| 32 |
+
"""Cache types with different TTLs."""
|
| 33 |
+
TRANSLATION = "translation"
|
| 34 |
+
USER_PREFERENCE = "user_preference"
|
| 35 |
+
API_RESPONSE = "api_response"
|
| 36 |
+
PERSONALIZATION = "personalization"
|
| 37 |
+
PROGRESS = "progress"
|
| 38 |
+
SEARCH_RESULT = "search_result"
|
| 39 |
+
BOOKMARK = "bookmark"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class CacheService:
|
| 43 |
+
"""
|
| 44 |
+
Cache service with Redis primary and localStorage fallback.
|
| 45 |
+
|
| 46 |
+
Features:
|
| 47 |
+
- Redis as primary cache (if available)
|
| 48 |
+
- localStorage as fallback
|
| 49 |
+
- TTL support per cache type
|
| 50 |
+
- Compression for large objects
|
| 51 |
+
- Statistics tracking
|
| 52 |
+
- Error handling and logging
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
# TTL configurations (in seconds)
|
| 56 |
+
TTL_CONFIG = {
|
| 57 |
+
CacheType.TRANSLATION: 7 * 24 * 60 * 60, # 7 days
|
| 58 |
+
CacheType.USER_PREFERENCE: 30 * 24 * 60 * 60, # 30 days
|
| 59 |
+
CacheType.API_RESPONSE: 5 * 60, # 5 minutes
|
| 60 |
+
CacheType.PERSONALIZATION: 1 * 60 * 60, # 1 hour
|
| 61 |
+
CacheType.PROGRESS: 24 * 60 * 60, # 24 hours
|
| 62 |
+
CacheType.SEARCH_RESULT: 10 * 60, # 10 minutes
|
| 63 |
+
CacheType.BOOKMARK: 30 * 24 * 60 * 60, # 30 days
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
# Statistics
|
| 67 |
+
_stats = {
|
| 68 |
+
"hits": 0,
|
| 69 |
+
"misses": 0,
|
| 70 |
+
"errors": 0,
|
| 71 |
+
"redis_hits": 0,
|
| 72 |
+
"local_hits": 0,
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
def __init__(
|
| 76 |
+
self,
|
| 77 |
+
redis_url: Optional[str] = None,
|
| 78 |
+
localStorage_path: Optional[str] = None,
|
| 79 |
+
enable_redis: bool = True,
|
| 80 |
+
enable_compression: bool = True,
|
| 81 |
+
compression_threshold: int = 1024
|
| 82 |
+
):
|
| 83 |
+
"""
|
| 84 |
+
Initialize cache service.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
redis_url: Redis connection URL
|
| 88 |
+
localStorage_path: Path to localStorage directory
|
| 89 |
+
enable_redis: Whether to use Redis if available
|
| 90 |
+
enable_compression: Whether to compress large objects
|
| 91 |
+
compression_threshold: Size threshold for compression (bytes)
|
| 92 |
+
"""
|
| 93 |
+
self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
| 94 |
+
self.localStorage_path = Path(localStorage_path or os.getenv("CACHE_LOCAL_PATH", "./cache_data"))
|
| 95 |
+
self.enable_redis = enable_redis and REDIS_AVAILABLE
|
| 96 |
+
self.enable_compression = enable_compression
|
| 97 |
+
self.compression_threshold = compression_threshold
|
| 98 |
+
|
| 99 |
+
self._redis_client = None
|
| 100 |
+
self._local_cache = {}
|
| 101 |
+
|
| 102 |
+
# Initialize localStorage
|
| 103 |
+
self.localStorage_path.mkdir(parents=True, exist_ok=True)
|
| 104 |
+
|
| 105 |
+
logger.info(
|
| 106 |
+
"Cache service initialized",
|
| 107 |
+
redis_enabled=self.enable_redis,
|
| 108 |
+
localStorage_path=str(self.localStorage_path),
|
| 109 |
+
compression_enabled=self.enable_compression
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
async def _get_redis_client(self):
|
| 113 |
+
"""Get or create Redis client."""
|
| 114 |
+
if not self.enable_redis:
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
if self._redis_client is None:
|
| 118 |
+
try:
|
| 119 |
+
self._redis_client = redis.from_url(
|
| 120 |
+
self.redis_url,
|
| 121 |
+
encoding="utf-8",
|
| 122 |
+
decode_responses=False,
|
| 123 |
+
socket_connect_timeout=5,
|
| 124 |
+
socket_timeout=5,
|
| 125 |
+
retry_on_timeout=True,
|
| 126 |
+
health_check_interval=30
|
| 127 |
+
)
|
| 128 |
+
# Test connection
|
| 129 |
+
await self._redis_client.ping()
|
| 130 |
+
logger.info("Redis connection established")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.warning("Failed to connect to Redis", error=str(e))
|
| 133 |
+
self.enable_redis = False
|
| 134 |
+
self._redis_client = None
|
| 135 |
+
|
| 136 |
+
return self._redis_client
|
| 137 |
+
|
| 138 |
+
def _generate_cache_key(
|
| 139 |
+
self,
|
| 140 |
+
prefix: str,
|
| 141 |
+
identifier: str,
|
| 142 |
+
version: str = "v1",
|
| 143 |
+
**kwargs
|
| 144 |
+
) -> str:
|
| 145 |
+
"""
|
| 146 |
+
Generate a consistent cache key.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
prefix: Cache type or prefix
|
| 150 |
+
identifier: Unique identifier for the cache entry
|
| 151 |
+
version: Version of the cache schema
|
| 152 |
+
**kwargs: Additional parameters to include in key
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Generated cache key
|
| 156 |
+
"""
|
| 157 |
+
# Create a stable representation of parameters
|
| 158 |
+
params = sorted(kwargs.items())
|
| 159 |
+
param_str = json.dumps(params, sort_keys=True, separators=(',', ':'))
|
| 160 |
+
|
| 161 |
+
# Create hash of identifier and params
|
| 162 |
+
hash_input = f"{identifier}:{param_str}"
|
| 163 |
+
hash_value = hashlib.sha256(hash_input.encode()).hexdigest()[:16]
|
| 164 |
+
|
| 165 |
+
return f"{prefix}:{version}:{identifier}:{hash_value}"
|
| 166 |
+
|
| 167 |
+
async def get(
|
| 168 |
+
self,
|
| 169 |
+
key: str,
|
| 170 |
+
cache_type: CacheType = CacheType.API_RESPONSE,
|
| 171 |
+
use_compression: Optional[bool] = None
|
| 172 |
+
) -> Optional[Any]:
|
| 173 |
+
"""
|
| 174 |
+
Get value from cache.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
key: Cache key
|
| 178 |
+
cache_type: Type of cache entry
|
| 179 |
+
use_compression: Override compression setting
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
Cached value or None if not found
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
# Try Redis first
|
| 186 |
+
if self.enable_redis:
|
| 187 |
+
redis_client = await self._get_redis_client()
|
| 188 |
+
if redis_client:
|
| 189 |
+
value = await self._get_from_redis(
|
| 190 |
+
redis_client,
|
| 191 |
+
key,
|
| 192 |
+
cache_type,
|
| 193 |
+
use_compression
|
| 194 |
+
)
|
| 195 |
+
if value is not None:
|
| 196 |
+
self._stats["hits"] += 1
|
| 197 |
+
self._stats["redis_hits"] += 1
|
| 198 |
+
return value
|
| 199 |
+
|
| 200 |
+
# Fallback to localStorage
|
| 201 |
+
value = await self._get_from_local(key, cache_type, use_compression)
|
| 202 |
+
if value is not None:
|
| 203 |
+
self._stats["hits"] += 1
|
| 204 |
+
self._stats["local_hits"] += 1
|
| 205 |
+
|
| 206 |
+
# If found locally but not in Redis, backfill to Redis
|
| 207 |
+
if self.enable_redis:
|
| 208 |
+
redis_client = await self._get_redis_client()
|
| 209 |
+
if redis_client:
|
| 210 |
+
ttl = self.TTL_CONFIG[cache_type]
|
| 211 |
+
await self._set_to_redis(
|
| 212 |
+
redis_client,
|
| 213 |
+
key,
|
| 214 |
+
value,
|
| 215 |
+
ttl,
|
| 216 |
+
use_compression
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
return value
|
| 220 |
+
|
| 221 |
+
# Cache miss
|
| 222 |
+
self._stats["misses"] += 1
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
except Exception as e:
|
| 226 |
+
self._stats["errors"] += 1
|
| 227 |
+
logger.error("Cache get failed", key=key, error=str(e))
|
| 228 |
+
return None
|
| 229 |
+
|
| 230 |
+
async def set(
|
| 231 |
+
self,
|
| 232 |
+
key: str,
|
| 233 |
+
value: Any,
|
| 234 |
+
cache_type: CacheType = CacheType.API_RESPONSE,
|
| 235 |
+
ttl: Optional[int] = None,
|
| 236 |
+
use_compression: Optional[bool] = None
|
| 237 |
+
) -> bool:
|
| 238 |
+
"""
|
| 239 |
+
Set value in cache.
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
key: Cache key
|
| 243 |
+
value: Value to cache
|
| 244 |
+
cache_type: Type of cache entry
|
| 245 |
+
ttl: Time to live in seconds (overrides type TTL)
|
| 246 |
+
use_compression: Override compression setting
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
True if successful, False otherwise
|
| 250 |
+
"""
|
| 251 |
+
try:
|
| 252 |
+
success = True
|
| 253 |
+
ttl = ttl or self.TTL_CONFIG[cache_type]
|
| 254 |
+
|
| 255 |
+
# Set in Redis
|
| 256 |
+
if self.enable_redis:
|
| 257 |
+
redis_client = await self._get_redis_client()
|
| 258 |
+
if redis_client:
|
| 259 |
+
success = await self._set_to_redis(
|
| 260 |
+
redis_client,
|
| 261 |
+
key,
|
| 262 |
+
value,
|
| 263 |
+
ttl,
|
| 264 |
+
use_compression
|
| 265 |
+
) and success
|
| 266 |
+
|
| 267 |
+
# Set in localStorage (always set as fallback)
|
| 268 |
+
local_success = await self._set_to_local(
|
| 269 |
+
key,
|
| 270 |
+
value,
|
| 271 |
+
cache_type,
|
| 272 |
+
ttl,
|
| 273 |
+
use_compression
|
| 274 |
+
)
|
| 275 |
+
success = local_success and success
|
| 276 |
+
|
| 277 |
+
return success
|
| 278 |
+
|
| 279 |
+
except Exception as e:
|
| 280 |
+
self._stats["errors"] += 1
|
| 281 |
+
logger.error("Cache set failed", key=key, error=str(e))
|
| 282 |
+
return False
|
| 283 |
+
|
| 284 |
+
async def delete(self, key: str) -> bool:
|
| 285 |
+
"""
|
| 286 |
+
Delete key from cache.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
key: Cache key to delete
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
True if successful, False otherwise
|
| 293 |
+
"""
|
| 294 |
+
try:
|
| 295 |
+
success = True
|
| 296 |
+
|
| 297 |
+
# Delete from Redis
|
| 298 |
+
if self.enable_redis:
|
| 299 |
+
redis_client = await self._get_redis_client()
|
| 300 |
+
if redis_client:
|
| 301 |
+
await redis_client.delete(key)
|
| 302 |
+
|
| 303 |
+
# Delete from localStorage
|
| 304 |
+
local_file = self.localStorage_path / f"{key}.cache"
|
| 305 |
+
if local_file.exists():
|
| 306 |
+
local_file.unlink()
|
| 307 |
+
|
| 308 |
+
# Remove from memory cache
|
| 309 |
+
if key in self._local_cache:
|
| 310 |
+
del self._local_cache[key]
|
| 311 |
+
|
| 312 |
+
return True
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
self._stats["errors"] += 1
|
| 316 |
+
logger.error("Cache delete failed", key=key, error=str(e))
|
| 317 |
+
return False
|
| 318 |
+
|
| 319 |
+
async def clear(
|
| 320 |
+
self,
|
| 321 |
+
pattern: Optional[str] = None,
|
| 322 |
+
cache_type: Optional[CacheType] = None
|
| 323 |
+
) -> int:
|
| 324 |
+
"""
|
| 325 |
+
Clear cache entries.
|
| 326 |
+
|
| 327 |
+
Args:
|
| 328 |
+
pattern: Pattern to match keys (supports wildcards)
|
| 329 |
+
cache_type: Clear only this cache type
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
Number of entries cleared
|
| 333 |
+
"""
|
| 334 |
+
try:
|
| 335 |
+
cleared_count = 0
|
| 336 |
+
|
| 337 |
+
# Build pattern if cache_type specified
|
| 338 |
+
if cache_type and not pattern:
|
| 339 |
+
pattern = f"{cache_type.value}:*"
|
| 340 |
+
|
| 341 |
+
# Clear from Redis
|
| 342 |
+
if self.enable_redis:
|
| 343 |
+
redis_client = await self._get_redis_client()
|
| 344 |
+
if redis_client:
|
| 345 |
+
if pattern:
|
| 346 |
+
keys = await redis_client.keys(pattern)
|
| 347 |
+
if keys:
|
| 348 |
+
await redis_client.delete(*keys)
|
| 349 |
+
cleared_count += len(keys)
|
| 350 |
+
else:
|
| 351 |
+
await redis_client.flushdb()
|
| 352 |
+
cleared_count = -1 # Indicate full clear
|
| 353 |
+
|
| 354 |
+
# Clear from localStorage
|
| 355 |
+
if pattern:
|
| 356 |
+
# Convert pattern to file pattern
|
| 357 |
+
file_pattern = pattern.replace("*", "").replace(":", "_") + "*.cache"
|
| 358 |
+
for cache_file in self.localStorage_path.glob(file_pattern):
|
| 359 |
+
cache_file.unlink()
|
| 360 |
+
cleared_count += 1
|
| 361 |
+
else:
|
| 362 |
+
# Clear all files
|
| 363 |
+
for cache_file in self.localStorage_path.glob("*.cache"):
|
| 364 |
+
cache_file.unlink()
|
| 365 |
+
cleared_count += 1
|
| 366 |
+
|
| 367 |
+
# Clear memory cache
|
| 368 |
+
self._local_cache.clear()
|
| 369 |
+
|
| 370 |
+
logger.info("Cache cleared", pattern=pattern, count=cleared_count)
|
| 371 |
+
return cleared_count
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
self._stats["errors"] += 1
|
| 375 |
+
logger.error("Cache clear failed", pattern=pattern, error=str(e))
|
| 376 |
+
return 0
|
| 377 |
+
|
| 378 |
+
async def _get_from_redis(
|
| 379 |
+
self,
|
| 380 |
+
redis_client,
|
| 381 |
+
key: str,
|
| 382 |
+
cache_type: CacheType,
|
| 383 |
+
use_compression: Optional[bool]
|
| 384 |
+
) -> Optional[Any]:
|
| 385 |
+
"""Get value from Redis."""
|
| 386 |
+
try:
|
| 387 |
+
data = await redis_client.get(key)
|
| 388 |
+
if data is None:
|
| 389 |
+
return None
|
| 390 |
+
|
| 391 |
+
# Uncompress if needed
|
| 392 |
+
if use_compression or (use_compression is None and self.enable_compression):
|
| 393 |
+
if data.startswith(b"COMP:"):
|
| 394 |
+
import gzip
|
| 395 |
+
data = gzip.decompress(data[5:])
|
| 396 |
+
|
| 397 |
+
# Deserialize
|
| 398 |
+
return pickle.loads(data)
|
| 399 |
+
|
| 400 |
+
except Exception as e:
|
| 401 |
+
logger.warning("Redis get failed", key=key, error=str(e))
|
| 402 |
+
return None
|
| 403 |
+
|
| 404 |
+
async def _set_to_redis(
|
| 405 |
+
self,
|
| 406 |
+
redis_client,
|
| 407 |
+
key: str,
|
| 408 |
+
value: Any,
|
| 409 |
+
ttl: int,
|
| 410 |
+
use_compression: Optional[bool]
|
| 411 |
+
) -> bool:
|
| 412 |
+
"""Set value in Redis."""
|
| 413 |
+
try:
|
| 414 |
+
# Serialize
|
| 415 |
+
data = pickle.dumps(value)
|
| 416 |
+
|
| 417 |
+
# Compress if needed
|
| 418 |
+
if (use_compression or (use_compression is None and self.enable_compression)) \
|
| 419 |
+
and len(data) > self.compression_threshold:
|
| 420 |
+
import gzip
|
| 421 |
+
data = b"COMP:" + gzip.compress(data)
|
| 422 |
+
|
| 423 |
+
await redis_client.setex(key, ttl, data)
|
| 424 |
+
return True
|
| 425 |
+
|
| 426 |
+
except Exception as e:
|
| 427 |
+
logger.warning("Redis set failed", key=key, error=str(e))
|
| 428 |
+
return False
|
| 429 |
+
|
| 430 |
+
async def _get_from_local(
|
| 431 |
+
self,
|
| 432 |
+
key: str,
|
| 433 |
+
cache_type: CacheType,
|
| 434 |
+
use_compression: Optional[bool]
|
| 435 |
+
) -> Optional[Any]:
|
| 436 |
+
"""Get value from localStorage."""
|
| 437 |
+
try:
|
| 438 |
+
# Check memory cache first
|
| 439 |
+
cache_entry = self._local_cache.get(key)
|
| 440 |
+
if cache_entry:
|
| 441 |
+
# Check if expired
|
| 442 |
+
if cache_entry["expires"] > datetime.utcnow():
|
| 443 |
+
return cache_entry["value"]
|
| 444 |
+
else:
|
| 445 |
+
# Remove expired entry
|
| 446 |
+
del self._local_cache[key]
|
| 447 |
+
|
| 448 |
+
# Check file cache
|
| 449 |
+
cache_file = self.localStorage_path / f"{key}.cache"
|
| 450 |
+
if not cache_file.exists():
|
| 451 |
+
return None
|
| 452 |
+
|
| 453 |
+
# Read and validate file
|
| 454 |
+
data = cache_file.read_bytes()
|
| 455 |
+
cache_entry = json.loads(data.decode())
|
| 456 |
+
|
| 457 |
+
# Check if expired
|
| 458 |
+
expires = datetime.fromisoformat(cache_entry["expires"])
|
| 459 |
+
if expires <= datetime.utcnow():
|
| 460 |
+
cache_file.unlink()
|
| 461 |
+
return None
|
| 462 |
+
|
| 463 |
+
# Decode value
|
| 464 |
+
if cache_entry.get("compressed") and (use_compression or (use_compression is None and self.enable_compression)):
|
| 465 |
+
import gzip
|
| 466 |
+
value = pickle.loads(gzip.decompress(cache_entry["value"].encode()))
|
| 467 |
+
else:
|
| 468 |
+
value = pickle.loads(cache_entry["value"].encode())
|
| 469 |
+
|
| 470 |
+
# Update memory cache
|
| 471 |
+
self._local_cache[key] = {
|
| 472 |
+
"value": value,
|
| 473 |
+
"expires": expires
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
return value
|
| 477 |
+
|
| 478 |
+
except Exception as e:
|
| 479 |
+
logger.warning("Local cache get failed", key=key, error=str(e))
|
| 480 |
+
return None
|
| 481 |
+
|
| 482 |
+
async def _set_to_local(
|
| 483 |
+
self,
|
| 484 |
+
key: str,
|
| 485 |
+
value: Any,
|
| 486 |
+
cache_type: CacheType,
|
| 487 |
+
ttl: int,
|
| 488 |
+
use_compression: Optional[bool]
|
| 489 |
+
) -> bool:
|
| 490 |
+
"""Set value in localStorage."""
|
| 491 |
+
try:
|
| 492 |
+
expires = datetime.utcnow() + timedelta(seconds=ttl)
|
| 493 |
+
|
| 494 |
+
# Compress if needed
|
| 495 |
+
compressed = False
|
| 496 |
+
if (use_compression or (use_compression is None and self.enable_compression)):
|
| 497 |
+
serialized = pickle.dumps(value)
|
| 498 |
+
if len(serialized) > self.compression_threshold:
|
| 499 |
+
import gzip
|
| 500 |
+
value_serialized = gzip.compress(serialized).decode()
|
| 501 |
+
compressed = True
|
| 502 |
+
else:
|
| 503 |
+
value_serialized = serialized.decode()
|
| 504 |
+
else:
|
| 505 |
+
value_serialized = pickle.dumps(value).decode()
|
| 506 |
+
|
| 507 |
+
# Create cache entry
|
| 508 |
+
cache_entry = {
|
| 509 |
+
"value": value_serialized,
|
| 510 |
+
"expires": expires.isoformat(),
|
| 511 |
+
"compressed": compressed,
|
| 512 |
+
"cache_type": cache_type.value
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
# Write to file
|
| 516 |
+
cache_file = self.localStorage_path / f"{key}.cache"
|
| 517 |
+
cache_file.write_bytes(json.dumps(cache_entry).encode())
|
| 518 |
+
|
| 519 |
+
# Update memory cache
|
| 520 |
+
self._local_cache[key] = {
|
| 521 |
+
"value": value,
|
| 522 |
+
"expires": expires
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
return True
|
| 526 |
+
|
| 527 |
+
except Exception as e:
|
| 528 |
+
logger.warning("Local cache set failed", key=key, error=str(e))
|
| 529 |
+
return False
|
| 530 |
+
|
| 531 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 532 |
+
"""Get cache statistics."""
|
| 533 |
+
total_requests = self._stats["hits"] + self._stats["misses"]
|
| 534 |
+
hit_rate = self._stats["hits"] / max(total_requests, 1) * 100
|
| 535 |
+
|
| 536 |
+
return {
|
| 537 |
+
**self._stats,
|
| 538 |
+
"total_requests": total_requests,
|
| 539 |
+
"hit_rate": round(hit_rate, 2),
|
| 540 |
+
"redis_enabled": self.enable_redis,
|
| 541 |
+
"memory_cache_size": len(self._local_cache)
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
async def cleanup_expired(self) -> int:
|
| 545 |
+
"""Clean up expired cache entries."""
|
| 546 |
+
cleaned = 0
|
| 547 |
+
|
| 548 |
+
try:
|
| 549 |
+
# Clean memory cache
|
| 550 |
+
now = datetime.utcnow()
|
| 551 |
+
expired_keys = [
|
| 552 |
+
key for key, entry in self._local_cache.items()
|
| 553 |
+
if entry["expires"] <= now
|
| 554 |
+
]
|
| 555 |
+
|
| 556 |
+
for key in expired_keys:
|
| 557 |
+
del self._local_cache[key]
|
| 558 |
+
cleaned += 1
|
| 559 |
+
|
| 560 |
+
# Clean file cache
|
| 561 |
+
for cache_file in self.localStorage_path.glob("*.cache"):
|
| 562 |
+
try:
|
| 563 |
+
data = json.loads(cache_file.read_bytes().decode())
|
| 564 |
+
expires = datetime.fromisoformat(data["expires"])
|
| 565 |
+
if expires <= datetime.utcnow():
|
| 566 |
+
cache_file.unlink()
|
| 567 |
+
cleaned += 1
|
| 568 |
+
except:
|
| 569 |
+
# Invalid cache file, remove it
|
| 570 |
+
cache_file.unlink()
|
| 571 |
+
cleaned += 1
|
| 572 |
+
|
| 573 |
+
logger.info("Cache cleanup completed", cleaned_entries=cleaned)
|
| 574 |
+
return cleaned
|
| 575 |
+
|
| 576 |
+
except Exception as e:
|
| 577 |
+
logger.error("Cache cleanup failed", error=str(e))
|
| 578 |
+
return 0
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
# Global cache service instance
|
| 582 |
+
_cache_service: Optional[CacheService] = None
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
async def get_cache_service() -> CacheService:
|
| 586 |
+
"""Get or create cache service instance."""
|
| 587 |
+
global _cache_service
|
| 588 |
+
|
| 589 |
+
if _cache_service is None:
|
| 590 |
+
_cache_service = CacheService()
|
| 591 |
+
|
| 592 |
+
return _cache_service
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
# Utility functions for specific cache types
|
| 596 |
+
async def cache_translation(
|
| 597 |
+
key: str,
|
| 598 |
+
translation: Dict[str, Any],
|
| 599 |
+
language: str
|
| 600 |
+
) -> bool:
|
| 601 |
+
"""Cache a translation entry."""
|
| 602 |
+
cache = await get_cache_service()
|
| 603 |
+
cache_key = cache._generate_cache_key(
|
| 604 |
+
"translation",
|
| 605 |
+
key,
|
| 606 |
+
lang=language
|
| 607 |
+
)
|
| 608 |
+
return await cache.set(
|
| 609 |
+
cache_key,
|
| 610 |
+
translation,
|
| 611 |
+
CacheType.TRANSLATION
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
async def get_cached_translation(
|
| 616 |
+
key: str,
|
| 617 |
+
language: str
|
| 618 |
+
) -> Optional[Dict[str, Any]]:
|
| 619 |
+
"""Get cached translation."""
|
| 620 |
+
cache = await get_cache_service()
|
| 621 |
+
cache_key = cache._generate_cache_key(
|
| 622 |
+
"translation",
|
| 623 |
+
key,
|
| 624 |
+
lang=language
|
| 625 |
+
)
|
| 626 |
+
return await cache.get(cache_key, CacheType.TRANSLATION)
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
async def cache_user_preference(
|
| 630 |
+
user_id: str,
|
| 631 |
+
preferences: Dict[str, Any]
|
| 632 |
+
) -> bool:
|
| 633 |
+
"""Cache user preferences."""
|
| 634 |
+
cache = await get_cache_service()
|
| 635 |
+
cache_key = cache._generate_cache_key(
|
| 636 |
+
"user_pref",
|
| 637 |
+
user_id
|
| 638 |
+
)
|
| 639 |
+
return await cache.set(
|
| 640 |
+
cache_key,
|
| 641 |
+
preferences,
|
| 642 |
+
CacheType.USER_PREFERENCE
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
async def get_cached_user_preference(
|
| 647 |
+
user_id: str
|
| 648 |
+
) -> Optional[Dict[str, Any]]:
|
| 649 |
+
"""Get cached user preferences."""
|
| 650 |
+
cache = await get_cache_service()
|
| 651 |
+
cache_key = cache._generate_cache_key(
|
| 652 |
+
"user_pref",
|
| 653 |
+
user_id
|
| 654 |
+
)
|
| 655 |
+
return await cache.get(cache_key, CacheType.USER_PREFERENCE)
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
async def cache_api_response(
|
| 659 |
+
endpoint: str,
|
| 660 |
+
params: Dict[str, Any],
|
| 661 |
+
response: Dict[str, Any],
|
| 662 |
+
ttl: Optional[int] = None
|
| 663 |
+
) -> bool:
|
| 664 |
+
"""Cache API response."""
|
| 665 |
+
cache = await get_cache_service()
|
| 666 |
+
cache_key = cache._generate_cache_key(
|
| 667 |
+
"api",
|
| 668 |
+
endpoint,
|
| 669 |
+
**params
|
| 670 |
+
)
|
| 671 |
+
return await cache.set(
|
| 672 |
+
cache_key,
|
| 673 |
+
response,
|
| 674 |
+
CacheType.API_RESPONSE,
|
| 675 |
+
ttl=ttl
|
| 676 |
+
)
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
async def get_cached_api_response(
|
| 680 |
+
endpoint: str,
|
| 681 |
+
params: Dict[str, Any]
|
| 682 |
+
) -> Optional[Dict[str, Any]]:
|
| 683 |
+
"""Get cached API response."""
|
| 684 |
+
cache = await get_cache_service()
|
| 685 |
+
cache_key = cache._generate_cache_key(
|
| 686 |
+
"api",
|
| 687 |
+
endpoint,
|
| 688 |
+
**params
|
| 689 |
+
)
|
| 690 |
+
return await cache.get(cache_key, CacheType.API_RESPONSE)
|
src/services/code_block_handler.py
ADDED
|
@@ -0,0 +1,630 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Code Block Handler for Translation System.
|
| 3 |
+
|
| 4 |
+
This module handles detection, preservation, and intelligent processing
|
| 5 |
+
of code blocks during translation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from typing import Dict, List, Optional, Any, Tuple, Set
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from enum import Enum
|
| 12 |
+
|
| 13 |
+
from bs4 import BeautifulSoup, Tag
|
| 14 |
+
import markdown
|
| 15 |
+
from pygments import highlight
|
| 16 |
+
from pygments.lexers import get_lexer_by_name, guess_lexer
|
| 17 |
+
from pygments.formatters import HtmlFormatter
|
| 18 |
+
|
| 19 |
+
from src.utils.translation_logger import get_translation_logger
|
| 20 |
+
|
| 21 |
+
logger = get_translation_logger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class CodeBlockType(Enum):
|
| 25 |
+
"""Types of code blocks."""
|
| 26 |
+
MARKDOWN = "markdown"
|
| 27 |
+
HTML_PRE = "html_pre"
|
| 28 |
+
HTML_INLINE = "html_inline"
|
| 29 |
+
INDENTED = "indented"
|
| 30 |
+
FENCED = "fenced"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class CodeBlock:
|
| 35 |
+
"""Represents a detected code block."""
|
| 36 |
+
block_type: CodeBlockType
|
| 37 |
+
language: Optional[str]
|
| 38 |
+
content: str
|
| 39 |
+
original_text: str
|
| 40 |
+
start_position: int
|
| 41 |
+
end_position: int
|
| 42 |
+
attributes: Dict[str, Any]
|
| 43 |
+
preserve_formatting: bool = True
|
| 44 |
+
add_urdu_comments: bool = False
|
| 45 |
+
translated: bool = False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class CodeBlockHandler:
|
| 49 |
+
"""
|
| 50 |
+
Handles code block detection, preservation, and processing.
|
| 51 |
+
|
| 52 |
+
Features:
|
| 53 |
+
- Multi-format code block detection
|
| 54 |
+
- Language identification
|
| 55 |
+
- Format preservation
|
| 56 |
+
- Urdu comment injection
|
| 57 |
+
- Syntax highlighting
|
| 58 |
+
- Code validation
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
# Code block patterns
|
| 62 |
+
PATTERNS = {
|
| 63 |
+
CodeBlockType.MARKDOWN: [
|
| 64 |
+
re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL),
|
| 65 |
+
re.compile(r'~~~(\w+)?\n(.*?)\n~~~', re.DOTALL),
|
| 66 |
+
],
|
| 67 |
+
CodeBlockType.HTML_PRE: [
|
| 68 |
+
re.compile(r'<pre[^>]*>.*?<code[^>]*>(.*?)</code>.*?</pre>', re.DOTALL | re.IGNORECASE),
|
| 69 |
+
],
|
| 70 |
+
CodeBlockType.HTML_INLINE: [
|
| 71 |
+
re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL | re.IGNORECASE),
|
| 72 |
+
],
|
| 73 |
+
CodeBlockType.INDENTED: [
|
| 74 |
+
# Detect 4+ spaces or tabs at start of line
|
| 75 |
+
re.compile(r'^( |\t).*$', re.MULTILINE),
|
| 76 |
+
],
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# Language patterns for detection
|
| 80 |
+
LANGUAGE_PATTERNS = {
|
| 81 |
+
'python': [r'import\s+\w+', r'def\s+\w+', r'class\s+\w+', r'if\s+__name__\s*=='],
|
| 82 |
+
'javascript': [r'function\s+\w+', r'const\s+\w+\s*=', r'let\s+\w+\s*=', r'var\s+\w+\s*='],
|
| 83 |
+
'java': [r'public\s+class\s+\w+', r'private\s+\w+\s+\w+', r'import\s+java\.'],
|
| 84 |
+
'cpp': [r'#include\s*<', r'using\s+namespace\s+', r'::\w+\s*\('],
|
| 85 |
+
'html': [r'<!DOCTYPE\s+html>', r'<html[^>]*>', r'<div[^>]*>'],
|
| 86 |
+
'css': [r'\.[\w-]+\s*{', r'#[\w-]+\s*{', r'@\w+\s*\w+\s*{'],
|
| 87 |
+
'sql': [r'SELECT\s+', r'FROM\s+', r'WHERE\s+', r'INSERT\s+INTO'],
|
| 88 |
+
'json': [r'^\s*{\s*"', r'^\s*\[', r'"[^"]*":\s*'],
|
| 89 |
+
'yaml': [r'^\s*\w+:', r'^\s+-\s+', r'^\s* \w+:'],
|
| 90 |
+
'bash': [r'#!/bin/bash', r'echo\s+', r'export\s+\w+='],
|
| 91 |
+
'powershell': [r'Write-Host\s+', r'$\w+\s*=', r'Get-'],
|
| 92 |
+
'dockerfile': [r'FROM\s+\w+', r'RUN\s+', r'CMD\s+'],
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
# Common programming keywords
|
| 96 |
+
PROGRAMMING_KEYWORDS = [
|
| 97 |
+
'function', 'class', 'import', 'export', 'return', 'if', 'else', 'for', 'while',
|
| 98 |
+
'def', 'var', 'let', 'const', 'try', 'catch', 'throw', 'new', 'this', 'super'
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
def __init__(self):
|
| 102 |
+
"""Initialize code block handler."""
|
| 103 |
+
self.detected_languages: Set[str] = set()
|
| 104 |
+
self.urdu_comments = {
|
| 105 |
+
'python': '#',
|
| 106 |
+
'javascript': '//',
|
| 107 |
+
'java': '//',
|
| 108 |
+
'cpp': '//',
|
| 109 |
+
'c': '//',
|
| 110 |
+
'css': '/*',
|
| 111 |
+
'sql': '--',
|
| 112 |
+
'bash': '#',
|
| 113 |
+
'powershell': '#',
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
def detect_code_blocks(
|
| 117 |
+
self,
|
| 118 |
+
content: str,
|
| 119 |
+
source_format: str = "html"
|
| 120 |
+
) -> List[CodeBlock]:
|
| 121 |
+
"""
|
| 122 |
+
Detect all code blocks in content.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
content: Content to analyze
|
| 126 |
+
source_format: Format type (html, markdown, etc.)
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
List of detected code blocks
|
| 130 |
+
"""
|
| 131 |
+
logger.info(
|
| 132 |
+
"Detecting code blocks",
|
| 133 |
+
content_length=len(content),
|
| 134 |
+
source_format=source_format
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
blocks = []
|
| 138 |
+
|
| 139 |
+
# Try each pattern type
|
| 140 |
+
for block_type, patterns in self.PATTERNS.items():
|
| 141 |
+
for pattern in patterns:
|
| 142 |
+
matches = list(pattern.finditer(content))
|
| 143 |
+
for match in matches:
|
| 144 |
+
block = self._create_code_block(
|
| 145 |
+
match, block_type, content
|
| 146 |
+
)
|
| 147 |
+
if block:
|
| 148 |
+
blocks.append(block)
|
| 149 |
+
|
| 150 |
+
# Remove duplicates (blocks that overlap)
|
| 151 |
+
blocks = self._remove_overlapping_blocks(blocks)
|
| 152 |
+
|
| 153 |
+
# Detect language for each block
|
| 154 |
+
for block in blocks:
|
| 155 |
+
block.language = self._detect_language(block.content)
|
| 156 |
+
|
| 157 |
+
logger.info(
|
| 158 |
+
"Code blocks detected",
|
| 159 |
+
total_blocks=len(blocks),
|
| 160 |
+
languages=list(set(b.language for b in blocks if b.language)),
|
| 161 |
+
block_types=[b.block_type.value for b in blocks]
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
return blocks
|
| 165 |
+
|
| 166 |
+
def _create_code_block(
|
| 167 |
+
self,
|
| 168 |
+
match: re.Match,
|
| 169 |
+
block_type: CodeBlockType,
|
| 170 |
+
content: str
|
| 171 |
+
) -> Optional[CodeBlock]:
|
| 172 |
+
"""Create a CodeBlock object from a regex match."""
|
| 173 |
+
start_pos = match.start()
|
| 174 |
+
end_pos = match.end()
|
| 175 |
+
original_text = match.group(0)
|
| 176 |
+
|
| 177 |
+
if block_type in [CodeBlockType.MARKDOWN, CodeBlockType.FENCED]:
|
| 178 |
+
# Extract language from fence
|
| 179 |
+
language = match.group(1) if match.groups() and match.group(1) else None
|
| 180 |
+
code_content = match.group(2) if match.groups() and len(match.groups()) > 1 else ""
|
| 181 |
+
elif block_type == CodeBlockType.HTML_PRE:
|
| 182 |
+
# Extract from HTML pre/code structure
|
| 183 |
+
soup = BeautifulSoup(original_text, 'html.parser')
|
| 184 |
+
code_tag = soup.find('code')
|
| 185 |
+
if code_tag:
|
| 186 |
+
language = self._extract_language_from_classes(code_tag.get('class', []))
|
| 187 |
+
code_content = code_tag.get_text()
|
| 188 |
+
else:
|
| 189 |
+
language = None
|
| 190 |
+
code_content = original_text
|
| 191 |
+
elif block_type == CodeBlockType.HTML_INLINE:
|
| 192 |
+
# Inline code
|
| 193 |
+
soup = BeautifulSoup(original_text, 'html.parser')
|
| 194 |
+
code_content = soup.get_text()
|
| 195 |
+
language = None
|
| 196 |
+
else:
|
| 197 |
+
# Other types
|
| 198 |
+
code_content = original_text
|
| 199 |
+
language = None
|
| 200 |
+
|
| 201 |
+
if not code_content.strip():
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
return CodeBlock(
|
| 205 |
+
block_type=block_type,
|
| 206 |
+
language=language,
|
| 207 |
+
content=code_content,
|
| 208 |
+
original_text=original_text,
|
| 209 |
+
start_position=start_pos,
|
| 210 |
+
end_position=end_pos,
|
| 211 |
+
attributes={'match_groups': match.groups()},
|
| 212 |
+
preserve_formatting=True,
|
| 213 |
+
add_urdu_comments=self._should_add_urdu_comments(code_content, language)
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
def _remove_overlapping_blocks(self, blocks: List[CodeBlock]) -> List[CodeBlock]:
|
| 217 |
+
"""Remove overlapping code blocks."""
|
| 218 |
+
if not blocks:
|
| 219 |
+
return []
|
| 220 |
+
|
| 221 |
+
# Sort by start position
|
| 222 |
+
blocks.sort(key=lambda x: x.start_position)
|
| 223 |
+
|
| 224 |
+
filtered_blocks = []
|
| 225 |
+
last_end = -1
|
| 226 |
+
|
| 227 |
+
for block in blocks:
|
| 228 |
+
if block.start_position >= last_end:
|
| 229 |
+
filtered_blocks.append(block)
|
| 230 |
+
last_end = block.end_position
|
| 231 |
+
|
| 232 |
+
return filtered_blocks
|
| 233 |
+
|
| 234 |
+
def _detect_language(self, code_content: str) -> Optional[str]:
|
| 235 |
+
"""Detect the programming language of code content."""
|
| 236 |
+
# Try language hints first
|
| 237 |
+
language = self._detect_language_from_hints(code_content)
|
| 238 |
+
if language:
|
| 239 |
+
return language
|
| 240 |
+
|
| 241 |
+
# Try pattern matching
|
| 242 |
+
language = self._detect_language_from_patterns(code_content)
|
| 243 |
+
if language:
|
| 244 |
+
return language
|
| 245 |
+
|
| 246 |
+
# Use pygments as fallback
|
| 247 |
+
try:
|
| 248 |
+
lexer = guess_lexer(code_content)
|
| 249 |
+
if lexer:
|
| 250 |
+
return lexer.name.lower()
|
| 251 |
+
except:
|
| 252 |
+
pass
|
| 253 |
+
|
| 254 |
+
return None
|
| 255 |
+
|
| 256 |
+
def _detect_language_from_hints(self, code_content: str) -> Optional[str]:
|
| 257 |
+
"""Detect language from explicit hints."""
|
| 258 |
+
# Check for shebang
|
| 259 |
+
shebang_match = re.match(r'^#!\s*/.*(?:python|node|bash|perl|ruby|php)\s*', code_content, re.MULTILINE)
|
| 260 |
+
if shebang_match:
|
| 261 |
+
shebang = shebang_match.group()
|
| 262 |
+
if 'python' in shebang:
|
| 263 |
+
return 'python'
|
| 264 |
+
elif 'node' in shebang:
|
| 265 |
+
return 'javascript'
|
| 266 |
+
elif 'bash' in shebang:
|
| 267 |
+
return 'bash'
|
| 268 |
+
elif 'perl' in shebang:
|
| 269 |
+
return 'perl'
|
| 270 |
+
elif 'ruby' in shebang:
|
| 271 |
+
return 'ruby'
|
| 272 |
+
elif 'php' in shebang:
|
| 273 |
+
return 'php'
|
| 274 |
+
|
| 275 |
+
# Check for language comments
|
| 276 |
+
if code_content.strip().startswith('#!'):
|
| 277 |
+
return 'bash' # Likely shell script
|
| 278 |
+
|
| 279 |
+
return None
|
| 280 |
+
|
| 281 |
+
def _detect_language_from_patterns(self, code_content: str) -> Optional[str]:
|
| 282 |
+
"""Detect language using pattern matching."""
|
| 283 |
+
scores = {}
|
| 284 |
+
|
| 285 |
+
for language, patterns in self.LANGUAGE_PATTERNS.items():
|
| 286 |
+
score = 0
|
| 287 |
+
for pattern in patterns:
|
| 288 |
+
matches = len(list(re.finditer(pattern, code_content, re.MULTILINE)))
|
| 289 |
+
score += matches
|
| 290 |
+
|
| 291 |
+
if score > 0:
|
| 292 |
+
scores[language] = score
|
| 293 |
+
|
| 294 |
+
if scores:
|
| 295 |
+
return max(scores.items(), key=lambda x: x[1])[0]
|
| 296 |
+
|
| 297 |
+
return None
|
| 298 |
+
|
| 299 |
+
def _extract_language_from_classes(self, classes: List[str]) -> Optional[str]:
|
| 300 |
+
"""Extract language from CSS classes."""
|
| 301 |
+
for cls in classes:
|
| 302 |
+
if isinstance(cls, str):
|
| 303 |
+
# Check for language- prefixed classes
|
| 304 |
+
if cls.startswith('language-'):
|
| 305 |
+
return cls[9:]
|
| 306 |
+
# Check for known language classes
|
| 307 |
+
if cls.lower() in ['python', 'javascript', 'java', 'cpp', 'c', 'html', 'css', 'sql', 'json']:
|
| 308 |
+
return cls.lower()
|
| 309 |
+
# Check for highlight.js classes
|
| 310 |
+
if cls.startswith('hljs-'):
|
| 311 |
+
lang = cls[5:]
|
| 312 |
+
if lang != 'language':
|
| 313 |
+
return lang
|
| 314 |
+
|
| 315 |
+
return None
|
| 316 |
+
|
| 317 |
+
def _should_add_urdu_comments(self, code_content: str, language: Optional[str]) -> bool:
|
| 318 |
+
"""Determine if Urdu comments should be added."""
|
| 319 |
+
if not language or language not in self.urdu_comments:
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
# Don't add comments to very short code blocks
|
| 323 |
+
if len(code_content.split('\n')) < 3:
|
| 324 |
+
return False
|
| 325 |
+
|
| 326 |
+
# Don't add if there are already comments in the target language
|
| 327 |
+
comment_char = self.urdu_comments[language]
|
| 328 |
+
if comment_char and comment_char in code_content:
|
| 329 |
+
# Check for non-English characters in comments
|
| 330 |
+
comment_pattern = re.compile(f'{re.escape(comment_char)}.*[^\x00-\x7F]+')
|
| 331 |
+
if comment_pattern.search(code_content):
|
| 332 |
+
return False
|
| 333 |
+
|
| 334 |
+
return True
|
| 335 |
+
|
| 336 |
+
def add_urdu_comments(self, code_block: CodeBlock) -> str:
|
| 337 |
+
"""
|
| 338 |
+
Add Urdu explanatory comments to code block.
|
| 339 |
+
|
| 340 |
+
Args:
|
| 341 |
+
code_block: Code block to enhance
|
| 342 |
+
|
| 343 |
+
Returns:
|
| 344 |
+
Code block with Urdu comments added
|
| 345 |
+
"""
|
| 346 |
+
if not code_block.language or not code_block.add_urdu_comments:
|
| 347 |
+
return code_block.content
|
| 348 |
+
|
| 349 |
+
language = code_block.language
|
| 350 |
+
comment_char = self.urdu_comments[language]
|
| 351 |
+
|
| 352 |
+
lines = code_block.content.split('\n')
|
| 353 |
+
enhanced_lines = []
|
| 354 |
+
|
| 355 |
+
for i, line in enumerate(lines):
|
| 356 |
+
enhanced_lines.append(line)
|
| 357 |
+
|
| 358 |
+
# Add comments after key lines
|
| 359 |
+
if self._is_comment_line(line, language):
|
| 360 |
+
continue
|
| 361 |
+
|
| 362 |
+
# Add Urdu comment after function definitions
|
| 363 |
+
if re.search(r'^(def|function|class|interface)\s+\w+', line):
|
| 364 |
+
# Extract function/class name
|
| 365 |
+
match = re.search(r'(def|function|class|interface)\s+(\w+)', line)
|
| 366 |
+
if match:
|
| 367 |
+
name = match.group(2)
|
| 368 |
+
urdu_translation = self._translate_code_name(name)
|
| 369 |
+
enhanced_lines.append(f"{comment_char} {urdu_translation}")
|
| 370 |
+
|
| 371 |
+
# Add comment after important statements
|
| 372 |
+
elif re.search(r'\b(return|break|continue|pass)\b', line):
|
| 373 |
+
urdu_comment = self._translate_statement(line.strip())
|
| 374 |
+
if urdu_comment:
|
| 375 |
+
enhanced_lines.append(f"{comment_char} {urdu_comment}")
|
| 376 |
+
|
| 377 |
+
# Add comment after imports
|
| 378 |
+
elif re.match(r'^(import|from|include)\s+', line):
|
| 379 |
+
urdu_comment = self._translate_import(line.strip())
|
| 380 |
+
if urdu_comment:
|
| 381 |
+
enhanced_lines.append(f"{comment_char} {urdu_comment}")
|
| 382 |
+
|
| 383 |
+
return '\n'.join(enhanced_lines)
|
| 384 |
+
|
| 385 |
+
def _is_comment_line(self, line: str, language: str) -> bool:
|
| 386 |
+
"""Check if line is already a comment."""
|
| 387 |
+
comment_char = self.urdu_comments.get(language, '')
|
| 388 |
+
return comment_char and line.strip().startswith(comment_char)
|
| 389 |
+
|
| 390 |
+
def _translate_code_name(self, name: str) -> str:
|
| 391 |
+
"""Translate a code identifier to Urdu."""
|
| 392 |
+
# Common translations
|
| 393 |
+
translations = {
|
| 394 |
+
'main': 'مین',
|
| 395 |
+
'init': 'ابتدائی',
|
| 396 |
+
'start': 'شروع',
|
| 397 |
+
'setup': 'سیٹ اپ',
|
| 398 |
+
'run': 'چلائیں',
|
| 399 |
+
'process': 'عملدرس',
|
| 400 |
+
'handle': 'ہینڈل کریں',
|
| 401 |
+
'update': 'اپڈیٹ کرنا',
|
| 402 |
+
'get': 'حاصل کریں',
|
| 403 |
+
'set': 'سیٹ کرنا',
|
| 404 |
+
'create': 'بنانا',
|
| 405 |
+
'delete': 'حذف کرنا',
|
| 406 |
+
'calculate': 'حساب لگانا',
|
| 407 |
+
'validate': 'تصدیق کرنا',
|
| 408 |
+
'convert': 'تبدیل کرنا',
|
| 409 |
+
'transform': 'تبدیل کرنا',
|
| 410 |
+
'parse': 'پارس کرنا',
|
| 411 |
+
'render': 'رینڈر کرنا',
|
| 412 |
+
'fetch': 'لانا',
|
| 413 |
+
'send': 'بھیجنا',
|
| 414 |
+
'receive': 'صول کرنا',
|
| 415 |
+
'connect': 'ربط جوڑنا',
|
| 416 |
+
'close': 'بند کرنا',
|
| 417 |
+
'open': 'کھولنا',
|
| 418 |
+
'save': 'محفوظ کرنا',
|
| 419 |
+
'load': 'لوڈ کرنا',
|
| 420 |
+
'read': 'پڑھنا',
|
| 421 |
+
'write': 'لکھنا',
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
return translations.get(name, name)
|
| 425 |
+
|
| 426 |
+
def _translate_statement(self, statement: str) -> str:
|
| 427 |
+
"""Translate a code statement to Urdu."""
|
| 428 |
+
# Common statement translations
|
| 429 |
+
translations = {
|
| 430 |
+
'return': 'واپس کریں',
|
| 431 |
+
'break': 'روک جائیں',
|
| 432 |
+
'continue': 'جاری رکھیں',
|
| 433 |
+
'pass': 'چھوٹ دیں',
|
| 434 |
+
'yield': 'دیں',
|
| 435 |
+
'raise': 'پھلاؤ',
|
| 436 |
+
'try': 'کوشش کریں',
|
| 437 |
+
'except': 'چھوٹ',
|
| 438 |
+
'finally': 'آخر میں',
|
| 439 |
+
'assert': 'تصدیق کریں',
|
| 440 |
+
'del': 'حذف کریں',
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
# Extract keyword
|
| 444 |
+
match = re.search(r'\b(' + '|'.join(translations.keys()) + r')\b', statement)
|
| 445 |
+
if match:
|
| 446 |
+
keyword = match.group(1)
|
| 447 |
+
translated = translations.get(keyword, keyword)
|
| 448 |
+
return statement.replace(keyword, translated, 1)
|
| 449 |
+
|
| 450 |
+
return None
|
| 451 |
+
|
| 452 |
+
def _translate_import(self, import_statement: str) -> str:
|
| 453 |
+
"""Translate an import statement to Urdu."""
|
| 454 |
+
if 'import ' in import_statement:
|
| 455 |
+
return 'لائبریری امپورٹ کریں'
|
| 456 |
+
elif 'from ' in import_statement:
|
| 457 |
+
return 'سے امپورٹ کریں'
|
| 458 |
+
elif 'include ' in import_statement:
|
| 459 |
+
return 'شامل کریں'
|
| 460 |
+
|
| 461 |
+
return None
|
| 462 |
+
|
| 463 |
+
def preserve_code_blocks(
|
| 464 |
+
self,
|
| 465 |
+
original_content: str,
|
| 466 |
+
translated_content: str,
|
| 467 |
+
code_blocks: List[CodeBlock]
|
| 468 |
+
) -> str:
|
| 469 |
+
"""
|
| 470 |
+
Preserve code blocks in translated content.
|
| 471 |
+
|
| 472 |
+
Args:
|
| 473 |
+
original_content: Original content with code blocks
|
| 474 |
+
translated_content: Translated content
|
| 475 |
+
code_blocks: Detected code blocks
|
| 476 |
+
|
| 477 |
+
Returns:
|
| 478 |
+
Content with original code blocks preserved
|
| 479 |
+
"""
|
| 480 |
+
logger.info(
|
| 481 |
+
"Preserving code blocks",
|
| 482 |
+
original_blocks=len(code_blocks)
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
# Replace translated code blocks with original ones
|
| 486 |
+
result = translated_content
|
| 487 |
+
blocks_preserved = 0
|
| 488 |
+
|
| 489 |
+
for block in code_blocks:
|
| 490 |
+
# Find and replace the corresponding block in translated content
|
| 491 |
+
# This is simplified - in practice, you'd want more precise matching
|
| 492 |
+
translated_block_content = self._find_translated_block(
|
| 493 |
+
result, block, original_content
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
if translated_block_content is not None:
|
| 497 |
+
# Replace with original
|
| 498 |
+
result = result.replace(
|
| 499 |
+
translated_block_content,
|
| 500 |
+
block.original_text,
|
| 501 |
+
1
|
| 502 |
+
)
|
| 503 |
+
blocks_preserved += 1
|
| 504 |
+
|
| 505 |
+
# Add Urdu comments if configured
|
| 506 |
+
if block.add_urdu_comments:
|
| 507 |
+
enhanced_code = self.add_urdu_comments(block)
|
| 508 |
+
result = result.replace(
|
| 509 |
+
block.original_text,
|
| 510 |
+
enhanced_code,
|
| 511 |
+
1
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
logger.info(
|
| 515 |
+
"Code blocks preserved",
|
| 516 |
+
blocks_preserved=blocks_preserved,
|
| 517 |
+
blocks_total=len(code_blocks)
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
return result
|
| 521 |
+
|
| 522 |
+
def _find_translated_block(
|
| 523 |
+
self,
|
| 524 |
+
content: str,
|
| 525 |
+
original_block: CodeBlock,
|
| 526 |
+
original_content: str
|
| 527 |
+
) -> Optional[str]:
|
| 528 |
+
"""Find the translated version of a code block."""
|
| 529 |
+
# This is a simplified implementation
|
| 530 |
+
# In practice, you'd track blocks more precisely during translation
|
| 531 |
+
|
| 532 |
+
# Look for the block content in the translated content
|
| 533 |
+
# This might not work perfectly due to translation changes
|
| 534 |
+
if original_block.content in content:
|
| 535 |
+
return original_block.content
|
| 536 |
+
|
| 537 |
+
# Try to find by looking for unique lines
|
| 538 |
+
original_lines = original_block.content.split('\n')
|
| 539 |
+
if len(original_lines) > 3:
|
| 540 |
+
# Use first and last lines as markers
|
| 541 |
+
first_line = original_lines[0]
|
| 542 |
+
last_line = original_lines[-1]
|
| 543 |
+
|
| 544 |
+
if first_line in content and last_line in content:
|
| 545 |
+
# Extract content between markers
|
| 546 |
+
start = content.find(first_line)
|
| 547 |
+
end = content.rfind(last_line) + len(last_line)
|
| 548 |
+
return content[start:end]
|
| 549 |
+
|
| 550 |
+
return None
|
| 551 |
+
|
| 552 |
+
def add_syntax_highlighting(
|
| 553 |
+
self,
|
| 554 |
+
code_block: CodeBlock,
|
| 555 |
+
theme: str = "default"
|
| 556 |
+
) -> str:
|
| 557 |
+
"""
|
| 558 |
+
Add syntax highlighting to a code block.
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
code_block: Code block to highlight
|
| 562 |
+
theme: Highlighting theme
|
| 563 |
+
|
| 564 |
+
Returns:
|
| 565 |
+
HTML with syntax highlighting
|
| 566 |
+
"""
|
| 567 |
+
try:
|
| 568 |
+
lexer = get_lexer_by_name(code_block.language or 'text')
|
| 569 |
+
formatter = HtmlFormatter(
|
| 570 |
+
style=theme,
|
| 571 |
+
linenos=True,
|
| 572 |
+
cssclass="highlight"
|
| 573 |
+
)
|
| 574 |
+
return highlight(code_block.content, lexer, formatter)
|
| 575 |
+
except:
|
| 576 |
+
# Fallback to plain code block
|
| 577 |
+
return f'<pre><code>{code_block.content}</code></pre>'
|
| 578 |
+
|
| 579 |
+
def validate_code_blocks(
|
| 580 |
+
self,
|
| 581 |
+
code_blocks: List[CodeBlock],
|
| 582 |
+
content: str
|
| 583 |
+
) -> Dict[str, Any]:
|
| 584 |
+
"""
|
| 585 |
+
Validate detected code blocks.
|
| 586 |
+
|
| 587 |
+
Args:
|
| 588 |
+
code_blocks: Detected code blocks
|
| 589 |
+
content: Original content
|
| 590 |
+
|
| 591 |
+
Returns:
|
| 592 |
+
Validation report
|
| 593 |
+
"""
|
| 594 |
+
report = {
|
| 595 |
+
'valid': True,
|
| 596 |
+
'warnings': [],
|
| 597 |
+
'errors': [],
|
| 598 |
+
'stats': {
|
| 599 |
+
'total_blocks': len(code_blocks),
|
| 600 |
+
'languages_detected': list(set(b.language for b in code_blocks if b.language)),
|
| 601 |
+
'blocks_with_languages': len([b for b in code_blocks if b.language])
|
| 602 |
+
}
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
for block in code_blocks:
|
| 606 |
+
# Check for empty blocks
|
| 607 |
+
if not block.content.strip():
|
| 608 |
+
report['warnings'].append(
|
| 609 |
+
f"Empty code block at position {block.start_position}"
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
# Check for very long blocks
|
| 613 |
+
if len(block.content) > 10000:
|
| 614 |
+
report['warnings'].append(
|
| 615 |
+
f"Very long code block ({len(block.content)} chars) at position {block.start_position}"
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
# Check for potential formatting issues
|
| 619 |
+
if block.block_type == CodeBlockType.INDENTED and block.content.strip():
|
| 620 |
+
report['warnings'].append(
|
| 621 |
+
f"Indented code block detected at position {block.start_position} - might be unintentional"
|
| 622 |
+
)
|
| 623 |
+
|
| 624 |
+
logger.info(
|
| 625 |
+
"Code block validation complete",
|
| 626 |
+
total_warnings=len(report['warnings']),
|
| 627 |
+
total_errors=len(report['errors'])
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
return report
|
src/services/content_reconstructor.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content Reconstructor for Translation System.
|
| 3 |
+
|
| 4 |
+
This module reconstructs HTML content from parsed elements,
|
| 5 |
+
injecting translated text while preserving original formatting
|
| 6 |
+
and structure.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List, Optional, Any
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
|
| 12 |
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
| 13 |
+
import re
|
| 14 |
+
import markdown
|
| 15 |
+
|
| 16 |
+
from src.services.html_parser import ContentElement, ContentType
|
| 17 |
+
from src.utils.translation_logger import get_translation_logger
|
| 18 |
+
|
| 19 |
+
logger = get_translation_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class ReconstructionConfig:
|
| 24 |
+
"""Configuration for content reconstruction."""
|
| 25 |
+
preserve_classes: bool = True
|
| 26 |
+
preserve_ids: bool = True
|
| 27 |
+
preserve_data_attributes: bool = False
|
| 28 |
+
preserve_style: bool = True
|
| 29 |
+
add_translation_markers: bool = False
|
| 30 |
+
cleanup_empty_elements: bool = True
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ContentReconstructor:
|
| 34 |
+
"""
|
| 35 |
+
Reconstructs HTML content from parsed elements with translations.
|
| 36 |
+
|
| 37 |
+
Features:
|
| 38 |
+
- Recursive HTML reconstruction
|
| 39 |
+
- Formatting preservation
|
| 40 |
+
- Code block protection
|
| 41 |
+
- Translation marker injection
|
| 42 |
+
- Structure validation
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(self, config: Optional[ReconstructionConfig] = None):
|
| 46 |
+
"""
|
| 47 |
+
Initialize content reconstructor.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
config: Reconstruction configuration
|
| 51 |
+
"""
|
| 52 |
+
self.config = config or ReconstructionConfig()
|
| 53 |
+
self.translation_markers = {
|
| 54 |
+
'translated': 'data-translated="true"',
|
| 55 |
+
'original': 'data-original="',
|
| 56 |
+
'preserve': 'data-preserve="true"'
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
def reconstruct_html(
|
| 60 |
+
self,
|
| 61 |
+
elements: List[ContentElement],
|
| 62 |
+
translated_map: Dict[str, str],
|
| 63 |
+
base_format: str = "html"
|
| 64 |
+
) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Reconstruct HTML from parsed elements with translations.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
elements: Parsed content elements
|
| 70 |
+
translated_map: Mapping of original text to translated text
|
| 71 |
+
base_format: Base format (html, markdown, etc.)
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Reconstructed HTML content
|
| 75 |
+
"""
|
| 76 |
+
logger.info(
|
| 77 |
+
"Reconstructing HTML content",
|
| 78 |
+
elements_count=len(elements),
|
| 79 |
+
translations_count=len(translated_map),
|
| 80 |
+
base_format=base_format
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Create base document
|
| 84 |
+
if base_format == "html":
|
| 85 |
+
soup = BeautifulSoup("", "html.parser")
|
| 86 |
+
body = soup.new_tag("body")
|
| 87 |
+
soup.append(body)
|
| 88 |
+
else:
|
| 89 |
+
soup = BeautifulSoup("", "html.parser")
|
| 90 |
+
|
| 91 |
+
# Reconstruct elements
|
| 92 |
+
container = soup.body if soup.body else soup
|
| 93 |
+
for element in elements:
|
| 94 |
+
reconstructed = self._reconstruct_element(element, translated_map, soup)
|
| 95 |
+
if reconstructed:
|
| 96 |
+
container.append(reconstructed)
|
| 97 |
+
|
| 98 |
+
# Post-processing
|
| 99 |
+
html_content = str(soup)
|
| 100 |
+
|
| 101 |
+
if self.config.cleanup_empty_elements:
|
| 102 |
+
html_content = self._cleanup_empty_elements(html_content)
|
| 103 |
+
|
| 104 |
+
logger.info(
|
| 105 |
+
"HTML reconstruction complete",
|
| 106 |
+
output_length=len(html_content)
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
return html_content
|
| 110 |
+
|
| 111 |
+
def _reconstruct_element(
|
| 112 |
+
self,
|
| 113 |
+
element: ContentElement,
|
| 114 |
+
translated_map: Dict[str, str],
|
| 115 |
+
soup: BeautifulSoup
|
| 116 |
+
) -> Optional[Tag]:
|
| 117 |
+
"""
|
| 118 |
+
Reconstruct a single element.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
element: Content element to reconstruct
|
| 122 |
+
translated_map: Translation mapping
|
| 123 |
+
soup: BeautifulSoup document
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
Reconstructed HTML tag
|
| 127 |
+
"""
|
| 128 |
+
# Handle special content types
|
| 129 |
+
if element.element_type == ContentType.CODE:
|
| 130 |
+
return self._reconstruct_code_element(element, soup)
|
| 131 |
+
elif element.element_type == ContentType.IMAGE:
|
| 132 |
+
return self._reconstruct_image_element(element, soup)
|
| 133 |
+
elif element.element_type == ContentType.LINK:
|
| 134 |
+
return self._reconstruct_link_element(element, soup)
|
| 135 |
+
elif element.element_type == ContentType.METADATA:
|
| 136 |
+
return None # Skip metadata
|
| 137 |
+
|
| 138 |
+
# Create appropriate tag
|
| 139 |
+
tag = self._create_tag(element.element_type, soup, element)
|
| 140 |
+
|
| 141 |
+
# Add attributes
|
| 142 |
+
self._add_attributes(tag, element)
|
| 143 |
+
|
| 144 |
+
# Add content or children
|
| 145 |
+
if element.should_translate and element.element_type == ContentType.TEXT:
|
| 146 |
+
# Add translated text
|
| 147 |
+
translated_text = translated_map.get(element.content, element.content)
|
| 148 |
+
tag.string = translated_text
|
| 149 |
+
|
| 150 |
+
# Add translation marker if configured
|
| 151 |
+
if self.config.add_translation_markers:
|
| 152 |
+
tag['data-translated'] = 'true'
|
| 153 |
+
tag['data-original'] = element.content
|
| 154 |
+
|
| 155 |
+
elif element.children:
|
| 156 |
+
# Reconstruct children
|
| 157 |
+
for child in element.children:
|
| 158 |
+
child_tag = self._reconstruct_element(child, translated_map, soup)
|
| 159 |
+
if child_tag:
|
| 160 |
+
tag.append(child_tag)
|
| 161 |
+
|
| 162 |
+
elif element.content:
|
| 163 |
+
# Add original content for non-translatable elements
|
| 164 |
+
tag.string = element.content
|
| 165 |
+
if element.element_type != ContentType.CODE:
|
| 166 |
+
tag['data-preserve'] = 'true'
|
| 167 |
+
|
| 168 |
+
return tag
|
| 169 |
+
|
| 170 |
+
def _reconstruct_code_element(
|
| 171 |
+
self,
|
| 172 |
+
element: ContentElement,
|
| 173 |
+
soup: BeautifulSoup
|
| 174 |
+
) -> Tag:
|
| 175 |
+
"""Reconstruct a code element."""
|
| 176 |
+
# Determine if it's inline or block code
|
| 177 |
+
is_inline = (
|
| 178 |
+
element.element_type == ContentType.INLINE_CODE or
|
| 179 |
+
not element.attributes.get('class', [])
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
if is_inline:
|
| 183 |
+
tag = soup.new_tag("code")
|
| 184 |
+
else:
|
| 185 |
+
tag = soup.new_tag("pre")
|
| 186 |
+
code_tag = soup.new_tag("code")
|
| 187 |
+
tag.append(code_tag)
|
| 188 |
+
tag = code_tag
|
| 189 |
+
|
| 190 |
+
# Add language class if specified
|
| 191 |
+
if 'language' in element.attributes:
|
| 192 |
+
tag['class'] = f"language-{element.attributes['language']}"
|
| 193 |
+
|
| 194 |
+
# Add original content
|
| 195 |
+
tag.string = element.content
|
| 196 |
+
tag['data-preserve'] = 'true'
|
| 197 |
+
|
| 198 |
+
return tag
|
| 199 |
+
|
| 200 |
+
def _reconstruct_image_element(
|
| 201 |
+
self,
|
| 202 |
+
element: ContentElement,
|
| 203 |
+
soup: BeautifulSoup
|
| 204 |
+
) -> Tag:
|
| 205 |
+
"""Reconstruct an image element."""
|
| 206 |
+
tag = soup.new_tag("img")
|
| 207 |
+
|
| 208 |
+
# Add attributes
|
| 209 |
+
for attr, value in element.attributes.items():
|
| 210 |
+
if attr in ['src', 'alt', 'title', 'width', 'height', 'class', 'id']:
|
| 211 |
+
tag[attr] = value
|
| 212 |
+
|
| 213 |
+
# Ensure essential attributes
|
| 214 |
+
if 'src' not in element.attributes and 'data-src' in element.attributes:
|
| 215 |
+
tag['src'] = element.attributes['data-src']
|
| 216 |
+
|
| 217 |
+
tag['data-preserve'] = 'true'
|
| 218 |
+
return tag
|
| 219 |
+
|
| 220 |
+
def _reconstruct_link_element(
|
| 221 |
+
self,
|
| 222 |
+
element: ContentElement,
|
| 223 |
+
soup: BeautifulSoup
|
| 224 |
+
) -> Tag:
|
| 225 |
+
"""Reconstruct a link element."""
|
| 226 |
+
tag = soup.new_tag("a")
|
| 227 |
+
|
| 228 |
+
# Add attributes
|
| 229 |
+
for attr, value in element.attributes.items():
|
| 230 |
+
if attr in ['href', 'title', 'target', 'class', 'id']:
|
| 231 |
+
tag[attr] = value
|
| 232 |
+
|
| 233 |
+
# Add content (typically don't translate URLs)
|
| 234 |
+
tag.string = element.content
|
| 235 |
+
tag['data-preserve'] = 'true'
|
| 236 |
+
|
| 237 |
+
return tag
|
| 238 |
+
|
| 239 |
+
def _create_tag(self, element_type: ContentType, soup: BeautifulSoup, element=None) -> Tag:
|
| 240 |
+
"""Create appropriate HTML tag for element type."""
|
| 241 |
+
tag_mapping = {
|
| 242 |
+
ContentType.TEXT: "p",
|
| 243 |
+
ContentType.HEADING: "p", # Will be updated based on attributes
|
| 244 |
+
ContentType.LIST: "ul", # Default to unordered list
|
| 245 |
+
ContentType.QUOTE: "blockquote",
|
| 246 |
+
ContentType.EMPHASIS: "em",
|
| 247 |
+
ContentType.STRONG: "strong",
|
| 248 |
+
ContentType.TABLE: "table",
|
| 249 |
+
ContentType.CODE: "code",
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
tag_name = tag_mapping.get(element_type, "div")
|
| 253 |
+
|
| 254 |
+
if element_type == ContentType.HEADING and element and 'level' in element.attributes:
|
| 255 |
+
level = element.attributes['level']
|
| 256 |
+
if isinstance(level, int) and 1 <= level <= 6:
|
| 257 |
+
tag_name = f"h{level}"
|
| 258 |
+
|
| 259 |
+
return soup.new_tag(tag_name)
|
| 260 |
+
|
| 261 |
+
def _add_attributes(self, tag: Tag, element: ContentElement) -> None:
|
| 262 |
+
"""Add attributes to reconstructed tag."""
|
| 263 |
+
for attr, value in element.attributes.items():
|
| 264 |
+
# Skip internal attributes
|
| 265 |
+
if attr.startswith('_'):
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
# Skip content attributes
|
| 269 |
+
if attr in ['content', 'text']:
|
| 270 |
+
continue
|
| 271 |
+
|
| 272 |
+
# Attribute filtering based on config
|
| 273 |
+
if attr == 'class' and not self.config.preserve_classes:
|
| 274 |
+
continue
|
| 275 |
+
elif attr == 'id' and not self.config.preserve_ids:
|
| 276 |
+
continue
|
| 277 |
+
elif attr.startswith('data-') and not self.config.preserve_data_attributes:
|
| 278 |
+
continue
|
| 279 |
+
elif attr == 'style' and not self.config.preserve_style:
|
| 280 |
+
continue
|
| 281 |
+
|
| 282 |
+
tag[attr] = value
|
| 283 |
+
|
| 284 |
+
def _cleanup_empty_elements(self, html: str) -> str:
|
| 285 |
+
"""Remove empty elements from HTML."""
|
| 286 |
+
# Remove empty tags
|
| 287 |
+
html = re.sub(r'<([a-z]+)[^>]*>\s*</\1>', '', html)
|
| 288 |
+
|
| 289 |
+
# Remove extra whitespace
|
| 290 |
+
html = re.sub(r'\s+', ' ', html)
|
| 291 |
+
|
| 292 |
+
# Clean up around tags
|
| 293 |
+
html = re.sub(r'>\s+<', '><', html)
|
| 294 |
+
html = re.sub(r'\s+', ' ', html)
|
| 295 |
+
|
| 296 |
+
return html.strip()
|
| 297 |
+
|
| 298 |
+
def inject_translated_text(
|
| 299 |
+
self,
|
| 300 |
+
html_content: str,
|
| 301 |
+
translated_segments: List[Dict[str, Any]]
|
| 302 |
+
) -> str:
|
| 303 |
+
"""
|
| 304 |
+
Inject translated text segments into HTML content.
|
| 305 |
+
|
| 306 |
+
Args:
|
| 307 |
+
html_content: Original HTML content
|
| 308 |
+
translated_segments: List of translated text segments with positions
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
HTML content with translated text injected
|
| 312 |
+
"""
|
| 313 |
+
logger.info(
|
| 314 |
+
"Injecting translated text",
|
| 315 |
+
segments_count=len(translated_segments)
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Sort segments by position (reverse order to maintain indices)
|
| 319 |
+
segments = sorted(translated_segments, key=lambda x: x.get('position', 0), reverse=True)
|
| 320 |
+
|
| 321 |
+
result = html_content
|
| 322 |
+
for segment in segments:
|
| 323 |
+
start = segment.get('start', 0)
|
| 324 |
+
end = segment.get('end', len(result))
|
| 325 |
+
translated_text = segment.get('translated_text', '')
|
| 326 |
+
|
| 327 |
+
# Replace the segment
|
| 328 |
+
result = result[:start] + translated_text + result[end:]
|
| 329 |
+
|
| 330 |
+
return result
|
| 331 |
+
|
| 332 |
+
def create_translation_markers(
|
| 333 |
+
self,
|
| 334 |
+
elements: List[ContentElement]
|
| 335 |
+
) -> List[Dict[str, Any]]:
|
| 336 |
+
"""
|
| 337 |
+
Create marker positions for text segments to be translated.
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
elements: Parsed content elements
|
| 341 |
+
|
| 342 |
+
Returns:
|
| 343 |
+
List of marker positions
|
| 344 |
+
"""
|
| 345 |
+
markers = []
|
| 346 |
+
current_position = 0
|
| 347 |
+
|
| 348 |
+
for element in elements:
|
| 349 |
+
if element.should_translate and element.element_type == ContentType.TEXT:
|
| 350 |
+
text = element.content
|
| 351 |
+
if text.strip():
|
| 352 |
+
markers.append({
|
| 353 |
+
'start': current_position,
|
| 354 |
+
'end': current_position + len(text),
|
| 355 |
+
'original_text': text,
|
| 356 |
+
'element_id': id(element)
|
| 357 |
+
})
|
| 358 |
+
current_position += len(text)
|
| 359 |
+
|
| 360 |
+
logger.info(
|
| 361 |
+
"Created translation markers",
|
| 362 |
+
markers_count=len(markers),
|
| 363 |
+
text_length=current_position
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
return markers
|
| 367 |
+
|
| 368 |
+
def validate_reconstruction(
|
| 369 |
+
self,
|
| 370 |
+
original_html: str,
|
| 371 |
+
reconstructed_html: str,
|
| 372 |
+
original_elements: List[ContentElement],
|
| 373 |
+
reconstructed_elements: List[ContentElement]
|
| 374 |
+
) -> Dict[str, Any]:
|
| 375 |
+
"""
|
| 376 |
+
Validate the reconstruction process.
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
original_html: Original HTML content
|
| 380 |
+
reconstructed_html: Reconstructed HTML content
|
| 381 |
+
original_elements: Original parsed elements
|
| 382 |
+
reconstructed_elements: Reconstructed elements
|
| 383 |
+
|
| 384 |
+
Returns:
|
| 385 |
+
Validation report
|
| 386 |
+
"""
|
| 387 |
+
report = {
|
| 388 |
+
'is_valid': True,
|
| 389 |
+
'errors': [],
|
| 390 |
+
'warnings': [],
|
| 391 |
+
'stats': {
|
| 392 |
+
'original_length': len(original_html),
|
| 393 |
+
'reconstructed_length': len(reconstructed_html),
|
| 394 |
+
'original_elements': len(original_elements),
|
| 395 |
+
'reconstructed_elements': len(reconstructed_elements)
|
| 396 |
+
}
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
# Check element counts
|
| 400 |
+
original_types = self._count_elements_by_type(original_elements)
|
| 401 |
+
reconstructed_types = self._count_elements_by_type(reconstructed_elements)
|
| 402 |
+
|
| 403 |
+
for element_type, count in original_types.items():
|
| 404 |
+
reconstructed_count = reconstructed_types.get(element_type, 0)
|
| 405 |
+
if count != reconstructed_count:
|
| 406 |
+
report['errors'].append(
|
| 407 |
+
f"Element count mismatch for {element_type.value}: "
|
| 408 |
+
f"original={count}, reconstructed={reconstructed_count}"
|
| 409 |
+
)
|
| 410 |
+
report['is_valid'] = False
|
| 411 |
+
|
| 412 |
+
# Check code blocks preservation
|
| 413 |
+
original_code = len([e for e in original_elements if e.element_type == ContentType.CODE])
|
| 414 |
+
reconstructed_code = len([e for e in reconstructed_elements if e.element_type == ContentType.CODE])
|
| 415 |
+
|
| 416 |
+
if original_code != reconstructed_code:
|
| 417 |
+
report['errors'].append(
|
| 418 |
+
f"Code blocks not preserved: original={original_code}, reconstructed={reconstructed_code}"
|
| 419 |
+
)
|
| 420 |
+
report['is_valid'] = False
|
| 421 |
+
|
| 422 |
+
# Check for preserved attributes
|
| 423 |
+
preserved_attributes = self._check_preserved_attributes(
|
| 424 |
+
original_elements,
|
| 425 |
+
reconstructed_elements
|
| 426 |
+
)
|
| 427 |
+
if not preserved_attributes['all_preserved']:
|
| 428 |
+
report['warnings'].extend(preserved_attributes['missing_attributes'])
|
| 429 |
+
|
| 430 |
+
logger.info(
|
| 431 |
+
"Reconstruction validation complete",
|
| 432 |
+
is_valid=report['is_valid'],
|
| 433 |
+
errors_count=len(report['errors']),
|
| 434 |
+
warnings_count=len(report['warnings'])
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
return report
|
| 438 |
+
|
| 439 |
+
def _count_elements_by_type(self, elements: List[ContentElement]) -> Dict[ContentType, int]:
|
| 440 |
+
"""Count elements by type."""
|
| 441 |
+
counts = {}
|
| 442 |
+
for element in elements:
|
| 443 |
+
counts[element.element_type] = counts.get(element.element_type, 0) + 1
|
| 444 |
+
return counts
|
| 445 |
+
|
| 446 |
+
def _check_preserved_attributes(
|
| 447 |
+
self,
|
| 448 |
+
original_elements: List[ContentElement],
|
| 449 |
+
reconstructed_elements: List[ContentElement]
|
| 450 |
+
) -> Dict[str, Any]:
|
| 451 |
+
"""Check if important attributes are preserved."""
|
| 452 |
+
result = {
|
| 453 |
+
'all_preserved': True,
|
| 454 |
+
'missing_attributes': []
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
important_attrs = ['id', 'class', 'href', 'src', 'alt']
|
| 458 |
+
|
| 459 |
+
# This is a simplified check
|
| 460 |
+
# In practice, you'd want more sophisticated comparison
|
| 461 |
+
for orig_elem in original_elements:
|
| 462 |
+
for attr in important_attrs:
|
| 463 |
+
if attr in orig_elem.attributes:
|
| 464 |
+
result['missing_attributes'].append(
|
| 465 |
+
f"Attribute '{attr}' may not be preserved in element {orig_elem.element_type.value}"
|
| 466 |
+
)
|
| 467 |
+
|
| 468 |
+
if result['missing_attributes']:
|
| 469 |
+
result['all_preserved'] = False
|
| 470 |
+
|
| 471 |
+
return result
|
src/services/html_parser.py
ADDED
|
@@ -0,0 +1,565 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HTML Parser for Translation Formatting Preservation.
|
| 3 |
+
|
| 4 |
+
This module parses HTML content to extract structure, identify
|
| 5 |
+
different content types, and prepare for translation while preserving
|
| 6 |
+
formatting.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
from typing import Dict, List, Optional, Any, Tuple
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from enum import Enum
|
| 13 |
+
|
| 14 |
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
| 15 |
+
import markdown
|
| 16 |
+
|
| 17 |
+
from src.utils.translation_logger import get_translation_logger
|
| 18 |
+
|
| 19 |
+
logger = get_translation_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ContentType(Enum):
|
| 23 |
+
"""Content types for translation handling."""
|
| 24 |
+
TEXT = "text"
|
| 25 |
+
CODE = "code"
|
| 26 |
+
HEADING = "heading"
|
| 27 |
+
LIST = "list"
|
| 28 |
+
LINK = "link"
|
| 29 |
+
IMAGE = "image"
|
| 30 |
+
TABLE = "table"
|
| 31 |
+
QUOTE = "quote"
|
| 32 |
+
EMPHASIS = "emphasis"
|
| 33 |
+
STRONG = "strong"
|
| 34 |
+
INLINE_CODE = "inline_code"
|
| 35 |
+
MATH = "math"
|
| 36 |
+
METADATA = "metadata"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class ContentElement:
|
| 41 |
+
"""Represents a parsed content element."""
|
| 42 |
+
element_type: ContentType
|
| 43 |
+
content: str
|
| 44 |
+
attributes: Dict[str, Any]
|
| 45 |
+
children: List['ContentElement']
|
| 46 |
+
parent: Optional['ContentElement'] = None
|
| 47 |
+
should_translate: bool = True
|
| 48 |
+
preserve_formatting: bool = True
|
| 49 |
+
position: int = 0
|
| 50 |
+
|
| 51 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 52 |
+
"""Convert to dictionary for serialization."""
|
| 53 |
+
return {
|
| 54 |
+
"type": self.element_type.value,
|
| 55 |
+
"content": self.content,
|
| 56 |
+
"attributes": self.attributes,
|
| 57 |
+
"children": [child.to_dict() for child in self.children],
|
| 58 |
+
"should_translate": self.should_translate,
|
| 59 |
+
"preserve_formatting": self.preserve_formatting,
|
| 60 |
+
"position": self.position
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class HTMLParser:
|
| 65 |
+
"""
|
| 66 |
+
HTML parser for translation with formatting preservation.
|
| 67 |
+
|
| 68 |
+
Features:
|
| 69 |
+
- Recursive HTML parsing
|
| 70 |
+
- Content type identification
|
| 71 |
+
- Code block detection and preservation
|
| 72 |
+
- Formatting marker injection
|
| 73 |
+
- Structure reconstruction support
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
# Code block patterns
|
| 77 |
+
CODE_BLOCK_PATTERNS = [
|
| 78 |
+
re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL), # Markdown code blocks
|
| 79 |
+
re.compile(r'<pre><code[^>]*>(.*?)</code></pre>', re.DOTALL | re.IGNORECASE), # HTML pre/code blocks
|
| 80 |
+
re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL | re.IGNORECASE), # Inline code
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
# Special tags that should not be translated
|
| 84 |
+
NON_TRANSLATABLE_TAGS = {
|
| 85 |
+
'script', 'style', 'noscript', 'iframe', 'object', 'embed',
|
| 86 |
+
'svg', 'math', 'canvas', 'video', 'audio'
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
# Tags that preserve inner structure
|
| 90 |
+
STRUCTURE_PRESERVING_TAGS = {
|
| 91 |
+
'pre', 'code', 'kbd', 'samp', 'var'
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Formatting tags
|
| 95 |
+
FORMATTING_TAGS = {
|
| 96 |
+
'em', 'i', 'strong', 'b', 'mark', 'small', 'del', 'ins',
|
| 97 |
+
'sub', 'sup', 'u', 'tt'
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
def __init__(self):
|
| 101 |
+
"""Initialize HTML parser."""
|
| 102 |
+
self.position_counter = 0
|
| 103 |
+
self.translation_markers = {
|
| 104 |
+
'start': '{{TRANSLATE_START}}',
|
| 105 |
+
'end': '{{TRANSLATE_END}}',
|
| 106 |
+
'skip': '{{SKIP_TRANSLATION}}'
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
def parse_html(
|
| 110 |
+
self,
|
| 111 |
+
html_content: str,
|
| 112 |
+
source_format: str = "html"
|
| 113 |
+
) -> List[ContentElement]:
|
| 114 |
+
"""
|
| 115 |
+
Parse HTML content into structured elements.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
html_content: HTML content to parse
|
| 119 |
+
source_format: Format type (html, markdown, etc.)
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
List of parsed content elements
|
| 123 |
+
"""
|
| 124 |
+
logger.info(
|
| 125 |
+
"Parsing HTML content",
|
| 126 |
+
content_length=len(html_content),
|
| 127 |
+
source_format=source_format
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Convert markdown to HTML if needed
|
| 131 |
+
if source_format == "markdown":
|
| 132 |
+
html_content = markdown.markdown(
|
| 133 |
+
html_content,
|
| 134 |
+
extensions=['codehilite', 'tables', 'toc']
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Parse with BeautifulSoup
|
| 138 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 139 |
+
|
| 140 |
+
# Extract and parse elements
|
| 141 |
+
elements = []
|
| 142 |
+
self.position_counter = 0
|
| 143 |
+
|
| 144 |
+
for child in soup.body.children if soup.body else soup.children:
|
| 145 |
+
element = self._parse_node(child)
|
| 146 |
+
if element:
|
| 147 |
+
elements.append(element)
|
| 148 |
+
|
| 149 |
+
logger.info(
|
| 150 |
+
"HTML parsing complete",
|
| 151 |
+
elements_count=len(elements),
|
| 152 |
+
translate_elements=len([e for e in self._flatten_elements(elements) if e.should_translate])
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
return elements
|
| 156 |
+
|
| 157 |
+
def _parse_node(self, node) -> Optional[ContentElement]:
|
| 158 |
+
"""
|
| 159 |
+
Parse a BeautifulSoup node into a content element.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
node: BeautifulSoup node
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Parsed content element or None
|
| 166 |
+
"""
|
| 167 |
+
if isinstance(node, NavigableString):
|
| 168 |
+
# Handle text content
|
| 169 |
+
text = str(node).strip()
|
| 170 |
+
if text:
|
| 171 |
+
return ContentElement(
|
| 172 |
+
element_type=ContentType.TEXT,
|
| 173 |
+
content=text,
|
| 174 |
+
attributes={},
|
| 175 |
+
children=[],
|
| 176 |
+
should_translate=True,
|
| 177 |
+
preserve_formatting=False,
|
| 178 |
+
position=self.position_counter
|
| 179 |
+
)
|
| 180 |
+
return None
|
| 181 |
+
|
| 182 |
+
elif isinstance(node, Tag):
|
| 183 |
+
tag_name = node.name.lower()
|
| 184 |
+
attributes = dict(node.attrs)
|
| 185 |
+
|
| 186 |
+
# Determine content type
|
| 187 |
+
element_type = self._determine_content_type(node, tag_name)
|
| 188 |
+
|
| 189 |
+
# Check if should translate
|
| 190 |
+
should_translate = self._should_translate_content(node, tag_name)
|
| 191 |
+
|
| 192 |
+
# Parse children
|
| 193 |
+
children = []
|
| 194 |
+
for child in node.children:
|
| 195 |
+
child_element = self._parse_node(child)
|
| 196 |
+
if child_element:
|
| 197 |
+
child_element.parent = node # type: ignore
|
| 198 |
+
children.append(child_element)
|
| 199 |
+
|
| 200 |
+
# Create element
|
| 201 |
+
element = ContentElement(
|
| 202 |
+
element_type=element_type,
|
| 203 |
+
content=node.get_text(strip=True) if should_translate else "",
|
| 204 |
+
attributes=attributes,
|
| 205 |
+
children=children,
|
| 206 |
+
should_translate=should_translate,
|
| 207 |
+
preserve_formatting=self._should_preserve_formatting(tag_name),
|
| 208 |
+
position=self.position_counter
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
self.position_counter += 1
|
| 212 |
+
return element
|
| 213 |
+
|
| 214 |
+
return None
|
| 215 |
+
|
| 216 |
+
def _determine_content_type(self, node: Tag, tag_name: str) -> ContentType:
|
| 217 |
+
"""Determine the content type of a node."""
|
| 218 |
+
# Code blocks
|
| 219 |
+
if tag_name in ['pre', 'code'] or self._has_code_class(node):
|
| 220 |
+
return ContentType.CODE
|
| 221 |
+
|
| 222 |
+
# Headings
|
| 223 |
+
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 224 |
+
return ContentType.HEADING
|
| 225 |
+
|
| 226 |
+
# Lists
|
| 227 |
+
elif tag_name in ['ul', 'ol', 'li', 'dl', 'dt', 'dd']:
|
| 228 |
+
return ContentType.LIST
|
| 229 |
+
|
| 230 |
+
# Links
|
| 231 |
+
elif tag_name == 'a':
|
| 232 |
+
return ContentType.LINK
|
| 233 |
+
|
| 234 |
+
# Images
|
| 235 |
+
elif tag_name == 'img':
|
| 236 |
+
return ContentType.IMAGE
|
| 237 |
+
|
| 238 |
+
# Tables
|
| 239 |
+
elif tag_name in ['table', 'thead', 'tbody', 'tr', 'td', 'th']:
|
| 240 |
+
return ContentType.TABLE
|
| 241 |
+
|
| 242 |
+
# Quotes
|
| 243 |
+
elif tag_name in ['blockquote', 'q']:
|
| 244 |
+
return ContentType.QUOTE
|
| 245 |
+
|
| 246 |
+
# Inline formatting
|
| 247 |
+
elif tag_name in self.FORMATTING_TAGS:
|
| 248 |
+
if tag_name in ['em', 'i']:
|
| 249 |
+
return ContentType.EMPHASIS
|
| 250 |
+
elif tag_name in ['strong', 'b']:
|
| 251 |
+
return ContentType.STRONG
|
| 252 |
+
elif tag_name == 'code' and not self._is_block_code(node):
|
| 253 |
+
return ContentType.INLINE_CODE
|
| 254 |
+
|
| 255 |
+
# Math
|
| 256 |
+
elif tag_name in ['math', 'mrow', 'mfrac', 'msqrt', 'mroot']:
|
| 257 |
+
return ContentType.MATH
|
| 258 |
+
|
| 259 |
+
# Metadata
|
| 260 |
+
elif tag_name in ['meta', 'title', 'head', 'style', 'script']:
|
| 261 |
+
return ContentType.METADATA
|
| 262 |
+
|
| 263 |
+
# Default to text
|
| 264 |
+
else:
|
| 265 |
+
return ContentType.TEXT
|
| 266 |
+
|
| 267 |
+
def _should_translate_content(self, node: Tag, tag_name: str) -> bool:
|
| 268 |
+
"""Determine if content should be translated."""
|
| 269 |
+
# Don't translate non-translatable tags
|
| 270 |
+
if tag_name in self.NON_TRANSLATABLE_TAGS:
|
| 271 |
+
return False
|
| 272 |
+
|
| 273 |
+
# Don't translate code blocks
|
| 274 |
+
if tag_name == 'code' and (node.parent and node.parent.name == 'pre'):
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
if tag_name == 'pre':
|
| 278 |
+
return False
|
| 279 |
+
|
| 280 |
+
# Don't translate if class indicates code
|
| 281 |
+
if self._has_code_class(node):
|
| 282 |
+
return False
|
| 283 |
+
|
| 284 |
+
# Don't translate image alt text that's purely technical
|
| 285 |
+
if tag_name == 'img' and self._is_technical_alt_text(node.get('alt', '')):
|
| 286 |
+
return False
|
| 287 |
+
|
| 288 |
+
return True
|
| 289 |
+
|
| 290 |
+
def _should_preserve_formatting(self, tag_name: str) -> bool:
|
| 291 |
+
"""Check if formatting should be preserved."""
|
| 292 |
+
return tag_name in (self.STRUCTURE_PRESERVING_TAGS | self.FORMATTING_TAGS)
|
| 293 |
+
|
| 294 |
+
def _has_code_class(self, node: Tag) -> bool:
|
| 295 |
+
"""Check if node has code-related classes."""
|
| 296 |
+
classes = node.get('class', [])
|
| 297 |
+
if isinstance(classes, str):
|
| 298 |
+
classes = [classes]
|
| 299 |
+
|
| 300 |
+
code_indicators = [
|
| 301 |
+
'language-', 'highlight', 'code', ' hljs', 'chroma',
|
| 302 |
+
'source-code', 'pre', 'verbatim', 'literal'
|
| 303 |
+
]
|
| 304 |
+
|
| 305 |
+
return any(
|
| 306 |
+
any(indicator in cls for indicator in code_indicators)
|
| 307 |
+
for cls in classes
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
def _is_block_code(self, node: Tag) -> bool:
|
| 311 |
+
"""Check if code element is a block code."""
|
| 312 |
+
return (
|
| 313 |
+
node.name == 'code' and
|
| 314 |
+
node.parent and
|
| 315 |
+
node.parent.name == 'pre'
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
def _is_technical_alt_text(self, alt_text: str) -> bool:
|
| 319 |
+
"""Check if alt text is purely technical."""
|
| 320 |
+
technical_indicators = [
|
| 321 |
+
'diagram', 'chart', 'graph', 'formula', 'equation',
|
| 322 |
+
'algorithm', 'flowchart', 'schema', 'architecture'
|
| 323 |
+
]
|
| 324 |
+
|
| 325 |
+
return any(indicator in alt_text.lower() for indicator in technical_indicators)
|
| 326 |
+
|
| 327 |
+
def _flatten_elements(self, elements: List[ContentElement]) -> List[ContentElement]:
|
| 328 |
+
"""Flatten nested elements into a single list."""
|
| 329 |
+
flattened = []
|
| 330 |
+
for element in elements:
|
| 331 |
+
flattened.append(element)
|
| 332 |
+
flattened.extend(self._flatten_elements(element.children))
|
| 333 |
+
return flattened
|
| 334 |
+
|
| 335 |
+
def extract_translatable_text(self, elements: List[ContentElement]) -> str:
|
| 336 |
+
"""
|
| 337 |
+
Extract only translatable text content from elements.
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
elements: Parsed content elements
|
| 341 |
+
|
| 342 |
+
Returns:
|
| 343 |
+
Concatenated translatable text
|
| 344 |
+
"""
|
| 345 |
+
translatable_parts = []
|
| 346 |
+
|
| 347 |
+
for element in self._flatten_elements(elements):
|
| 348 |
+
if element.should_translate and element.element_type != ContentType.CODE:
|
| 349 |
+
if element.element_type == ContentType.TEXT:
|
| 350 |
+
translatable_parts.append(element.content)
|
| 351 |
+
else:
|
| 352 |
+
# Add spacing for block elements
|
| 353 |
+
if element.element_type == ContentType.HEADING:
|
| 354 |
+
translatable_parts.append('\n\n')
|
| 355 |
+
|
| 356 |
+
return ''.join(translatable_parts).strip()
|
| 357 |
+
|
| 358 |
+
def inject_translation_markers(
|
| 359 |
+
self,
|
| 360 |
+
elements: List[ContentElement],
|
| 361 |
+
translated_text: str
|
| 362 |
+
) -> List[ContentElement]:
|
| 363 |
+
"""
|
| 364 |
+
Inject translation markers into elements for reconstruction.
|
| 365 |
+
|
| 366 |
+
Args:
|
| 367 |
+
elements: Original parsed elements
|
| 368 |
+
translated_text: Translated text content
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
Elements with markers injected
|
| 372 |
+
"""
|
| 373 |
+
# This is a simplified version - in practice, you'd want
|
| 374 |
+
# more sophisticated mapping of translated text to elements
|
| 375 |
+
translatable_elements = [
|
| 376 |
+
e for e in self._flatten_elements(elements)
|
| 377 |
+
if e.should_translate and e.element_type != ContentType.CODE
|
| 378 |
+
]
|
| 379 |
+
|
| 380 |
+
if translatable_elements:
|
| 381 |
+
# Inject markers around the whole content
|
| 382 |
+
first = translatable_elements[0]
|
| 383 |
+
last = translatable_elements[-1]
|
| 384 |
+
|
| 385 |
+
# Add start marker
|
| 386 |
+
first.attributes['_translation_start'] = True
|
| 387 |
+
|
| 388 |
+
# Add end marker
|
| 389 |
+
last.attributes['_translation_end'] = True
|
| 390 |
+
|
| 391 |
+
return elements
|
| 392 |
+
|
| 393 |
+
def extract_code_blocks(self, html_content: str) -> List[Dict[str, Any]]:
|
| 394 |
+
"""
|
| 395 |
+
Extract code blocks from HTML content.
|
| 396 |
+
|
| 397 |
+
Args:
|
| 398 |
+
html_content: HTML content to parse
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
List of code block information
|
| 402 |
+
"""
|
| 403 |
+
code_blocks = []
|
| 404 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 405 |
+
|
| 406 |
+
# Find all code blocks
|
| 407 |
+
for code_element in soup.find_all(['pre', 'code']):
|
| 408 |
+
if code_element.name == 'pre' or (
|
| 409 |
+
code_element.name == 'code' and
|
| 410 |
+
code_element.parent and
|
| 411 |
+
code_element.parent.name == 'pre'
|
| 412 |
+
):
|
| 413 |
+
language = None
|
| 414 |
+
classes = code_element.get('class', [])
|
| 415 |
+
|
| 416 |
+
# Extract language from classes
|
| 417 |
+
if classes:
|
| 418 |
+
for cls in classes:
|
| 419 |
+
if isinstance(cls, str):
|
| 420 |
+
if cls.startswith('language-'):
|
| 421 |
+
language = cls[9:]
|
| 422 |
+
elif cls in ['python', 'javascript', 'java', 'cpp', 'html', 'css', 'sql']:
|
| 423 |
+
language = cls
|
| 424 |
+
|
| 425 |
+
code_content = code_element.get_text()
|
| 426 |
+
code_html = str(code_element)
|
| 427 |
+
|
| 428 |
+
code_blocks.append({
|
| 429 |
+
'language': language or 'text',
|
| 430 |
+
'content': code_content,
|
| 431 |
+
'html': code_html,
|
| 432 |
+
'position': html_content.find(code_html)
|
| 433 |
+
})
|
| 434 |
+
|
| 435 |
+
logger.info(
|
| 436 |
+
"Code blocks extracted",
|
| 437 |
+
total_blocks=len(code_blocks),
|
| 438 |
+
languages=[cb['language'] for cb in code_blocks]
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
return code_blocks
|
| 442 |
+
|
| 443 |
+
def preserve_code_blocks(
|
| 444 |
+
self,
|
| 445 |
+
html_content: str,
|
| 446 |
+
translated_content: str
|
| 447 |
+
) -> str:
|
| 448 |
+
"""
|
| 449 |
+
Preserve code blocks in translated content.
|
| 450 |
+
|
| 451 |
+
Args:
|
| 452 |
+
html_content: Original HTML with code blocks
|
| 453 |
+
translated_content: Translated HTML (code blocks might be altered)
|
| 454 |
+
|
| 455 |
+
Returns:
|
| 456 |
+
HTML with original code blocks preserved
|
| 457 |
+
"""
|
| 458 |
+
# Extract code blocks from original
|
| 459 |
+
original_blocks = self.extract_code_blocks(html_content)
|
| 460 |
+
|
| 461 |
+
# Replace code blocks in translated content with originals
|
| 462 |
+
result = translated_content
|
| 463 |
+
for block in original_blocks:
|
| 464 |
+
result = result.replace(block['html'], block['html'], 1)
|
| 465 |
+
|
| 466 |
+
logger.info(
|
| 467 |
+
"Code blocks preserved",
|
| 468 |
+
blocks_count=len(original_blocks)
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
return result
|
| 472 |
+
|
| 473 |
+
def validate_structure(
|
| 474 |
+
self,
|
| 475 |
+
original_elements: List[ContentElement],
|
| 476 |
+
translated_elements: List[ContentElement]
|
| 477 |
+
) -> List[str]:
|
| 478 |
+
"""
|
| 479 |
+
Validate that structure is preserved between original and translated.
|
| 480 |
+
|
| 481 |
+
Args:
|
| 482 |
+
original_elements: Original parsed elements
|
| 483 |
+
translated_elements: Translated parsed elements
|
| 484 |
+
|
| 485 |
+
Returns:
|
| 486 |
+
List of validation errors
|
| 487 |
+
"""
|
| 488 |
+
errors = []
|
| 489 |
+
|
| 490 |
+
# Compare structure counts
|
| 491 |
+
original_types = self._count_element_types(original_elements)
|
| 492 |
+
translated_types = self._count_element_types(translated_elements)
|
| 493 |
+
|
| 494 |
+
for element_type, count in original_types.items():
|
| 495 |
+
if element_type != ContentType.TEXT: # Text count may differ
|
| 496 |
+
translated_count = translated_types.get(element_type, 0)
|
| 497 |
+
if count != translated_count:
|
| 498 |
+
errors.append(
|
| 499 |
+
f"Element count mismatch for {element_type.value}: "
|
| 500 |
+
f"original={count}, translated={translated_count}"
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
# Check that code blocks are preserved
|
| 504 |
+
original_code_blocks = len([
|
| 505 |
+
e for e in self._flatten_elements(original_elements)
|
| 506 |
+
if e.element_type == ContentType.CODE
|
| 507 |
+
])
|
| 508 |
+
translated_code_blocks = len([
|
| 509 |
+
e for e in self._flatten_elements(translated_elements)
|
| 510 |
+
if e.element_type == ContentType.CODE
|
| 511 |
+
])
|
| 512 |
+
|
| 513 |
+
if original_code_blocks != translated_code_blocks:
|
| 514 |
+
errors.append(
|
| 515 |
+
f"Code block count mismatch: "
|
| 516 |
+
f"original={original_code_blocks}, translated={translated_code_blocks}"
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
logger.info(
|
| 520 |
+
"Structure validation complete",
|
| 521 |
+
errors_count=len(errors),
|
| 522 |
+
element_types_matched=len(set(original_types.keys()) & set(translated_types.keys()))
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
return errors
|
| 526 |
+
|
| 527 |
+
def _count_element_types(self, elements: List[ContentElement]) -> Dict[ContentType, int]:
|
| 528 |
+
"""Count occurrences of each element type."""
|
| 529 |
+
counts = {}
|
| 530 |
+
for element in self._flatten_elements(elements):
|
| 531 |
+
counts[element.element_type] = counts.get(element.element_type, 0) + 1
|
| 532 |
+
return counts
|
| 533 |
+
|
| 534 |
+
def generate_structure_report(
|
| 535 |
+
self,
|
| 536 |
+
elements: List[ContentElement]
|
| 537 |
+
) -> Dict[str, Any]:
|
| 538 |
+
"""
|
| 539 |
+
Generate a report of the content structure.
|
| 540 |
+
|
| 541 |
+
Args:
|
| 542 |
+
elements: Parsed content elements
|
| 543 |
+
|
| 544 |
+
Returns:
|
| 545 |
+
Structure report
|
| 546 |
+
"""
|
| 547 |
+
flattened = self._flatten_elements(elements)
|
| 548 |
+
type_counts = self._count_element_types(elements)
|
| 549 |
+
|
| 550 |
+
report = {
|
| 551 |
+
"total_elements": len(flattened),
|
| 552 |
+
"element_types": {
|
| 553 |
+
type_name.value: count
|
| 554 |
+
for type_name, count in type_counts.items()
|
| 555 |
+
},
|
| 556 |
+
"translatable_elements": len([e for e in flattened if e.should_translate]),
|
| 557 |
+
"code_blocks": type_counts.get(ContentType.CODE, 0),
|
| 558 |
+
"headings": type_counts.get(ContentType.HEADING, 0),
|
| 559 |
+
"lists": type_counts.get(ContentType.LIST, 0),
|
| 560 |
+
"links": type_counts.get(ContentType.LINK, 0),
|
| 561 |
+
"images": type_counts.get(ContentType.IMAGE, 0),
|
| 562 |
+
"tables": type_counts.get(ContentType.TABLE, 0)
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
return report
|
src/services/openai_translation/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenAI Translation Service with Gemini API.
|
| 3 |
+
|
| 4 |
+
This package provides translation services using OpenAI Agents SDK
|
| 5 |
+
with Gemini API for high-quality English to Urdu translation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .service import OpenAITranslationService
|
| 9 |
+
|
| 10 |
+
__all__ = ["OpenAITranslationService"]
|
src/services/openai_translation/client.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenAI Agents SDK Client for Gemini API.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from agents import AsyncOpenAI, OpenAIChatCompletionsModel
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class GeminiOpenAIClient:
|
| 10 |
+
"""OpenAI Agents SDK client for Gemini API."""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
"""Initialize Gemini OpenAI client."""
|
| 14 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
| 15 |
+
if not api_key:
|
| 16 |
+
raise ValueError("GEMINI_API_KEY not configured")
|
| 17 |
+
|
| 18 |
+
# Initialize AsyncOpenAI client for Gemini
|
| 19 |
+
self.provider = AsyncOpenAI(
|
| 20 |
+
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
|
| 21 |
+
api_key=api_key,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Define the chat completions model using Gemini
|
| 25 |
+
self.model = OpenAIChatCompletionsModel(
|
| 26 |
+
openai_client=self.provider,
|
| 27 |
+
model="gemini-2.0-flash-lite",
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def get_provider(self) -> AsyncOpenAI:
|
| 31 |
+
"""Get the AsyncOpenAI provider."""
|
| 32 |
+
return self.provider
|
| 33 |
+
|
| 34 |
+
def get_client(self) -> AsyncOpenAI:
|
| 35 |
+
"""Get the AsyncOpenAI client (alias for get_provider)."""
|
| 36 |
+
return self.provider
|
| 37 |
+
|
| 38 |
+
def get_model(self) -> OpenAIChatCompletionsModel:
|
| 39 |
+
"""Get the OpenAI chat completions model."""
|
| 40 |
+
return self.model
|
| 41 |
+
|
| 42 |
+
async def test_connection(self) -> bool:
|
| 43 |
+
"""Test the connection to Gemini API."""
|
| 44 |
+
try:
|
| 45 |
+
# Try a simple completion request
|
| 46 |
+
response = await self.provider.chat.completions.create(
|
| 47 |
+
model="gemini-2.0-flash-lite",
|
| 48 |
+
messages=[{"role": "user", "content": "test"}],
|
| 49 |
+
max_tokens=1
|
| 50 |
+
)
|
| 51 |
+
return True
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"Connection test failed: {str(e)}")
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_gemini_client() -> GeminiOpenAIClient:
|
| 58 |
+
"""Get the Gemini client instance."""
|
| 59 |
+
return GeminiOpenAIClient()
|
src/services/openai_translation/openai_agent.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenAI Agents SDK Implementation for Translation.
|
| 3 |
+
|
| 4 |
+
This module properly implements translation using the OpenAI Agents SDK
|
| 5 |
+
with Gemini API integration, including proper error handling for rate limits.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
from typing import Dict, List, Optional, Any, AsyncGenerator
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
from agents import Agent, Runner, function_tool, RunContextWrapper
|
| 15 |
+
from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
|
| 16 |
+
from src.models.translation_openai import TranslationJob, TranslationChunk
|
| 17 |
+
from src.utils.translation_logger import get_translation_logger
|
| 18 |
+
from src.utils.translation_errors import (
|
| 19 |
+
TranslationError, RateLimitError, APIError,
|
| 20 |
+
retry_with_exponential_backoff, handle_api_error
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
logger = get_translation_logger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class TranslationContext:
|
| 28 |
+
"""Context information for translation."""
|
| 29 |
+
page_url: Optional[str] = None
|
| 30 |
+
page_title: Optional[str] = None
|
| 31 |
+
document_type: Optional[str] = None # book, article, documentation, etc.
|
| 32 |
+
technical_domain: Optional[str] = None # AI, robotics, programming, etc.
|
| 33 |
+
target_audience: Optional[str] = None # students, professionals, general
|
| 34 |
+
previous_translations: Optional[List[str]] = None
|
| 35 |
+
glossary: Optional[Dict[str, str]] = None
|
| 36 |
+
chunk_index: Optional[int] = None
|
| 37 |
+
total_chunks: Optional[int] = None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class OpenAITranslationAgent:
|
| 41 |
+
"""
|
| 42 |
+
OpenAI Agents SDK-based translation agent with proper error handling.
|
| 43 |
+
|
| 44 |
+
Uses the official OpenAI Agents SDK with Gemini API for intelligent translation
|
| 45 |
+
with context awareness and specialized tools.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
def __init__(
|
| 49 |
+
self,
|
| 50 |
+
gemini_client: GeminiOpenAIClient,
|
| 51 |
+
model: str = "gemini-2.0-flash-lite"
|
| 52 |
+
):
|
| 53 |
+
"""
|
| 54 |
+
Initialize translation agent.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
gemini_client: Configured Gemini OpenAI client
|
| 58 |
+
model: Model to use for translation
|
| 59 |
+
"""
|
| 60 |
+
self.client = gemini_client
|
| 61 |
+
self.model = model
|
| 62 |
+
self.agent = self._create_agent()
|
| 63 |
+
|
| 64 |
+
logger.info(
|
| 65 |
+
"OpenAI Translation Agent initialized",
|
| 66 |
+
model=model
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
def _create_agent(self) -> Agent:
|
| 70 |
+
"""Create the translation agent with tools and proper error handling."""
|
| 71 |
+
instructions = """
|
| 72 |
+
You are a professional translator specializing in technical content translation from English to Urdu.
|
| 73 |
+
|
| 74 |
+
Your primary task is to translate English content to Urdu while:
|
| 75 |
+
1. Maintaining technical accuracy
|
| 76 |
+
2. Using appropriate Urdu terminology
|
| 77 |
+
3. Preserving code blocks and technical identifiers
|
| 78 |
+
4. Providing contextually appropriate translations
|
| 79 |
+
5. Using Urdu script (Nastaleeq) for all Urdu text
|
| 80 |
+
|
| 81 |
+
Key Translation Guidelines:
|
| 82 |
+
- Translate ALL content unless explicitly marked as code
|
| 83 |
+
- Use Urdu script for all translations
|
| 84 |
+
- For technical terms, use established Urdu translations where available
|
| 85 |
+
- For brand new terms, create appropriate Urdu equivalents
|
| 86 |
+
- Maintain the original document structure and formatting
|
| 87 |
+
- Code blocks remain in English but add Urdu comments if helpful
|
| 88 |
+
|
| 89 |
+
Technical Term Examples:
|
| 90 |
+
- AI → مصنوعی ذہانت
|
| 91 |
+
- Machine Learning → مشین لرننگ
|
| 92 |
+
- Robotics → روبوٹکس
|
| 93 |
+
- Computer Vision → کمپیوٹر ویژن
|
| 94 |
+
- Neural Network → نیورل نیٹورک
|
| 95 |
+
- Algorithm → الگورتھم
|
| 96 |
+
|
| 97 |
+
Error Handling:
|
| 98 |
+
- If you encounter rate limiting errors, wait and retry automatically
|
| 99 |
+
- If translation fails for a chunk, note the error and continue
|
| 100 |
+
- Always provide meaningful error messages
|
| 101 |
+
|
| 102 |
+
Always strive for natural, fluent Urdu that accurately conveys the technical meaning.
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
return Agent(
|
| 106 |
+
name="UrduTechnicalTranslator",
|
| 107 |
+
instructions=instructions,
|
| 108 |
+
model=self.model,
|
| 109 |
+
tools=[
|
| 110 |
+
self._create_translate_tool(),
|
| 111 |
+
self._create_analyze_code_tool(),
|
| 112 |
+
self._create_glossary_tool(),
|
| 113 |
+
self._create_context_tool()
|
| 114 |
+
]
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
async def _handle_rate_limit_error(self, error: Exception) -> None:
|
| 118 |
+
"""
|
| 119 |
+
Handle rate limit errors with proper backoff.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
error: The rate limit error
|
| 123 |
+
"""
|
| 124 |
+
if isinstance(error, OpenAIRateLimitError):
|
| 125 |
+
logger.warning(
|
| 126 |
+
"Rate limit hit, implementing backoff",
|
| 127 |
+
retry_after=error.retry_after if hasattr(error, 'retry_after') else None
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Implement exponential backoff
|
| 131 |
+
retry_after = getattr(error, 'retry_after', 1)
|
| 132 |
+
await asyncio.sleep(retry_after)
|
| 133 |
+
|
| 134 |
+
# Handle HTTP 429 from OpenAI client
|
| 135 |
+
elif hasattr(error, 'status_code') and error.status_code == 429:
|
| 136 |
+
retry_after = 1
|
| 137 |
+
if hasattr(error, 'response') and error.response:
|
| 138 |
+
try:
|
| 139 |
+
error_data = error.response.json()
|
| 140 |
+
retry_after = error_data.get('retry_after', retry_after)
|
| 141 |
+
except:
|
| 142 |
+
pass
|
| 143 |
+
|
| 144 |
+
logger.warning(
|
| 145 |
+
"HTTP 429 rate limit hit",
|
| 146 |
+
retry_after=retry_after
|
| 147 |
+
)
|
| 148 |
+
await asyncio.sleep(retry_after)
|
| 149 |
+
|
| 150 |
+
async def translate_with_agent(
|
| 151 |
+
self,
|
| 152 |
+
text: str,
|
| 153 |
+
context: Optional[TranslationContext] = None
|
| 154 |
+
) -> Dict[str, Any]:
|
| 155 |
+
"""
|
| 156 |
+
Translate text using OpenAI Agents SDK with proper error handling.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
text: Text to translate
|
| 160 |
+
context: Translation context
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
Translation result with metadata
|
| 164 |
+
"""
|
| 165 |
+
logger.info(
|
| 166 |
+
"Starting translation with OpenAI Agents SDK",
|
| 167 |
+
text_length=len(text),
|
| 168 |
+
has_context=bool(context)
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Prepare context prompt
|
| 172 |
+
context_info = ""
|
| 173 |
+
if context:
|
| 174 |
+
if context.technical_domain:
|
| 175 |
+
context_info += f"\nDomain: {context.technical_domain}"
|
| 176 |
+
if context.document_type:
|
| 177 |
+
context_info += f"\nDocument Type: {context.document_type}"
|
| 178 |
+
if context.target_audience:
|
| 179 |
+
context_info += f"\nTarget Audience: {context.target_audience}"
|
| 180 |
+
if context.chunk_index is not None:
|
| 181 |
+
context_info += f"\nChunk: {context.chunk_index + 1} of {context.total_chunks or '?'}"
|
| 182 |
+
|
| 183 |
+
# Create the translation prompt
|
| 184 |
+
prompt = f"""
|
| 185 |
+
Translate the following English text to Urdu:
|
| 186 |
+
|
| 187 |
+
{context_info}
|
| 188 |
+
|
| 189 |
+
Text:
|
| 190 |
+
{text}
|
| 191 |
+
|
| 192 |
+
Requirements:
|
| 193 |
+
- Use Urdu script (Nastaleeq)
|
| 194 |
+
- Translate all non-code content
|
| 195 |
+
- Preserve formatting and structure
|
| 196 |
+
- Use appropriate technical terminology
|
| 197 |
+
- Maintain consistency with previous translations
|
| 198 |
+
"""
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
# Create runner and execute with retry logic
|
| 202 |
+
runner = Runner(self.agent)
|
| 203 |
+
|
| 204 |
+
# Implement retry with rate limit handling
|
| 205 |
+
max_retries = 3
|
| 206 |
+
for attempt in range(max_retries):
|
| 207 |
+
try:
|
| 208 |
+
result = await runner.run(prompt)
|
| 209 |
+
|
| 210 |
+
# Extract metadata
|
| 211 |
+
tokens_used = 0
|
| 212 |
+
if hasattr(result, 'usage') and result.usage:
|
| 213 |
+
tokens_used = result.usage.total_tokens
|
| 214 |
+
|
| 215 |
+
return {
|
| 216 |
+
"translated_text": result.final_output.strip(),
|
| 217 |
+
"original_text": text,
|
| 218 |
+
"tokens_used": tokens_used,
|
| 219 |
+
"model": self.model,
|
| 220 |
+
"confidence_score": 0.9, # Placeholder
|
| 221 |
+
"attempt": attempt + 1,
|
| 222 |
+
"context": context_info
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
except OpenAIRateLimitError as e:
|
| 226 |
+
if attempt < max_retries - 1:
|
| 227 |
+
await self._handle_rate_limit_error(e)
|
| 228 |
+
continue
|
| 229 |
+
else:
|
| 230 |
+
raise RateLimitError(
|
| 231 |
+
f"Rate limit exceeded after {max_retries} attempts",
|
| 232 |
+
retry_after=getattr(e, 'retry_after', None)
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
# Check if it's an HTTP 429 error
|
| 237 |
+
if hasattr(e, 'status_code') and e.status_code == 429:
|
| 238 |
+
if attempt < max_retries - 1:
|
| 239 |
+
await self._handle_rate_limit_error(e)
|
| 240 |
+
continue
|
| 241 |
+
else:
|
| 242 |
+
raise RateLimitError(
|
| 243 |
+
f"Rate limit exceeded after {max_retries} attempts",
|
| 244 |
+
retry_after=getattr(e, 'retry_after', 1)
|
| 245 |
+
)
|
| 246 |
+
else:
|
| 247 |
+
# Re-raise non-rate-limit errors
|
| 248 |
+
raise
|
| 249 |
+
|
| 250 |
+
except RateLimitError:
|
| 251 |
+
raise
|
| 252 |
+
except Exception as e:
|
| 253 |
+
logger.error(
|
| 254 |
+
"Agent translation failed",
|
| 255 |
+
error=str(e),
|
| 256 |
+
error_type=type(e).__name__
|
| 257 |
+
)
|
| 258 |
+
raise TranslationError(
|
| 259 |
+
f"Translation failed: {str(e)}",
|
| 260 |
+
error_type="AGENT_ERROR",
|
| 261 |
+
details={"original_error": str(e)}
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
def _create_translate_tool(self):
|
| 265 |
+
"""Create the translate tool for the agent."""
|
| 266 |
+
@function_tool
|
| 267 |
+
async def translate_text(
|
| 268 |
+
ctx: RunContextWrapper[Any],
|
| 269 |
+
text: str,
|
| 270 |
+
context: Optional[Dict[str, Any]] = None,
|
| 271 |
+
preserve_formatting: bool = True
|
| 272 |
+
) -> str:
|
| 273 |
+
"""
|
| 274 |
+
Translate text from English to Urdu using the OpenAI client directly.
|
| 275 |
+
|
| 276 |
+
This is a fallback tool used by the agent for complex translations.
|
| 277 |
+
"""
|
| 278 |
+
logger.debug(
|
| 279 |
+
"Using translate_text tool",
|
| 280 |
+
text_length=len(text)
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
try:
|
| 284 |
+
# Use the Gemini OpenAI client directly
|
| 285 |
+
client = self.client.get_client()
|
| 286 |
+
|
| 287 |
+
response = await client.chat.completions.create(
|
| 288 |
+
model=self.model,
|
| 289 |
+
messages=[
|
| 290 |
+
{
|
| 291 |
+
"role": "system",
|
| 292 |
+
"content": "You are a professional translator for technical content."
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
"role": "user",
|
| 296 |
+
"content": f"Translate to Urdu: {text}"
|
| 297 |
+
}
|
| 298 |
+
],
|
| 299 |
+
temperature=0.3,
|
| 300 |
+
max_tokens=4000
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
return response.choices[0].message.content.strip()
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
if hasattr(e, 'status_code') and e.status_code == 429:
|
| 307 |
+
# Convert to OpenAI Agents SDK rate limit error
|
| 308 |
+
raise OpenAIRateLimitError(
|
| 309 |
+
"Rate limit exceeded",
|
| 310 |
+
retry_after=getattr(e, 'retry_after', 1)
|
| 311 |
+
)
|
| 312 |
+
raise
|
| 313 |
+
|
| 314 |
+
return translate_text
|
| 315 |
+
|
| 316 |
+
def _create_analyze_code_tool(self):
|
| 317 |
+
"""Create the code analysis tool for the agent."""
|
| 318 |
+
@function_tool
|
| 319 |
+
async def analyze_code_blocks(
|
| 320 |
+
ctx: RunContextWrapper[Any],
|
| 321 |
+
text: str
|
| 322 |
+
) -> List[Dict[str, Any]]:
|
| 323 |
+
"""
|
| 324 |
+
Analyze text to identify and extract code blocks.
|
| 325 |
+
"""
|
| 326 |
+
import re
|
| 327 |
+
|
| 328 |
+
# Pattern to match code blocks
|
| 329 |
+
code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
|
| 330 |
+
|
| 331 |
+
code_blocks = []
|
| 332 |
+
for match in code_pattern.finditer(text):
|
| 333 |
+
language = match.group(1) or "text"
|
| 334 |
+
code_content = match.group(2)
|
| 335 |
+
start_pos = match.start()
|
| 336 |
+
end_pos = match.end()
|
| 337 |
+
|
| 338 |
+
code_blocks.append({
|
| 339 |
+
"language": language,
|
| 340 |
+
"content": code_content,
|
| 341 |
+
"start_position": start_pos,
|
| 342 |
+
"end_position": end_pos,
|
| 343 |
+
"length": len(code_content)
|
| 344 |
+
})
|
| 345 |
+
|
| 346 |
+
return code_blocks
|
| 347 |
+
|
| 348 |
+
return analyze_code_blocks
|
| 349 |
+
|
| 350 |
+
def _create_glossary_tool(self):
|
| 351 |
+
"""Create the glossary tool for the agent."""
|
| 352 |
+
@function_tool
|
| 353 |
+
async def get_translation_glossary(
|
| 354 |
+
ctx: RunContextWrapper[Any],
|
| 355 |
+
domain: Optional[str] = None
|
| 356 |
+
) -> Dict[str, str]:
|
| 357 |
+
"""
|
| 358 |
+
Get domain-specific translation glossary.
|
| 359 |
+
"""
|
| 360 |
+
glossaries = {
|
| 361 |
+
"ai": {
|
| 362 |
+
"Artificial Intelligence": "مصنوعی ذہانت",
|
| 363 |
+
"Machine Learning": "مشین لرننگ",
|
| 364 |
+
"Deep Learning": "ڈیپ لرننگ",
|
| 365 |
+
"Neural Network": "نیورل نیٹورک",
|
| 366 |
+
"Algorithm": "الگورتھم",
|
| 367 |
+
"Model": "ماڈل",
|
| 368 |
+
"Training": "تربیت",
|
| 369 |
+
"Inference": "استنتاج",
|
| 370 |
+
"Dataset": "ڈیٹاسیٹ",
|
| 371 |
+
"Feature": "خصوصیت"
|
| 372 |
+
},
|
| 373 |
+
"robotics": {
|
| 374 |
+
"Robot": "روبوٹ",
|
| 375 |
+
"Actuator": "ایکچویٹر",
|
| 376 |
+
"Sensor": "سینسر",
|
| 377 |
+
"Kinematics": "کائنیمیٹکس",
|
| 378 |
+
"Path Planning": "پاتھ پلاننگ",
|
| 379 |
+
"Control System": "کنٹرول سسٹم",
|
| 380 |
+
"Embedded": "ایمبیڈڈ",
|
| 381 |
+
"Autonomous": "خودکار"
|
| 382 |
+
},
|
| 383 |
+
"programming": {
|
| 384 |
+
"Function": "فنکشن",
|
| 385 |
+
"Variable": "متغیر",
|
| 386 |
+
"Class": "کلاس",
|
| 387 |
+
"Object": "آبجیکٹ",
|
| 388 |
+
"Method": "میٹھڈ",
|
| 389 |
+
"Library": "لائبریری",
|
| 390 |
+
"Framework": "فریم ورک",
|
| 391 |
+
"API": "API",
|
| 392 |
+
"Database": "ڈیٹا بیس",
|
| 393 |
+
"Server": "سرور"
|
| 394 |
+
}
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
if domain and domain.lower() in glossaries:
|
| 398 |
+
return glossaries[domain.lower()]
|
| 399 |
+
|
| 400 |
+
# Return combined glossary for general use
|
| 401 |
+
combined = {}
|
| 402 |
+
for gloss in glossaries.values():
|
| 403 |
+
combined.update(gloss)
|
| 404 |
+
|
| 405 |
+
return combined
|
| 406 |
+
|
| 407 |
+
return get_translation_glossary
|
| 408 |
+
|
| 409 |
+
def _create_context_tool(self):
|
| 410 |
+
"""Create the context tool for the agent."""
|
| 411 |
+
@function_tool
|
| 412 |
+
async def set_translation_context(
|
| 413 |
+
ctx: RunContextWrapper[Any],
|
| 414 |
+
page_url: Optional[str] = None,
|
| 415 |
+
document_type: Optional[str] = None,
|
| 416 |
+
technical_domain: Optional[str] = None,
|
| 417 |
+
target_audience: Optional[str] = None
|
| 418 |
+
) -> Dict[str, Any]:
|
| 419 |
+
"""
|
| 420 |
+
Set context for translation decisions.
|
| 421 |
+
"""
|
| 422 |
+
context = {
|
| 423 |
+
"page_url": page_url,
|
| 424 |
+
"document_type": document_type,
|
| 425 |
+
"technical_domain": technical_domain,
|
| 426 |
+
"target_audience": target_audience,
|
| 427 |
+
"set_at": time.time()
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
logger.info(
|
| 431 |
+
"Translation context set via tool",
|
| 432 |
+
context=context
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
return {
|
| 436 |
+
"success": True,
|
| 437 |
+
"message": "Translation context updated successfully",
|
| 438 |
+
"context": context
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
return set_translation_context
|
| 442 |
+
|
| 443 |
+
async def translate_chunk_sequence(
|
| 444 |
+
self,
|
| 445 |
+
chunks: List[str],
|
| 446 |
+
context: Optional[TranslationContext] = None
|
| 447 |
+
) -> List[Dict[str, Any]]:
|
| 448 |
+
"""
|
| 449 |
+
Translate a sequence of chunks maintaining consistency.
|
| 450 |
+
|
| 451 |
+
Args:
|
| 452 |
+
chunks: List of text chunks to translate
|
| 453 |
+
context: Translation context
|
| 454 |
+
|
| 455 |
+
Returns:
|
| 456 |
+
List of translation results
|
| 457 |
+
"""
|
| 458 |
+
logger.info(
|
| 459 |
+
"Translating chunk sequence with OpenAI Agents SDK",
|
| 460 |
+
chunk_count=len(chunks),
|
| 461 |
+
has_context=bool(context)
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
results = []
|
| 465 |
+
total_tokens = 0
|
| 466 |
+
|
| 467 |
+
for i, chunk in enumerate(chunks):
|
| 468 |
+
logger.debug(
|
| 469 |
+
"Translating chunk",
|
| 470 |
+
chunk_index=i,
|
| 471 |
+
chunk_length=len(chunk)
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
# Update context with chunk info
|
| 475 |
+
chunk_context = context
|
| 476 |
+
if chunk_context:
|
| 477 |
+
chunk_context.chunk_index = i
|
| 478 |
+
chunk_context.total_chunks = len(chunks)
|
| 479 |
+
|
| 480 |
+
try:
|
| 481 |
+
result = await self.translate_with_agent(chunk, chunk_context)
|
| 482 |
+
result["chunk_index"] = i
|
| 483 |
+
results.append(result)
|
| 484 |
+
total_tokens += result.get("tokens_used", 0)
|
| 485 |
+
|
| 486 |
+
except RateLimitError as e:
|
| 487 |
+
logger.error(
|
| 488 |
+
"Rate limit hit for chunk",
|
| 489 |
+
chunk_index=i,
|
| 490 |
+
retry_after=e.retry_after
|
| 491 |
+
)
|
| 492 |
+
# Add rate limit error result
|
| 493 |
+
results.append({
|
| 494 |
+
"chunk_index": i,
|
| 495 |
+
"translated_text": f"[RATE LIMIT ERROR: {str(e)}]",
|
| 496 |
+
"original_text": chunk,
|
| 497 |
+
"error": str(e),
|
| 498 |
+
"error_type": "RATE_LIMIT",
|
| 499 |
+
"tokens_used": 0,
|
| 500 |
+
"model": self.model,
|
| 501 |
+
"confidence_score": 0.0,
|
| 502 |
+
"retry_after": e.retry_after
|
| 503 |
+
})
|
| 504 |
+
|
| 505 |
+
except Exception as e:
|
| 506 |
+
logger.error(
|
| 507 |
+
"Chunk translation failed",
|
| 508 |
+
chunk_index=i,
|
| 509 |
+
error=str(e)
|
| 510 |
+
)
|
| 511 |
+
# Add failed result
|
| 512 |
+
results.append({
|
| 513 |
+
"chunk_index": i,
|
| 514 |
+
"translated_text": chunk, # Fallback to original
|
| 515 |
+
"original_text": chunk,
|
| 516 |
+
"error": str(e),
|
| 517 |
+
"tokens_used": 0,
|
| 518 |
+
"model": self.model,
|
| 519 |
+
"confidence_score": 0.0
|
| 520 |
+
})
|
| 521 |
+
|
| 522 |
+
logger.info(
|
| 523 |
+
"Chunk sequence translation completed",
|
| 524 |
+
total_chunks=len(chunks),
|
| 525 |
+
successful_chunks=len([r for r in results if not r.get("error")]),
|
| 526 |
+
total_tokens=total_tokens
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
return results
|
| 530 |
+
|
| 531 |
+
async def get_agent(self) -> Agent:
|
| 532 |
+
"""Get the configured translation agent."""
|
| 533 |
+
return self.agent
|
src/services/openai_translation/service.py
ADDED
|
@@ -0,0 +1,855 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenAI Translation Service using Gemini API.
|
| 3 |
+
|
| 4 |
+
This service implements the core translation functionality using
|
| 5 |
+
OpenAI Agents SDK with Gemini's OpenAI-compatible endpoint.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import hashlib
|
| 10 |
+
import json
|
| 11 |
+
import time
|
| 12 |
+
import uuid
|
| 13 |
+
from datetime import datetime, timedelta
|
| 14 |
+
from typing import Dict, List, Optional, Any, AsyncGenerator
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
|
| 17 |
+
from openai import AsyncOpenAI
|
| 18 |
+
from openai.types.chat import ChatCompletion
|
| 19 |
+
|
| 20 |
+
from src.models.translation_openai import (
|
| 21 |
+
TranslationJob, TranslationChunk, TranslationError, TranslationSession,
|
| 22 |
+
TranslationCache, TranslationJobStatus, ChunkStatus, ErrorSeverity
|
| 23 |
+
)
|
| 24 |
+
from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
|
| 25 |
+
from src.services.cache_service import CacheService, get_cache_service
|
| 26 |
+
from src.database.base import get_db
|
| 27 |
+
from src.utils.translation_errors import (
|
| 28 |
+
TranslationError as TranslationServiceError, APIError, RateLimitError,
|
| 29 |
+
with_translation_error_handling, retry_with_exponential_backoff
|
| 30 |
+
)
|
| 31 |
+
from src.utils.translation_logger import get_translation_logger, log_translation_performance
|
| 32 |
+
|
| 33 |
+
logger = get_translation_logger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class OpenAITranslationRequest:
|
| 38 |
+
"""Translation request with comprehensive parameters."""
|
| 39 |
+
text: str
|
| 40 |
+
source_language: str
|
| 41 |
+
target_language: str
|
| 42 |
+
page_url: Optional[str] = None
|
| 43 |
+
user_id: Optional[str] = None
|
| 44 |
+
session_id: Optional[str] = None
|
| 45 |
+
|
| 46 |
+
# OpenAI parameters
|
| 47 |
+
model: str = "gemini-2.0-flash-lite"
|
| 48 |
+
temperature: float = 0.3
|
| 49 |
+
max_tokens: int = 2048
|
| 50 |
+
|
| 51 |
+
# Processing options
|
| 52 |
+
preserve_code_blocks: bool = True
|
| 53 |
+
enable_transliteration: bool = True
|
| 54 |
+
chunk_size: int = 2000
|
| 55 |
+
max_chunks: int = 100
|
| 56 |
+
|
| 57 |
+
# Retry settings
|
| 58 |
+
max_retries: int = 3
|
| 59 |
+
retry_delay: float = 1.0
|
| 60 |
+
|
| 61 |
+
# Streaming
|
| 62 |
+
streaming: bool = False
|
| 63 |
+
|
| 64 |
+
# Session context
|
| 65 |
+
user_agent: Optional[str] = None
|
| 66 |
+
ip_address: Optional[str] = None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class OpenAITranslationResponse:
|
| 71 |
+
"""Translation response with comprehensive metadata."""
|
| 72 |
+
job_id: str
|
| 73 |
+
translated_text: str
|
| 74 |
+
status: TranslationJobStatus
|
| 75 |
+
progress: float # 0-100
|
| 76 |
+
chunks: List[Dict[str, Any]]
|
| 77 |
+
processing_time_ms: int
|
| 78 |
+
cached: bool
|
| 79 |
+
|
| 80 |
+
# Cost tracking
|
| 81 |
+
input_tokens: int
|
| 82 |
+
output_tokens: int
|
| 83 |
+
estimated_cost_usd: float
|
| 84 |
+
|
| 85 |
+
# Quality metrics
|
| 86 |
+
confidence_score: Optional[float] = None
|
| 87 |
+
quality_score: Optional[float] = None
|
| 88 |
+
|
| 89 |
+
# Error information
|
| 90 |
+
error_message: Optional[str] = None
|
| 91 |
+
error_details: Optional[Dict[str, Any]] = None
|
| 92 |
+
|
| 93 |
+
# Cache information
|
| 94 |
+
cache_key: Optional[str] = None
|
| 95 |
+
cache_hit: bool = False
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class OpenAITranslationService:
|
| 99 |
+
"""
|
| 100 |
+
Translation service using OpenAI Agents SDK with Gemini API.
|
| 101 |
+
|
| 102 |
+
Features:
|
| 103 |
+
- OpenAI Agents SDK with Gemini 2.0 Flash model
|
| 104 |
+
- Content chunking for large texts
|
| 105 |
+
- Enhanced caching with page URL support
|
| 106 |
+
- Progress tracking and streaming
|
| 107 |
+
- Error handling and retries
|
| 108 |
+
- Session management
|
| 109 |
+
- Cost and quality tracking
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
# Translation prompt templates
|
| 113 |
+
TRANSLATION_PROMPT_TEMPLATE = """
|
| 114 |
+
You are a professional translator. Translate the following text from {source_lang} to {target_lang}.
|
| 115 |
+
|
| 116 |
+
CRITICAL REQUIREMENTS:
|
| 117 |
+
1. Translate ALL text to {target_lang} - no English words should remain
|
| 118 |
+
2. ONLY preserve code blocks marked with ```
|
| 119 |
+
3. Translate technical terms with context (e.g., AI → مصنوعی ذہانت)
|
| 120 |
+
4. Use Urdu script (Nastaleeq) for Urdu text
|
| 121 |
+
5. Maintain formatting and structure
|
| 122 |
+
6. Mix Urdu with Roman Urdu for technical terms where appropriate
|
| 123 |
+
|
| 124 |
+
Text to translate:
|
| 125 |
+
{text}
|
| 126 |
+
|
| 127 |
+
Translate only the content above.
|
| 128 |
+
"""
|
| 129 |
+
|
| 130 |
+
CHUNK_TRANSLATION_PROMPT = """
|
| 131 |
+
Translate this text segment from {source_lang} to {target_lang}.
|
| 132 |
+
|
| 133 |
+
Context: This is part {current_part} of {total_parts} of a larger document.
|
| 134 |
+
|
| 135 |
+
Requirements:
|
| 136 |
+
- Maintain consistency with the overall document
|
| 137 |
+
- Translate accurately while preserving meaning
|
| 138 |
+
- Handle technical terms appropriately
|
| 139 |
+
- Keep the flow natural
|
| 140 |
+
- Use Urdu script (Nastaleeq)
|
| 141 |
+
|
| 142 |
+
Text:
|
| 143 |
+
{text}
|
| 144 |
+
|
| 145 |
+
Translation:
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
# Model pricing (approximate USD per 1K tokens)
|
| 149 |
+
MODEL_PRICING = {
|
| 150 |
+
"gemini-2.0-flash-lite": {
|
| 151 |
+
"input": 0.000075, # $0.075 per 1M input tokens
|
| 152 |
+
"output": 0.00015 # $0.15 per 1M output tokens
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
def __init__(
|
| 157 |
+
self,
|
| 158 |
+
gemini_client: Optional[GeminiOpenAIClient] = None,
|
| 159 |
+
cache_service: Optional[CacheService] = None,
|
| 160 |
+
enable_analytics: bool = True
|
| 161 |
+
):
|
| 162 |
+
"""
|
| 163 |
+
Initialize OpenAI translation service.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
gemini_client: Gemini OpenAI client
|
| 167 |
+
cache_service: Cache service instance
|
| 168 |
+
enable_analytics: Whether to collect detailed analytics
|
| 169 |
+
"""
|
| 170 |
+
self.gemini_client = gemini_client
|
| 171 |
+
self.cache_service = cache_service
|
| 172 |
+
self.enable_analytics = enable_analytics
|
| 173 |
+
|
| 174 |
+
# Initialize services if not provided
|
| 175 |
+
if not self.gemini_client:
|
| 176 |
+
self.gemini_client = get_gemini_client()
|
| 177 |
+
|
| 178 |
+
if not self.cache_service:
|
| 179 |
+
self.cache_service = get_cache_service()
|
| 180 |
+
|
| 181 |
+
logger.info(
|
| 182 |
+
"OpenAI Translation Service initialized",
|
| 183 |
+
model="gemini-2.0-flash-lite",
|
| 184 |
+
analytics_enabled=enable_analytics
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
def _generate_content_hash(self, text: str, source_lang: str, target_lang: str) -> str:
|
| 188 |
+
"""Generate SHA-256 hash for content identification."""
|
| 189 |
+
content = f"{text}:{source_lang}:{target_lang}"
|
| 190 |
+
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
| 191 |
+
|
| 192 |
+
def _generate_cache_key(self, content_hash: str, page_url: Optional[str] = None) -> str:
|
| 193 |
+
"""Generate comprehensive cache key including page URL."""
|
| 194 |
+
if page_url:
|
| 195 |
+
url_hash = hashlib.sha256(page_url.encode('utf-8')).hexdigest()[:16]
|
| 196 |
+
return f"openai_translation:{content_hash}:{url_hash}"
|
| 197 |
+
return f"openai_translation:{content_hash}"
|
| 198 |
+
|
| 199 |
+
async def _check_cache(
|
| 200 |
+
self,
|
| 201 |
+
content_hash: str,
|
| 202 |
+
page_url: Optional[str] = None
|
| 203 |
+
) -> Optional[TranslationCache]:
|
| 204 |
+
"""Check if translation is cached in database."""
|
| 205 |
+
cache_key = self._generate_cache_key(content_hash, page_url)
|
| 206 |
+
|
| 207 |
+
db = next(get_db())
|
| 208 |
+
try:
|
| 209 |
+
cache_entry = db.query(TranslationCache).filter(
|
| 210 |
+
TranslationCache.cache_key == cache_key,
|
| 211 |
+
TranslationCache.expires_at > datetime.utcnow()
|
| 212 |
+
).first()
|
| 213 |
+
|
| 214 |
+
if cache_entry:
|
| 215 |
+
# Update hit statistics
|
| 216 |
+
cache_entry.hit_count += 1
|
| 217 |
+
cache_entry.last_hit_at = datetime.utcnow()
|
| 218 |
+
db.commit()
|
| 219 |
+
logger.info(
|
| 220 |
+
"Cache hit found",
|
| 221 |
+
cache_key=cache_key[:20],
|
| 222 |
+
hits=cache_entry.hit_count
|
| 223 |
+
)
|
| 224 |
+
return cache_entry
|
| 225 |
+
|
| 226 |
+
finally:
|
| 227 |
+
db.close()
|
| 228 |
+
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
async def _cache_translation(
|
| 232 |
+
self,
|
| 233 |
+
job: TranslationJob,
|
| 234 |
+
cache_key: str,
|
| 235 |
+
quality_score: Optional[float] = None
|
| 236 |
+
) -> bool:
|
| 237 |
+
"""Cache a successful translation."""
|
| 238 |
+
try:
|
| 239 |
+
db = next(get_db())
|
| 240 |
+
|
| 241 |
+
# Determine TTL based on quality
|
| 242 |
+
if quality_score and quality_score >= 4.5:
|
| 243 |
+
ttl_hours = 30 * 24 # 30 days for high quality
|
| 244 |
+
elif quality_score and quality_score < 3.0:
|
| 245 |
+
ttl_hours = 24 # 1 day for low quality
|
| 246 |
+
else:
|
| 247 |
+
ttl_hours = 7 * 24 # 7 days default
|
| 248 |
+
|
| 249 |
+
expires_at = datetime.utcnow() + timedelta(hours=ttl_hours)
|
| 250 |
+
|
| 251 |
+
cache_entry = TranslationCache(
|
| 252 |
+
cache_key=cache_key,
|
| 253 |
+
job_id=job.id,
|
| 254 |
+
content_hash=job.content_hash,
|
| 255 |
+
page_url=job.page_url,
|
| 256 |
+
source_language=job.source_language,
|
| 257 |
+
target_language=job.target_language,
|
| 258 |
+
original_text=job.original_text,
|
| 259 |
+
translated_text=job.translated_text,
|
| 260 |
+
model_version=job.model_name,
|
| 261 |
+
processing_time_ms=job.processing_time_ms,
|
| 262 |
+
ttl_hours=ttl_hours,
|
| 263 |
+
expires_at=expires_at,
|
| 264 |
+
quality_score=quality_score,
|
| 265 |
+
is_validated=quality_score is not None
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
db.add(cache_entry)
|
| 269 |
+
db.commit()
|
| 270 |
+
|
| 271 |
+
logger.info(
|
| 272 |
+
"Translation cached",
|
| 273 |
+
cache_key=cache_key[:20],
|
| 274 |
+
ttl_hours=ttl_hours
|
| 275 |
+
)
|
| 276 |
+
return True
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.error("Failed to cache translation", error=str(e))
|
| 280 |
+
return False
|
| 281 |
+
finally:
|
| 282 |
+
db.close()
|
| 283 |
+
|
| 284 |
+
async def _translate_with_gemini(
|
| 285 |
+
self,
|
| 286 |
+
text: str,
|
| 287 |
+
source_lang: str,
|
| 288 |
+
target_lang: str,
|
| 289 |
+
model: str,
|
| 290 |
+
temperature: float,
|
| 291 |
+
max_tokens: int,
|
| 292 |
+
is_chunk: bool = False,
|
| 293 |
+
context: Optional[Dict[str, Any]] = None
|
| 294 |
+
) -> Dict[str, Any]:
|
| 295 |
+
"""
|
| 296 |
+
Translate text using Gemini via OpenAI SDK.
|
| 297 |
+
|
| 298 |
+
Returns:
|
| 299 |
+
Dict containing translated_text, tokens_used, and response metadata
|
| 300 |
+
"""
|
| 301 |
+
client = self.gemini_client.get_client()
|
| 302 |
+
|
| 303 |
+
try:
|
| 304 |
+
# Select appropriate prompt
|
| 305 |
+
if is_chunk and context:
|
| 306 |
+
prompt = self.CHUNK_TRANSLATION_PROMPT.format(
|
| 307 |
+
source_lang=source_lang,
|
| 308 |
+
target_lang=target_lang,
|
| 309 |
+
current_part=context.get('current_part', 1),
|
| 310 |
+
total_parts=context.get('total_parts', 1),
|
| 311 |
+
text=text
|
| 312 |
+
)
|
| 313 |
+
else:
|
| 314 |
+
prompt = self.TRANSLATION_PROMPT_TEMPLATE.format(
|
| 315 |
+
source_lang=source_lang,
|
| 316 |
+
target_lang=target_lang,
|
| 317 |
+
text=text
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
# Call Gemini API via OpenAI SDK
|
| 321 |
+
response = await client.chat.completions.create(
|
| 322 |
+
model=model,
|
| 323 |
+
messages=[
|
| 324 |
+
{"role": "system", "content": "You are a professional translator."},
|
| 325 |
+
{"role": "user", "content": prompt}
|
| 326 |
+
],
|
| 327 |
+
temperature=temperature,
|
| 328 |
+
max_tokens=max_tokens
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
# Extract translation and metrics
|
| 332 |
+
translated_text = response.choices[0].message.content
|
| 333 |
+
input_tokens = response.usage.prompt_tokens
|
| 334 |
+
output_tokens = response.usage.completion_tokens
|
| 335 |
+
|
| 336 |
+
# Calculate cost
|
| 337 |
+
pricing = self.MODEL_PRICING.get(model, self.MODEL_PRICING["gemini-2.0-flash-lite"])
|
| 338 |
+
estimated_cost = (
|
| 339 |
+
(input_tokens / 1000 * pricing["input"]) +
|
| 340 |
+
(output_tokens / 1000 * pricing["output"])
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
return {
|
| 344 |
+
"translated_text": translated_text.strip() if translated_text else "",
|
| 345 |
+
"input_tokens": input_tokens,
|
| 346 |
+
"output_tokens": output_tokens,
|
| 347 |
+
"total_tokens": input_tokens + output_tokens,
|
| 348 |
+
"estimated_cost": estimated_cost,
|
| 349 |
+
"model": model,
|
| 350 |
+
"response_id": response.id
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
logger.error("Gemini API error", error=str(e))
|
| 355 |
+
raise TranslationServiceError(
|
| 356 |
+
f"Translation failed: {str(e)}",
|
| 357 |
+
error_type="API_ERROR",
|
| 358 |
+
is_retriable=True
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
def _split_text_into_chunks(
|
| 362 |
+
self,
|
| 363 |
+
text: str,
|
| 364 |
+
chunk_size: int,
|
| 365 |
+
max_chunks: int,
|
| 366 |
+
preserve_code_blocks: bool = True
|
| 367 |
+
) -> List[Dict[str, Any]]:
|
| 368 |
+
"""
|
| 369 |
+
Split text into chunks for processing.
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
List of chunks with text, position, and metadata
|
| 373 |
+
"""
|
| 374 |
+
chunks = []
|
| 375 |
+
|
| 376 |
+
if preserve_code_blocks:
|
| 377 |
+
# Handle code blocks separately
|
| 378 |
+
import re
|
| 379 |
+
code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
|
| 380 |
+
|
| 381 |
+
last_end = 0
|
| 382 |
+
chunk_index = 0
|
| 383 |
+
|
| 384 |
+
for match in code_pattern.finditer(text):
|
| 385 |
+
# Process text before code block
|
| 386 |
+
text_before = text[last_end:match.start()]
|
| 387 |
+
if text_before:
|
| 388 |
+
text_chunks = self._split_plain_text(text_before, chunk_size - 200)
|
| 389 |
+
for chunk_text in text_chunks:
|
| 390 |
+
if chunk_index >= max_chunks:
|
| 391 |
+
break
|
| 392 |
+
chunks.append({
|
| 393 |
+
"text": chunk_text,
|
| 394 |
+
"start": last_end,
|
| 395 |
+
"end": last_end + len(chunk_text),
|
| 396 |
+
"is_code_block": False,
|
| 397 |
+
"index": chunk_index
|
| 398 |
+
})
|
| 399 |
+
chunk_index += 1
|
| 400 |
+
last_end += len(chunk_text)
|
| 401 |
+
|
| 402 |
+
# Add code block as separate chunk
|
| 403 |
+
if chunk_index < max_chunks:
|
| 404 |
+
code_lang = match.group(1) or "unknown"
|
| 405 |
+
code_content = match.group(2)
|
| 406 |
+
full_code = f"```{code_lang}\n{code_content}\n```"
|
| 407 |
+
chunks.append({
|
| 408 |
+
"text": full_code,
|
| 409 |
+
"start": match.start(),
|
| 410 |
+
"end": match.end(),
|
| 411 |
+
"is_code_block": True,
|
| 412 |
+
"code_language": code_lang,
|
| 413 |
+
"index": chunk_index
|
| 414 |
+
})
|
| 415 |
+
chunk_index += 1
|
| 416 |
+
last_end = match.end()
|
| 417 |
+
|
| 418 |
+
# Process remaining text
|
| 419 |
+
if last_end < len(text) and chunk_index < max_chunks:
|
| 420 |
+
remaining_text = text[last_end:]
|
| 421 |
+
text_chunks = self._split_plain_text(remaining_text, chunk_size)
|
| 422 |
+
for chunk_text in text_chunks:
|
| 423 |
+
if chunk_index >= max_chunks:
|
| 424 |
+
break
|
| 425 |
+
chunks.append({
|
| 426 |
+
"text": chunk_text,
|
| 427 |
+
"start": last_end,
|
| 428 |
+
"end": last_end + len(chunk_text),
|
| 429 |
+
"is_code_block": False,
|
| 430 |
+
"index": chunk_index
|
| 431 |
+
})
|
| 432 |
+
chunk_index += 1
|
| 433 |
+
last_end += len(chunk_text)
|
| 434 |
+
else:
|
| 435 |
+
# Simple text splitting
|
| 436 |
+
text_chunks = self._split_plain_text(text, chunk_size)
|
| 437 |
+
chunks = [
|
| 438 |
+
{
|
| 439 |
+
"text": chunk,
|
| 440 |
+
"start": i * chunk_size,
|
| 441 |
+
"end": (i + 1) * chunk_size,
|
| 442 |
+
"is_code_block": False,
|
| 443 |
+
"index": i
|
| 444 |
+
}
|
| 445 |
+
for i, chunk in enumerate(text_chunks[:max_chunks])
|
| 446 |
+
]
|
| 447 |
+
|
| 448 |
+
return chunks
|
| 449 |
+
|
| 450 |
+
def _split_plain_text(self, text: str, chunk_size: int) -> List[str]:
|
| 451 |
+
"""Split plain text into chunks, trying to preserve sentences."""
|
| 452 |
+
import re
|
| 453 |
+
|
| 454 |
+
chunks = []
|
| 455 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 456 |
+
|
| 457 |
+
current_chunk = ""
|
| 458 |
+
for sentence in sentences:
|
| 459 |
+
if len(current_chunk) + len(sentence) <= chunk_size:
|
| 460 |
+
current_chunk += sentence
|
| 461 |
+
else:
|
| 462 |
+
if current_chunk:
|
| 463 |
+
chunks.append(current_chunk)
|
| 464 |
+
current_chunk = sentence
|
| 465 |
+
|
| 466 |
+
if current_chunk:
|
| 467 |
+
chunks.append(current_chunk)
|
| 468 |
+
|
| 469 |
+
return chunks
|
| 470 |
+
|
| 471 |
+
@log_translation_performance
|
| 472 |
+
async def translate(
|
| 473 |
+
self,
|
| 474 |
+
request: OpenAITranslationRequest
|
| 475 |
+
) -> OpenAITranslationResponse:
|
| 476 |
+
"""
|
| 477 |
+
Translate text with comprehensive tracking and caching.
|
| 478 |
+
|
| 479 |
+
Args:
|
| 480 |
+
request: Translation request with all parameters
|
| 481 |
+
|
| 482 |
+
Returns:
|
| 483 |
+
Translation response with metadata
|
| 484 |
+
"""
|
| 485 |
+
start_time = time.time()
|
| 486 |
+
job_id = str(uuid.uuid4())
|
| 487 |
+
content_hash = self._generate_content_hash(
|
| 488 |
+
request.text,
|
| 489 |
+
request.source_language,
|
| 490 |
+
request.target_language
|
| 491 |
+
)
|
| 492 |
+
cache_key = self._generate_cache_key(content_hash, request.page_url)
|
| 493 |
+
|
| 494 |
+
logger.bind_request(request_id=job_id).log_translation_request(
|
| 495 |
+
text_length=len(request.text),
|
| 496 |
+
source_lang=request.source_language,
|
| 497 |
+
target_lang=request.target_language,
|
| 498 |
+
page_url=request.page_url
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
# Check cache first
|
| 502 |
+
cached_translation = await self._check_cache(content_hash, request.page_url)
|
| 503 |
+
if cached_translation:
|
| 504 |
+
processing_time = int((time.time() - start_time) * 1000)
|
| 505 |
+
|
| 506 |
+
logger.log_translation_response(
|
| 507 |
+
translated_length=len(cached_translation.translated_text),
|
| 508 |
+
chunks_count=1,
|
| 509 |
+
cached=True
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
return OpenAITranslationResponse(
|
| 513 |
+
job_id=job_id,
|
| 514 |
+
translated_text=cached_translation.translated_text,
|
| 515 |
+
status=TranslationJobStatus.COMPLETED,
|
| 516 |
+
progress=100.0,
|
| 517 |
+
chunks=[],
|
| 518 |
+
processing_time_ms=processing_time,
|
| 519 |
+
cached=True,
|
| 520 |
+
input_tokens=0,
|
| 521 |
+
output_tokens=0,
|
| 522 |
+
estimated_cost_usd=0.0,
|
| 523 |
+
cache_key=cache_key,
|
| 524 |
+
cache_hit=True
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
# Create translation job
|
| 528 |
+
db = next(get_db())
|
| 529 |
+
try:
|
| 530 |
+
job = TranslationJob(
|
| 531 |
+
job_id=job_id,
|
| 532 |
+
user_id=request.user_id,
|
| 533 |
+
session_id=request.session_id,
|
| 534 |
+
content_hash=content_hash,
|
| 535 |
+
page_url=request.page_url,
|
| 536 |
+
source_language=request.source_language,
|
| 537 |
+
target_language=request.target_language,
|
| 538 |
+
original_text=request.text,
|
| 539 |
+
model_name=request.model,
|
| 540 |
+
temperature=request.temperature,
|
| 541 |
+
max_tokens=request.max_tokens,
|
| 542 |
+
preserve_code_blocks=request.preserve_code_blocks,
|
| 543 |
+
enable_transliteration=request.enable_transliteration,
|
| 544 |
+
chunk_size=request.chunk_size,
|
| 545 |
+
max_chunks=request.max_chunks,
|
| 546 |
+
user_agent=request.user_agent,
|
| 547 |
+
ip_address=request.ip_address
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
db.add(job)
|
| 551 |
+
db.commit()
|
| 552 |
+
|
| 553 |
+
# Split text into chunks
|
| 554 |
+
chunks_data = self._split_text_into_chunks(
|
| 555 |
+
request.text,
|
| 556 |
+
request.chunk_size,
|
| 557 |
+
request.max_chunks,
|
| 558 |
+
request.preserve_code_blocks
|
| 559 |
+
)
|
| 560 |
+
|
| 561 |
+
job.chunks_total = len(chunks_data)
|
| 562 |
+
job.status = TranslationJobStatus.PROCESSING.value
|
| 563 |
+
job.started_at = datetime.utcnow()
|
| 564 |
+
db.commit()
|
| 565 |
+
|
| 566 |
+
# Process chunks
|
| 567 |
+
translated_chunks = []
|
| 568 |
+
total_input_tokens = 0
|
| 569 |
+
total_output_tokens = 0
|
| 570 |
+
total_cost = 0.0
|
| 571 |
+
|
| 572 |
+
for i, chunk_data in enumerate(chunks_data):
|
| 573 |
+
try:
|
| 574 |
+
# Create chunk record
|
| 575 |
+
chunk = TranslationChunk(
|
| 576 |
+
job_id=job.id,
|
| 577 |
+
chunk_index=i,
|
| 578 |
+
original_text=chunk_data["text"],
|
| 579 |
+
start_position=chunk_data["start"],
|
| 580 |
+
end_position=chunk_data["end"],
|
| 581 |
+
is_code_block=chunk_data["is_code_block"],
|
| 582 |
+
code_language=chunk_data.get("code_language"),
|
| 583 |
+
word_count=len(chunk_data["text"].split()),
|
| 584 |
+
status=ChunkStatus.PROCESSING.value,
|
| 585 |
+
started_at=datetime.utcnow()
|
| 586 |
+
)
|
| 587 |
+
db.add(chunk)
|
| 588 |
+
db.commit()
|
| 589 |
+
|
| 590 |
+
# Translate or skip code blocks
|
| 591 |
+
if chunk_data["is_code_block"] and request.preserve_code_blocks:
|
| 592 |
+
translated_text = chunk_data["text"]
|
| 593 |
+
chunk.status = ChunkStatus.COMPLETED.value
|
| 594 |
+
chunk.translated_text = translated_text
|
| 595 |
+
chunk.completed_at = datetime.utcnow()
|
| 596 |
+
else:
|
| 597 |
+
# Translate chunk with retry logic
|
| 598 |
+
async def translate_chunk():
|
| 599 |
+
return await self._translate_with_gemini(
|
| 600 |
+
chunk_data["text"],
|
| 601 |
+
request.source_language,
|
| 602 |
+
request.target_language,
|
| 603 |
+
request.model,
|
| 604 |
+
request.temperature,
|
| 605 |
+
request.max_tokens,
|
| 606 |
+
is_chunk=True,
|
| 607 |
+
context={
|
| 608 |
+
"current_part": i + 1,
|
| 609 |
+
"total_parts": len(chunks_data)
|
| 610 |
+
} if len(chunks_data) > 1 else None
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
result = await retry_with_exponential_backoff(
|
| 614 |
+
translate_chunk,
|
| 615 |
+
max_retries=request.max_retries
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
translated_text = result["translated_text"]
|
| 619 |
+
chunk.translated_text = translated_text
|
| 620 |
+
chunk.input_tokens = result["input_tokens"]
|
| 621 |
+
chunk.output_tokens = result["output_tokens"]
|
| 622 |
+
chunk.status = ChunkStatus.COMPLETED.value
|
| 623 |
+
chunk.completed_at = datetime.utcnow()
|
| 624 |
+
|
| 625 |
+
total_input_tokens += result["input_tokens"]
|
| 626 |
+
total_output_tokens += result["output_tokens"]
|
| 627 |
+
total_cost += result["estimated_cost"]
|
| 628 |
+
|
| 629 |
+
# Update job progress
|
| 630 |
+
job.chunks_completed += 1
|
| 631 |
+
job.progress_percentage = (job.chunks_completed / job.chunks_total) * 100
|
| 632 |
+
db.commit()
|
| 633 |
+
|
| 634 |
+
# Add to response chunks
|
| 635 |
+
translated_chunks.append({
|
| 636 |
+
"index": i,
|
| 637 |
+
"original_text": chunk_data["text"],
|
| 638 |
+
"translated_text": translated_text,
|
| 639 |
+
"start_position": chunk_data["start"],
|
| 640 |
+
"end_position": chunk_data["end"],
|
| 641 |
+
"is_code_block": chunk_data["is_code_block"],
|
| 642 |
+
"code_language": chunk_data.get("code_language")
|
| 643 |
+
})
|
| 644 |
+
|
| 645 |
+
except Exception as e:
|
| 646 |
+
# Handle chunk error
|
| 647 |
+
chunk.status = ChunkStatus.FAILED.value
|
| 648 |
+
chunk.last_error = str(e)
|
| 649 |
+
job.chunks_failed += 1
|
| 650 |
+
|
| 651 |
+
# Log error
|
| 652 |
+
logger.log_error(e, chunk_index=i)
|
| 653 |
+
|
| 654 |
+
db.commit()
|
| 655 |
+
logger.error(f"Chunk {i} translation failed", error=str(e))
|
| 656 |
+
|
| 657 |
+
# Reconstruct final translation
|
| 658 |
+
final_translation = ''.join(chunk["translated_text"] for chunk in translated_chunks)
|
| 659 |
+
|
| 660 |
+
# Update job completion
|
| 661 |
+
job.translated_text = final_translation
|
| 662 |
+
job.input_tokens = total_input_tokens
|
| 663 |
+
job.output_tokens = total_output_tokens
|
| 664 |
+
job.estimated_cost_usd = total_cost
|
| 665 |
+
job.status = (
|
| 666 |
+
TranslationJobStatus.COMPLETED.value
|
| 667 |
+
if job.chunks_failed == 0
|
| 668 |
+
else TranslationJobStatus.FAILED.value
|
| 669 |
+
)
|
| 670 |
+
job.completed_at = datetime.utcnow()
|
| 671 |
+
job.processing_time_ms = int((time.time() - start_time) * 1000)
|
| 672 |
+
job.progress_percentage = 100.0
|
| 673 |
+
db.commit()
|
| 674 |
+
|
| 675 |
+
# Cache successful translation
|
| 676 |
+
if job.chunks_failed == 0:
|
| 677 |
+
await self._cache_translation(job, cache_key)
|
| 678 |
+
|
| 679 |
+
processing_time = int((time.time() - start_time) * 1000)
|
| 680 |
+
|
| 681 |
+
logger.log_translation_response(
|
| 682 |
+
translated_length=len(final_translation),
|
| 683 |
+
chunks_count=len(translated_chunks),
|
| 684 |
+
tokens_used=total_input_tokens + total_output_tokens,
|
| 685 |
+
cost_usd=total_cost,
|
| 686 |
+
cached=False
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
logger.info(
|
| 690 |
+
"Translation completed",
|
| 691 |
+
job_id=job_id,
|
| 692 |
+
chunks=len(chunks_data),
|
| 693 |
+
failed=job.chunks_failed,
|
| 694 |
+
processing_time_ms=processing_time,
|
| 695 |
+
total_cost=total_cost
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
return OpenAITranslationResponse(
|
| 699 |
+
job_id=job_id,
|
| 700 |
+
translated_text=final_translation,
|
| 701 |
+
status=TranslationJobStatus(job.status),
|
| 702 |
+
progress=100.0,
|
| 703 |
+
chunks=translated_chunks,
|
| 704 |
+
processing_time_ms=processing_time,
|
| 705 |
+
cached=False,
|
| 706 |
+
input_tokens=total_input_tokens,
|
| 707 |
+
output_tokens=total_output_tokens,
|
| 708 |
+
estimated_cost_usd=total_cost,
|
| 709 |
+
cache_key=cache_key,
|
| 710 |
+
cache_hit=False,
|
| 711 |
+
error_message=(
|
| 712 |
+
f"{job.chunks_failed} chunks failed"
|
| 713 |
+
if job.chunks_failed > 0
|
| 714 |
+
else None
|
| 715 |
+
)
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
except Exception as e:
|
| 719 |
+
# Update job status to failed
|
| 720 |
+
if 'job' in locals():
|
| 721 |
+
job.status = TranslationJobStatus.FAILED.value
|
| 722 |
+
job.completed_at = datetime.utcnow()
|
| 723 |
+
db.commit()
|
| 724 |
+
|
| 725 |
+
logger.log_error(e, job_id=job_id)
|
| 726 |
+
raise TranslationServiceError(
|
| 727 |
+
f"Translation failed: {str(e)}",
|
| 728 |
+
error_type="SYSTEM_ERROR"
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
finally:
|
| 732 |
+
db.close()
|
| 733 |
+
|
| 734 |
+
async def get_translation_status(self, job_id: str) -> Dict[str, Any]:
|
| 735 |
+
"""Get the status of a translation job."""
|
| 736 |
+
db = next(get_db())
|
| 737 |
+
try:
|
| 738 |
+
job = db.query(TranslationJob).filter(
|
| 739 |
+
TranslationJob.job_id == job_id
|
| 740 |
+
).first()
|
| 741 |
+
|
| 742 |
+
if not job:
|
| 743 |
+
raise TranslationServiceError(
|
| 744 |
+
"Translation job not found",
|
| 745 |
+
error_type="VALIDATION_ERROR"
|
| 746 |
+
)
|
| 747 |
+
|
| 748 |
+
return {
|
| 749 |
+
"job_id": job.job_id,
|
| 750 |
+
"status": job.status,
|
| 751 |
+
"progress": float(job.progress_percentage),
|
| 752 |
+
"chunks_total": job.chunks_total,
|
| 753 |
+
"chunks_completed": job.chunks_completed,
|
| 754 |
+
"chunks_failed": job.chunks_failed,
|
| 755 |
+
"processing_time_ms": job.processing_time_ms,
|
| 756 |
+
"estimated_cost_usd": float(job.estimated_cost_usd),
|
| 757 |
+
"created_at": job.created_at.isoformat(),
|
| 758 |
+
"started_at": job.started_at.isoformat() if job.started_at else None,
|
| 759 |
+
"completed_at": job.completed_at.isoformat() if job.completed_at else None
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
finally:
|
| 763 |
+
db.close()
|
| 764 |
+
|
| 765 |
+
async def stream_translation_status(self, job_id: str) -> AsyncGenerator[Dict[str, Any], None]:
|
| 766 |
+
"""Stream translation status updates."""
|
| 767 |
+
# Implementation for streaming status updates
|
| 768 |
+
# This would typically check status periodically and yield updates
|
| 769 |
+
yield {"type": "start", "job_id": job_id, "message": "Starting stream..."}
|
| 770 |
+
|
| 771 |
+
# In a real implementation, you would:
|
| 772 |
+
# 1. Get initial job status
|
| 773 |
+
# 2. Poll status changes
|
| 774 |
+
# 3. Yield updates as they occur
|
| 775 |
+
# 4. Close stream when job completes
|
| 776 |
+
|
| 777 |
+
async def check_cache(self, content_hash: str, page_url: Optional[str] = None) -> Optional[TranslationCache]:
|
| 778 |
+
"""Check cache for translation."""
|
| 779 |
+
return await self._check_cache(content_hash, page_url)
|
| 780 |
+
|
| 781 |
+
def generate_cache_key(self, content_hash: str, page_url: Optional[str] = None) -> str:
|
| 782 |
+
"""Generate cache key."""
|
| 783 |
+
return self._generate_cache_key(content_hash, page_url)
|
| 784 |
+
|
| 785 |
+
async def clear_cache(self, page_url: Optional[str] = None, older_than_hours: Optional[int] = None) -> int:
|
| 786 |
+
"""Clear translation cache entries."""
|
| 787 |
+
db = next(get_db())
|
| 788 |
+
try:
|
| 789 |
+
query = db.query(TranslationCache)
|
| 790 |
+
|
| 791 |
+
if page_url:
|
| 792 |
+
query = query.filter(TranslationCache.page_url == page_url)
|
| 793 |
+
|
| 794 |
+
if older_than_hours:
|
| 795 |
+
cutoff_time = datetime.utcnow() - timedelta(hours=older_than_hours)
|
| 796 |
+
query = query.filter(TranslationCache.created_at < cutoff_time)
|
| 797 |
+
|
| 798 |
+
# Get count before deleting
|
| 799 |
+
count = query.count()
|
| 800 |
+
|
| 801 |
+
# Delete entries
|
| 802 |
+
query.delete()
|
| 803 |
+
db.commit()
|
| 804 |
+
|
| 805 |
+
logger.info(
|
| 806 |
+
"Cache cleared",
|
| 807 |
+
entries_deleted=count,
|
| 808 |
+
page_url=page_url,
|
| 809 |
+
older_than_hours=older_than_hours
|
| 810 |
+
)
|
| 811 |
+
|
| 812 |
+
return count
|
| 813 |
+
|
| 814 |
+
finally:
|
| 815 |
+
db.close()
|
| 816 |
+
|
| 817 |
+
async def health_check(self) -> bool:
|
| 818 |
+
"""Check if the service is healthy."""
|
| 819 |
+
try:
|
| 820 |
+
# Test Gemini connection
|
| 821 |
+
await self.gemini_client.test_connection()
|
| 822 |
+
return True
|
| 823 |
+
except Exception as e:
|
| 824 |
+
logger.error("Health check failed", error=str(e))
|
| 825 |
+
return False
|
| 826 |
+
|
| 827 |
+
async def get_metrics(self, period: str = "24h") -> Dict[str, Any]:
|
| 828 |
+
"""Get translation metrics."""
|
| 829 |
+
# Implementation would aggregate metrics from database
|
| 830 |
+
# This is a placeholder
|
| 831 |
+
return {
|
| 832 |
+
"period": period,
|
| 833 |
+
"total_requests": 0,
|
| 834 |
+
"successful_requests": 0,
|
| 835 |
+
"failed_requests": 0,
|
| 836 |
+
"cache_hit_rate": 0.0,
|
| 837 |
+
"avg_processing_time_ms": 0.0,
|
| 838 |
+
"total_cost_usd": 0.0
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
|
| 842 |
+
# Global service instance
|
| 843 |
+
_translation_service: Optional[OpenAITranslationService] = None
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
async def get_translation_service() -> OpenAITranslationService:
|
| 847 |
+
"""Get or create OpenAI translation service instance."""
|
| 848 |
+
global _translation_service
|
| 849 |
+
|
| 850 |
+
if _translation_service is None:
|
| 851 |
+
_translation_service = OpenAITranslationService()
|
| 852 |
+
# Initialize the async client
|
| 853 |
+
_translation_service.gemini_client = get_gemini_client()
|
| 854 |
+
|
| 855 |
+
return _translation_service
|
src/services/openai_translation/translation_agent.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simplified OpenAI Translation Agent using proper Runner.run pattern.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
from typing import Dict, Optional, Any
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
from agents import Agent, Runner
|
| 10 |
+
from src.services.openai_translation.client import GeminiOpenAIClient, get_gemini_client
|
| 11 |
+
from src.utils.translation_logger import get_translation_logger
|
| 12 |
+
|
| 13 |
+
logger = get_translation_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class TranslationContext:
|
| 18 |
+
"""Context information for translation."""
|
| 19 |
+
page_url: Optional[str] = None
|
| 20 |
+
document_type: Optional[str] = None
|
| 21 |
+
technical_domain: Optional[str] = None
|
| 22 |
+
target_audience: Optional[str] = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class OpenAITranslationAgent:
|
| 26 |
+
"""
|
| 27 |
+
OpenAI Agents SDK-based translation agent using proper Runner.run pattern.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(
|
| 31 |
+
self,
|
| 32 |
+
gemini_client: Optional[GeminiOpenAIClient] = None,
|
| 33 |
+
model: str = "gemini-2.0-flash-lite"
|
| 34 |
+
):
|
| 35 |
+
"""Initialize translation agent."""
|
| 36 |
+
self.client = gemini_client or get_gemini_client()
|
| 37 |
+
self.model = model
|
| 38 |
+
|
| 39 |
+
# Create the agent with translation instructions
|
| 40 |
+
self.agent = Agent(
|
| 41 |
+
name="Translation Agent",
|
| 42 |
+
instructions=self._get_translation_instructions(),
|
| 43 |
+
model=self.client.get_model()
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
def _get_translation_instructions(self) -> str:
|
| 47 |
+
"""Get the base translation instructions for the agent."""
|
| 48 |
+
return """
|
| 49 |
+
You are a professional translator specializing in English to Urdu translation.
|
| 50 |
+
|
| 51 |
+
CRITICAL REQUIREMENTS:
|
| 52 |
+
1. Translate ALL text to Urdu - no English words should remain
|
| 53 |
+
2. ONLY preserve code blocks marked with ```
|
| 54 |
+
3. Translate technical terms with context (e.g., AI -> مصنوعی ذہانت)
|
| 55 |
+
4. Use Urdu script (Nastaleeq) for Urdu text
|
| 56 |
+
5. Maintain formatting and structure
|
| 57 |
+
6. Mix Urdu with Roman Urdu for technical terms where appropriate
|
| 58 |
+
|
| 59 |
+
When translating:
|
| 60 |
+
- Use appropriate honorifics and politeness levels
|
| 61 |
+
- Translate idioms and expressions to their Urdu equivalents
|
| 62 |
+
- Preserve the meaning and tone of the original text
|
| 63 |
+
- Handle technical terminology correctly
|
| 64 |
+
- Ensure grammatical correctness in Urdu
|
| 65 |
+
|
| 66 |
+
Additional context will be provided as needed for specific domains.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
async def translate_with_agent(
|
| 70 |
+
self,
|
| 71 |
+
text: str,
|
| 72 |
+
context: Optional[TranslationContext] = None,
|
| 73 |
+
user_id: Optional[str] = None
|
| 74 |
+
) -> Dict[str, Any]:
|
| 75 |
+
"""
|
| 76 |
+
Translate text using OpenAI Agents SDK with proper Runner.run pattern.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
text: Text to translate
|
| 80 |
+
context: Translation context information
|
| 81 |
+
user_id: User ID for tracking
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
Dictionary containing translation result
|
| 85 |
+
"""
|
| 86 |
+
try:
|
| 87 |
+
# Build the prompt with context
|
| 88 |
+
prompt = self._build_translation_prompt(text, context)
|
| 89 |
+
|
| 90 |
+
logger.info(
|
| 91 |
+
"Starting translation with agent",
|
| 92 |
+
text_length=len(text),
|
| 93 |
+
context=context.document_type if context else None,
|
| 94 |
+
model=self.model
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Run the agent using the proper Runner.run pattern
|
| 98 |
+
result = await Runner.run(
|
| 99 |
+
self.agent,
|
| 100 |
+
prompt,
|
| 101 |
+
max_turns=1 # Single turn for simple translation
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Extract the translated text
|
| 105 |
+
translated_text = result.final_output
|
| 106 |
+
|
| 107 |
+
# Try to extract tokens from usage if available
|
| 108 |
+
tokens_used = 0
|
| 109 |
+
model_used = self.model
|
| 110 |
+
|
| 111 |
+
# The result might have usage information in different formats
|
| 112 |
+
if hasattr(result, 'usage') and result.usage:
|
| 113 |
+
tokens_used = result.usage.total_tokens if hasattr(result.usage, 'total_tokens') else 0
|
| 114 |
+
model_used = result.usage.model if hasattr(result.usage, 'model') else self.model
|
| 115 |
+
|
| 116 |
+
# Check if the translation contains code blocks
|
| 117 |
+
has_code_blocks = "```" in translated_text
|
| 118 |
+
|
| 119 |
+
# Extract code blocks if present
|
| 120 |
+
code_blocks = []
|
| 121 |
+
if has_code_blocks:
|
| 122 |
+
import re
|
| 123 |
+
code_pattern = re.compile(r'```(\w+)?\n(.*?)\n```', re.DOTALL)
|
| 124 |
+
code_blocks = [
|
| 125 |
+
{
|
| 126 |
+
"language": match.group(1) or "unknown",
|
| 127 |
+
"code": match.group(2)
|
| 128 |
+
}
|
| 129 |
+
for match in code_pattern.finditer(translated_text)
|
| 130 |
+
]
|
| 131 |
+
|
| 132 |
+
logger.info(
|
| 133 |
+
"Translation completed successfully",
|
| 134 |
+
original_length=len(text),
|
| 135 |
+
translated_length=len(translated_text),
|
| 136 |
+
tokens_used=tokens_used,
|
| 137 |
+
has_code_blocks=has_code_blocks
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
return {
|
| 141 |
+
"translated_text": translated_text.strip(),
|
| 142 |
+
"original_text": text,
|
| 143 |
+
"tokens_used": tokens_used,
|
| 144 |
+
"model": model_used,
|
| 145 |
+
"confidence_score": 0.95, # Agent typically produces high-quality translations
|
| 146 |
+
"has_code_blocks": has_code_blocks,
|
| 147 |
+
"code_blocks": code_blocks,
|
| 148 |
+
"context_used": context is not None,
|
| 149 |
+
"processing_time_ms": 0, # Could track this if needed
|
| 150 |
+
"cache_hit": False
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(
|
| 155 |
+
"Agent translation failed",
|
| 156 |
+
error=str(e),
|
| 157 |
+
error_type=type(e).__name__,
|
| 158 |
+
text_length=len(text)
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Re-raise with context
|
| 162 |
+
raise Exception(f"Translation failed: {str(e)}") from e
|
| 163 |
+
|
| 164 |
+
def _build_translation_prompt(
|
| 165 |
+
self,
|
| 166 |
+
text: str,
|
| 167 |
+
context: Optional[TranslationContext]
|
| 168 |
+
) -> str:
|
| 169 |
+
"""Build the translation prompt with context."""
|
| 170 |
+
prompt_parts = ["Translate the following text from English to Urdu:"]
|
| 171 |
+
|
| 172 |
+
# Add context information if provided
|
| 173 |
+
if context:
|
| 174 |
+
context_parts = []
|
| 175 |
+
if context.document_type:
|
| 176 |
+
context_parts.append(f"Document Type: {context.document_type}")
|
| 177 |
+
if context.technical_domain:
|
| 178 |
+
context_parts.append(f"Technical Domain: {context.technical_domain}")
|
| 179 |
+
if context.target_audience:
|
| 180 |
+
context_parts.append(f"Target Audience: {context.target_audience}")
|
| 181 |
+
|
| 182 |
+
if context_parts:
|
| 183 |
+
prompt_parts.append("\nContext:")
|
| 184 |
+
prompt_parts.append("\n".join(f"- {part}" for part in context_parts))
|
| 185 |
+
|
| 186 |
+
# Add the text to translate
|
| 187 |
+
prompt_parts.append(f"\n\nText to translate:\n{text}")
|
| 188 |
+
|
| 189 |
+
# Add instruction to translate only the content
|
| 190 |
+
prompt_parts.append("\n\nTranslate only the text above.")
|
| 191 |
+
|
| 192 |
+
return "\n".join(prompt_parts)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# Factory function
|
| 196 |
+
def create_translation_agent(model: str = "gemini-2.0-flash-lite") -> OpenAITranslationAgent:
|
| 197 |
+
"""Create a translation agent instance."""
|
| 198 |
+
return OpenAITranslationAgent(model=model)
|