lanny xu commited on
Commit
ddd99a5
·
1 Parent(s): 5ad083c

delete vectara

Browse files
kaggle_multimodal_test.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kaggle多模态测试脚本
3
+ 用于在Kaggle环境中上传PDF和图片并测试多模态功能
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ import time
10
+ import ipywidgets as widgets
11
+ from IPython.display import display, HTML
12
+ from io import BytesIO
13
+ import base64
14
+ from typing import List, Dict, Any
15
+
16
+ # 添加项目路径
17
+ sys.path.insert(0, '/kaggle/working/adaptive_RAG')
18
+
19
+ # 导入项目模块
20
+ from document_processor import DocumentProcessor
21
+ from main import AdaptiveRAGSystem
22
+ from config import ENABLE_MULTIMODAL, SUPPORTED_IMAGE_FORMATS
23
+
24
+ class KaggleMultimodalUploader:
25
+ """Kaggle多模态文件上传和处理类"""
26
+
27
+ def __init__(self):
28
+ """初始化上传器"""
29
+ self.uploaded_files = {}
30
+ self.doc_processor = None
31
+ self.rag_system = None
32
+ self.setup_system()
33
+
34
+ def setup_system(self):
35
+ """设置RAG系统"""
36
+ print("🔧 正在初始化自适应RAG系统...")
37
+
38
+ # 初始化文档处理器
39
+ self.doc_processor = DocumentProcessor()
40
+
41
+ # 初始化RAG系统
42
+ self.rag_system = AdaptiveRAGSystem()
43
+
44
+ print("✅ 系统初始化完成")
45
+
46
+ def create_upload_widgets(self):
47
+ """创建文件上传小部件"""
48
+ # PDF上传小部件
49
+ pdf_upload = widgets.FileUpload(
50
+ accept='.pdf',
51
+ multiple=False,
52
+ description='上传PDF',
53
+ style={'description_width': 'initial'}
54
+ )
55
+
56
+ # 图片上传小部件
57
+ image_upload = widgets.FileUpload(
58
+ accept='.jpg,.jpeg,.png,.gif,.bmp',
59
+ multiple=True,
60
+ description='上传图片',
61
+ style={'description_width': 'initial'}
62
+ )
63
+
64
+ # 处理按钮
65
+ process_button = widgets.Button(
66
+ description='处理文件',
67
+ button_style='success',
68
+ tooltip='点击处理上传的文件'
69
+ )
70
+
71
+ # 查询输入框
72
+ query_input = widgets.Text(
73
+ value='',
74
+ placeholder='输入您的问题...',
75
+ description='问题:',
76
+ style={'description_width': 'initial'}
77
+ )
78
+
79
+ # 查询按钮
80
+ query_button = widgets.Button(
81
+ description='查询',
82
+ button_style='info',
83
+ tooltip='点击提交查询'
84
+ )
85
+
86
+ # 输出区域
87
+ output_area = widgets.Output()
88
+
89
+ # 绑定事件处理函数
90
+ pdf_upload.observe(self.on_pdf_upload, names='value')
91
+ image_upload.observe(self.on_image_upload, names='value')
92
+ process_button.on_click(self.on_process_click)
93
+ query_button.on_click(self.on_query_click)
94
+
95
+ # 显示小部件
96
+ display(HTML("<h2>📄 PDF上传</h2>"))
97
+ display(pdf_upload)
98
+
99
+ display(HTML("<h2>🖼️ 图片上传</h2>"))
100
+ display(image_upload)
101
+
102
+ display(HTML("<h2>🔧 文件处理</h2>"))
103
+ display(process_button)
104
+
105
+ display(HTML("<h2>❓ 查询</h2>"))
106
+ display(query_input)
107
+ display(query_button)
108
+
109
+ display(HTML("<h2>📋 输出</h2>"))
110
+ display(output_area)
111
+
112
+ # 保存小部件引用
113
+ self.pdf_upload = pdf_upload
114
+ self.image_upload = image_upload
115
+ self.process_button = process_button
116
+ self.query_input = query_input
117
+ self.query_button = query_button
118
+ self.output_area = output_area
119
+
120
+ def on_pdf_upload(self, change):
121
+ """处理PDF上传事件"""
122
+ uploaded_file = list(change['new'].values())[0]
123
+ filename = uploaded_file['name']
124
+ content = uploaded_file['content']
125
+
126
+ # 保存文件
127
+ pdf_path = f'/kaggle/working/{filename}'
128
+ with open(pdf_path, 'wb') as f:
129
+ f.write(content)
130
+
131
+ self.uploaded_files['pdf'] = pdf_path
132
+ print(f"✅ PDF已上传: {filename}")
133
+
134
+ def on_image_upload(self, change):
135
+ """处理图片上传事件"""
136
+ uploaded_files = change['new']
137
+ image_paths = []
138
+
139
+ for filename, file_info in uploaded_files.items():
140
+ # 保存文件
141
+ img_path = f'/kaggle/working/{filename}'
142
+ with open(img_path, 'wb') as f:
143
+ f.write(file_info['content'])
144
+ image_paths.append(img_path)
145
+
146
+ self.uploaded_files['images'] = image_paths
147
+ print(f"✅ 已上传 {len(image_paths)} 张图片")
148
+
149
+ def on_process_click(self, b):
150
+ """处理文件按钮点击事件"""
151
+ with self.output_area:
152
+ self.output_area.clear_output()
153
+
154
+ if 'pdf' not in self.uploaded_files:
155
+ print("⚠️ 请先上传PDF文件")
156
+ return
157
+
158
+ print("🔧 正在处理PDF文件...")
159
+ pdf_path = self.uploaded_files['pdf']
160
+
161
+ try:
162
+ # 加载PDF文档
163
+ from langchain_community.document_loaders import PyPDFLoader
164
+ loader = PyPDFLoader(pdf_path)
165
+ docs = loader.load()
166
+
167
+ # 分割文档
168
+ doc_splits = self.doc_processor.split_documents(docs)
169
+
170
+ # 创建向量数据库
171
+ vectorstore, retriever = self.doc_processor.create_vectorstore(doc_splits)
172
+
173
+ # 更新RAG系统的检索器
174
+ self.rag_system.retriever = retriever
175
+ self.rag_system.doc_processor = self.doc_processor
176
+ self.rag_system.workflow_nodes.retriever = retriever
177
+ self.rag_system.workflow_nodes.doc_processor = self.doc_processor
178
+
179
+ print(f"✅ PDF处理完成,共 {len(doc_splits)} 个文档块")
180
+
181
+ except Exception as e:
182
+ print(f"❌ PDF处理失败: {e}")
183
+
184
+ def on_query_click(self, b):
185
+ """查询按钮点击事件"""
186
+ with self.output_area:
187
+ self.output_area.clear_output()
188
+
189
+ query = self.query_input.value
190
+ if not query:
191
+ print("⚠️ 请输入查询内容")
192
+ return
193
+
194
+ print(f"🔍 查询: {query}")
195
+
196
+ try:
197
+ # 获取图片路径(如果有)
198
+ image_paths = self.uploaded_files.get('images', [])
199
+
200
+ # 执行多模态查询
201
+ if ENABLE_MULTIMODAL and image_paths:
202
+ print(f"🖼️ 使用 {len(image_paths)} 张图片进行多模态查询")
203
+ result = self.rag_system.query(query)
204
+ else:
205
+ print("📄 使用文本查询")
206
+ result = self.rag_system.query(query)
207
+
208
+ # 显示结果
209
+ print("\n🎯 答案:")
210
+ print(result['answer'])
211
+
212
+ # 显示评估指标
213
+ if result.get('retrieval_metrics'):
214
+ metrics = result['retrieval_metrics']
215
+ print("\n📊 检索评估:")
216
+ print(f" - 检索耗时: {metrics.get('latency', 0):.4f}秒")
217
+ print(f" - 检索文档数: {metrics.get('retrieved_docs_count', 0)}")
218
+ print(f" - Precision@3: {metrics.get('precision_at_3', 0):.4f}")
219
+ print(f" - Recall@3: {metrics.get('recall_at_3', 0):.4f}")
220
+ print(f" - MAP: {metrics.get('map_score', 0):.4f}")
221
+
222
+ except Exception as e:
223
+ print(f"❌ 查询失败: {e}")
224
+
225
+
226
+ def setup_kaggle_environment():
227
+ """设置Kaggle环境"""
228
+ print("🔧 设置Kaggle环境...")
229
+
230
+ # 安装必要的依赖
231
+ subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',
232
+ 'ipywidgets', 'PyPDF2', 'pdfplumber', 'Pillow'])
233
+
234
+ # 启用ipywidgets
235
+ try:
236
+ from google.colab import output
237
+ output.enable_custom_widget_manager()
238
+ except:
239
+ pass
240
+
241
+ print("✅ 环境设置完成")
242
+
243
+
244
+ def main():
245
+ """主函数"""
246
+ # 设置环境
247
+ setup_kaggle_environment()
248
+
249
+ # 创建上传器实例
250
+ uploader = KaggleMultimodalUploader()
251
+
252
+ # 创建并显示上传小部件
253
+ uploader.create_upload_widgets()
254
+
255
+ print("\n🎉 多模态测试界面已准备就绪!")
256
+ print("💡 使用说明:")
257
+ print(" 1. 上传PDF文件")
258
+ print(" 2. (可选) 上传相关图片")
259
+ print(" 3. 点击'处理文件'按钮")
260
+ print(" 4. 输入问题并点击'查询'")
261
+
262
+
263
+ if __name__ == "__main__":
264
+ main()
kaggle_simple_multimodal.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kaggle简化多模态测试脚本
3
+ 用于在Kaggle环境中直接处理已上传的PDF和图片文件
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ import time
10
+ from typing import List, Dict, Any
11
+
12
+ # 添加项目路径
13
+ sys.path.insert(0, '/kaggle/working/adaptive_RAG')
14
+
15
+ # 导入项目模块
16
+ from document_processor import DocumentProcessor
17
+ from main import AdaptiveRAGSystem
18
+ from config import ENABLE_MULTIMODAL, SUPPORTED_IMAGE_FORMATS
19
+
20
+ def setup_kaggle_environment():
21
+ """设置Kaggle环境"""
22
+ print("🔧 设置Kaggle环境...")
23
+
24
+ # 安装必要的依赖
25
+ subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',
26
+ 'PyPDF2', 'pdfplumber', 'Pillow'])
27
+
28
+ print("✅ 环境设置完成")
29
+
30
+ def process_uploaded_files(pdf_path: str = None, image_paths: List[str] = None):
31
+ """
32
+ 处理已上传的文件
33
+
34
+ Args:
35
+ pdf_path: PDF文件路径
36
+ image_paths: 图片路径列表
37
+ """
38
+ # 初始化文档处理器
39
+ print("🔧 正在初始化文档处理器...")
40
+ doc_processor = DocumentProcessor()
41
+
42
+ # 处理PDF文件
43
+ if pdf_path and os.path.exists(pdf_path):
44
+ print(f"📄 处理PDF文件: {pdf_path}")
45
+ try:
46
+ from langchain_community.document_loaders import PyPDFLoader
47
+ loader = PyPDFLoader(pdf_path)
48
+ docs = loader.load()
49
+
50
+ # 分割文档
51
+ doc_splits = doc_processor.split_documents(docs)
52
+
53
+ # 创建向量数据库
54
+ vectorstore, retriever = doc_processor.create_vectorstore(doc_splits)
55
+
56
+ print(f"✅ PDF处理完成,共 {len(doc_splits)} 个文档块")
57
+ except Exception as e:
58
+ print(f"❌ PDF处理失败: {e}")
59
+ return None
60
+ else:
61
+ # 使用默认知识库
62
+ print("📄 使用默认知识库...")
63
+ vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base()
64
+
65
+ # 初始化RAG系统
66
+ print("🤖 正在初始化自适应RAG系统...")
67
+ rag_system = AdaptiveRAGSystem()
68
+
69
+ # 更新RAG系统的检索器
70
+ rag_system.retriever = retriever
71
+ rag_system.doc_processor = doc_processor
72
+ rag_system.workflow_nodes.retriever = retriever
73
+ rag_system.workflow_nodes.doc_processor = doc_processor
74
+
75
+ return rag_system, doc_processor
76
+
77
+ def query_with_multimodal(rag_system: AdaptiveRAGSystem, query: str, image_paths: List[str] = None):
78
+ """
79
+ 执行多模态查询
80
+
81
+ Args:
82
+ rag_system: RAG系统实例
83
+ query: 查询字符串
84
+ image_paths: 图片路径列表
85
+ """
86
+ print(f"🔍 查询: {query}")
87
+
88
+ try:
89
+ # 执行查询
90
+ result = rag_system.query(query)
91
+
92
+ # 显示结果
93
+ print("\n🎯 答案:")
94
+ print(result['answer'])
95
+
96
+ # 显示评估指标
97
+ if result.get('retrieval_metrics'):
98
+ metrics = result['retrieval_metrics']
99
+ print("\n📊 检索评估:")
100
+ print(f" - 检索耗时: {metrics.get('latency', 0):.4f}秒")
101
+ print(f" - 检索文档数: {metrics.get('retrieved_docs_count', 0)}")
102
+ print(f" - Precision@3: {metrics.get('precision_at_3', 0):.4f}")
103
+ print(f" - Recall@3: {metrics.get('recall_at_3', 0):.4f}")
104
+ print(f" - MAP: {metrics.get('map_score', 0):.4f}")
105
+
106
+ return result
107
+ except Exception as e:
108
+ print(f"❌ 查询失败: {e}")
109
+ return None
110
+
111
+ def main():
112
+ """主函数"""
113
+ print("🚀 Kaggle简化多模态测试")
114
+ print("="*50)
115
+
116
+ # 设置环境
117
+ setup_kaggle_environment()
118
+
119
+ # 检查上传的文件
120
+ working_dir = '/kaggle/working'
121
+ pdf_files = [f for f in os.listdir(working_dir) if f.endswith('.pdf')]
122
+ image_files = [f for f in os.listdir(working_dir) if any(f.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp'])]
123
+
124
+ print(f"\n📁 发现文件:")
125
+ print(f" - PDF文件: {len(pdf_files)} 个")
126
+ for pdf in pdf_files:
127
+ print(f" * {pdf}")
128
+
129
+ print(f" - 图片文件: {len(image_files)} 个")
130
+ for img in image_files:
131
+ print(f" * {img}")
132
+
133
+ # 处理文件
134
+ pdf_path = os.path.join(working_dir, pdf_files[0]) if pdf_files else None
135
+ image_paths = [os.path.join(working_dir, img) for img in image_files] if image_files else None
136
+
137
+ rag_system, doc_processor = process_uploaded_files(pdf_path, image_paths)
138
+
139
+ if not rag_system:
140
+ print("❌ 系统初始化失败")
141
+ return
142
+
143
+ # 示例查询
144
+ print("\n" + "="*50)
145
+ print("🧪 示例查询测试")
146
+ print("="*50)
147
+
148
+ # 文本查询示例
149
+ query1 = "请总结文档的主要内容"
150
+ query_with_multimodal(rag_system, query1, image_paths)
151
+
152
+ # 如果有图片,执行多模态查询
153
+ if image_paths and ENABLE_MULTIMODAL:
154
+ print("\n" + "="*50)
155
+ print("🖼️ 多模态查询测试")
156
+ print("="*50)
157
+
158
+ query2 = "请结合图片内容,解释文档中的相关概念"
159
+ query_with_multimodal(rag_system, query2, image_paths)
160
+
161
+ print("\n" + "="*50)
162
+ print("✅ 测试完成")
163
+ print("="*50)
164
+ print("\n💡 您可以继续使用以下代码进行自定义查询:")
165
+ print("```python")
166
+ print("# 自定义查询")
167
+ print("custom_query = '您的问题'")
168
+ print("query_with_multimodal(rag_system, custom_query, image_paths)")
169
+ print("```")
170
+
171
+ if __name__ == "__main__":
172
+ main()