Spaces:

ahaahaaha
/

adaptive_rag

Paused

App Files Files Community

lanny xu commited on Oct 22

Commit

52f92a4

1 Parent(s): 2cb7544

resolve conflict

Browse files

Files changed (4) hide show

COLAB_CONTINUE_FROM_TIMEOUT.py +229 -0
COLAB_QUICK_CONTINUE.py +121 -0
TIMEOUT_QUICK_FIX_CN.md +324 -0
fix_timeout_issue.py +144 -0

COLAB_CONTINUE_FROM_TIMEOUT.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+在 Colab 中从超时处继续处理的完整脚本
+直接复制到 Colab 代码单元格运行
+"""
+print("🚀 GraphRAG 超时恢复脚本")
+print("="*60)
+# ==================== 步骤 0: 检查前置条件 ====================
+print("\n📋 步骤 0: 检查前置条件...")
+import sys
+import os
+# 挂载 Google Drive（如果还没有挂载）
+try:
+    from google.colab import drive
+    if not os.path.exists('/content/drive'):
+        print("   挂载 Google Drive...")
+        drive.mount('/content/drive')
+    else:
+        print("   ✅ Google Drive 已挂载")
+except:
+    print("   ⚠️ 不在 Colab 环境中")
+# 设置路径
+project_path = '/content/drive/MyDrive/adaptive_RAG'
+sys.path.insert(0, project_path)
+print(f"   项目路径: {project_path}")
+# ==================== 步骤 1: 重启 Ollama ====================
+print("\n🔄 步骤 1: 重启 Ollama 服务...")
+import subprocess
+import time
+# 杀掉旧进程
+!pkill -9 ollama 2>/dev/null
+time.sleep(2)
+# 启动新进程
+print("   启动 Ollama 服务...")
+ollama_process = subprocess.Popen(
+    ["ollama", "serve"],
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+    preexec_fn=os.setpgrp
+)
+time.sleep(5)
+# 验证服务
+import requests
+try:
+    response = requests.get('http://localhost:11434/api/tags', timeout=5)
+    if response.status_code == 200:
+        print("   ✅ Ollama 服务运行正常")
+    else:
+        print(f"   ⚠️ Ollama 响应异常: {response.status_code}")
+except Exception as e:
+    print(f"   ❌ Ollama 服务未响应: {e}")
+    print("   请检查 Ollama 是否正确安装")
+# ==================== 步骤 2: 加载配置和文档 ====================
+print("\n📚 步骤 2: 加载配置和文档...")
+# 导入配置
+from config import setup_environment
+try:
+    setup_environment()
+    print("   ✅ 环境配置加载成功")
+except Exception as e:
+    print(f"   ⚠️ 环境配置警告: {e}")
+# 检查是否已经有 doc_splits 变量
+if 'doc_splits' in dir():
+    print(f"   ✅ 检测到已有 doc_splits: {len(doc_splits)} 个文档")
+    use_existing_docs = True
+else:
+    print("   ⚠️ 未检测到 doc_splits，需要重新加载文档")
+    use_existing_docs = False
+# 如果没有 doc_splits，重新加载
+if not use_existing_docs:
+    print("\n   正在加载文档...")
+    from document_processor import DocumentProcessor
+    doc_processor = DocumentProcessor()
+    # 使用默认 URL 或自定义 URL
+    urls = [
+        "https://lilianweng.github.io/posts/2023-06-23-agent/",
+        "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
+        "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"
+    ]
+    vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base(
+        urls=urls,
+        enable_graphrag=True
+    )
+    print(f"   ✅ 文档加载完成: {len(doc_splits)} 个文档片段")
+# ==================== 步骤 3: 修复超时配置 ====================
+print("\n⚙️ 步骤 3: 修复超时配置...")
+# 方案：直接修改 entity_extractor.py 文件内容
+entity_extractor_path = os.path.join(project_path, 'entity_extractor.py')
+# 读取原文件
+with open(entity_extractor_path, 'r', encoding='utf-8') as f:
+    content = f.read()
+# 检查是否已经修改过
+if 'timeout: int = 180' in content:
+    print("   ✅ entity_extractor.py 已经包含超时修复")
+else:
+    print("   📝 修改 entity_extractor.py...")
+    # 替换初始化方法的签名
+    content = content.replace(
+        'def __init__(self, timeout: int = 60, max_retries: int = 3):',
+        'def __init__(self, timeout: int = 180, max_retries: int = 5):'
+    )
+    # 保存修改
+    with open(entity_extractor_path, 'w', encoding='utf-8') as f:
+        f.write(content)
+    print("   ✅ 已将默认超时时间改为 180 秒，重试次数改为 5 次")
+# 重新加载模块
+import importlib
+if 'entity_extractor' in sys.modules:
+    importlib.reload(sys.modules['entity_extractor'])
+    print("   🔄 entity_extractor 模块已重新加载")
+if 'graph_indexer' in sys.modules:
+    importlib.reload(sys.modules['graph_indexer'])
+    print("   🔄 graph_indexer 模块已重新加载")
+# ==================== 步骤 4: 确定继续处理的起点 ====================
+print("\n📊 步骤 4: 确定处理起点...")
+# 让用户选择从哪里开始
+print("\n请选择继续处理的方式:")
+print("  1. 从文档 #56 重新开始（包含 #56）")
+print("  2. 跳过文档 #56，从 #57 开始")
+print("  3. 从头开始处理所有文档")
+print("  4. 自定义起始位置")
+# 默认选项（可以修改）
+choice = 1  # 👈 修改这里来选择不同的选项
+if choice == 1:
+    start_index = 55  # 文档 #56 的索引
+    print(f"\n   ✅ 选择: 从文档 #56 开始（索引 {start_index}）")
+elif choice == 2:
+    start_index = 56  # 跳过 #56
+    print(f"\n   ✅ 选择: 跳过文档 #56，从 #57 开始（索引 {start_index}）")
+elif choice == 3:
+    start_index = 0
+    print(f"\n   ✅ 选择: 从头开始处理所有文档")
+else:
+    # 自定义
+    start_index = 55  # 👈 修改这里来自定义起始位置
+    print(f"\n   ✅ 选择: 自定义起始位置（索引 {start_index}）")
+remaining_docs = doc_splits[start_index:]
+print(f"   待处理文档数: {len(remaining_docs)} 个")
+# ==================== 步骤 5: 开始处理 ====================
+print("\n🚀 步骤 5: 开始处理文档...")
+print("="*60)
+from graph_indexer import GraphRAGIndexer
+# 创建索引器
+indexer = GraphRAGIndexer()
+# 开始索引
+try:
+    graph = indexer.index_documents(
+        documents=remaining_docs,
+        batch_size=3,  # 👈 可以调整批次大小（1-5 推荐）
+        save_path=os.path.join(project_path, "knowledge_graph_recovered.pkl")
+    )
+    print("\n" + "="*60)
+    print("✅ 处理完成！")
+    print("="*60)
+    # 显示统计信息
+    stats = graph.get_statistics()
+    print(f"\n📊 知识图谱统计:")
+    print(f"   • 节点数: {stats['num_nodes']}")
+    print(f"   • 边数: {stats['num_edges']}")
+    print(f"   • 社区数: {stats['num_communities']}")
+    print(f"   • 图密度: {stats['density']:.4f}")
+except KeyboardInterrupt:
+    print("\n⚠️ 处理被用户中断")
+    print("   可以记录当前进度，稍后继续")
+except Exception as e:
+    print(f"\n❌ 处理过程中出现错误:")
+    print(f"   {type(e).__name__}: {e}")
+    print("\n建议:")
+    print("   1. 检查上面的错误信息")
+    print("   2. 如果是某个文档超时，尝试跳过它")
+    print("   3. 如果是 Ollama 问题，重启服务")
+    import traceback
+    print("\n完整错误堆栈:")
+    traceback.print_exc()
+# ==================== 完成 ====================
+print("\n" + "="*60)
+print("脚本执行完成")
+print("="*60)
+print("\n💡 提示:")
+print("   • 如果遇到超时，检查上面的错误信息")
+print("   • 可以修改 choice 变量来跳过问题文档")
+print("   • 可以修改 batch_size 来调整处理速度")
+print("   • 图谱已保存到: knowledge_graph_recovered.pkl")

COLAB_QUICK_CONTINUE.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Colab 快速继续脚本 - 从超时处恢复
+复制到 Colab 运行，会自动检测并继续处理
+"""
+print("🚀 GraphRAG 恢复脚本 v2.0")
+print("="*60)
+import sys
+import os
+# ==================== 1. 设置环境 ====================
+print("\n1️⃣ 设置环境...")
+# 设置项目路径
+project_path = '/content/drive/MyDrive/adaptive_RAG'
+if project_path not in sys.path:
+    sys.path.insert(0, project_path)
+print(f"   ✅ 项目路径: {project_path}")
+# ==================== 2. 重启 Ollama ====================
+print("\n2️⃣ 重启 Ollama...")
+import subprocess
+import time
+subprocess.run(['pkill', '-9', 'ollama'], stderr=subprocess.DEVNULL)
+time.sleep(2)
+ollama_process = subprocess.Popen(
+    ["ollama", "serve"],
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE
+)
+time.sleep(5)
+import requests
+try:
+    r = requests.get('http://localhost:11434/api/tags', timeout=5)
+    print(f"   ✅ Ollama 运行正常" if r.status_code == 200 else f"   ⚠️ 状态码: {r.status_code}")
+except:
+    print("   ❌ Ollama 未响应")
+# ==================== 3. 加载文档 ====================
+print("\n3️⃣ 加载文档...")
+from config import setup_environment
+from document_processor import DocumentProcessor
+setup_environment()
+# 创建文档处理器
+doc_processor = DocumentProcessor()
+# 加载文档（使用默认 URLs）
+vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base(
+    enable_graphrag=True
+)
+print(f"   ✅ 已加载 {len(doc_splits)} 个文档")
+# ==================== 4. 修改超时配置 ====================
+print("\n4️⃣ 增加超时时间...")
+entity_file = os.path.join(project_path, 'entity_extractor.py')
+with open(entity_file, 'r', encoding='utf-8') as f:
+    content = f.read()
+# 修改默认参数
+if 'timeout: int = 60' in content:
+    content = content.replace(
+        'timeout: int = 60, max_retries: int = 3',
+        'timeout: int = 180, max_retries: int = 5'
+    )
+    with open(entity_file, 'w', encoding='utf-8') as f:
+        f.write(content)
+    print("   ✅ 超时已改为 180 秒，重试改为 5 次")
+else:
+    print("   ℹ️ 已经是修改后的配置")
+# 重新加载模块
+import importlib
+for mod in ['entity_extractor', 'graph_indexer']:
+    if mod in sys.modules:
+        importlib.reload(sys.modules[mod])
+# ==================== 5. 继续处理 ====================
+print("\n5️⃣ 继续处理文档...")
+print("="*60)
+from graph_indexer import GraphRAGIndexer
+# 配置起始位置
+START_INDEX = 55  # 👈 从文档 #56 开始，修改这里可以跳过某些文档
+BATCH_SIZE = 3    # 👈 批次大小，可以改为 1-5
+print(f"\n   起始位置: 文档 #{START_INDEX + 1}")
+print(f"   批次大小: {BATCH_SIZE}")
+print(f"   待处理: {len(doc_splits) - START_INDEX} 个文档\n")
+remaining_docs = doc_splits[START_INDEX:]
+indexer = GraphRAGIndexer()
+try:
+    graph = indexer.index_documents(
+        documents=remaining_docs,
+        batch_size=BATCH_SIZE,
+        save_path=f"{project_path}/knowledge_graph_recovered.pkl"
+    )
+    print("\n✅ 处理完成！")
+    stats = graph.get_statistics()
+    print(f"📊 节点: {stats['num_nodes']}, 边: {stats['num_edges']}, 社区: {stats['num_communities']}")
+except Exception as e:
+    print(f"\n❌ 错误: {e}")
+    print("\n建议:")
+    print("   • 如果文档 #56 超时，修改 START_INDEX = 56 跳过它")
+    print("   • 如果 Ollama 崩溃，重新运行此脚本")
+    print("   • 减小 BATCH_SIZE 到 1 或 2")

TIMEOUT_QUICK_FIX_CN.md ADDED Viewed

	@@ -0,0 +1,324 @@

+# 超时问题快速修复指南
+## 🚨 当前问题
+您遇到了这个错误：
+```
+🔄 提取实体 (尝试 1/3)... ❌ 错误: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
+```
+**原因**: 文档 #56 处理时间超过60秒，Ollama 没有在规定时间内返回结果。
+## ⚡ 立即修复（3步搞定）
+### 步骤 1: 重启 Ollama 服务
+在 Colab 中运行：
+```bash
+!pkill -9 ollama
+!sleep 2
+!nohup ollama serve > /tmp/ollama.log 2>&1 &
+!sleep 5
+!curl http://localhost:11434/api/tags
+```
+### 步骤 2: 增加超时时间
+在您的 Colab 笔记本中，修改初始化代码：
+```python
+# 找到 entity_extractor.py 的导入位置，修改为：
+from entity_extractor import EntityExtractor
+# 创建带更长超时的提取器
+# 直接在 Python 中猴子补丁修复
+import entity_extractor
+# 保存原始初始化方法
+_original_init = entity_extractor.EntityExtractor.__init__
+# 创建新的初始化方法，默认使用更长的超时
+def _new_init(self, timeout=180, max_retries=5):
+    _original_init(self, timeout=timeout, max_retries=max_retries)
+# 替换初始化方法
+entity_extractor.EntityExtractor.__init__ = _new_init
+print("✅ 已将超时时间增加到 180 秒（3分钟）")
+```
+### 步骤 3: 继续处理（跳过已完成的）
+```python
+# 从文档 #56 继续（索引 55）
+processed_count = 55
+remaining_docs = doc_splits[processed_count:]
+graph = indexer.index_documents(
+    documents=remaining_docs,
+    batch_size=3,  # 减小批次大小
+    save_path="/content/drive/MyDrive/knowledge_graph.pkl"
+)
+```
+## 🎯 完整的 Colab 代码块
+直接复制粘贴到 Colab 新的代码单元格：
+```python
+print("🔧 开始修复超时问题...")
+print("="*60)
+# ========== 第1步: 重启 Ollama ==========
+print("\n1️⃣ 重启 Ollama 服务...")
+!pkill -9 ollama
+!sleep 2
+!nohup ollama serve > /tmp/ollama.log 2>&1 &
+!sleep 5
+# 验证 Ollama 已启动
+import requests
+try:
+    response = requests.get('http://localhost:11434/api/tags', timeout=5)
+    if response.status_code == 200:
+        print("✅ Ollama 服务运行正常")
+    else:
+        print("⚠️ Ollama 可能未正常启动")
+except:
+    print("❌ Ollama 服务未响应，请检查日志")
+# ========== 第2步: 增加超时时间 ==========
+print("\n2️⃣ 修改超时配置...")
+import sys
+sys.path.insert(0, '/content/drive/MyDrive/adaptive_RAG')
+import entity_extractor
+# 保存原始初始化
+_original_init = entity_extractor.EntityExtractor.__init__
+# 新的初始化方法：默认3分钟超时，5次重试
+def _new_init(self, timeout=180, max_retries=5):
+    from langchain_community.chat_models import ChatOllama
+    from langchain_core.output_parsers import JsonOutputParser
+    from config import LOCAL_LLM
+    try:
+        from langchain_core.prompts import PromptTemplate
+    except ImportError:
+        from langchain.prompts import PromptTemplate
+    import time
+    self.llm = ChatOllama(
+        model=LOCAL_LLM,
+        format="json",
+        temperature=0,
+        timeout=timeout
+    )
+    self.max_retries = max_retries
+    # 实体提取提示模板
+    self.entity_prompt = PromptTemplate(
+        template="""你是一个专业的实体识别专家。从以下文本中提取所有重要的实体。
+实体类型包括:
+- PERSON: 人物、作者、研究者
+- ORGANIZATION: 组织、机构、公司
+- CONCEPT: 技术概念、算法、方法论
+- TECHNOLOGY: 具体技术、工具、框架
+- PAPER: 论文、出版物
+- EVENT: 事件、会议
+文本内容:
+{text}
+请以JSON格式返回，包含以下字段:
+{{
+    "entities": [
+        {{
+            "name": "实体名称",
+            "type": "实体类型",
+            "description": "简短描述"
+        }}
+    ]
+}}
+不要包含前言或解释，只返回JSON。
+""",
+        input_variables=["text"]
+    )
+    # 关系提取提示模板
+    self.relation_prompt = PromptTemplate(
+        template="""你是一个关系抽取专家。从文本中识别实体之间的关系。
+已识别的实体:
+{entities}
+文本内容:
+{text}
+请识别实体之间的关系，以JSON格式返回:
+{{
+    "relations": [
+        {{
+            "source": "源实体名称",
+            "target": "目标实体名称",
+            "relation_type": "关系类型",
+            "description": "关系描述"
+        }}
+    ]
+}}
+关系类型包括: AUTHOR_OF, USES, BASED_ON, RELATED_TO, PART_OF, APPLIES_TO, IMPROVES, CITES
+不要包含前言或解释，只返回JSON。
+""",
+        input_variables=["text", "entities"]
+    )
+    self.entity_chain = self.entity_prompt | self.llm | JsonOutputParser()
+    self.relation_chain = self.relation_prompt | self.llm | JsonOutputParser()
+# 应用补丁
+entity_extractor.EntityExtractor.__init__ = _new_init
+print("✅ 超时时间已增加到 180 秒（3分钟）")
+print("✅ 重��次数已增加到 5 次")
+# ========== 第3步: 继续处理 ==========
+print("\n3️⃣ 准备继续处理...")
+# 重新导入模块以应用更改
+import importlib
+if 'graph_indexer' in sys.modules:
+    importlib.reload(sys.modules['graph_indexer'])
+from graph_indexer import GraphRAGIndexer
+# 创建新的索引器
+indexer = GraphRAGIndexer()
+print("\n📋 当前状态:")
+print(f"  • 总文档数: {len(doc_splits)}")
+print(f"  • 已处理: 55 个文档（0-55）")
+print(f"  • 待处理: {len(doc_splits) - 55} 个文档（56-{len(doc_splits)-1}）")
+# 从文档 #56 继续
+processed_count = 55
+remaining_docs = doc_splits[processed_count:]
+print("\n🚀 开始处理剩余文档...")
+print("="*60)
+graph = indexer.index_documents(
+    documents=remaining_docs,
+    batch_size=3,  # 减小批次大小以降低负载
+    save_path="/content/drive/MyDrive/knowledge_graph_partial.pkl"
+)
+print("\n✅ 处理完成！")
+```
+## 📊 如果文档 #56 仍然超时
+如果增加超时后，文档 #56 仍然失败，可能是该文档内容特别复杂。可以选择跳过它：
+```python
+# 方案A: 跳过文档 #56
+print("跳过文档 #56，从 #57 继续...")
+processed_count = 56  # 跳过 #56
+remaining_docs = doc_splits[processed_count:]
+graph = indexer.index_documents(
+    documents=remaining_docs,
+    batch_size=3,
+    save_path="/content/drive/MyDrive/knowledge_graph_partial.pkl"
+)
+```
+或者单独检查该文档：
+```python
+# 方案B: 检查文档 #56 的内容
+problem_doc = doc_splits[55]  # 文档 #56（索引55）
+print(f"文档 #56 信息:")
+print(f"  长度: {len(problem_doc.page_content)} 字符")
+print(f"  前500字符:")
+print(f"  {problem_doc.page_content[:500]}")
+print(f"\n  后500字符:")
+print(f"  {problem_doc.page_content[-500:]}")
+# 如果文档太长，可以考虑分割它
+if len(problem_doc.page_content) > 3000:
+    print("\n⚠️ 文档较长，可能需要更多处理时间或分割处理")
+```
+## 🔍 监控进度
+修复后，您将看到更详细的输出：
+```
+⚙️  === 批次 19/20 (文档 56-58) ===
+🔍 文档 #56: 开始提取...
+   🔄 提取实体 (尝试 1/5)... ✅ 提取到 8 个实体
+   🔄 提取关系 (尝试 1/5)... ✅ 提取到 5 个关系
+📊 文档 #56 完成: 8 实体, 5 关系
+```
+## 📌 参数说明
+| 参数 | 原值 | 新值 | 说明 |
+|-----|------|------|------|
+| `timeout` | 60秒 | 180秒 | 单次请求最大等待时间 |
+| `max_retries` | 3次 | 5次 | 失败后重试次数 |
+| `batch_size` | 10 | 3 | 每批次处理的文档数 |
+## ⏱️ 预计时间
+- **每个文档**: 10-180秒（取决于复杂度）
+- **批次间隔**: 重试时有2-10秒等待
+- **总时间**: 对于100个文档，预计20-60分钟
+## 🆘 如果问题持续
+### 检查 Ollama 日志
+```bash
+!tail -n 50 /tmp/ollama.log
+```
+### 检查系统资源
+```python
+# 检查 GPU 内存
+!nvidia-smi
+# 检查 RAM
+import psutil
+print(f"内存使用: {psutil.virtual_memory().percent}%")
+```
+### 使用更小的模型
+如果 Mistral 太慢，可以在 `config.py` 中切换到更快的模型：
+```python
+LOCAL_LLM = "phi:latest"  # 更快但质量稍低
+# 或
+LOCAL_LLM = "llama2:7b"   # 平衡选择
+```
+## 📝 总结
+**最可能的解决方案**:
+1. ✅ 重启 Ollama（清理内存）
+2. ✅ 增加超时到 180 秒
+3. ✅ 减小批次大小到 3
+4. ✅ 从断点继续处理
+**紧急情况**:
+- 如果某个文档持续失败 → 跳过它
+- 如果 Ollama 崩溃 → 重启服务
+- 如果内存不足 → 使用更小的模型
+现在请运行上面的"完整 Colab 代码块"，应该就能解决问题了！ 🚀

fix_timeout_issue.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+临时修复超时问题的脚本
+在 Colab 中运行此脚本来增加超时时间
+"""
+import sys
+import os
+# 确保路径正确
+sys.path.insert(0, '/content/drive/MyDrive/adaptive_RAG')
+print("🔧 修复超时问题...")
+print("="*60)
+# 方案 1: 修改 entity_extractor 的超时设置
+print("\n📝 方案 1: 增加超时时间和重试次数")
+print("-"*60)
+# 重新导入并修改
+from entity_extractor import EntityExtractor, EntityDeduplicator
+from graph_indexer import GraphRAGIndexer
+# 创建自定义的 GraphRAG 索引器，使用更长的超时
+class GraphRAGIndexerWithLongerTimeout(GraphRAGIndexer):
+    """增加超时时间的 GraphRAG 索引器"""
+    def __init__(self, timeout=180, max_retries=5):
+        """
+        初始化索引器，使用更长的超时时间
+        Args:
+            timeout: 超时时间（秒），默认180秒（3分钟）
+            max_retries: 最大重试次数，默认5次
+        """
+        print(f"🚀 初始化GraphRAG索引器（超时: {timeout}秒, 重试: {max_retries}次）...")
+        # 使用更长的超时初始化实体提取器
+        self.entity_extractor = EntityExtractor(
+            timeout=timeout,
+            max_retries=max_retries
+        )
+        self.entity_deduplicator = EntityDeduplicator()
+        # 导入其他必要的类
+        from knowledge_graph import KnowledgeGraph, CommunitySummarizer
+        self.knowledge_graph = KnowledgeGraph()
+        self.community_summarizer = CommunitySummarizer()
+        self.indexed = False
+        print("✅ GraphRAG索引器初始化完成")
+# 方案 2: 提供快速重启脚本
+print("\n📝 方案 2: 重启 Ollama 服务")
+print("-"*60)
+print("运行以下命令:")
+print("  !pkill -9 ollama")
+print("  !sleep 2")
+print("  !nohup ollama serve > /tmp/ollama.log 2>&1 &")
+print("  !sleep 5")
+print("  !curl http://localhost:11434/api/tags")
+# 方案 3: 跳过当前文档
+print("\n📝 方案 3: 跳过问题文档并继续")
+print("-"*60)
+print("如果某个文档持续失败，可以跳过它:")
+print("""
+# 示例：从文档 #57 开始继续处理
+problem_doc_index = 55  # 文档 #56 的索引
+doc_splits_filtered = doc_splits[:problem_doc_index] + doc_splits[problem_doc_index+1:]
+# 使用过滤后的文档列表
+graph = indexer.index_documents(
+    documents=doc_splits_filtered,
+    batch_size=3
+)
+""")
+# 使用示例
+print("\n" + "="*60)
+print("✅ 修复方案准备完成")
+print("="*60)
+print("\n💡 推荐的使用方法:")
+print("-"*60)
+usage_example = """
+# 1. 导入修复后的索引器
+from fix_timeout_issue import GraphRAGIndexerWithLongerTimeout
+# 2. 使用更长的超时时间（3分钟）创建索引器
+indexer = GraphRAGIndexerWithLongerTimeout(
+    timeout=180,      # 3分钟超时
+    max_retries=5     # 5次重试
+)
+# 3. 减小批次大小，继续处理
+# 如果已经处理了部分文档，可以跳过它们
+processed_count = 55  # 已处理到文档 #55
+remaining_docs = doc_splits[processed_count:]
+graph = indexer.index_documents(
+    documents=remaining_docs,
+    batch_size=3,  # 更小的批次
+    save_path="/content/drive/MyDrive/knowledge_graph.pkl"
+)
+# 4. 如果还是超时，考虑跳过问题文档
+# problem_indices = [55]  # 文档 #56 的索引
+# remaining_docs_filtered = [doc for i, doc in enumerate(doc_splits[processed_count:])
+#                            if (processed_count + i) not in problem_indices]
+"""
+print(usage_example)
+print("\n" + "="*60)
+print("🎯 立即执行的步骤:")
+print("="*60)
+print("""
+1️⃣ 首先重启 Ollama 服务:
+   !pkill -9 ollama && sleep 2 && nohup ollama serve > /tmp/ollama.log 2>&1 & && sleep 5
+2️⃣ 然后使用更长的超时时间继续:
+   from fix_timeout_issue import GraphRAGIndexerWithLongerTimeout
+   indexer = GraphRAGIndexerWithLongerTimeout(timeout=180, max_retries=5)
+3️⃣ 从文档 #56 继续处理（减小批次大小）:
+   remaining_docs = doc_splits[55:]  # 从文档 #56 开始
+   graph = indexer.index_documents(remaining_docs, batch_size=3)
+4️⃣ 如果文档 #56 仍然超时，跳过它:
+   remaining_docs = doc_splits[56:]  # 跳过文档 #56，从 #57 开始
+   graph = indexer.index_documents(remaining_docs, batch_size=3)
+""")
+print("\n⚠️ 注意:")
+print("  • 超时通常说明文档内容复杂或 Ollama 负载过重")
+print("  • 重启 Ollama 通常能解决负载问题")
+print("  • 增加超时时间（180秒）能处理复杂文档")
+print("  • 减小批次大小（3个文档/批次）能减轻负载")
+print("  • 如果某个文档持续失败，可以考虑跳过它")