Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
52f92a4
1
Parent(s):
2cb7544
resolve conflict
Browse files- COLAB_CONTINUE_FROM_TIMEOUT.py +229 -0
- COLAB_QUICK_CONTINUE.py +121 -0
- TIMEOUT_QUICK_FIX_CN.md +324 -0
- fix_timeout_issue.py +144 -0
COLAB_CONTINUE_FROM_TIMEOUT.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
在 Colab 中从超时处继续处理的完整脚本
|
| 3 |
+
直接复制到 Colab 代码单元格运行
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
print("🚀 GraphRAG 超时恢复脚本")
|
| 7 |
+
print("="*60)
|
| 8 |
+
|
| 9 |
+
# ==================== 步骤 0: 检查前置条件 ====================
|
| 10 |
+
print("\n📋 步骤 0: 检查前置条件...")
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# 挂载 Google Drive(如果还没有挂载)
|
| 16 |
+
try:
|
| 17 |
+
from google.colab import drive
|
| 18 |
+
if not os.path.exists('/content/drive'):
|
| 19 |
+
print(" 挂载 Google Drive...")
|
| 20 |
+
drive.mount('/content/drive')
|
| 21 |
+
else:
|
| 22 |
+
print(" ✅ Google Drive 已挂载")
|
| 23 |
+
except:
|
| 24 |
+
print(" ⚠️ 不在 Colab 环境中")
|
| 25 |
+
|
| 26 |
+
# 设置路径
|
| 27 |
+
project_path = '/content/drive/MyDrive/adaptive_RAG'
|
| 28 |
+
sys.path.insert(0, project_path)
|
| 29 |
+
|
| 30 |
+
print(f" 项目路径: {project_path}")
|
| 31 |
+
|
| 32 |
+
# ==================== 步骤 1: 重启 Ollama ====================
|
| 33 |
+
print("\n🔄 步骤 1: 重启 Ollama 服务...")
|
| 34 |
+
|
| 35 |
+
import subprocess
|
| 36 |
+
import time
|
| 37 |
+
|
| 38 |
+
# 杀掉旧进程
|
| 39 |
+
!pkill -9 ollama 2>/dev/null
|
| 40 |
+
|
| 41 |
+
time.sleep(2)
|
| 42 |
+
|
| 43 |
+
# 启动新进程
|
| 44 |
+
print(" 启动 Ollama 服务...")
|
| 45 |
+
ollama_process = subprocess.Popen(
|
| 46 |
+
["ollama", "serve"],
|
| 47 |
+
stdout=subprocess.PIPE,
|
| 48 |
+
stderr=subprocess.PIPE,
|
| 49 |
+
preexec_fn=os.setpgrp
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
time.sleep(5)
|
| 53 |
+
|
| 54 |
+
# 验证服务
|
| 55 |
+
import requests
|
| 56 |
+
try:
|
| 57 |
+
response = requests.get('http://localhost:11434/api/tags', timeout=5)
|
| 58 |
+
if response.status_code == 200:
|
| 59 |
+
print(" ✅ Ollama 服务运行正常")
|
| 60 |
+
else:
|
| 61 |
+
print(f" ⚠️ Ollama 响应异常: {response.status_code}")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f" ❌ Ollama 服务未响应: {e}")
|
| 64 |
+
print(" 请检查 Ollama 是否正确安装")
|
| 65 |
+
|
| 66 |
+
# ==================== 步骤 2: 加载配置和文档 ====================
|
| 67 |
+
print("\n📚 步骤 2: 加载配置和文档...")
|
| 68 |
+
|
| 69 |
+
# 导入配置
|
| 70 |
+
from config import setup_environment
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
setup_environment()
|
| 74 |
+
print(" ✅ 环境配置加载成功")
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f" ⚠️ 环境配置警告: {e}")
|
| 77 |
+
|
| 78 |
+
# 检查是否已经有 doc_splits 变量
|
| 79 |
+
if 'doc_splits' in dir():
|
| 80 |
+
print(f" ✅ 检测到已有 doc_splits: {len(doc_splits)} 个文档")
|
| 81 |
+
use_existing_docs = True
|
| 82 |
+
else:
|
| 83 |
+
print(" ⚠️ 未检测到 doc_splits,需要重新加载文档")
|
| 84 |
+
use_existing_docs = False
|
| 85 |
+
|
| 86 |
+
# 如果没有 doc_splits,重新加载
|
| 87 |
+
if not use_existing_docs:
|
| 88 |
+
print("\n 正在加载文档...")
|
| 89 |
+
from document_processor import DocumentProcessor
|
| 90 |
+
|
| 91 |
+
doc_processor = DocumentProcessor()
|
| 92 |
+
|
| 93 |
+
# 使用默认 URL 或自定义 URL
|
| 94 |
+
urls = [
|
| 95 |
+
"https://lilianweng.github.io/posts/2023-06-23-agent/",
|
| 96 |
+
"https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
|
| 97 |
+
"https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base(
|
| 101 |
+
urls=urls,
|
| 102 |
+
enable_graphrag=True
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
print(f" ✅ 文档加载完成: {len(doc_splits)} 个文档片段")
|
| 106 |
+
|
| 107 |
+
# ==================== 步骤 3: 修复超时配置 ====================
|
| 108 |
+
print("\n⚙️ 步骤 3: 修复超时配置...")
|
| 109 |
+
|
| 110 |
+
# 方案:直接修改 entity_extractor.py 文件内容
|
| 111 |
+
entity_extractor_path = os.path.join(project_path, 'entity_extractor.py')
|
| 112 |
+
|
| 113 |
+
# 读取原文件
|
| 114 |
+
with open(entity_extractor_path, 'r', encoding='utf-8') as f:
|
| 115 |
+
content = f.read()
|
| 116 |
+
|
| 117 |
+
# 检查是否已经修改过
|
| 118 |
+
if 'timeout: int = 180' in content:
|
| 119 |
+
print(" ✅ entity_extractor.py 已经包含超时修复")
|
| 120 |
+
else:
|
| 121 |
+
print(" 📝 修改 entity_extractor.py...")
|
| 122 |
+
|
| 123 |
+
# 替换初始化方法的签名
|
| 124 |
+
content = content.replace(
|
| 125 |
+
'def __init__(self, timeout: int = 60, max_retries: int = 3):',
|
| 126 |
+
'def __init__(self, timeout: int = 180, max_retries: int = 5):'
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# 保存修改
|
| 130 |
+
with open(entity_extractor_path, 'w', encoding='utf-8') as f:
|
| 131 |
+
f.write(content)
|
| 132 |
+
|
| 133 |
+
print(" ✅ 已将默认超时时间改为 180 秒,重试次数改为 5 次")
|
| 134 |
+
|
| 135 |
+
# 重新加载模块
|
| 136 |
+
import importlib
|
| 137 |
+
|
| 138 |
+
if 'entity_extractor' in sys.modules:
|
| 139 |
+
importlib.reload(sys.modules['entity_extractor'])
|
| 140 |
+
print(" 🔄 entity_extractor 模块已重新加载")
|
| 141 |
+
|
| 142 |
+
if 'graph_indexer' in sys.modules:
|
| 143 |
+
importlib.reload(sys.modules['graph_indexer'])
|
| 144 |
+
print(" 🔄 graph_indexer 模块已重新加载")
|
| 145 |
+
|
| 146 |
+
# ==================== 步骤 4: 确定继续处理的起点 ====================
|
| 147 |
+
print("\n📊 步骤 4: 确定处理起点...")
|
| 148 |
+
|
| 149 |
+
# 让用户选择从哪里开始
|
| 150 |
+
print("\n请选择继续处理的方式:")
|
| 151 |
+
print(" 1. 从文档 #56 重新开始(包含 #56)")
|
| 152 |
+
print(" 2. 跳过文档 #56,从 #57 开始")
|
| 153 |
+
print(" 3. 从头开始处理所有文档")
|
| 154 |
+
print(" 4. 自定义起始位置")
|
| 155 |
+
|
| 156 |
+
# 默认选项(可以修改)
|
| 157 |
+
choice = 1 # 👈 修改这里来选择不同的选项
|
| 158 |
+
|
| 159 |
+
if choice == 1:
|
| 160 |
+
start_index = 55 # 文档 #56 的索引
|
| 161 |
+
print(f"\n ✅ 选择: 从文档 #56 开始(索引 {start_index})")
|
| 162 |
+
elif choice == 2:
|
| 163 |
+
start_index = 56 # 跳过 #56
|
| 164 |
+
print(f"\n ✅ 选择: 跳过文档 #56,从 #57 开始(索引 {start_index})")
|
| 165 |
+
elif choice == 3:
|
| 166 |
+
start_index = 0
|
| 167 |
+
print(f"\n ✅ 选择: 从头开始处理所有文档")
|
| 168 |
+
else:
|
| 169 |
+
# 自定义
|
| 170 |
+
start_index = 55 # 👈 修改这里来自定义起始位置
|
| 171 |
+
print(f"\n ✅ 选择: 自定义起始位置(索引 {start_index})")
|
| 172 |
+
|
| 173 |
+
remaining_docs = doc_splits[start_index:]
|
| 174 |
+
print(f" 待处理文档数: {len(remaining_docs)} 个")
|
| 175 |
+
|
| 176 |
+
# ==================== 步骤 5: 开始处理 ====================
|
| 177 |
+
print("\n🚀 步骤 5: 开始处理文档...")
|
| 178 |
+
print("="*60)
|
| 179 |
+
|
| 180 |
+
from graph_indexer import GraphRAGIndexer
|
| 181 |
+
|
| 182 |
+
# 创建索引器
|
| 183 |
+
indexer = GraphRAGIndexer()
|
| 184 |
+
|
| 185 |
+
# 开始索引
|
| 186 |
+
try:
|
| 187 |
+
graph = indexer.index_documents(
|
| 188 |
+
documents=remaining_docs,
|
| 189 |
+
batch_size=3, # 👈 可以调整批次大小(1-5 推荐)
|
| 190 |
+
save_path=os.path.join(project_path, "knowledge_graph_recovered.pkl")
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
print("\n" + "="*60)
|
| 194 |
+
print("✅ 处理完成!")
|
| 195 |
+
print("="*60)
|
| 196 |
+
|
| 197 |
+
# 显示统计信息
|
| 198 |
+
stats = graph.get_statistics()
|
| 199 |
+
print(f"\n📊 知识图谱统计:")
|
| 200 |
+
print(f" • 节点数: {stats['num_nodes']}")
|
| 201 |
+
print(f" • 边数: {stats['num_edges']}")
|
| 202 |
+
print(f" • 社区数: {stats['num_communities']}")
|
| 203 |
+
print(f" • 图密度: {stats['density']:.4f}")
|
| 204 |
+
|
| 205 |
+
except KeyboardInterrupt:
|
| 206 |
+
print("\n⚠️ 处理被用户中断")
|
| 207 |
+
print(" 可以记录当前进度,稍后继续")
|
| 208 |
+
|
| 209 |
+
except Exception as e:
|
| 210 |
+
print(f"\n❌ 处理过程中出现错误:")
|
| 211 |
+
print(f" {type(e).__name__}: {e}")
|
| 212 |
+
print("\n建议:")
|
| 213 |
+
print(" 1. 检查上面的错误信息")
|
| 214 |
+
print(" 2. 如果是某个文档超时,尝试跳过它")
|
| 215 |
+
print(" 3. 如果是 Ollama 问题,重启服务")
|
| 216 |
+
|
| 217 |
+
import traceback
|
| 218 |
+
print("\n完整错误堆栈:")
|
| 219 |
+
traceback.print_exc()
|
| 220 |
+
|
| 221 |
+
# ==================== 完成 ====================
|
| 222 |
+
print("\n" + "="*60)
|
| 223 |
+
print("脚本执行完成")
|
| 224 |
+
print("="*60)
|
| 225 |
+
print("\n💡 提示:")
|
| 226 |
+
print(" • 如果遇到超时,检查上面的错误信息")
|
| 227 |
+
print(" • 可以修改 choice 变量来跳过问题文档")
|
| 228 |
+
print(" • 可以修改 batch_size 来调整处理速度")
|
| 229 |
+
print(" • 图谱已保存到: knowledge_graph_recovered.pkl")
|
COLAB_QUICK_CONTINUE.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Colab 快速继续脚本 - 从超时处恢复
|
| 3 |
+
复制到 Colab 运行,会自动检测并继续处理
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
print("🚀 GraphRAG 恢复脚本 v2.0")
|
| 7 |
+
print("="*60)
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
# ==================== 1. 设置环境 ====================
|
| 13 |
+
print("\n1️⃣ 设置环境...")
|
| 14 |
+
|
| 15 |
+
# 设置项目路径
|
| 16 |
+
project_path = '/content/drive/MyDrive/adaptive_RAG'
|
| 17 |
+
if project_path not in sys.path:
|
| 18 |
+
sys.path.insert(0, project_path)
|
| 19 |
+
print(f" ✅ 项目路径: {project_path}")
|
| 20 |
+
|
| 21 |
+
# ==================== 2. 重启 Ollama ====================
|
| 22 |
+
print("\n2️⃣ 重启 Ollama...")
|
| 23 |
+
|
| 24 |
+
import subprocess
|
| 25 |
+
import time
|
| 26 |
+
|
| 27 |
+
subprocess.run(['pkill', '-9', 'ollama'], stderr=subprocess.DEVNULL)
|
| 28 |
+
time.sleep(2)
|
| 29 |
+
|
| 30 |
+
ollama_process = subprocess.Popen(
|
| 31 |
+
["ollama", "serve"],
|
| 32 |
+
stdout=subprocess.PIPE,
|
| 33 |
+
stderr=subprocess.PIPE
|
| 34 |
+
)
|
| 35 |
+
time.sleep(5)
|
| 36 |
+
|
| 37 |
+
import requests
|
| 38 |
+
try:
|
| 39 |
+
r = requests.get('http://localhost:11434/api/tags', timeout=5)
|
| 40 |
+
print(f" ✅ Ollama 运行正常" if r.status_code == 200 else f" ⚠️ 状态码: {r.status_code}")
|
| 41 |
+
except:
|
| 42 |
+
print(" ❌ Ollama 未响应")
|
| 43 |
+
|
| 44 |
+
# ==================== 3. 加载文档 ====================
|
| 45 |
+
print("\n3️⃣ 加载文档...")
|
| 46 |
+
|
| 47 |
+
from config import setup_environment
|
| 48 |
+
from document_processor import DocumentProcessor
|
| 49 |
+
|
| 50 |
+
setup_environment()
|
| 51 |
+
|
| 52 |
+
# 创建文档处理器
|
| 53 |
+
doc_processor = DocumentProcessor()
|
| 54 |
+
|
| 55 |
+
# 加载文档(使用默认 URLs)
|
| 56 |
+
vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base(
|
| 57 |
+
enable_graphrag=True
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
print(f" ✅ 已加载 {len(doc_splits)} 个文档")
|
| 61 |
+
|
| 62 |
+
# ==================== 4. 修改超时配置 ====================
|
| 63 |
+
print("\n4️⃣ 增加超时时间...")
|
| 64 |
+
|
| 65 |
+
entity_file = os.path.join(project_path, 'entity_extractor.py')
|
| 66 |
+
with open(entity_file, 'r', encoding='utf-8') as f:
|
| 67 |
+
content = f.read()
|
| 68 |
+
|
| 69 |
+
# 修改默认参数
|
| 70 |
+
if 'timeout: int = 60' in content:
|
| 71 |
+
content = content.replace(
|
| 72 |
+
'timeout: int = 60, max_retries: int = 3',
|
| 73 |
+
'timeout: int = 180, max_retries: int = 5'
|
| 74 |
+
)
|
| 75 |
+
with open(entity_file, 'w', encoding='utf-8') as f:
|
| 76 |
+
f.write(content)
|
| 77 |
+
print(" ✅ 超时已改为 180 秒,重试改为 5 次")
|
| 78 |
+
else:
|
| 79 |
+
print(" ℹ️ 已经是修改后的配置")
|
| 80 |
+
|
| 81 |
+
# 重新加载模块
|
| 82 |
+
import importlib
|
| 83 |
+
for mod in ['entity_extractor', 'graph_indexer']:
|
| 84 |
+
if mod in sys.modules:
|
| 85 |
+
importlib.reload(sys.modules[mod])
|
| 86 |
+
|
| 87 |
+
# ==================== 5. 继续处理 ====================
|
| 88 |
+
print("\n5️⃣ 继续处理文档...")
|
| 89 |
+
print("="*60)
|
| 90 |
+
|
| 91 |
+
from graph_indexer import GraphRAGIndexer
|
| 92 |
+
|
| 93 |
+
# 配置起始位置
|
| 94 |
+
START_INDEX = 55 # 👈 从文档 #56 开始,修改这里可以跳过某些文档
|
| 95 |
+
BATCH_SIZE = 3 # 👈 批次大小,可以改为 1-5
|
| 96 |
+
|
| 97 |
+
print(f"\n 起始位置: 文档 #{START_INDEX + 1}")
|
| 98 |
+
print(f" 批次大小: {BATCH_SIZE}")
|
| 99 |
+
print(f" 待处理: {len(doc_splits) - START_INDEX} 个文档\n")
|
| 100 |
+
|
| 101 |
+
remaining_docs = doc_splits[START_INDEX:]
|
| 102 |
+
|
| 103 |
+
indexer = GraphRAGIndexer()
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
graph = indexer.index_documents(
|
| 107 |
+
documents=remaining_docs,
|
| 108 |
+
batch_size=BATCH_SIZE,
|
| 109 |
+
save_path=f"{project_path}/knowledge_graph_recovered.pkl"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
print("\n✅ 处理完成!")
|
| 113 |
+
stats = graph.get_statistics()
|
| 114 |
+
print(f"📊 节点: {stats['num_nodes']}, 边: {stats['num_edges']}, 社区: {stats['num_communities']}")
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"\n❌ 错误: {e}")
|
| 118 |
+
print("\n建议:")
|
| 119 |
+
print(" • 如果文档 #56 超时,修改 START_INDEX = 56 跳过它")
|
| 120 |
+
print(" • 如果 Ollama 崩溃,重新运行此脚本")
|
| 121 |
+
print(" • 减小 BATCH_SIZE 到 1 或 2")
|
TIMEOUT_QUICK_FIX_CN.md
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 超时问题快速修复指南
|
| 2 |
+
|
| 3 |
+
## 🚨 当前问题
|
| 4 |
+
|
| 5 |
+
您遇到了这个错误:
|
| 6 |
+
```
|
| 7 |
+
🔄 提取实体 (尝试 1/3)... ❌ 错误: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
**原因**: 文档 #56 处理时间超过60秒,Ollama 没有在规定时间内返回结果。
|
| 11 |
+
|
| 12 |
+
## ⚡ 立即修复(3步搞定)
|
| 13 |
+
|
| 14 |
+
### 步骤 1: 重启 Ollama 服务
|
| 15 |
+
|
| 16 |
+
在 Colab 中运行:
|
| 17 |
+
```bash
|
| 18 |
+
!pkill -9 ollama
|
| 19 |
+
!sleep 2
|
| 20 |
+
!nohup ollama serve > /tmp/ollama.log 2>&1 &
|
| 21 |
+
!sleep 5
|
| 22 |
+
!curl http://localhost:11434/api/tags
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
### 步骤 2: 增加超时时间
|
| 26 |
+
|
| 27 |
+
在您的 Colab 笔记本中,修改初始化代码:
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
# 找到 entity_extractor.py 的导入位置,修改为:
|
| 31 |
+
from entity_extractor import EntityExtractor
|
| 32 |
+
|
| 33 |
+
# 创建带更长超时的提取器
|
| 34 |
+
# 直接在 Python 中猴子补丁修复
|
| 35 |
+
import entity_extractor
|
| 36 |
+
|
| 37 |
+
# 保存原始初始化方法
|
| 38 |
+
_original_init = entity_extractor.EntityExtractor.__init__
|
| 39 |
+
|
| 40 |
+
# 创建新的初始化方法,默认使用更长的超时
|
| 41 |
+
def _new_init(self, timeout=180, max_retries=5):
|
| 42 |
+
_original_init(self, timeout=timeout, max_retries=max_retries)
|
| 43 |
+
|
| 44 |
+
# 替换初始化方法
|
| 45 |
+
entity_extractor.EntityExtractor.__init__ = _new_init
|
| 46 |
+
|
| 47 |
+
print("✅ 已将超时时间增加到 180 秒(3分钟)")
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### 步骤 3: 继续处理(跳过已完成的)
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
# 从文档 #56 继续(索引 55)
|
| 54 |
+
processed_count = 55
|
| 55 |
+
|
| 56 |
+
remaining_docs = doc_splits[processed_count:]
|
| 57 |
+
|
| 58 |
+
graph = indexer.index_documents(
|
| 59 |
+
documents=remaining_docs,
|
| 60 |
+
batch_size=3, # 减小批次大小
|
| 61 |
+
save_path="/content/drive/MyDrive/knowledge_graph.pkl"
|
| 62 |
+
)
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## 🎯 完整的 Colab 代码块
|
| 66 |
+
|
| 67 |
+
直接复制粘贴到 Colab 新的代码单元格:
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
print("🔧 开始修复超时问题...")
|
| 71 |
+
print("="*60)
|
| 72 |
+
|
| 73 |
+
# ========== 第1步: 重启 Ollama ==========
|
| 74 |
+
print("\n1️⃣ 重启 Ollama 服务...")
|
| 75 |
+
!pkill -9 ollama
|
| 76 |
+
!sleep 2
|
| 77 |
+
!nohup ollama serve > /tmp/ollama.log 2>&1 &
|
| 78 |
+
!sleep 5
|
| 79 |
+
|
| 80 |
+
# 验证 Ollama 已启动
|
| 81 |
+
import requests
|
| 82 |
+
try:
|
| 83 |
+
response = requests.get('http://localhost:11434/api/tags', timeout=5)
|
| 84 |
+
if response.status_code == 200:
|
| 85 |
+
print("✅ Ollama 服务运行正常")
|
| 86 |
+
else:
|
| 87 |
+
print("⚠️ Ollama 可能未正常启动")
|
| 88 |
+
except:
|
| 89 |
+
print("❌ Ollama 服务未响应,请检查日志")
|
| 90 |
+
|
| 91 |
+
# ========== 第2步: 增加超时时间 ==========
|
| 92 |
+
print("\n2️⃣ 修改超时配置...")
|
| 93 |
+
|
| 94 |
+
import sys
|
| 95 |
+
sys.path.insert(0, '/content/drive/MyDrive/adaptive_RAG')
|
| 96 |
+
|
| 97 |
+
import entity_extractor
|
| 98 |
+
|
| 99 |
+
# 保存原始初始化
|
| 100 |
+
_original_init = entity_extractor.EntityExtractor.__init__
|
| 101 |
+
|
| 102 |
+
# 新的初始化方法:默认3分钟超时,5次重试
|
| 103 |
+
def _new_init(self, timeout=180, max_retries=5):
|
| 104 |
+
from langchain_community.chat_models import ChatOllama
|
| 105 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 106 |
+
from config import LOCAL_LLM
|
| 107 |
+
try:
|
| 108 |
+
from langchain_core.prompts import PromptTemplate
|
| 109 |
+
except ImportError:
|
| 110 |
+
from langchain.prompts import PromptTemplate
|
| 111 |
+
import time
|
| 112 |
+
|
| 113 |
+
self.llm = ChatOllama(
|
| 114 |
+
model=LOCAL_LLM,
|
| 115 |
+
format="json",
|
| 116 |
+
temperature=0,
|
| 117 |
+
timeout=timeout
|
| 118 |
+
)
|
| 119 |
+
self.max_retries = max_retries
|
| 120 |
+
|
| 121 |
+
# 实体提取提示模板
|
| 122 |
+
self.entity_prompt = PromptTemplate(
|
| 123 |
+
template="""你是一个专业的实体识别专家。从以下文本中提取所有重要的实体。
|
| 124 |
+
|
| 125 |
+
实体类型包括:
|
| 126 |
+
- PERSON: 人物、作者、研究者
|
| 127 |
+
- ORGANIZATION: 组织、机构、公司
|
| 128 |
+
- CONCEPT: 技术概念、算法、方法论
|
| 129 |
+
- TECHNOLOGY: 具体技术、工具、框架
|
| 130 |
+
- PAPER: 论文、出版物
|
| 131 |
+
- EVENT: 事件、会议
|
| 132 |
+
|
| 133 |
+
文本内容:
|
| 134 |
+
{text}
|
| 135 |
+
|
| 136 |
+
请以JSON格式返回,包含以下字段:
|
| 137 |
+
{{
|
| 138 |
+
"entities": [
|
| 139 |
+
{{
|
| 140 |
+
"name": "实体名称",
|
| 141 |
+
"type": "实体类型",
|
| 142 |
+
"description": "简短描述"
|
| 143 |
+
}}
|
| 144 |
+
]
|
| 145 |
+
}}
|
| 146 |
+
|
| 147 |
+
不要包含前言或解释,只返回JSON。
|
| 148 |
+
""",
|
| 149 |
+
input_variables=["text"]
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# 关系提取提示模板
|
| 153 |
+
self.relation_prompt = PromptTemplate(
|
| 154 |
+
template="""你是一个关系抽取专家。从文本中识别实体之间的关系。
|
| 155 |
+
|
| 156 |
+
已识别的实体:
|
| 157 |
+
{entities}
|
| 158 |
+
|
| 159 |
+
文本内容:
|
| 160 |
+
{text}
|
| 161 |
+
|
| 162 |
+
请识别实体之间的关系,以JSON格式返回:
|
| 163 |
+
{{
|
| 164 |
+
"relations": [
|
| 165 |
+
{{
|
| 166 |
+
"source": "源实体名称",
|
| 167 |
+
"target": "目标实体名称",
|
| 168 |
+
"relation_type": "关系类型",
|
| 169 |
+
"description": "关系描述"
|
| 170 |
+
}}
|
| 171 |
+
]
|
| 172 |
+
}}
|
| 173 |
+
|
| 174 |
+
关系类型包括: AUTHOR_OF, USES, BASED_ON, RELATED_TO, PART_OF, APPLIES_TO, IMPROVES, CITES
|
| 175 |
+
|
| 176 |
+
不要包含前言或解释,只返回JSON。
|
| 177 |
+
""",
|
| 178 |
+
input_variables=["text", "entities"]
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
self.entity_chain = self.entity_prompt | self.llm | JsonOutputParser()
|
| 182 |
+
self.relation_chain = self.relation_prompt | self.llm | JsonOutputParser()
|
| 183 |
+
|
| 184 |
+
# 应用补丁
|
| 185 |
+
entity_extractor.EntityExtractor.__init__ = _new_init
|
| 186 |
+
|
| 187 |
+
print("✅ 超时时间已增加到 180 秒(3分钟)")
|
| 188 |
+
print("✅ 重��次数已增加到 5 次")
|
| 189 |
+
|
| 190 |
+
# ========== 第3步: 继续处理 ==========
|
| 191 |
+
print("\n3️⃣ 准备继续处理...")
|
| 192 |
+
|
| 193 |
+
# 重新导入模块以应用更改
|
| 194 |
+
import importlib
|
| 195 |
+
if 'graph_indexer' in sys.modules:
|
| 196 |
+
importlib.reload(sys.modules['graph_indexer'])
|
| 197 |
+
|
| 198 |
+
from graph_indexer import GraphRAGIndexer
|
| 199 |
+
|
| 200 |
+
# 创建新的索引器
|
| 201 |
+
indexer = GraphRAGIndexer()
|
| 202 |
+
|
| 203 |
+
print("\n📋 当前状态:")
|
| 204 |
+
print(f" • 总文档数: {len(doc_splits)}")
|
| 205 |
+
print(f" • 已处理: 55 个文档(0-55)")
|
| 206 |
+
print(f" • 待处理: {len(doc_splits) - 55} 个文档(56-{len(doc_splits)-1})")
|
| 207 |
+
|
| 208 |
+
# 从文档 #56 继续
|
| 209 |
+
processed_count = 55
|
| 210 |
+
remaining_docs = doc_splits[processed_count:]
|
| 211 |
+
|
| 212 |
+
print("\n🚀 开始处理剩余文档...")
|
| 213 |
+
print("="*60)
|
| 214 |
+
|
| 215 |
+
graph = indexer.index_documents(
|
| 216 |
+
documents=remaining_docs,
|
| 217 |
+
batch_size=3, # 减小批次大小以降低负载
|
| 218 |
+
save_path="/content/drive/MyDrive/knowledge_graph_partial.pkl"
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
print("\n✅ 处理完成!")
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
## 📊 如果文档 #56 仍然超时
|
| 225 |
+
|
| 226 |
+
如果增加超时后,文档 #56 仍然失败,可能是该文档内容特别复杂。可以选择跳过它:
|
| 227 |
+
|
| 228 |
+
```python
|
| 229 |
+
# 方案A: 跳过文档 #56
|
| 230 |
+
print("跳过文档 #56,从 #57 继续...")
|
| 231 |
+
processed_count = 56 # 跳过 #56
|
| 232 |
+
remaining_docs = doc_splits[processed_count:]
|
| 233 |
+
|
| 234 |
+
graph = indexer.index_documents(
|
| 235 |
+
documents=remaining_docs,
|
| 236 |
+
batch_size=3,
|
| 237 |
+
save_path="/content/drive/MyDrive/knowledge_graph_partial.pkl"
|
| 238 |
+
)
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
或者单独检查该文档:
|
| 242 |
+
|
| 243 |
+
```python
|
| 244 |
+
# 方案B: 检查文档 #56 的内容
|
| 245 |
+
problem_doc = doc_splits[55] # 文档 #56(索引55)
|
| 246 |
+
|
| 247 |
+
print(f"文档 #56 信息:")
|
| 248 |
+
print(f" 长度: {len(problem_doc.page_content)} 字符")
|
| 249 |
+
print(f" 前500字符:")
|
| 250 |
+
print(f" {problem_doc.page_content[:500]}")
|
| 251 |
+
print(f"\n 后500字符:")
|
| 252 |
+
print(f" {problem_doc.page_content[-500:]}")
|
| 253 |
+
|
| 254 |
+
# 如果文档太长,可以考虑分割它
|
| 255 |
+
if len(problem_doc.page_content) > 3000:
|
| 256 |
+
print("\n⚠️ 文档较长,可能需要更多处理时间或分割处理")
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
## 🔍 监控进度
|
| 260 |
+
|
| 261 |
+
修复后,您将看到更详细的输出:
|
| 262 |
+
|
| 263 |
+
```
|
| 264 |
+
⚙️ === 批次 19/20 (文档 56-58) ===
|
| 265 |
+
|
| 266 |
+
🔍 文档 #56: 开始提取...
|
| 267 |
+
🔄 提取实体 (尝试 1/5)... ✅ 提取到 8 个实体
|
| 268 |
+
🔄 提取关系 (尝试 1/5)... ✅ 提取到 5 个关系
|
| 269 |
+
📊 文档 #56 完成: 8 实体, 5 关系
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
## 📌 参数说明
|
| 273 |
+
|
| 274 |
+
| 参数 | 原值 | 新值 | 说明 |
|
| 275 |
+
|-----|------|------|------|
|
| 276 |
+
| `timeout` | 60秒 | 180秒 | 单次请求最大等待时间 |
|
| 277 |
+
| `max_retries` | 3次 | 5次 | 失败后重试次数 |
|
| 278 |
+
| `batch_size` | 10 | 3 | 每批次处理的文档数 |
|
| 279 |
+
|
| 280 |
+
## ⏱️ 预计时间
|
| 281 |
+
|
| 282 |
+
- **每个文档**: 10-180秒(取决于复杂度)
|
| 283 |
+
- **批次间隔**: 重试时有2-10秒等待
|
| 284 |
+
- **总时间**: 对于100个文档,预计20-60分钟
|
| 285 |
+
|
| 286 |
+
## 🆘 如果问题持续
|
| 287 |
+
|
| 288 |
+
### 检查 Ollama 日志
|
| 289 |
+
```bash
|
| 290 |
+
!tail -n 50 /tmp/ollama.log
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
### 检查系统资源
|
| 294 |
+
```python
|
| 295 |
+
# 检查 GPU 内存
|
| 296 |
+
!nvidia-smi
|
| 297 |
+
|
| 298 |
+
# 检查 RAM
|
| 299 |
+
import psutil
|
| 300 |
+
print(f"内存使用: {psutil.virtual_memory().percent}%")
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
### 使用更小的模型
|
| 304 |
+
如果 Mistral 太慢,可以在 `config.py` 中切换到更快的模型:
|
| 305 |
+
```python
|
| 306 |
+
LOCAL_LLM = "phi:latest" # 更快但质量稍低
|
| 307 |
+
# 或
|
| 308 |
+
LOCAL_LLM = "llama2:7b" # 平衡选择
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
## 📝 总结
|
| 312 |
+
|
| 313 |
+
**最可能的解决方案**:
|
| 314 |
+
1. ✅ 重启 Ollama(清理内存)
|
| 315 |
+
2. ✅ 增加超时到 180 秒
|
| 316 |
+
3. ✅ 减小批次大小到 3
|
| 317 |
+
4. ✅ 从断点继续处理
|
| 318 |
+
|
| 319 |
+
**紧急情况**:
|
| 320 |
+
- 如果某个文档持续失败 → 跳过它
|
| 321 |
+
- 如果 Ollama 崩溃 → 重启服务
|
| 322 |
+
- 如果内存不足 → 使用更小的模型
|
| 323 |
+
|
| 324 |
+
现在请运行上面的"完整 Colab 代码块",应该就能解决问题了! 🚀
|
fix_timeout_issue.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
临时修复超时问题的脚本
|
| 3 |
+
在 Colab 中运行此脚本来增加超时时间
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# 确保路径正确
|
| 10 |
+
sys.path.insert(0, '/content/drive/MyDrive/adaptive_RAG')
|
| 11 |
+
|
| 12 |
+
print("🔧 修复超时问题...")
|
| 13 |
+
print("="*60)
|
| 14 |
+
|
| 15 |
+
# 方案 1: 修改 entity_extractor 的超时设置
|
| 16 |
+
print("\n📝 方案 1: 增加超时时间和重试次数")
|
| 17 |
+
print("-"*60)
|
| 18 |
+
|
| 19 |
+
# 重新导入并修改
|
| 20 |
+
from entity_extractor import EntityExtractor, EntityDeduplicator
|
| 21 |
+
from graph_indexer import GraphRAGIndexer
|
| 22 |
+
|
| 23 |
+
# 创建自定义的 GraphRAG 索引器,使用更长的超时
|
| 24 |
+
class GraphRAGIndexerWithLongerTimeout(GraphRAGIndexer):
|
| 25 |
+
"""增加超时时间的 GraphRAG 索引器"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, timeout=180, max_retries=5):
|
| 28 |
+
"""
|
| 29 |
+
初始化索引器,使用更长的超时时间
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
timeout: 超时时间(秒),默认180秒(3分钟)
|
| 33 |
+
max_retries: 最大重试次数,默认5次
|
| 34 |
+
"""
|
| 35 |
+
print(f"🚀 初始化GraphRAG索引器(超时: {timeout}秒, 重试: {max_retries}次)...")
|
| 36 |
+
|
| 37 |
+
# 使用更长的超时初始化实体提取器
|
| 38 |
+
self.entity_extractor = EntityExtractor(
|
| 39 |
+
timeout=timeout,
|
| 40 |
+
max_retries=max_retries
|
| 41 |
+
)
|
| 42 |
+
self.entity_deduplicator = EntityDeduplicator()
|
| 43 |
+
|
| 44 |
+
# 导入其他必要的类
|
| 45 |
+
from knowledge_graph import KnowledgeGraph, CommunitySummarizer
|
| 46 |
+
self.knowledge_graph = KnowledgeGraph()
|
| 47 |
+
self.community_summarizer = CommunitySummarizer()
|
| 48 |
+
|
| 49 |
+
self.indexed = False
|
| 50 |
+
|
| 51 |
+
print("✅ GraphRAG索引器初始化完成")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# 方案 2: 提供快速重启脚本
|
| 55 |
+
print("\n📝 方案 2: 重启 Ollama 服务")
|
| 56 |
+
print("-"*60)
|
| 57 |
+
print("运行以下命令:")
|
| 58 |
+
print(" !pkill -9 ollama")
|
| 59 |
+
print(" !sleep 2")
|
| 60 |
+
print(" !nohup ollama serve > /tmp/ollama.log 2>&1 &")
|
| 61 |
+
print(" !sleep 5")
|
| 62 |
+
print(" !curl http://localhost:11434/api/tags")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# 方案 3: 跳过当前文档
|
| 66 |
+
print("\n📝 方案 3: 跳过问题文档并继续")
|
| 67 |
+
print("-"*60)
|
| 68 |
+
print("如果某个文档持续失败,可以跳过它:")
|
| 69 |
+
print("""
|
| 70 |
+
# 示例:从文档 #57 开始继续处理
|
| 71 |
+
problem_doc_index = 55 # 文档 #56 的索引
|
| 72 |
+
doc_splits_filtered = doc_splits[:problem_doc_index] + doc_splits[problem_doc_index+1:]
|
| 73 |
+
|
| 74 |
+
# 使用过滤后的文档列表
|
| 75 |
+
graph = indexer.index_documents(
|
| 76 |
+
documents=doc_splits_filtered,
|
| 77 |
+
batch_size=3
|
| 78 |
+
)
|
| 79 |
+
""")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# 使用示例
|
| 83 |
+
print("\n" + "="*60)
|
| 84 |
+
print("✅ 修复方案准备完成")
|
| 85 |
+
print("="*60)
|
| 86 |
+
print("\n💡 推荐的使用方法:")
|
| 87 |
+
print("-"*60)
|
| 88 |
+
|
| 89 |
+
usage_example = """
|
| 90 |
+
# 1. 导入修复后的索引器
|
| 91 |
+
from fix_timeout_issue import GraphRAGIndexerWithLongerTimeout
|
| 92 |
+
|
| 93 |
+
# 2. 使用更长的超时时间(3分钟)创建索引器
|
| 94 |
+
indexer = GraphRAGIndexerWithLongerTimeout(
|
| 95 |
+
timeout=180, # 3分钟超时
|
| 96 |
+
max_retries=5 # 5次重试
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# 3. 减小批次大小,继续处理
|
| 100 |
+
# 如果已经处理了部分文档,可以跳过它们
|
| 101 |
+
processed_count = 55 # 已处理到文档 #55
|
| 102 |
+
|
| 103 |
+
remaining_docs = doc_splits[processed_count:]
|
| 104 |
+
|
| 105 |
+
graph = indexer.index_documents(
|
| 106 |
+
documents=remaining_docs,
|
| 107 |
+
batch_size=3, # 更小的批次
|
| 108 |
+
save_path="/content/drive/MyDrive/knowledge_graph.pkl"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# 4. 如果还是超时,考虑跳过问题文档
|
| 112 |
+
# problem_indices = [55] # 文档 #56 的索引
|
| 113 |
+
# remaining_docs_filtered = [doc for i, doc in enumerate(doc_splits[processed_count:])
|
| 114 |
+
# if (processed_count + i) not in problem_indices]
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
print(usage_example)
|
| 118 |
+
|
| 119 |
+
print("\n" + "="*60)
|
| 120 |
+
print("🎯 立即执行的步骤:")
|
| 121 |
+
print("="*60)
|
| 122 |
+
print("""
|
| 123 |
+
1️⃣ 首先重启 Ollama 服务:
|
| 124 |
+
!pkill -9 ollama && sleep 2 && nohup ollama serve > /tmp/ollama.log 2>&1 & && sleep 5
|
| 125 |
+
|
| 126 |
+
2️⃣ 然后使用更长的超时时间继续:
|
| 127 |
+
from fix_timeout_issue import GraphRAGIndexerWithLongerTimeout
|
| 128 |
+
indexer = GraphRAGIndexerWithLongerTimeout(timeout=180, max_retries=5)
|
| 129 |
+
|
| 130 |
+
3️⃣ 从文档 #56 继续处理(减小批次大小):
|
| 131 |
+
remaining_docs = doc_splits[55:] # 从文档 #56 开始
|
| 132 |
+
graph = indexer.index_documents(remaining_docs, batch_size=3)
|
| 133 |
+
|
| 134 |
+
4️⃣ 如果文档 #56 仍然超时,跳过它:
|
| 135 |
+
remaining_docs = doc_splits[56:] # 跳过文档 #56,从 #57 开始
|
| 136 |
+
graph = indexer.index_documents(remaining_docs, batch_size=3)
|
| 137 |
+
""")
|
| 138 |
+
|
| 139 |
+
print("\n⚠️ 注意:")
|
| 140 |
+
print(" • 超时通常说明文档内容复杂或 Ollama 负载过重")
|
| 141 |
+
print(" • 重启 Ollama 通常能解决负载问题")
|
| 142 |
+
print(" • 增加超时时间(180秒)能处理复杂文档")
|
| 143 |
+
print(" • 减小批次大小(3个文档/批次)能减轻负载")
|
| 144 |
+
print(" • 如果某个文档持续失败,可以考虑跳过它")
|