Spaces:

ahaahaaha
/

adaptive_rag

Paused

App Files Files Community

lanny xu commited on 26 days ago

Commit

e427a94

1 Parent(s): 69629dd

delete files

Browse files

Files changed (5) hide show

KAGGLE_CHECK_OLLAMA.py +0 -181
KAGGLE_FIX_OLLAMA_CONNECTION.py +0 -233
KAGGLE_LOAD_OLLAMA.py +0 -268
KAGGLE_QUICK_START.py +0 -197
KAGGLE_SAVE_OLLAMA.py +0 -282

KAGGLE_CHECK_OLLAMA.py DELETED Viewed

@@ -1,181 +0,0 @@
-"""
-Kaggle Ollama 备份与加载 - 快速验证脚本
-这个脚本帮助你验证 Ollama 和模型的位置，确保备份方案正确
-在 Kaggle Notebook 中运行此脚本，检查环境
-"""
-import os
-import subprocess
-import shutil
-print("="*70)
-print("🔍 Kaggle Ollama 环境检查")
-print("="*70)
-# ==================== 检查 Ollama 安装 ====================
-print("\n📍 步骤 1: 检查 Ollama 安装位置")
-ollama_bin = shutil.which('ollama')
-if ollama_bin:
-    print(f"   ✅ Ollama 已安装")
-    print(f"   📂 位置: {ollama_bin}")
-    # 检查文件信息
-    file_size = os.path.getsize(ollama_bin) / (1024**2)
-    print(f"   📊 大小: {file_size:.2f} MB")
-    # 检查版本
-    version_result = subprocess.run(['ollama', '--version'], capture_output=True, text=True)
-    if version_result.returncode == 0:
-        print(f"   📌 版本: {version_result.stdout.strip()}")
-else:
-    print("   ❌ Ollama 未安装")
-    print("   💡 请先运行安装:")
-    print("      !curl -fsSL https://ollama.com/install.sh | sh")
-# ==================== 检查 Ollama 服务 ====================
-print("\n📍 步骤 2: 检查 Ollama 服务状态")
-ps_check = subprocess.run(['pgrep', '-f', 'ollama serve'], capture_output=True)
-if ps_check.returncode == 0:
-    print("   ✅ Ollama 服务正在运行")
-else:
-    print("   ⚠️  Ollama 服务未运行")
-    print("   💡 请启动服务:")
-    print("      import subprocess, time")
-    print("      subprocess.Popen(['ollama', 'serve'])")
-    print("      time.sleep(15)")
-# ==================== 检查模型位置 ====================
-print("\n📍 步骤 3: 检查模型存储位置")
-possible_dirs = [
-    "~/.ollama",
-    "/root/.ollama",
-    "~/.ollama/models",
-    "/root/.ollama/models"
-]
-found_dirs = []
-for dir_path in possible_dirs:
-    expanded_path = os.path.expanduser(dir_path)
-    if os.path.exists(expanded_path):
-        # 计算目录大小
-        total_size = 0
-        file_count = 0
-        for dirpath, dirnames, filenames in os.walk(expanded_path):
-            for filename in filenames:
-                fp = os.path.join(dirpath, filename)
-                if os.path.exists(fp):
-                    total_size += os.path.getsize(fp)
-                    file_count += 1
-        size_gb = total_size / (1024**3)
-        print(f"\n   ✅ 找到: {expanded_path}")
-        print(f"      📊 大小: {size_gb:.2f} GB")
-        print(f"      📁 文件数: {file_count}")
-        # 显示目录结构
-        print(f"      📂 内容:")
-        for item in os.listdir(expanded_path)[:10]:  # 只显示前10个
-            item_path = os.path.join(expanded_path, item)
-            if os.path.isdir(item_path):
-                print(f"         • {item}/ (目录)")
-            else:
-                size = os.path.getsize(item_path) / (1024**2)
-                print(f"         • {item} ({size:.2f} MB)")
-        found_dirs.append((expanded_path, size_gb))
-if not found_dirs:
-    print("\n   ❌ 未找到模型目录")
-    print("   💡 请先下载模型:")
-    print("      !ollama pull mistral")
-# ==================== 检查已下载的模型 ====================
-print("\n📍 步骤 4: 检查已下载的模型")
-if ollama_bin and ps_check.returncode == 0:
-    list_result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
-    if list_result.returncode == 0:
-        print("\n   已下载的模型:")
-        print("   " + "-"*60)
-        print("   " + list_result.stdout)
-    else:
-        print("   ⚠️  无法获取模型列表")
-        print("   请确保 Ollama 服务正在运行")
-else:
-    print("   ⚠️  Ollama 服务未运行，无法检查模型")
-# ==================== 推荐备份方案 ====================
-print("\n" + "="*70)
-print("📋 推荐备份方案")
-print("="*70)
-if found_dirs:
-    # 选择最大的目录（通常是完整的 .ollama 目录）
-    backup_dir = max(found_dirs, key=lambda x: x[1])[0]
-    backup_size = max(found_dirs, key=lambda x: x[1])[1]
-    print(f"\n推荐备份目录: {backup_dir}")
-    print(f"预计压缩包大小: ~{backup_size:.2f} GB")
-    print(f"\n💾 备份步骤:")
-    print(f"""
-1. 使用 KAGGLE_SAVE_OLLAMA.py 脚本
-   exec(open('KAGGLE_SAVE_OLLAMA.py').read())
-2. 脚本会自动:
-   • 找到 Ollama 二进制文件: {ollama_bin if ollama_bin else '未找到'}
-   • 打包模型目录: {backup_dir}
-   • 生成压缩包: /kaggle/working/ollama_backup/
-3. 下载并创建 Dataset:
-   • 在 Notebook 右侧 Output 下载 ollama_backup 目录
-   • 访问 https://www.kaggle.com/datasets 创建 Dataset
-   • 上传 ollama 和 ollama_models.tar.gz
-4. 后续使用:
-   • 添加 Dataset 到 Notebook
-   • 运行 KAGGLE_LOAD_OLLAMA.py
-   • 40-50秒完成加载！
-""")
-    # 估算上传时间
-    upload_time_min = int(backup_size * 2)  # 假设 2 分钟/GB
-    upload_time_max = int(backup_size * 5)  # 假设 5 分钟/GB
-    print(f"⏱️  预计时间:")
-    print(f"   • 压缩时间: {int(backup_size * 0.5)}-{int(backup_size)} 分钟")
-    print(f"   • 下载时间: {int(backup_size * 1)}-{int(backup_size * 3)} 分钟（取决于网络）")
-    print(f"   • 上传时间: {upload_time_min}-{upload_time_max} 分钟（取决于网络）")
-    print(f"   • 首次总计: ~{int(backup_size * 4)}-{int(backup_size * 10)} 分钟（一次性）")
-    print(f"   • 后续加载: 40-50 秒（每次）")
-else:
-    print("\n⚠️  未找到模型目录，无法提供备份方案")
-    print("请先安装 Ollama 并下载模型")
-# ==================== 环境摘要 ====================
-print("\n" + "="*70)
-print("📊 环境摘要")
-print("="*70)
-print(f"""
-Ollama 安装: {'✅ 是' if ollama_bin else '❌ 否'}
-Ollama 服务: {'✅ 运行中' if ps_check.returncode == 0 else '❌ 未运行'}
-模型目录: {'✅ 找到 ' + str(len(found_dirs)) + ' 个' if found_dirs else '❌ 未找到'}
-已下载模型: {'✅ 有' if ollama_bin and ps_check.returncode == 0 else '⚠️  无法确认'}
-准备就绪: {'✅ 可以开始备份' if (ollama_bin and found_dirs) else '❌ 请先完成安装和模型下载'}
-""")
-if ollama_bin and found_dirs:
-    print("💡 下一步: 运行 KAGGLE_SAVE_OLLAMA.py 开始备份")
-else:
-    print("💡 下一步: 完成 Ollama 安装和模型下载")
-print("\n" + "="*70)

KAGGLE_FIX_OLLAMA_CONNECTION.py DELETED Viewed

@@ -1,233 +0,0 @@
-#!/usr/bin/env python3
-"""
-Kaggle Ollama 连接问题诊断和修复脚本
-解决 GraphRAG 异步处理时的连接错误
-"""
-import subprocess
-import time
-import requests
-import os
-def check_ollama_service():
-    """检查 Ollama 服务状态"""
-    print("="*70)
-    print("🔍 Ollama 服务诊断")
-    print("="*70)
-    # 1. 检查进程
-    print("\n1️⃣ 检查 Ollama 进程...")
-    ps_check = subprocess.run(['pgrep', '-f', 'ollama serve'], capture_output=True)
-    if ps_check.returncode == 0:
-        print("   ✅ Ollama 进程正在运行")
-        pids = ps_check.stdout.decode().strip().split('\n')
-        print(f"   📊 进程 PID: {', '.join(pids)}")
-    else:
-        print("   ❌ Ollama 进程未运行")
-        return False
-    # 2. 检查端口
-    print("\n2️⃣ 检查端口 11434...")
-    port_check = subprocess.run(
-        ['netstat', '-tuln'],
-        capture_output=True,
-        text=True
-    )
-    if '11434' in port_check.stdout:
-        print("   ✅ 端口 11434 已监听")
-    else:
-        print("   ❌ 端口 11434 未监听")
-        return False
-    # 3. 测试 API 连接
-    print("\n3️⃣ 测试 API 连接...")
-    try:
-        response = requests.get('http://localhost:11434/api/tags', timeout=5)
-        if response.status_code == 200:
-            print("   ✅ API 连接正常")
-            models = response.json().get('models', [])
-            print(f"   📦 可用模型: {len(models)}")
-            for model in models:
-                print(f"      • {model.get('name', 'unknown')}")
-            return True
-        else:
-            print(f"   ❌ API 返回错误: {response.status_code}")
-            return False
-    except Exception as e:
-        print(f"   ❌ API 连接失败: {e}")
-        return False
-def start_ollama_service():
-    """启动 Ollama 服务"""
-    print("\n"+"="*70)
-    print("🚀 启动 Ollama 服务")
-    print("="*70)
-    # 先杀死可能存在的僵尸进程
-    print("\n1️⃣ 清理旧进程...")
-    subprocess.run(['pkill', '-9', 'ollama'], capture_output=True)
-    time.sleep(2)
-    # 启动服务
-    print("\n2️⃣ 启动新服务...")
-    process = subprocess.Popen(
-        ['ollama', 'serve'],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        env=os.environ.copy()
-    )
-    print(f"   ✅ 服务进程已启动 (PID: {process.pid})")
-    # 等待服务就绪
-    print("\n3️⃣ 等待服务就绪...")
-    max_wait = 30
-    for i in range(max_wait):
-        try:
-            response = requests.get('http://localhost:11434/api/tags', timeout=2)
-            if response.status_code == 200:
-                print(f"   ✅ 服务就绪！(耗时 {i+1} 秒)")
-                return True
-        except:
-            pass
-        if i < max_wait - 1:
-            print(f"   ⏳ 等待中... ({i+1}/{max_wait})", end='\r')
-            time.sleep(1)
-    print(f"\n   ⚠️ 服务启动超时，但可能仍在初始化中")
-    return False
-def test_generation():
-    """测试生成功能"""
-    print("\n"+"="*70)
-    print("🧪 测试文本生成")
-    print("="*70)
-    print("\n   ℹ️ 首次调用会加载模型到内存，需要 30-60 秒...")
-    print("   ⏳ 请耐心等待...\n")
-    try:
-        response = requests.post(
-            'http://localhost:11434/api/generate',
-            json={
-                "model": "mistral",
-                "prompt": "Say 'Hello' in one word",
-                "stream": False
-            },
-            timeout=120  # 增加到 120 秒，首次加载模型需要时间
-        )
-        if response.status_code == 200:
-            result = response.json()
-            print(f"   ✅ 生成成功")
-            print(f"   📝 响应: {result.get('response', '')[:100]}")
-            return True
-        else:
-            print(f"   ❌ 生成失败: {response.status_code}")
-            return False
-    except requests.exceptions.Timeout:
-        print(f"   ⚠️ 生成超时（但这可能是模型加载中）")
-        print(f"   💡 建议：再等待 30 秒后重试")
-        return False
-    except Exception as e:
-        print(f"   ❌ 生成错误: {e}")
-        return False
-def main():
-    """主函数"""
-    print("\n" + "="*70)
-    print("🔧 Kaggle Ollama 连接问题修复工具")
-    print("="*70)
-    print("\n解决问题: Cannot connect to host localhost:11434")
-    print("场景: GraphRAG 异步批处理时")
-    # 检查服务
-    is_running = check_ollama_service()
-    if not is_running:
-        print("\n⚠️ Ollama 服务未正常运行，正在修复...")
-        start_ollama_service()
-        # 再次检查
-        print("\n"+"="*70)
-        print("🔍 验证修复结果")
-        print("="*70)
-        is_running = check_ollama_service()
-    # 测试生成
-    if is_running:
-        test_generation()
-    # 输出建议
-    print("\n"+"="*70)
-    print("💡 使用建议")
-    print("="*70)
-    if is_running:
-        if test_generation():
-            print("""
-✅ Ollama 服务完全就绪！现在可以运行 GraphRAG 了
-📝 在 Kaggle Notebook 中运行:
-from document_processor import DocumentProcessor
-from graph_indexer import GraphRAGIndexer
-# 初始化
-processor = DocumentProcessor()
-vectorstore, retriever, doc_splits = processor.setup_knowledge_base(
-    enable_graphrag=True
-)
-# GraphRAG 索引（异步处理）
-indexer = GraphRAGIndexer(
-    enable_async=True,      # 启用异步
-    async_batch_size=8      # 并发处理 8 个文档
-)
-graph = indexer.index_documents(doc_splits)
-        """)
-        else:
-            print("""
-⚠️ Ollama 服务运行中，但模型可能还在加载
-💡 解决方案：
-1. 等待 30-60 秒让模型完全加载
-2. 再次运行此脚本验证
-3. 或者直接运行一次简单测试：
-   !curl http://localhost:11434/api/generate -d '{
-     "model": "mistral",
-     "prompt": "Hello",
-     "stream": false
-   }'
-4. 如果上述测试成功，就可以运行 GraphRAG 了
-        """)
-    else:
-        print("""
-❌ Ollama 服务仍然异常
-🔧 手动修复步骤:
-1. 在 Kaggle Notebook 新单元格运行:
-   !pkill -9 ollama
-   !ollama serve &
-2. 等待 15 秒后，运行:
-   !curl http://localhost:11434/api/tags
-3. 如果成功，重新运行此脚本验证
-4. 如果失败，检查 Ollama 是否正确安装:
-   !which ollama
-   !ollama --version
-        """)
-    print("="*70)
-if __name__ == "__main__":
-    main()

KAGGLE_LOAD_OLLAMA.py DELETED Viewed

@@ -1,268 +0,0 @@
-"""
-Kaggle Ollama 加载脚本
-从 Kaggle Dataset 快速加载 Ollama 和模型，无需重新下载
-前置条件:
-1. 已使用 KAGGLE_SAVE_OLLAMA.py 创建备份
-2. 已在 Kaggle 上传 Dataset
-3. 已在 Notebook 中添加该 Dataset
-使用方法:
-在 Kaggle Notebook 第一个单元格运行:
-    exec(open('/kaggle/working/adaptive_RAG/KAGGLE_LOAD_OLLAMA.py').read())
-"""
-import os
-import subprocess
-import tarfile
-import shutil
-import time
-print("="*70)
-print("📦 从 Dataset 加载 Ollama（快速启动）")
-print("="*70)
-# ==================== 配置 ====================
-# 修改为你的 Dataset 名称
-# 常见名称: ollama-mistral-backup, ollama-phi-backup, ollama-backup 等
-DATASET_NAME = "ollama-mistral-backup"  # 👈 修改这里为你的实际 Dataset 名称
-DATASET_PATH = f"/kaggle/input/{DATASET_NAME}"
-print(f"💡 提示: 如果 Dataset 不存在，请检查:")
-print(f"   1. Dataset 是否已添加到 Notebook")
-print(f"   2. Dataset 名称是否正确")
-print(f"   3. 可用的 Datasets:")
-import os
-if os.path.exists("/kaggle/input"):
-    available = os.listdir("/kaggle/input")
-    if available:
-        for ds in available:
-            print(f"      • {ds}")
-    else:
-        print(f"      （无）")
-print()
-print(f"\n📋 配置:")
-print(f"   Dataset 路径: {DATASET_PATH}")
-# ==================== 检查 Dataset ====================
-print(f"\n🔍 步骤 1/5: 检查 Dataset...")
-if not os.path.exists(DATASET_PATH):
-    print(f"   ❌ Dataset 不存在: {DATASET_PATH}")
-    print(f"\n💡 请检查:")
-    print(f"   1. Dataset 是否已添加到 Notebook")
-    print(f"   2. Dataset 名称是否正确")
-    print(f"   3. 可用的 Datasets:")
-    if os.path.exists("/kaggle/input"):
-        for item in os.listdir("/kaggle/input"):
-            print(f"      • {item}")
-    print(f"\n📝 如何添加 Dataset:")
-    print(f"   1. 点击右侧 'Add data' 按钮")
-    print(f"   2. 选择 'Your Datasets'")
-    print(f"   3. 找到你的 ollama 备份 Dataset")
-    print(f"   4. 点击 'Add'")
-    exit(1)
-print(f"   ✅ Dataset 存在")
-# 列出 Dataset 内容
-print(f"\n   Dataset 内容:")
-for item in os.listdir(DATASET_PATH):
-    item_path = os.path.join(DATASET_PATH, item)
-    if os.path.isfile(item_path):
-        size = os.path.getsize(item_path)
-        size_str = f"{size / (1024**3):.2f} GB" if size > 1024**3 else f"{size / (1024**2):.2f} MB"
-        print(f"      • {item}: {size_str}")
-# ==================== 安装 Ollama 二进制文件 ====================
-print(f"\n🔧 步骤 2/5: 安装 Ollama 二进制文件...")
-ollama_bin_source = os.path.join(DATASET_PATH, "ollama")
-if os.path.exists(ollama_bin_source):
-    # 先停止可能正在运行的 Ollama 服务
-    print(f"   🛑 检查并停止现有 Ollama 进程...")
-    subprocess.run(['pkill', '-9', 'ollama'], capture_output=True)
-    time.sleep(2)
-    # 复制到系统路径
-    ollama_bin_dest = "/usr/local/bin/ollama"
-    try:
-        shutil.copy2(ollama_bin_source, ollama_bin_dest)
-        # 设置执行权限
-        os.chmod(ollama_bin_dest, 0o755)
-        print(f"   ✅ Ollama 已安装到: {ollama_bin_dest}")
-        # 验证版本
-        version_result = subprocess.run(['ollama', '--version'], capture_output=True, text=True)
-        if version_result.returncode == 0:
-            print(f"   📌 {version_result.stdout.strip()}")
-    except OSError as e:
-        if "Text file busy" in str(e):
-            print(f"   ⚠️ 文件被占用，尝试强制停止...")
-            subprocess.run(['killall', '-9', 'ollama'], capture_output=True)
-            time.sleep(3)
-            # 重试
-            shutil.copy2(ollama_bin_source, ollama_bin_dest)
-            os.chmod(ollama_bin_dest, 0o755)
-            print(f"   ✅ Ollama 已安装（重试成功）")
-        else:
-            raise
-else:
-    print(f"   ❌ 未找到 Ollama 二进制文件")
-    exit(1)
-# ==================== 解压模型文件 ====================
-print(f"\n📦 步骤 3/5: 恢复模型文件...")
-models_archive = os.path.join(DATASET_PATH, "ollama_models.tar.gz")
-ollama_home = os.path.expanduser("~")
-# 检查是否有压缩包
-if os.path.exists(models_archive):
-    # 情况1: 有压缩包，需要解压
-    print(f"   找到模型压缩包: {os.path.getsize(models_archive) / (1024**3):.2f} GB")
-    print(f"   📦 开始解压（这可能需要 10-30 秒）...")
-    start_time = time.time()
-    with tarfile.open(models_archive, 'r:gz') as tar:
-        tar.extractall(ollama_home)  # 会自动创建 ~/.ollama 目录
-    elapsed = time.time() - start_time
-    print(f"   ✅ 解压完成（耗时: {int(elapsed)}秒）")
-else:
-    # 情况2: 没有压缩包，检查是否已解压
-    print(f"   ⚠️  未找到压缩包，检查是否有解压后的文件...")
-    # 检查常见的解压后文件/目录
-    possible_sources = [
-        os.path.join(DATASET_PATH, ".ollama"),              # 直接在根目录
-        os.path.join(DATASET_PATH, "ollama_model", ".ollama"),  # 在 ollama_model 文件夹内（嵌套结构）
-        os.path.join(DATASET_PATH, "ollama_models", ".ollama"), # 在 ollama_models 文件夹内
-        os.path.join(DATASET_PATH, "ollama"),               # 备用路径
-        os.path.join(DATASET_PATH, "models")                # 备用路径
-    ]
-    found = False
-    for source in possible_sources:
-        if os.path.exists(source):
-            print(f"   ✅ 找到解压后的目录: {source}")
-            # 确定目标目录
-            if source.endswith(".ollama"):
-                # 直接复制整个 .ollama 目录
-                dest = os.path.join(ollama_home, ".ollama")
-            else:
-                # 创建 .ollama/models 目录
-                dest = os.path.join(ollama_home, ".ollama", "models")
-                os.makedirs(os.path.dirname(dest), exist_ok=True)
-            print(f"   📋 复制到: {dest}")
-            # 复制文件
-            if os.path.isdir(source):
-                shutil.copytree(source, dest, dirs_exist_ok=True)
-            else:
-                shutil.copy2(source, dest)
-            found = True
-            break
-    if not found:
-        print(f"   ❌ 未找到模型文件")
-        print(f"\n   Dataset 内容:")
-        for item in os.listdir(DATASET_PATH):
-            print(f"      • {item}")
-        exit(1)
-# 检查模型目录
-models_dir = os.path.join(ollama_home, ".ollama")
-if os.path.exists(models_dir):
-        total_size = sum(
-            os.path.getsize(os.path.join(dirpath, filename))
-            for dirpath, dirnames, filenames in os.walk(models_dir)
-            for filename in filenames
-        )
-        print(f"   📊 模型总大小: {total_size / (1024**3):.2f} GB")
-else:
-    print(f"   ❌ 未找到模型压缩包")
-    exit(1)
-# ==================== 启动 Ollama 服务 ====================
-print(f"\n🚀 步骤 4/5: 启动 Ollama 服务...")
-# 检查是否已运行
-ps_check = subprocess.run(['pgrep', '-f', 'ollama serve'], capture_output=True)
-if ps_check.returncode == 0:
-    print(f"   ✅ Ollama 服务已在运行")
-else:
-    print(f"   🔄 启动服务...")
-    subprocess.Popen(
-        ['ollama', 'serve'],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE
-    )
-    print(f"   ⏳ 等待服务启动（15秒）...")
-    time.sleep(15)
-    # 验证服务
-    import requests
-    try:
-        response = requests.get('http://localhost:11434/api/tags', timeout=10)
-        if response.status_code == 200:
-            print(f"   ✅ Ollama 服务运行正常")
-    except Exception as e:
-        print(f"   ⚠️ 服务验证失败: {e}")
-        print(f"   但可能仍在启动中...")
-# ==================== 验证模型 ====================
-print(f"\n✅ 步骤 5/5: 验证模型...")
-list_result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
-print(f"\n   可用模型:")
-print(f"   {list_result.stdout}")
-# ==================== 完成 ====================
-print("="*70)
-print("✅ Ollama 加载完成！")
-print("="*70)
-print(f"\n📊 加载总结:")
-print(f"   • Ollama 服务: ✅ 运行中")
-print(f"   • 模型: ✅ 已加载")
-print(f"   • 总耗时: < 1 分钟")
-print(f"\n💡 对比:")
-print(f"   • 传统方式: 5-10 分钟（重新下载）")
-print(f"   • Dataset 方式: < 1 分钟（直接加载）")
-print(f"   • 节省时间: 约 90%！")
-print(f"\n🧪 快速测试:")
-print(f"   在新单元格运行:")
-print(f"   !ollama run mistral 'Hi, respond in one word'")
-print(f"\n📝 下一步:")
-print(f"   继续运行你的 GraphRAG 索引:")
-print(f"""
-   from document_processor import DocumentProcessor
-   from graph_indexer import GraphRAGIndexer
-   processor = DocumentProcessor()
-   vectorstore, retriever, doc_splits = processor.setup_knowledge_base(enable_graphrag=True)
-   indexer = GraphRAGIndexer(async_batch_size=8)
-   graph = indexer.index_documents(doc_splits)
-""")
-print("\n" + "="*70)

KAGGLE_QUICK_START.py DELETED Viewed

@@ -1,197 +0,0 @@
-"""
-Kaggle 快速启动脚本 - 避免重复下载大模型
-使用优化的小模型配置，大幅减少启动时间
-使用方法:
-在 Kaggle Notebook 第一个单元格运行:
-    exec(open('/kaggle/working/adaptive_RAG/KAGGLE_QUICK_START.py').read())
-"""
-import os
-import subprocess
-import sys
-import time
-print("🚀 Kaggle 快速启动（优化版）")
-print("="*70)
-# ==================== 配置区域 ====================
-REPO_URL = "https://github.com/LannyCodes/adaptive_RAG.git"
-PROJECT_DIR = "/kaggle/working/adaptive_RAG"
-# 模型选择（根据需求修改）
-# "phi"       - 1.6GB, 2-3分钟下载，质量好 ⭐⭐⭐⭐ （推荐）
-# "tinyllama" - 600MB, 1分钟下载，质量中等 ⭐⭐⭐
-# "qwen:0.5b" - 350MB, 30秒下载，质量较低 ⭐⭐
-# "mistral"   - 4GB, 5-10分钟下载，质量最好 ⭐⭐⭐⭐⭐ （慢）
-PREFERRED_MODEL = "phi"  # 👈 修改这里选择模型
-print(f"\n📌 配置:")
-print(f"   • 仓库: {REPO_URL}")
-print(f"   • 模型: {PREFERRED_MODEL}")
-print()
-# ==================== 步骤 1: 克隆项目 ====================
-print("📦 步骤 1/6: 克隆项目...")
-os.chdir('/kaggle/working')
-if os.path.exists(PROJECT_DIR):
-    print("   ✅ 项目已存在")
-else:
-    result = subprocess.run(['git', 'clone', REPO_URL], capture_output=True, text=True)
-    if result.returncode == 0:
-        print("   ✅ 项目克隆成功")
-    else:
-        print(f"   ❌ 克隆失败: {result.stderr}")
-        sys.exit(1)
-os.chdir(PROJECT_DIR)
-# ==================== 步骤 2: 修改配置使用小模型 ====================
-print("\n⚙️ 步骤 2/6: 优化模型配置...")
-config_file = 'config.py'
-with open(config_file, 'r', encoding='utf-8') as f:
-    content = f.read()
-# 替换模型配置
-if 'LOCAL_LLM = "mistral"' in content:
-    content = content.replace(
-        'LOCAL_LLM = "mistral"',
-        f'LOCAL_LLM = "{PREFERRED_MODEL}"  # Kaggle优化: 使用更小的模型'
-    )
-    with open(config_file, 'w', encoding='utf-8') as f:
-        f.write(content)
-    print(f"   ✅ 已切换到 {PREFERRED_MODEL} 模型")
-else:
-    print(f"   ℹ️ 配置已是优化模式")
-# ==================== 步骤 3: 检查并安装 Ollama ====================
-print("\n🔧 步骤 3/6: 检查 Ollama...")
-ollama_check = subprocess.run(['which', 'ollama'], capture_output=True)
-if ollama_check.returncode == 0:
-    print("   ✅ Ollama 已安装")
-else:
-    print("   📥 安装 Ollama...")
-    subprocess.run('curl -fsSL https://ollama.com/install.sh | sh', shell=True)
-    time.sleep(3)
-    print("   ✅ Ollama 安装完成")
-# 验证安装
-version_result = subprocess.run(['ollama', '--version'], capture_output=True, text=True)
-if version_result.returncode == 0:
-    print(f"   📌 {version_result.stdout.strip()}")
-# ==================== 步骤 4: 启动 Ollama 服务 ====================
-print("\n🚀 步骤 4/6: 启动 Ollama 服务...")
-# 检查是否已运行
-ps_check = subprocess.run(['pgrep', '-f', 'ollama serve'], capture_output=True)
-if ps_check.returncode == 0:
-    print("   ✅ Ollama 服务已运行")
-else:
-    print("   🔄 启动服务...")
-    subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    time.sleep(15)
-    # 验证
-    import requests
-    try:
-        response = requests.get('http://localhost:11434/api/tags', timeout=10)
-        if response.status_code == 200:
-            print("   ✅ 服务运行正常")
-    except:
-        print("   ⚠️ 服务验证失败，但可能仍在启动中...")
-# ==================== 步骤 5: 下载优化的模型 ====================
-print(f"\n📦 步骤 5/6: 下载 {PREFERRED_MODEL} 模型...")
-# 检查模型是否已存在
-list_result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
-if PREFERRED_MODEL in list_result.stdout:
-    print(f"   ✅ {PREFERRED_MODEL} 模型已存在")
-else:
-    # 显示预计时间
-    time_estimates = {
-        "qwen:0.5b": "约30秒",
-        "tinyllama": "约1分钟",
-        "phi": "约2-3分钟",
-        "mistral": "约5-10分钟"
-    }
-    estimated_time = time_estimates.get(PREFERRED_MODEL, "未知")
-    print(f"   📥 开始下载（预计时间: {estimated_time}）...")
-    print(f"   ⏳ 请稍候...")
-    start_time = time.time()
-    pull_result = subprocess.run(
-        ['ollama', 'pull', PREFERRED_MODEL],
-        capture_output=True,
-        text=True
-    )
-    elapsed = time.time() - start_time
-    if pull_result.returncode == 0:
-        print(f"   ✅ 模型下载完成（耗时: {int(elapsed)}秒）")
-    else:
-        print(f"   ⚠️ 下载警告: {pull_result.stderr[:200]}")
-# ==================== 步骤 6: 安装 Python 依赖 ====================
-print("\n📦 步骤 6/6: 安装 Python 依赖...")
-subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements_graphrag.txt', '-q'])
-subprocess.run([sys.executable, '-m', 'pip', 'install', '-U',
-                'langchain', 'langchain-core', 'langchain-community',
-                'langchain-text-splitters', '-q'])
-print("   ✅ 依赖安装完成")
-# ==================== 设置 Python 路径 ====================
-if PROJECT_DIR not in sys.path:
-    sys.path.insert(0, PROJECT_DIR)
-# ==================== 完成 ====================
-print("\n" + "="*70)
-print("✅ 环境准备完成！")
-print("="*70)
-print(f"\n📊 配置摘要:")
-print(f"   • 工作目录: {os.getcwd()}")
-print(f"   • 使用模型: {PREFERRED_MODEL}")
-print(f"   • Python路径: 已添加")
-# 显示模型对比
-print(f"\n📌 模型选择说明:")
-print("   • phi (当前) - 平衡速度和质量，推荐日常使用")
-print("   • tinyllama - 最快下载，适合快速测试")
-print("   • mistral - 质量最高，但下载慢（不推荐Kaggle）")
-print(f"\n💡 下一步:")
-print("   1. 开始 GraphRAG 索引:")
-print("      from document_processor import DocumentProcessor")
-print("      from graph_indexer import GraphRAGIndexer")
-print("      ")
-print("      doc_processor = DocumentProcessor()")
-print("      vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base(enable_graphrag=True)")
-print("      ")
-print("      indexer = GraphRAGIndexer()")
-print("      graph = indexer.index_documents(doc_splits, batch_size=3)")
-print()
-print("   2. 如需切换模型，修改脚本顶部的 PREFERRED_MODEL 变量")
-print("\n⚠️ 提示:")
-print(f"   • 当前使用 {PREFERRED_MODEL} 模型，比 Mistral 快 {2 if PREFERRED_MODEL == 'phi' else 5}x")
-print("   • 会话结束后仍需重新下载（但速度已大幅提升）")
-print("   • 如需最佳质量，本地开发时可用 Mistral")

KAGGLE_SAVE_OLLAMA.py DELETED Viewed

@@ -1,282 +0,0 @@
-"""
-Kaggle Ollama 保存脚本
-将 Ollama 和模型保存到 Kaggle Dataset，下次直接使用
-使用步骤:
-1. 首次运行: 安装 Ollama 和下载模型后，运行本脚本保存
-2. 后续使用: 使用 KAGGLE_LOAD_OLLAMA.py 从 Dataset 加载
-注意: 需要手动创建 Kaggle Dataset 并上传
-"""
-import os
-import subprocess
-import shutil
-import tarfile
-import time
-from pathlib import Path
-print("="*70)
-print("💾 Kaggle Ollama 保存工具")
-print("="*70)
-# ==================== 配置 ====================
-OUTPUT_DIR = "/kaggle/working/ollama_backup"
-MODEL_NAME = "mistral"  # 或者 "phi", "tinyllama" 等
-print(f"\n📋 配置:")
-print(f"   模型: {MODEL_NAME}")
-print(f"   输出目录: {OUTPUT_DIR}")
-# ==================== 步骤 1: 创建输出目录 ====================
-print(f"\n📁 步骤 1/4: 创建备份目录...")
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-print(f"   ✅ 目录创建成功")
-# ==================== 步骤 2: 备份 Ollama 二进制文件 ====================
-print(f"\n📦 步骤 2/4: 备份 Ollama 二进制文件...")
-ollama_bin = shutil.which('ollama')
-if ollama_bin:
-    print(f"   找到 Ollama: {ollama_bin}")
-    # 复制二进制文件
-    shutil.copy2(ollama_bin, os.path.join(OUTPUT_DIR, 'ollama'))
-    print(f"   ✅ Ollama 二进制文件已备份")
-else:
-    print(f"   ❌ 未找到 Ollama，请先安装")
-    exit(1)
-# ==================== 步骤 3: 备份模型文件 ====================
-print(f"\n🤖 步骤 3/4: 备份 {MODEL_NAME} 模型...")
-# Ollama 模型存储位置（可能在不同位置）
-possible_model_dirs = [
-    os.path.expanduser("~/.ollama/models"),
-    "/root/.ollama/models",
-    os.path.expanduser("~/.ollama")
-]
-ollama_models_dir = None
-for dir_path in possible_model_dirs:
-    if os.path.exists(dir_path) and os.path.isdir(dir_path):
-        # 检查是否有内容
-        if os.listdir(dir_path):
-            ollama_models_dir = os.path.dirname(dir_path) if dir_path.endswith('models') else dir_path
-            break
-if ollama_models_dir and os.path.exists(ollama_models_dir):
-    print(f"   找到模型目录: {ollama_models_dir}")
-    # 计算目录大小
-    total_size = sum(
-        os.path.getsize(os.path.join(dirpath, filename))
-        for dirpath, dirnames, filenames in os.walk(ollama_models_dir)
-        for filename in filenames
-    )
-    print(f"   模型总大小: {total_size / (1024**3):.2f} GB")
-    # 创建压缩包（整个 .ollama 目录）
-    models_archive = os.path.join(OUTPUT_DIR, 'ollama_models.tar.gz')
-    print(f"   📦 创建压缩包（这可能需要几分钟）...")
-    print(f"   正在压缩: {ollama_models_dir}")
-    start_time = time.time()
-    with tarfile.open(models_archive, 'w:gz') as tar:
-        tar.add(ollama_models_dir, arcname='.ollama')
-    elapsed = time.time() - start_time
-    archive_size = os.path.getsize(models_archive) / (1024**3)
-    print(f"   ✅ 压缩完成")
-    print(f"      耗时: {int(elapsed)}秒")
-    print(f"      压缩包大小: {archive_size:.2f} GB")
-else:
-    print(f"   ❌ 未找到模型目录")
-    print(f"   请先运行: ollama pull {MODEL_NAME}")
-    exit(1)
-# ==================== 步骤 4: 生成说明文件 ====================
-print(f"\n📝 步骤 4/4: 生成说明文件...")
-readme_content = f"""# Ollama 备份包
-## 内容
-- `ollama`: Ollama 二进制文件
-- `ollama_models.tar.gz`: 模型文件压缩包（包含 {MODEL_NAME}）
-## 备份信息
-- 备份时间: {time.strftime('%Y-%m-%d %H:%M:%S')}
-- 模型: {MODEL_NAME}
-- 压缩包大小: {archive_size:.2f} GB
-## 使用方法
-### 1. 创建 Kaggle Dataset
-1. 下载此目录中的所有文件到本地
-2. 在 Kaggle 网站创建新 Dataset:
-   - 访问: https://www.kaggle.com/datasets
-   - 点击 "New Dataset"
-   - 上传 `ollama` 和 `ollama_models.tar.gz`
-   - 命名为: `ollama-{MODEL_NAME}-backup`
-   - 设置为 Private
-   - 点击 "Create"
-### 2. 在 Notebook 中加载
-在 Kaggle Notebook 中:
-1. 添加 Dataset:
-   - 点击右侧 "Add data" → "Your Datasets"
-   - 选择你创建的 `ollama-{MODEL_NAME}-backup`
-2. 运行加载脚本:
-   ```python
-   # 使用项目中的 KAGGLE_LOAD_OLLAMA.py
-   exec(open('/kaggle/working/adaptive_RAG/KAGGLE_LOAD_OLLAMA.py').read())
-   ```
-### 3. 验证
-```bash
-# 检查 Ollama
-ollama --version
-# 检查模型
-ollama list
-# 测试运行
-ollama run {MODEL_NAME} "Hello"
-```
-## 文件大小参考
-不同模型的压缩包大小（近似值）:
-- qwen:0.5b: ~350 MB
-- tinyllama: ~600 MB
-- phi: ~1.6 GB
-- mistral: ~4 GB
-- llama2:7b: ~3.8 GB
-## 注意事项
-1. ⚠️ Dataset 大小限制:
-   - 免费用户: 每个 Dataset 最大 20GB
-   - 需要确保压缩包 < 20GB
-2. ⚠️ 上传时间:
-   - 取决于你的网络速度
-   - 4GB 文件可能需要 10-30 分钟
-3. ✅ 优势:
-   - 只需上传一次
-   - 每次 Notebook 启动时直接加载（秒级）
-   - 节省大量时间
-## 故障排除
-### 问题: 上传失败
-解决: 检查网络连接，或分多次上传
-### 问题: Dataset 过大
-解决: 使用更小的模型（如 phi 或 tinyllama）
-### 问题: 加载后 Ollama 无法运行
-解决: 检查文件权限，运行 `chmod +x /usr/local/bin/ollama`
-"""
-readme_file = os.path.join(OUTPUT_DIR, 'README.md')
-with open(readme_file, 'w', encoding='utf-8') as f:
-    f.write(readme_content)
-print(f"   ✅ 说明文件已生成")
-# ==================== 生成加载脚本（供参考） ====================
-loader_script = os.path.join(OUTPUT_DIR, 'load_example.py')
-with open(loader_script, 'w', encoding='utf-8') as f:
-    f.write(f'''"""
-示例: 从 Kaggle Dataset 加载 Ollama
-"""
-import os
-import subprocess
-import tarfile
-import shutil
-# Dataset 路径（根据你的 Dataset 名称修改）
-DATASET_PATH = "/kaggle/input/ollama-{MODEL_NAME}-backup"
-print("📦 从 Dataset 加载 Ollama...")
-# 1. 复制 Ollama 二进制文件
-ollama_bin = os.path.join(DATASET_PATH, "ollama")
-if os.path.exists(ollama_bin):
-    shutil.copy2(ollama_bin, "/usr/local/bin/ollama")
-    os.chmod("/usr/local/bin/ollama", 0o755)
-    print("✅ Ollama 二进制文件已安装")
-# 2. 解压模型文件
-models_archive = os.path.join(DATASET_PATH, "ollama_models.tar.gz")
-if os.path.exists(models_archive):
-    print("📦 解压模型文件...")
-    with tarfile.open(models_archive, 'r:gz') as tar:
-        tar.extractall(os.path.expanduser("~/.ollama"))
-    print("✅ 模型已解压")
-# 3. 启动 Ollama 服务
-print("🚀 启动 Ollama 服务...")
-subprocess.Popen(['ollama', 'serve'])
-import time
-time.sleep(15)
-print("✅ Ollama 已准备就绪!")
-print("\\n验证:")
-subprocess.run(['ollama', 'list'])
-''')
-print(f"   ✅ 示例脚本已生成")
-# ==================== 显示文件列表 ====================
-print(f"\n📊 备份内容:")
-for item in os.listdir(OUTPUT_DIR):
-    item_path = os.path.join(OUTPUT_DIR, item)
-    size = os.path.getsize(item_path)
-    size_str = f"{size / (1024**3):.2f} GB" if size > 1024**3 else f"{size / (1024**2):.2f} MB"
-    print(f"   • {item}: {size_str}")
-# ==================== 后续步骤说明 ====================
-print("\n" + "="*70)
-print("✅ 备份完成！")
-print("="*70)
-print(f"\n📋 后续步骤:")
-print(f"""
-1. 下载备份文件到本地:
-   在 Kaggle Notebook 右侧 Output 中下载 {OUTPUT_DIR} 目录
-2. 创建 Kaggle Dataset:
-   • 访问: https://www.kaggle.com/datasets
-   • 点击 "New Dataset"
-   • 上传以下文件:
-     - ollama (二进制文件)
-     - ollama_models.tar.gz (模型压缩包)
-   • 命名: ollama-{MODEL_NAME}-backup
-   • 点击 "Create"
-3. 下次使用:
-   • 在 Notebook 中添加你的 Dataset
-   • 运行 KAGGLE_LOAD_OLLAMA.py 脚本
-   • 即可秒级加载，无需重新下载！
-⏱️  时间对比:
-   • 传统方式: 每次启动需要 5-10 分钟下载
-   • Dataset 方式: 每次启动只需 10-20 秒加载
-   • 节省时间: 每次节省 5-10 分钟！
-💡 提示:
-   • 上传 Dataset 是一次性工作
-   • 之后每次 Notebook 启动都能快速加载
-   • 强烈推荐！
-""")
-print("\n查看详细说明: cat {}/README.md".format(OUTPUT_DIR))