Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
94f5b16
1
Parent(s):
df14376
delete vectara
Browse files- kaggle_simple_multimodal.py +28 -6
kaggle_simple_multimodal.py
CHANGED
|
@@ -127,8 +127,8 @@ def scan_and_copy_files():
|
|
| 127 |
# 递归扫描所有文件
|
| 128 |
for root, dirs, files in os.walk(input_dir):
|
| 129 |
for file in files:
|
| 130 |
-
#
|
| 131 |
-
if not file or file.startswith('.')
|
| 132 |
continue
|
| 133 |
|
| 134 |
# 调试:显示所有文件
|
|
@@ -175,15 +175,14 @@ def main():
|
|
| 175 |
# 检查文件
|
| 176 |
working_dir = '/kaggle/working'
|
| 177 |
|
| 178 |
-
# 过滤有效的PDF
|
| 179 |
try:
|
| 180 |
all_files = os.listdir(working_dir)
|
| 181 |
|
| 182 |
-
#
|
| 183 |
pdf_files = [
|
| 184 |
f for f in all_files
|
| 185 |
-
if f.lower().endswith('.pdf') #
|
| 186 |
-
and len(f) > 4 # 确保不只是 '.pdf'
|
| 187 |
and not f.startswith('.') # 排除隐藏文件
|
| 188 |
and os.path.isfile(os.path.join(working_dir, f)) # 确保是文件
|
| 189 |
]
|
|
@@ -197,8 +196,31 @@ def main():
|
|
| 197 |
print(f"❌ 扫描文件时出错: {e}")
|
| 198 |
pdf_files = []
|
| 199 |
image_files = []
|
|
|
|
| 200 |
|
| 201 |
print(f"\n📁 /kaggle/working/ 中的文件:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
print(f" - PDF文件: {len(pdf_files)} 个")
|
| 203 |
for pdf in pdf_files:
|
| 204 |
pdf_path = os.path.join(working_dir, pdf)
|
|
|
|
| 127 |
# 递归扫描所有文件
|
| 128 |
for root, dirs, files in os.walk(input_dir):
|
| 129 |
for file in files:
|
| 130 |
+
# 跳过隐藏文件和空文件名
|
| 131 |
+
if not file or file.startswith('.'):
|
| 132 |
continue
|
| 133 |
|
| 134 |
# 调试:显示所有文件
|
|
|
|
| 175 |
# 检查文件
|
| 176 |
working_dir = '/kaggle/working'
|
| 177 |
|
| 178 |
+
# 过滤有效的PDF文件(排除隐藏文件)
|
| 179 |
try:
|
| 180 |
all_files = os.listdir(working_dir)
|
| 181 |
|
| 182 |
+
# 修复:移除文件名长度限制,支持 .pdf 等短文件名
|
| 183 |
pdf_files = [
|
| 184 |
f for f in all_files
|
| 185 |
+
if f.lower().endswith('.pdf') # 小写比较
|
|
|
|
| 186 |
and not f.startswith('.') # 排除隐藏文件
|
| 187 |
and os.path.isfile(os.path.join(working_dir, f)) # 确保是文件
|
| 188 |
]
|
|
|
|
| 196 |
print(f"❌ 扫描文件时出错: {e}")
|
| 197 |
pdf_files = []
|
| 198 |
image_files = []
|
| 199 |
+
all_files = []
|
| 200 |
|
| 201 |
print(f"\n📁 /kaggle/working/ 中的文件:")
|
| 202 |
+
|
| 203 |
+
# 调试:详细显示所有文件和过滤过程
|
| 204 |
+
print("\n🔍 详细调试信息:")
|
| 205 |
+
print(f" 目录中总共 {len(all_files)} 个项目")
|
| 206 |
+
for f in all_files:
|
| 207 |
+
f_path = os.path.join(working_dir, f)
|
| 208 |
+
is_file = os.path.isfile(f_path)
|
| 209 |
+
is_dir = os.path.isdir(f_path)
|
| 210 |
+
f_lower = f.lower()
|
| 211 |
+
|
| 212 |
+
# 检查 PDF
|
| 213 |
+
if f_lower.endswith('.pdf'):
|
| 214 |
+
file_size = os.path.getsize(f_path) if is_file else 0
|
| 215 |
+
print(f" 📄 {f}: 是文件={is_file}, 大小={file_size/1024:.1f}KB, 长度={len(f)}")
|
| 216 |
+
# 检查图片
|
| 217 |
+
elif any(f_lower.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']):
|
| 218 |
+
file_size = os.path.getsize(f_path) if is_file else 0
|
| 219 |
+
print(f" 🖼️ {f}: 是文件={is_file}, 大小={file_size/1024:.1f}KB")
|
| 220 |
+
else:
|
| 221 |
+
print(f" ⚪ {f}: 类型={'[目录]' if is_dir else '[文件]'}")
|
| 222 |
+
|
| 223 |
+
print(f"\n📊 过滤结果:")
|
| 224 |
print(f" - PDF文件: {len(pdf_files)} 个")
|
| 225 |
for pdf in pdf_files:
|
| 226 |
pdf_path = os.path.join(working_dir, pdf)
|