Spaces:
Runtime error
Runtime error
| import os | |
| # can be used to add metadata to the index, for instance URL | |
| metadata_by_file_path = { | |
| "data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"}, | |
| "data/Confucianism/Analects of Confucius.pdf": { | |
| "url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" | |
| }, | |
| } | |
| def get_domains(): | |
| domains = [] | |
| for root, dirs, files in os.walk("data"): | |
| for dir in dirs: | |
| domains.append(dir) | |
| return domains | |
| def get_sources(): | |
| res = [] | |
| for root, dirs, files in os.walk("data"): | |
| for file in files: | |
| if file.endswith(".pdf"): | |
| file_path = os.path.join(root, file) | |
| print("file_path", file_path) | |
| res.append( | |
| { | |
| "domain": parse_domain(file_path), | |
| "name": parse_name(file_path), | |
| "file_path": file_path, | |
| **metadata_by_file_path.get(file_path, {}), | |
| } | |
| ) | |
| return res | |
| def parse_name(source: str) -> str: | |
| filename = os.path.basename(source) | |
| name, _ = os.path.splitext(filename) | |
| return name.replace("_", " ") | |
| def parse_domain(source: str) -> str: | |
| domain = source.split(os.sep)[1] | |
| return domain.replace("_", " ") | |