Unverified Commit 5528dd9d authored by dcasota's avatar dcasota Committed by GitHub
Browse files

Error handling load_single_document() in ingest.py (#4852)

load_single_document() handles
- corrupt files
- empty (zero byte) files
- unsupported file extensions
parent 943172cb
......@@ -77,13 +77,21 @@ LOADER_MAPPING = {
def load_single_document(file_path: str) -> List[Document]:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
if os.path.getsize(file_path) != 0:
filename, ext = os.path.splitext(file_path)
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
try:
loader = loader_class(file_path, **loader_args)
if loader:
return loader.load()
except:
print(f"Corrupted file {file_path}. Ignoring it.")
else:
print(f"Unsupported file {file_path}. Ignoring it.")
else:
print(f"Empty file {file_path}. Ignoring it.")
raise ValueError(f"Unsupported file extension '{ext}'")
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
"""
......@@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
results = []
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
results.extend(docs)
if docs:
results.extend(docs)
pbar.update()
return results
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment