"dlib/vscode:/vscode.git/clone" did not exist on "6f353f30be43b3dac77a1ad9c0253440542564bc"
Unverified Commit dd96663c authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2088 from opendatalab/dev

fix: support non-pdf file in batch mode
parents efbd00bf bb40b9b6
...@@ -137,6 +137,17 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -137,6 +137,17 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
doc_paths = [] doc_paths = []
for doc_path in Path(path).glob('*'): for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes: if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
if doc_path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(doc_path), temp_dir)
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
elif doc_path.suffix in image_suffixes:
with open(str(doc_path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
doc_path = Path(fn)
doc_paths.append(doc_path) doc_paths.append(doc_path)
datasets = batch_build_dataset(doc_paths, 4, lang) datasets = batch_build_dataset(doc_paths, 4, lang)
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang) batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment