Unverified Commit e32704f1 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge branch 'opendatalab:dev' into dev

parents 786da939 a881ee89
...@@ -107,50 +107,13 @@ def batch_build_dataset(pdf_paths, k, lang=None): ...@@ -107,50 +107,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
pdf_info = [] pdf_info = []
total_pages = 0 total_pages = 0
results = []
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
try: try:
doc = fitz.open(pdf_path) with open(pdf_path, 'rb') as f:
num_pages = len(doc) bits = f.read()
pdf_info.append((pdf_path, num_pages)) results.append(PymuDocDataset(bits, lang))
total_pages += num_pages
doc.close()
except Exception as e: except Exception as e:
print(f'Error opening {pdf_path}: {e}') print(f'Error opening {pdf_path}: {e}')
# Partition the jobs based on page countEach job has 1 page
partitions = partition_array_greedy(pdf_info, k)
# Process each partition in parallel
all_images_h = {}
with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
# Submit one task per partition
futures = []
for sn, partition in enumerate(partitions):
# Get the jobs for this partition
partition_jobs = [pdf_info[idx] for idx in partition]
# Submit the task
future = executor.submit(
process_pdf_batch,
partition_jobs,
sn
)
futures.append(future)
# Process results as they complete
for i, future in enumerate(concurrent.futures.as_completed(futures)):
try:
idx, images = future.result()
all_images_h[idx] = images
except Exception as e:
print(f'Error processing partition: {e}')
results = [None] * len(pdf_paths)
for i in range(len(partitions)):
partition = partitions[i]
for j in range(len(partition)):
with open(pdf_info[partition[j]][0], 'rb') as f:
pdf_bytes = f.read()
dataset = PymuDocDataset(pdf_bytes, lang=lang)
dataset.set_images(all_images_h[i][j])
results[partition[j]] = dataset
return results return results
...@@ -342,17 +342,8 @@ class Doc(PageableData): ...@@ -342,17 +342,8 @@ class Doc(PageableData):
height: int height: int
} }
""" """
if self._img is None: return fitz_doc_to_image(self._doc)
self._img = fitz_doc_to_image(self._doc)
return self._img
def set_image(self, img):
"""
Args:
img (np.ndarray): the image
"""
if self._img is None:
self._img = img
def get_doc(self) -> fitz.Page: def get_doc(self) -> fitz.Page:
"""Get the pymudoc object. """Get the pymudoc object.
......
...@@ -138,30 +138,31 @@ def doc_analyze( ...@@ -138,30 +138,31 @@ def doc_analyze(
) )
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200)) MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
batch_size = MIN_BATCH_INFERENCE_SIZE
images = [] images = []
page_wh_list = [] page_wh_list = []
images_with_extra_info = []
results = []
for index in range(len(dataset)): for index in range(len(dataset)):
if start_page_id <= index <= end_page_id: if start_page_id <= index <= end_page_id:
page_data = dataset.get_page(index) page_data = dataset.get_page(index)
img_dict = page_data.get_image() img_dict = page_data.get_image()
images.append(img_dict['img']) images.append(img_dict['img'])
page_wh_list.append((img_dict['width'], img_dict['height'])) page_wh_list.append((img_dict['width'], img_dict['height']))
if lang is None or lang == 'auto':
if lang is None or lang == 'auto': images_with_extra_info.append((images[index], ocr, dataset._lang))
images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))] else:
else: images_with_extra_info.append((images[index], ocr, lang))
images_with_extra_info = [(images[index], ocr, lang) for index in range(len(images))]
if len(images_with_extra_info) == batch_size:
if len(images) >= MIN_BATCH_INFERENCE_SIZE: _, result = may_batch_image_analyze(images_with_extra_info, 0, ocr, show_log, layout_model, formula_enable, table_enable)
batch_size = MIN_BATCH_INFERENCE_SIZE results.extend(result)
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)] images_with_extra_info = []
else:
batch_images = [images_with_extra_info] if len(images_with_extra_info) > 0:
_, result = may_batch_image_analyze(images_with_extra_info, 0, ocr, show_log, layout_model, formula_enable, table_enable)
results = []
for sn, batch_image in enumerate(batch_images):
_, result = may_batch_image_analyze(batch_image, sn, ocr, show_log,layout_model, formula_enable, table_enable)
results.extend(result) results.extend(result)
images_with_extra_info = []
model_json = [] model_json = []
for index in range(len(dataset)): for index in range(len(dataset)):
...@@ -193,6 +194,7 @@ def batch_doc_analyze( ...@@ -193,6 +194,7 @@ def batch_doc_analyze(
batch_size = MIN_BATCH_INFERENCE_SIZE batch_size = MIN_BATCH_INFERENCE_SIZE
images = [] images = []
page_wh_list = [] page_wh_list = []
results = []
images_with_extra_info = [] images_with_extra_info = []
for dataset in datasets: for dataset in datasets:
...@@ -211,11 +213,15 @@ def batch_doc_analyze( ...@@ -211,11 +213,15 @@ def batch_doc_analyze(
else: else:
images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang)) images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang))
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)] if len(images_with_extra_info) == batch_size:
results = [] _, result = may_batch_image_analyze(images_with_extra_info, 0, True, show_log, layout_model, formula_enable, table_enable)
for sn, batch_image in enumerate(batch_images): results.extend(result)
_, result = may_batch_image_analyze(batch_image, sn, True, show_log, layout_model, formula_enable, table_enable) images_with_extra_info = []
if len(images_with_extra_info) > 0:
_, result = may_batch_image_analyze(images_with_extra_info, 0, True, show_log, layout_model, formula_enable, table_enable)
results.extend(result) results.extend(result)
images_with_extra_info = []
infer_results = [] infer_results = []
from magic_pdf.operators.models import InferenceResult from magic_pdf.operators.models import InferenceResult
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment