"examples/vscode:/vscode.git/clone" did not exist on "a83cc0c0bc2c5f4bbb55beb0132de03e222dd199"
Commit c2ad4c75 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update pdf_extract_kit.py

parent 0135861f
...@@ -247,7 +247,7 @@ class CustomPEKModel: ...@@ -247,7 +247,7 @@ class CustomPEKModel:
logger.info('DocAnalysis init done!') logger.info('DocAnalysis init done!')
def __call__(self, image): def __call__(self, image,index,end_page_id):
latex_filling_list = [] latex_filling_list = []
mf_image_list = [] mf_image_list = []
...@@ -256,8 +256,8 @@ class CustomPEKModel: ...@@ -256,8 +256,8 @@ class CustomPEKModel:
layout_start = time.time() layout_start = time.time()
layout_res = self.layout_model(image, ignore_catids=[]) layout_res = self.layout_model(image, ignore_catids=[])
layout_cost = round(time.time() - layout_start, 2) layout_cost = round(time.time() - layout_start, 2)
logger.info(f"layout detection cost: {layout_cost}") # logger.info(f"layout detection cost: {layout_cost}")
total_cost = layout_cost
if self.apply_formula: if self.apply_formula:
# 公式检测 # 公式检测
mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0] mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
...@@ -286,7 +286,7 @@ class CustomPEKModel: ...@@ -286,7 +286,7 @@ class CustomPEKModel:
for res, latex in zip(latex_filling_list, mfr_res): for res, latex in zip(latex_filling_list, mfr_res):
res['latex'] = latex_rm_whitespace(latex) res['latex'] = latex_rm_whitespace(latex)
mfr_cost = round(time.time() - mfr_start, 2) mfr_cost = round(time.time() - mfr_start, 2)
logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}") # logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
# Select regions for OCR / formula regions / table regions # Select regions for OCR / formula regions / table regions
ocr_res_list = [] ocr_res_list = []
...@@ -369,7 +369,11 @@ class CustomPEKModel: ...@@ -369,7 +369,11 @@ class CustomPEKModel:
}) })
ocr_cost = round(time.time() - ocr_start, 2) ocr_cost = round(time.time() - ocr_start, 2)
logger.info(f"ocr cost: {ocr_cost}") # logger.info(f"ocr cost: {ocr_cost}")
total_cost = total_cost + ocr_cost
index = index + 1
end_page_id = end_page_id + 1
logger.info(f'当前解析第【{index} / {end_page_id}】页, 耗时:{total_cost}')
#logger.info(f'是否表格识别:{self.apply_table}') #logger.info(f'是否表格识别:{self.apply_table}')
# 表格识别 table recognition # 表格识别 table recognition
if self.apply_table: if self.apply_table:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment