"scripts/git@developer.sourcefind.cn:change/sglang.git" did not exist on "f6af3a6561b2528531bcb4815012b085280d4ec7"
Commit 97bcc8b2 authored by myhloli's avatar myhloli
Browse files

refactor(pdf_parse): improve code readability and maintainability

parent 034c59a8
...@@ -179,16 +179,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -179,16 +179,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
useful_spans = [] useful_spans = []
unuseful_spans = [] unuseful_spans = []
for span in spans: for span in spans:
for block in all_bboxes + all_discarded_blocks: for block in all_bboxes + all_discarded_blocks:
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]: if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue continue
overlap_ratio = calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if overlap_ratio > 0.5: if block in all_bboxes:
if block in all_bboxes: useful_spans.append(span)
useful_spans.append(span) else:
else: unuseful_spans.append(span)
unuseful_spans.append(span) break
break
new_spans = [] new_spans = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment