Commit be505a95 authored by myhloli's avatar myhloli
Browse files

fix(pre_proc): improve character overlap handling in OCR processing

- Add condition to check for identical or space characters when resolving overlaps
- Skip non-conflicting character pairs to prevent unnecessary removals
parent 59e99fcf
...@@ -71,15 +71,17 @@ def remove_x_overlapping_chars(span, median_width): ...@@ -71,15 +71,17 @@ def remove_x_overlapping_chars(span, median_width):
overlap_width = x_right - x_left overlap_width = x_right - x_left
if overlap_width > overlap_threshold: if overlap_width > overlap_threshold:
# Determine which character to remove if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
width1 = char1['bbox'][2] - char1['bbox'][0] # Determine which character to remove
width2 = char2['bbox'][2] - char2['bbox'][0] width1 = char1['bbox'][2] - char1['bbox'][0]
width2 = char2['bbox'][2] - char2['bbox'][0]
if width1 < width2: if width1 < width2:
# Remove the narrower character # Remove the narrower character
span['chars'].pop(i) span['chars'].pop(i)
else:
span['chars'].pop(i + 1)
else: else:
span['chars'].pop(i + 1) i += 1
# Don't increment i since we need to check the new pair # Don't increment i since we need to check the new pair
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment