Commit be505a95 authored by myhloli's avatar myhloli
Browse files

fix(pre_proc): improve character overlap handling in OCR processing

- Add condition to check for identical or space characters when resolving overlaps
- Skip non-conflicting character pairs to prevent unnecessary removals
parent 59e99fcf
......@@ -71,15 +71,17 @@ def remove_x_overlapping_chars(span, median_width):
overlap_width = x_right - x_left
if overlap_width > overlap_threshold:
# Determine which character to remove
width1 = char1['bbox'][2] - char1['bbox'][0]
width2 = char2['bbox'][2] - char2['bbox'][0]
if width1 < width2:
# Remove the narrower character
span['chars'].pop(i)
if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
# Determine which character to remove
width1 = char1['bbox'][2] - char1['bbox'][0]
width2 = char2['bbox'][2] - char2['bbox'][0]
if width1 < width2:
# Remove the narrower character
span['chars'].pop(i)
else:
span['chars'].pop(i + 1)
else:
span['chars'].pop(i + 1)
i += 1
# Don't increment i since we need to check the new pair
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment