Commit 315adbce authored by myhloli's avatar myhloli
Browse files

feat(ocr_mkcontent): add full-width to half-width character conversion

- Implement full_to_half function to convert full-width characters to half-width
- Apply conversion to span content before merging paragraphs
- Improve text processing for better readability and consistency
parent 6753df8d
...@@ -126,11 +126,35 @@ def detect_language(text): ...@@ -126,11 +126,35 @@ def detect_language(text):
return 'empty' return 'empty'
def full_to_half(text: str) -> str:
"""Convert full-width characters to half-width characters using code point manipulation.
Args:
text: String containing full-width characters
Returns:
String with full-width characters converted to half-width
"""
result = []
for char in text:
code = ord(char)
# Full-width ASCII variants (FF01-FF5E)
if 0xFF01 <= code <= 0xFF5E:
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
# Full-width space
elif code == 0x3000:
result.append(' ')
else:
result.append(char)
return ''.join(result)
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
block_text = '' block_text = ''
for line in para_block['lines']: for line in para_block['lines']:
for span in line['spans']: for span in line['spans']:
if span['type'] in [ContentType.Text]: if span['type'] in [ContentType.Text]:
span['content'] = full_to_half(span['content'])
block_text += span['content'] block_text += span['content']
block_lang = detect_lang(block_text) block_lang = detect_lang(block_text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment