Unverified Commit e0b74b86 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1778 from opendatalab/release-1.2.0

Release 1.2.0
parents 6e0d9a3e d3c822f8
...@@ -126,11 +126,35 @@ def detect_language(text): ...@@ -126,11 +126,35 @@ def detect_language(text):
return 'empty' return 'empty'
def full_to_half(text: str) -> str:
"""Convert full-width characters to half-width characters using code point manipulation.
Args:
text: String containing full-width characters
Returns:
String with full-width characters converted to half-width
"""
result = []
for char in text:
code = ord(char)
# Full-width ASCII variants (FF01-FF5E)
if 0xFF01 <= code <= 0xFF5E:
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
# Full-width space
elif code == 0x3000:
result.append(' ')
else:
result.append(char)
return ''.join(result)
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
block_text = '' block_text = ''
for line in para_block['lines']: for line in para_block['lines']:
for span in line['spans']: for span in line['spans']:
if span['type'] in [ContentType.Text]: if span['type'] in [ContentType.Text]:
span['content'] = full_to_half(span['content'])
block_text += span['content'] block_text += span['content']
block_lang = detect_lang(block_text) block_lang = detect_lang(block_text)
......
...@@ -157,6 +157,7 @@ def doc_analyze( ...@@ -157,6 +157,7 @@ def doc_analyze(
) )
batch_analyze = False batch_analyze = False
batch_ratio = 1
device = get_device() device = get_device()
npu_support = False npu_support = False
...@@ -181,7 +182,6 @@ def doc_analyze( ...@@ -181,7 +182,6 @@ def doc_analyze(
batch_ratio = 2 batch_ratio = 2
logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}') logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
batch_analyze = True batch_analyze = True
model_json = [] model_json = []
...@@ -190,24 +190,26 @@ def doc_analyze( ...@@ -190,24 +190,26 @@ def doc_analyze(
if batch_analyze: if batch_analyze:
# batch analyze # batch analyze
images = [] images = []
page_wh_list = []
for index in range(len(dataset)): for index in range(len(dataset)):
if start_page_id <= index <= end_page_id: if start_page_id <= index <= end_page_id:
page_data = dataset.get_page(index) page_data = dataset.get_page(index)
img_dict = page_data.get_image() img_dict = page_data.get_image()
images.append(img_dict['img']) images.append(img_dict['img'])
page_wh_list.append((img_dict['width'], img_dict['height']))
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
analyze_result = batch_model(images) analyze_result = batch_model(images)
for index in range(len(dataset)): for index in range(len(dataset)):
page_data = dataset.get_page(index)
img_dict = page_data.get_image()
page_width = img_dict['width']
page_height = img_dict['height']
if start_page_id <= index <= end_page_id: if start_page_id <= index <= end_page_id:
result = analyze_result.pop(0) result = analyze_result.pop(0)
page_width, page_height = page_wh_list.pop(0)
else: else:
result = [] result = []
page_height = 0
page_width = 0
page_info = {'page_no': index, 'height': page_height, 'width': page_width} page_info = {'page_no': index, 'width': page_width, 'height': page_height}
page_dict = {'layout_dets': result, 'page_info': page_info} page_dict = {'layout_dets': result, 'page_info': page_info}
model_json.append(page_dict) model_json.append(page_dict)
...@@ -227,7 +229,7 @@ def doc_analyze( ...@@ -227,7 +229,7 @@ def doc_analyze(
else: else:
result = [] result = []
page_info = {'page_no': index, 'height': page_height, 'width': page_width} page_info = {'page_no': index, 'width': page_width, 'height': page_height}
page_dict = {'layout_dets': result, 'page_info': page_info} page_dict = {'layout_dets': result, 'page_info': page_info}
model_json.append(page_dict) model_json.append(page_dict)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment