Commit d9b5d004 authored by myhloli's avatar myhloli
Browse files

refactor: update content type references in pipeline and VLM processing scripts

parent e6f817fe
......@@ -193,12 +193,12 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
para_content = {}
if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
para_content = {
'type': 'text',
'type': ContentType.TEXT,
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.TITLE:
para_content = {
'type': 'text',
'type': ContentType.TEXT,
'text': merge_para_with_text(para_block),
}
title_level = get_title_level(para_block)
......@@ -208,14 +208,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
return None
para_content = {
'type': 'equation',
'type': ContentType.EQUATION,
'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
}
if para_block['lines'][0]['spans'][0].get('content', ''):
para_content['text'] = merge_para_with_text(para_block)
para_content['text_format'] = 'latex'
elif para_type == BlockType.IMAGE:
para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
for block in para_block['blocks']:
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
......@@ -224,29 +224,26 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.IMAGE_CAPTION:
para_content['img_caption'].append(merge_para_with_text(block))
para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_content['img_footnote'].append(merge_para_with_text(block))
para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
elif para_type == BlockType.TABLE:
para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
for block in para_block['blocks']:
if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.TABLE:
if span.get('latex', ''):
para_content['table_body'] = f"{span['latex']}"
elif span.get('html', ''):
para_content['table_body'] = f"{span['html']}"
if span.get('html', ''):
para_content[BlockType.TABLE_BODY] = f"{span['html']}"
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.TABLE_CAPTION:
para_content['table_caption'].append(merge_para_with_text(block))
para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.TABLE_FOOTNOTE:
para_content['table_footnote'].append(merge_para_with_text(block))
para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
para_content['page_idx'] = page_idx
......
......@@ -130,25 +130,25 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
para_content = {}
if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
para_content = {
'type': 'text',
'type': ContentType.TEXT,
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.TITLE:
title_level = get_title_level(para_block)
para_content = {
'type': 'text',
'type': ContentType.TEXT,
'text': merge_para_with_text(para_block),
}
if title_level != 0:
para_content['text_level'] = title_level
elif para_type == BlockType.INTERLINE_EQUATION:
para_content = {
'type': 'equation',
'type': ContentType.EQUATION,
'text': merge_para_with_text(para_block),
'text_format': 'latex',
}
elif para_type == BlockType.IMAGE:
para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
for block in para_block['blocks']:
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
......@@ -157,11 +157,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.IMAGE_CAPTION:
para_content['img_caption'].append(merge_para_with_text(block))
para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_content['img_footnote'].append(merge_para_with_text(block))
para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
elif para_type == BlockType.TABLE:
para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
for block in para_block['blocks']:
if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']:
......@@ -169,15 +169,15 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if span['type'] == ContentType.TABLE:
if span.get('html', ''):
para_content['table_body'] = f"{span['html']}"
para_content[BlockType.TABLE_BODY] = f"{span['html']}"
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.TABLE_CAPTION:
para_content['table_caption'].append(merge_para_with_text(block))
para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.TABLE_FOOTNOTE:
para_content['table_footnote'].append(merge_para_with_text(block))
para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
para_content['page_idx'] = page_idx
......
......@@ -21,6 +21,7 @@ class ContentType:
TEXT = 'text'
INTERLINE_EQUATION = 'interline_equation'
INLINE_EQUATION = 'inline_equation'
EQUATION = 'equation'
class CategoryId:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment