Unverified Commit 9d689790 authored by linfeng's avatar linfeng Committed by GitHub
Browse files

Merge branch 'opendatalab:dev' into dev

parents bcef0868 fb383ba6
...@@ -17,7 +17,7 @@ class AbsPipe(ABC): ...@@ -17,7 +17,7 @@ class AbsPipe(ABC):
PIP_TXT = "txt" PIP_TXT = "txt"
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None): start_page_id=0, end_page_id=None, lang=None):
self.pdf_bytes = pdf_bytes self.pdf_bytes = pdf_bytes
self.model_list = model_list self.model_list = model_list
self.image_writer = image_writer self.image_writer = image_writer
...@@ -25,6 +25,7 @@ class AbsPipe(ABC): ...@@ -25,6 +25,7 @@ class AbsPipe(ABC):
self.is_debug = is_debug self.is_debug = is_debug
self.start_page_id = start_page_id self.start_page_id = start_page_id
self.end_page_id = end_page_id self.end_page_id = end_page_id
self.lang = lang
def get_compress_pdf_mid_data(self): def get_compress_pdf_mid_data(self):
return JsonCompressor.compress_json(self.pdf_mid_data) return JsonCompressor.compress_json(self.pdf_mid_data)
...@@ -94,7 +95,9 @@ class AbsPipe(ABC): ...@@ -94,7 +95,9 @@ class AbsPipe(ABC):
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path) parse_type = pdf_mid_data["_parse_type"]
lang = pdf_mid_data.get("_lang", None)
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
return content_list return content_list
@staticmethod @staticmethod
...@@ -104,7 +107,9 @@ class AbsPipe(ABC): ...@@ -104,7 +107,9 @@ class AbsPipe(ABC):
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path) parse_type = pdf_mid_data["_parse_type"]
lang = pdf_mid_data.get("_lang", None)
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
return md_content return md_content
...@@ -10,19 +10,21 @@ from magic_pdf.user_api import parse_ocr_pdf ...@@ -10,19 +10,21 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe): class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None): start_page_id=0, end_page_id=None, lang=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id) super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_analyze(self): def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=True, self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug, self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
......
...@@ -11,19 +11,21 @@ from magic_pdf.user_api import parse_txt_pdf ...@@ -11,19 +11,21 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe): class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None): start_page_id=0, end_page_id=None, lang=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id) super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_analyze(self): def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=False, self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug, self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
......
...@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf ...@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class UNIPipe(AbsPipe): class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False, def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None): start_page_id=0, end_page_id=None, lang=None):
self.pdf_type = jso_useful_key["_pdf_type"] self.pdf_type = jso_useful_key["_pdf_type"]
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id) super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
if len(self.model_list) == 0: if len(self.model_list) == 0:
self.input_model_is_empty = True self.input_model_is_empty = True
else: else:
...@@ -28,22 +28,26 @@ class UNIPipe(AbsPipe): ...@@ -28,22 +28,26 @@ class UNIPipe(AbsPipe):
def pipe_analyze(self): def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.model_list = doc_analyze(self.pdf_bytes, ocr=False, self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.model_list = doc_analyze(self.pdf_bytes, ocr=True, self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_parse(self): def pipe_parse(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty, is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info("uni_pipe mk content list finished") logger.info("uni_pipe mk content list finished")
return result return result
......
...@@ -2,13 +2,13 @@ model: ...@@ -2,13 +2,13 @@ model:
arch: unimernet arch: unimernet
model_type: unimernet model_type: unimernet
model_config: model_config:
model_name: ./models model_name: ./models/unimernet_base
max_seq_len: 1024 max_seq_len: 1536
length_aware: False
load_pretrained: True load_pretrained: True
pretrained: ./models/pytorch_model.bin pretrained: './models/unimernet_base/pytorch_model.pth'
tokenizer_config: tokenizer_config:
path: ./models path: ./models/unimernet_base
datasets: datasets:
formula_rec_eval: formula_rec_eval:
...@@ -18,7 +18,7 @@ datasets: ...@@ -18,7 +18,7 @@ datasets:
image_size: image_size:
- 192 - 192
- 672 - 672
run: run:
runner: runner_iter runner: runner_iter
task: unimernet_train task: unimernet_train
...@@ -43,4 +43,4 @@ run: ...@@ -43,4 +43,4 @@ run:
distributed_type: ddp # or fsdp when train llm distributed_type: ddp # or fsdp when train llm
generate_cfg: generate_cfg:
temperature: 0.0 temperature: 0.0
\ No newline at end of file
...@@ -10,6 +10,6 @@ config: ...@@ -10,6 +10,6 @@ config:
weights: weights:
layout: Layout/model_final.pth layout: Layout/model_final.pth
mfd: MFD/weights.pt mfd: MFD/weights.pt
mfr: MFR/UniMERNet mfr: MFR/unimernet_base
struct_eqtable: TabRec/StructEqTable struct_eqtable: TabRec/StructEqTable
TableMaster: TabRec/TableMaster TableMaster: TabRec/TableMaster
\ No newline at end of file
...@@ -44,6 +44,18 @@ auto: automatically choose the best method for parsing pdf from ocr and txt. ...@@ -44,6 +44,18 @@ auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""", without method specified, auto will be used by default.""",
default='auto', default='auto',
) )
@click.option(
'-l',
'--lang',
'lang',
type=str,
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""",
default=None,
)
@click.option( @click.option(
'-d', '-d',
'--debug', '--debug',
...@@ -68,7 +80,7 @@ without method specified, auto will be used by default.""", ...@@ -68,7 +80,7 @@ without method specified, auto will be used by default.""",
help='The ending page for PDF parsing, beginning from 0.', help='The ending page for PDF parsing, beginning from 0.',
default=None, default=None,
) )
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id): def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full' model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
...@@ -90,6 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id): ...@@ -90,6 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
debug_able, debug_able,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
lang=lang
) )
except Exception as e: except Exception as e:
......
...@@ -44,9 +44,10 @@ def do_parse( ...@@ -44,9 +44,10 @@ def do_parse(
f_draw_model_bbox=False, f_draw_model_bbox=False,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
lang=None,
): ):
if debug_able: if debug_able:
logger.warning("debug mode is on") logger.warning('debug mode is on')
f_dump_content_list = True f_dump_content_list = True
f_draw_model_bbox = True f_draw_model_bbox = True
...@@ -61,13 +62,13 @@ def do_parse( ...@@ -61,13 +62,13 @@ def do_parse(
if parse_method == 'auto': if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list} jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True, pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id) start_page_id=start_page_id, end_page_id=end_page_id, lang=lang)
elif parse_method == 'txt': elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True, pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id) start_page_id=start_page_id, end_page_id=end_page_id, lang=lang)
elif parse_method == 'ocr': elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True, pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id) start_page_id=start_page_id, end_page_id=end_page_id, lang=lang)
else: else:
logger.error('unknown parse method') logger.error('unknown parse method')
exit(1) exit(1)
......
...@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr" ...@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None, start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs): *args, **kwargs):
""" """
解析文本类pdf 解析文本类pdf
...@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict["_version_name"] = __version__ pdf_info_dict["_version_name"] = __version__
if lang is not None:
pdf_info_dict["_lang"] = lang
return pdf_info_dict return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None, start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs): *args, **kwargs):
""" """
解析ocr类pdf 解析ocr类pdf
...@@ -66,12 +69,15 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -66,12 +69,15 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict["_version_name"] = __version__ pdf_info_dict["_version_name"] = __version__
if lang is not None:
pdf_info_dict["_lang"] = lang
return pdf_info_dict return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
input_model_is_empty: bool = False, input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None, start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs): *args, **kwargs):
""" """
ocr和文本混合的pdf,全部解析出来 ocr和文本混合的pdf,全部解析出来
...@@ -95,9 +101,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -95,9 +101,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False): if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty: if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True, pdf_models = doc_analyze(pdf_bytes,
ocr=True,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id) end_page_id=end_page_id,
lang=lang)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
...@@ -108,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -108,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
pdf_info_dict["_version_name"] = __version__ pdf_info_dict["_version_name"] = __version__
if lang is not None:
pdf_info_dict["_lang"] = lang
return pdf_info_dict return pdf_info_dict
...@@ -3,4 +3,6 @@ ...@@ -3,4 +3,6 @@
## Project List ## Project List
- [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index - [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
- [gradio_app](./gradio_app/README.md): Build a web app based on gradio
...@@ -3,3 +3,5 @@ ...@@ -3,3 +3,5 @@
## 项目列表 ## 项目列表
- [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统 - [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
## Installation
MinerU(>=0.8.0)
> If you already have a functioning MinerU environment, you can skip this step.
>
[Deploy in CPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#quick-cpu-demo)
[Deploy in GPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#using-gpu)
Third-party Software
```bash
pip install gradio gradio-pdf
```
## Start Gradio App
```bash
python app.py
```
## Use Gradio App
Access http://127.0.0.1:7860 in your web browser
\ No newline at end of file
## 安装
MinerU(>=0.8.0)
>如已有正常运行的MinerU环境则可以跳过此步骤
>
[在CPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8cpu%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C)
[在GPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8gpu)
第三方软件
```bash
pip install gradio gradio-pdf
```
## 启动gradio应用
```bash
python app.py
```
## 使用gradio应用
在浏览器中访问 http://127.0.0.1:7860
\ No newline at end of file
...@@ -14,8 +14,6 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter ...@@ -14,8 +14,6 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, prepare_env from magic_pdf.tools.common import do_parse, prepare_env
os.system("pip install gradio")
os.system("pip install gradio-pdf")
import gradio as gr import gradio as gr
from gradio_pdf import PDF from gradio_pdf import PDF
...@@ -25,13 +23,16 @@ def read_fn(path): ...@@ -25,13 +23,16 @@ def read_fn(path):
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_pdf(doc_path, output_dir, end_page_id): def parse_pdf(doc_path, output_dir, end_page_id, is_ocr):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
try: try:
file_name = f"{str(Path(doc_path).stem)}_{time.time()}" file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
pdf_data = read_fn(doc_path) pdf_data = read_fn(doc_path)
parse_method = "auto" if is_ocr:
parse_method = "ocr"
else:
parse_method = "auto"
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method) local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
do_parse( do_parse(
output_dir, output_dir,
...@@ -92,9 +93,9 @@ def replace_image_with_base64(markdown_text, image_dir_path): ...@@ -92,9 +93,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
return re.sub(pattern, replace, markdown_text) return re.sub(pattern, replace, markdown_text)
def to_markdown(file_path, end_pages): def to_markdown(file_path, end_pages, is_ocr):
# 获取识别的md文件以及压缩包文件路径 # 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1) local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr)
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip") archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0: if zip_archive_success == 0:
...@@ -111,14 +112,6 @@ def to_markdown(file_path, end_pages): ...@@ -111,14 +112,6 @@ def to_markdown(file_path, end_pages):
return md_content, txt_content, archive_zip_path, new_pdf_path return md_content, txt_content, archive_zip_path, new_pdf_path
# def show_pdf(file_path):
# with open(file_path, "rb") as f:
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
# f'width="100%" height="1000" type="application/pdf">'
# return pdf_display
latex_delimiters = [{"left": "$$", "right": "$$", "display": True}, latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
{"left": '$', "right": '$', "display": False}] {"left": '$', "right": '$', "display": False}]
...@@ -141,16 +134,29 @@ model_init = init_model() ...@@ -141,16 +134,29 @@ model_init = init_model()
logger.info(f"model_init: {model_init}") logger.info(f"model_init: {model_init}")
with open("header.html", "r") as file:
header = file.read()
if __name__ == "__main__": if __name__ == "__main__":
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.HTML(header)
with gr.Row(): with gr.Row():
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
pdf_show = gr.Markdown() pdf_show = gr.Markdown()
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages") max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
with gr.Row() as bu_flow: with gr.Row() as bu_flow:
is_ocr = gr.Checkbox(label="Force enable OCR")
change_bu = gr.Button("Convert") change_bu = gr.Button("Convert")
clear_bu = gr.ClearButton([pdf_show], value="Clear") clear_bu = gr.ClearButton([pdf_show], value="Clear")
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800) pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
with gr.Accordion("Examples:"):
example_root = os.path.join(os.path.dirname(__file__), "examples")
gr.Examples(
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
_.endswith("pdf")],
inputs=pdf_show,
)
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
output_file = gr.File(label="convert result", interactive=False) output_file = gr.File(label="convert result", interactive=False)
...@@ -160,8 +166,7 @@ if __name__ == "__main__": ...@@ -160,8 +166,7 @@ if __name__ == "__main__":
latex_delimiters=latex_delimiters, line_breaks=True) latex_delimiters=latex_delimiters, line_breaks=True)
with gr.Tab("Markdown text"): with gr.Tab("Markdown text"):
md_text = gr.TextArea(lines=45, show_copy_button=True) md_text = gr.TextArea(lines=45, show_copy_button=True)
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show]) change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([md, pdf_show, md_text, output_file]) clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
demo.launch()
demo.launch()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment