Commit ef39b825 authored by myhloli's avatar myhloli
Browse files

fix(UNIPipe): add language parameter support for document analysis

Introduce a new parameter `lang` to the `UNIPipe` class's `parse_union_pdf` function to allow
specifying the language for document analysis. This enhancement enables users to customizethe language setting for better processing of multilingual documents.
parent f9841a0c
......@@ -39,7 +39,8 @@ class UNIPipe(AbsPipe):
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug,
......
......@@ -71,7 +71,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
......@@ -95,9 +95,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True,
pdf_models = doc_analyze(pdf_bytes,
ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id)
end_page_id=end_page_id,
lang=lang)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment