Unverified Commit c5a4150e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1228 from icecraft/fix/pipe_result

fix: add parse_pdf_type and version
parents 8f266869 57f9f9dc
...@@ -3,17 +3,17 @@ import json ...@@ -3,17 +3,17 @@ import json
import os import os
from typing import Callable from typing import Callable
from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.filter import classify from magic_pdf.filter import classify
from magic_pdf.libs.draw_bbox import draw_model_bbox from magic_pdf.libs.draw_bbox import draw_model_bbox
from magic_pdf.libs.version import __version__
from magic_pdf.model import InferenceResultBase
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult from magic_pdf.pipe.operators import PipeResult
from magic_pdf.model import InferenceResultBase
from magic_pdf.libs.version import __version__
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
class InferenceResult(InferenceResultBase): class InferenceResult(InferenceResultBase):
def __init__(self, inference_results: list, dataset: Dataset): def __init__(self, inference_results: list, dataset: Dataset):
...@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase): ...@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
def proc(*args, **kwargs) -> PipeResult: def proc(*args, **kwargs) -> PipeResult:
res = pdf_parse_union(*args, **kwargs) res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_TXT
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
return PipeResult(res, self._dataset) return PipeResult(res, self._dataset)
res = self.apply( res = self.apply(
...@@ -141,11 +145,7 @@ class InferenceResult(InferenceResultBase): ...@@ -141,11 +145,7 @@ class InferenceResult(InferenceResultBase):
debug_mode=debug_mode, debug_mode=debug_mode,
lang=lang, lang=lang,
) )
res['_parse_type'] = PARSE_TYPE_TXT
res['_version_name'] = __version__
return res return res
def pipe_ocr_mode( def pipe_ocr_mode(
self, self,
...@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase): ...@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
def proc(*args, **kwargs) -> PipeResult: def proc(*args, **kwargs) -> PipeResult:
res = pdf_parse_union(*args, **kwargs) res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
return PipeResult(res, self._dataset) return PipeResult(res, self._dataset)
res = self.apply( res = self.apply(
proc, proc,
self._dataset, self._dataset,
imageWriter, imageWriter,
SupportedPdfParseMethod.TXT, SupportedPdfParseMethod.OCR,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
debug_mode=debug_mode, debug_mode=debug_mode,
lang=lang, lang=lang,
) )
res['_parse_type'] = PARSE_TYPE_OCR return res
res['_version_name'] = __version__
return res
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment