Unverified Commit c5a4150e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1228 from icecraft/fix/pipe_result

fix: add parse_pdf_type and version
parents 8f266869 57f9f9dc
......@@ -3,17 +3,17 @@ import json
import os
from typing import Callable
from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.filter import classify
from magic_pdf.libs.draw_bbox import draw_model_bbox
from magic_pdf.libs.version import __version__
from magic_pdf.model import InferenceResultBase
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult
from magic_pdf.model import InferenceResultBase
from magic_pdf.libs.version import __version__
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
class InferenceResult(InferenceResultBase):
def __init__(self, inference_results: list, dataset: Dataset):
......@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
def proc(*args, **kwargs) -> PipeResult:
res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_TXT
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
return PipeResult(res, self._dataset)
res = self.apply(
......@@ -141,11 +145,7 @@ class InferenceResult(InferenceResultBase):
debug_mode=debug_mode,
lang=lang,
)
res['_parse_type'] = PARSE_TYPE_TXT
res['_version_name'] = __version__
return res
def pipe_ocr_mode(
self,
......@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
def proc(*args, **kwargs) -> PipeResult:
res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
return PipeResult(res, self._dataset)
res = self.apply(
proc,
self._dataset,
imageWriter,
SupportedPdfParseMethod.TXT,
SupportedPdfParseMethod.OCR,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=debug_mode,
lang=lang,
)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
return res
\ No newline at end of file
return res
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment