Commit 87af738a authored by sawmice's avatar sawmice
Browse files

fix: 1. ocr txt mode error 2. lose pdf_parse_type field

parent fa113b57
...@@ -51,3 +51,8 @@ class MODEL_NAME: ...@@ -51,3 +51,8 @@ class MODEL_NAME:
UniMerNet_v2_Small = 'unimernet_small' UniMerNet_v2_Small = 'unimernet_small'
RAPID_TABLE = 'rapid_table' RAPID_TABLE = 'rapid_table'
PARSE_TYPE_TXT = 'txt'
PARSE_TYPE_OCR = 'ocr'
...@@ -6,12 +6,14 @@ from typing import Callable ...@@ -6,12 +6,14 @@ from typing import Callable
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.filter import classify from magic_pdf.filter import classify
from magic_pdf.libs.draw_bbox import draw_model_bbox from magic_pdf.libs.draw_bbox import draw_model_bbox
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult from magic_pdf.pipe.operators import PipeResult
from magic_pdf.model import InferenceResultBase from magic_pdf.model import InferenceResultBase
from magic_pdf.libs.version import __version__
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
class InferenceResult(InferenceResultBase): class InferenceResult(InferenceResultBase):
def __init__(self, inference_results: list, dataset: Dataset): def __init__(self, inference_results: list, dataset: Dataset):
...@@ -129,7 +131,7 @@ class InferenceResult(InferenceResultBase): ...@@ -129,7 +131,7 @@ class InferenceResult(InferenceResultBase):
res = pdf_parse_union(*args, **kwargs) res = pdf_parse_union(*args, **kwargs)
return PipeResult(res, self._dataset) return PipeResult(res, self._dataset)
return self.apply( res = self.apply(
proc, proc,
self._dataset, self._dataset,
imageWriter, imageWriter,
...@@ -139,6 +141,11 @@ class InferenceResult(InferenceResultBase): ...@@ -139,6 +141,11 @@ class InferenceResult(InferenceResultBase):
debug_mode=debug_mode, debug_mode=debug_mode,
lang=lang, lang=lang,
) )
res['_parse_type'] = PARSE_TYPE_TXT
res['_version_name'] = __version__
return res
def pipe_ocr_mode( def pipe_ocr_mode(
self, self,
...@@ -166,7 +173,7 @@ class InferenceResult(InferenceResultBase): ...@@ -166,7 +173,7 @@ class InferenceResult(InferenceResultBase):
res = pdf_parse_union(*args, **kwargs) res = pdf_parse_union(*args, **kwargs)
return PipeResult(res, self._dataset) return PipeResult(res, self._dataset)
return self.apply( res = self.apply(
proc, proc,
self._dataset, self._dataset,
imageWriter, imageWriter,
...@@ -176,3 +183,7 @@ class InferenceResult(InferenceResultBase): ...@@ -176,3 +183,7 @@ class InferenceResult(InferenceResultBase):
debug_mode=debug_mode, debug_mode=debug_mode,
lang=lang, lang=lang,
) )
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
return res
\ No newline at end of file
...@@ -15,9 +15,7 @@ from magic_pdf.libs.version import __version__ ...@@ -15,9 +15,7 @@ from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
PARSE_TYPE_TXT = 'txt'
PARSE_TYPE_OCR = 'ocr'
def parse_txt_pdf( def parse_txt_pdf(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment