Unverified Commit 8a0aa7a4 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge branch 'dev' into dev

parents 2e1bf881 ad9abc32
import os
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer from magic_pdf.data.io.s3 import S3Reader, S3Writer
...@@ -22,10 +22,10 @@ class MultiS3Mixin: ...@@ -22,10 +22,10 @@ class MultiS3Mixin:
""" """
if len(default_prefix) == 0: if len(default_prefix) == 0:
raise InvalidConfig('default_prefix must be provided') raise InvalidConfig('default_prefix must be provided')
arr = default_prefix.strip("/").split("/") arr = default_prefix.strip('/').split('/')
self.default_bucket = arr[0] self.default_bucket = arr[0]
self.default_prefix = "/".join(arr[1:]) self.default_prefix = '/'.join(arr[1:])
found_default_bucket_config = False found_default_bucket_config = False
for conf in s3_configs: for conf in s3_configs:
...@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin): ...@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
s3_reader = self.__get_s3_client(bucket_name) s3_reader = self.__get_s3_client(bucket_name)
else: else:
s3_reader = self.__get_s3_client(self.default_bucket) s3_reader = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path) if self.default_prefix:
path = self.default_prefix + '/' + path
return s3_reader.read_at(path, offset, limit) return s3_reader.read_at(path, offset, limit)
...@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin): ...@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
s3_writer = self.__get_s3_client(bucket_name) s3_writer = self.__get_s3_client(bucket_name)
else: else:
s3_writer = self.__get_s3_client(self.default_bucket) s3_writer = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path) if self.default_prefix:
path = self.default_prefix + '/' + path
return s3_writer.write(path, data) return s3_writer.write(path, data)
...@@ -91,7 +91,8 @@ def chars_to_content(span): ...@@ -91,7 +91,8 @@ def chars_to_content(span):
content = '' content = ''
for char in span['chars']: for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1 = char char1 = char
char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ': if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
......
...@@ -13,7 +13,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod ...@@ -13,7 +13,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult from magic_pdf.operators.models import InferenceResult
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment