"...python/git@developer.sourcefind.cn:zhaoyu6/sglang.git" did not exist on "de1350ea20530e0744b48d0d50415fa2ff5122cd"
Commit e36a083d authored by icecraft's avatar icecraft
Browse files

fix: image dataset add lang field

parent f442adfc
...@@ -232,7 +232,7 @@ class PymuDocDataset(Dataset): ...@@ -232,7 +232,7 @@ class PymuDocDataset(Dataset):
self._records[i].set_image(images[i]) self._records[i].set_image(images[i])
class ImageDataset(Dataset): class ImageDataset(Dataset):
def __init__(self, bits: bytes): def __init__(self, bits: bytes, lang=None):
"""Initialize the dataset, which wraps the pymudoc documents. """Initialize the dataset, which wraps the pymudoc documents.
Args: Args:
...@@ -244,6 +244,17 @@ class ImageDataset(Dataset): ...@@ -244,6 +244,17 @@ class ImageDataset(Dataset):
self._raw_data = bits self._raw_data = bits
self._data_bits = pdf_bytes self._data_bits = pdf_bytes
if lang == '':
self._lang = None
elif lang == 'auto':
from magic_pdf.model.sub_modules.language_detection.utils import \
auto_detect_lang
self._lang = auto_detect_lang(bits)
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
else:
self._lang = lang
logger.info(f'lang: {lang}')
def __len__(self) -> int: def __len__(self) -> int:
"""The length of the dataset.""" """The length of the dataset."""
return len(self._records) return len(self._records)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment