Unverified Commit dfb3cbfb authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2126 from icecraft/fix/image_ds_add_lang

fix: image dataset add lang field
parents f442adfc e36a083d
......@@ -232,7 +232,7 @@ class PymuDocDataset(Dataset):
self._records[i].set_image(images[i])
class ImageDataset(Dataset):
def __init__(self, bits: bytes):
def __init__(self, bits: bytes, lang=None):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
......@@ -244,6 +244,17 @@ class ImageDataset(Dataset):
self._raw_data = bits
self._data_bits = pdf_bytes
if lang == '':
self._lang = None
elif lang == 'auto':
from magic_pdf.model.sub_modules.language_detection.utils import \
auto_detect_lang
self._lang = auto_detect_lang(bits)
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
else:
self._lang = lang
logger.info(f'lang: {lang}')
def __len__(self) -> int:
"""The length of the dataset."""
return len(self._records)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment