Commit e36a083d authored by icecraft's avatar icecraft
Browse files

fix: image dataset add lang field

parent f442adfc
......@@ -232,7 +232,7 @@ class PymuDocDataset(Dataset):
self._records[i].set_image(images[i])
class ImageDataset(Dataset):
def __init__(self, bits: bytes):
def __init__(self, bits: bytes, lang=None):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
......@@ -244,6 +244,17 @@ class ImageDataset(Dataset):
self._raw_data = bits
self._data_bits = pdf_bytes
if lang == '':
self._lang = None
elif lang == 'auto':
from magic_pdf.model.sub_modules.language_detection.utils import \
auto_detect_lang
self._lang = auto_detect_lang(bits)
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
else:
self._lang = lang
logger.info(f'lang: {lang}')
def __len__(self) -> int:
"""The length of the dataset."""
return len(self._records)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment