# Copyright 2025 Bytedance Ltd. and/or its affiliates. # SPDX-License-Identifier: Apache-2.0 import glob import json import os import os.path as osp from .edit_dataset_jsonl import EditJSONLIterableDataset from .interleave_datasets import UnifiedEditIterableDataset from .t2i_dataset import T2IIterableDataset from .t2i_dataset_jsonl import T2IJSONLIterableDataset from .vlm_dataset import SftJSONLIterableDataset DATASET_REGISTRY = { "sensenova_si_800K": SftJSONLIterableDataset, "sensenova_si_8M": SftJSONLIterableDataset, } DATASET_INFO = {} # load additional dataset info from the dataset_info/ directory dataset_info_path = osp.join(osp.dirname(__file__), "dataset_info") dataset_info_files = glob.glob(osp.join(dataset_info_path, "*.json")) training_root = os.environ.get( "TRAINING_ROOT", osp.abspath(osp.join(osp.dirname(__file__), "..", "..", "..")), ) def _resolve_training_root_path(value): if isinstance(value, str): return value.replace("__TRAINING_ROOT__", training_root) if isinstance(value, list): return [_resolve_training_root_path(v) for v in value] if isinstance(value, dict): return {k: _resolve_training_root_path(v) for k, v in value.items()} return value for dataset_info_file in dataset_info_files: with open(dataset_info_file, "r") as f: base_name = osp.splitext(osp.basename(dataset_info_file))[0] dataset_info = _resolve_training_root_path(json.load(f)) for key in dataset_info.keys(): if key in DATASET_INFO: raise ValueError(f"Key {key} already exists in DATASET_INFO") DATASET_INFO.update({base_name: dataset_info})