Commit 7c19b3a8 authored by wangsen's avatar wangsen
Browse files

Initial commit

parents
Pipeline #1721 failed with stages
in 0 seconds
CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
CPPFLAGS += $(shell python3 -m pybind11 --includes)
LIBNAME = helpers
LIBEXT = $(shell python3-config --extension-suffix)
default: $(LIBNAME)$(LIBEXT)
%$(LIBEXT): %.cpp
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import numpy
from megatron.core.datasets.indexed_dataset import IndexedDataset
from megatron.core.datasets.masked_dataset import (
MaskedWordPieceDataset,
MaskedWordPieceDatasetConfig,
)
from megatron.core.datasets.utils import Split
@dataclass
class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
"""Configuration object for Megatron Core BERT WordPiece datasets"""
classification_head: bool = None
"""Option to perform the next sequence prediction during sampling"""
def __post_init__(self) -> None:
"""Do asserts and set fields post init
"""
super().__post_init__()
assert self.classification_head is not None
class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
"""The BERT dataset that assumes WordPiece tokenization
Args:
indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
dataset_path (str): The real path on disk to the dataset, for bookkeeping
indexed_indices (numpy.ndarray): The set of the documents indices to expose
num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
index_split (Split): The indexed_indices Split
config (BERTMaskedWordPieceDatasetConfig): The config
"""
def __init__(
self,
indexed_dataset: IndexedDataset,
dataset_path: str,
indexed_indices: numpy.ndarray,
num_samples: Optional[int],
index_split: Split,
config: BERTMaskedWordPieceDatasetConfig,
) -> None:
super().__init__(
indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
)
self.token_lookup = list(self.config.tokenizer.inv_vocab.keys())
# Account for the single <cls> and two <sep> token ids
self.sample_index = self._build_sample_index(
self.config.sequence_length - 3, 2 if self.config.classification_head else 1
)
@staticmethod
def _key_config_attributes() -> List[str]:
"""Inherited method implementation
Returns:
List[str]: The key config attributes
"""
return super(
BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset
)._key_config_attributes() + ["classification_head",]
def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
"""Abstract method implementation
Args:
idx (int): The index into the dataset
Returns:
Dict[str, Union[int, numpy.ndarray]]: The
"""
idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
numpy_random_state = numpy.random.RandomState(
seed=(self.config.random_seed + idx) % 2 ** 32
)
assert target_sequence_length <= self.config.sequence_length
# Split the sample into contiguous subsegments A and B
pivot = len(sample)
is_next_random = False
if self.config.classification_head:
assert len(sample) > 1, "the sample must contain at least two sentences"
pivot = 1
if len(sample) >= 3:
pivot = numpy_random_state.randint(low=1, high=len(sample))
is_next_random = numpy_random_state.random() < 0.5
split_A = []
for sample_a in sample[:pivot]:
split_A.extend(sample_a)
split_B = []
for sample_b in sample[pivot:]:
split_B.extend(sample_b)
if is_next_random:
split_A, split_B = split_B, split_A
# Trim the subsegments from either end to a desired joint length
length_A = len(split_A)
length_B = len(split_B)
if length_A + length_B <= target_sequence_length:
truncated = False
else:
while length_A + length_B > target_sequence_length:
split = split_A if length_A > length_B else split_B
if numpy_random_state.random() < 0.5:
del split[0]
else:
del split[-1]
length_A = len(split_A)
length_B = len(split_B)
truncated = True
# Merge the subsegments and create the token assignment labels
tokens = [
self.config.tokenizer.cls,
*split_A,
self.config.tokenizer.sep,
]
assignments = [0 for _ in range(1 + len(split_A) + 1)]
if split_B:
tokens += [*split_B, self.config.tokenizer.sep]
assignments += [1 for _ in range(len(split_B) + 1)]
# Masking
tokens, masked_positions, masked_labels, _, _ = self._create_masked_lm_predictions(
tokens, target_sequence_length, numpy_random_state
)
# Pad the sequences and convert to NumPy
length_toks = len(tokens)
length_pads = self.config.sequence_length - length_toks
assert length_pads >= 0
tokens = numpy.array(tokens, dtype=numpy.int64)
tokens = numpy.pad(tokens, (0, length_pads), constant_values=self.config.tokenizer.pad)
assignments = numpy.array(assignments, dtype=numpy.int64)
assignments = numpy.pad(
assignments, (0, length_pads), constant_values=self.config.tokenizer.pad
)
# Get the padding mask
mask_pads = numpy.ones(length_toks, dtype=numpy.int64)
mask_pads = numpy.pad(
mask_pads, (0, length_pads), constant_values=self.config.tokenizer.pad
)
# Mask the labels
labels = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) - 1
labels[masked_positions] = masked_labels
# Get the loss mask
mask_loss = numpy.zeros(self.config.sequence_length, dtype=numpy.int64)
mask_loss[masked_positions] = 1
return {
"text": tokens,
"types": assignments,
"labels": labels,
"is_random": int(is_next_random),
"padding_mask": mask_pads,
"loss_mask": mask_loss,
"truncated": int(truncated),
}
def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
"""Abstract method implementation
80% of the time, replace the token id with mask token id. 10% of the time, replace token id
with a random token id from the vocabulary. 10% of the time, do nothing.
Args:
numpy_random_state (RandomState): The NumPy random state
Returns:
Optional[int]: The replacement token id or None
"""
if numpy_random_state.random() < 0.8:
return self.config.tokenizer.mask
else:
if numpy_random_state.random() >= 0.5:
return self.token_lookup[numpy_random_state.randint(0, len(self.token_lookup))]
return None
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import hashlib
import json
import logging
import os
import time
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple, Union
import numpy
import torch
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
from megatron.core.datasets.megatron_dataset import MegatronDataset
from megatron.core.datasets.utils import normalize
from megatron.core.utils import log_single_rank
logger = logging.getLogger(__name__)
_VERBOSE = False
class BlendedDataset(torch.utils.data.Dataset):
"""Conjugating class for a set of MegatronDataset instances
Args:
datasets (List[MegatronDataset]): The MegatronDataset instances to blend
weights (List[Union[int, float]]): The weights that determine the dataset blend ratios
size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
config (BlendedMegatronDatasetConfig): The config
Raises:
RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
"""
def __init__(
self,
datasets: List[MegatronDataset],
weights: List[Union[int, float]],
size: Optional[int],
config: BlendedMegatronDatasetConfig,
) -> None:
assert len(datasets) == len(weights)
assert len(datasets) < 32767
assert all(map(lambda _: type(_) == type(datasets[0]), datasets))
assert all(map(lambda _: _.index_split == datasets[0].index_split, datasets))
assert all(map(lambda _: _ > 0, weights))
assert all(map(lambda _: type(_) == type(weights[0]), weights))
if size is None and isinstance(weights[0], float):
assert all(map(lambda _: _ == int(_), weights))
# Alert user to unnecessary blending
if len(datasets) == 1:
log_single_rank(
logger, logging.WARNING, f"Building a BlendedDataset for a single MegatronDataset"
)
if size is not None:
weights = normalize(weights)
self.datasets = datasets
self.split = self.datasets[0].index_split
self.weights = weights
self.size = size
self.config = config
unique_identifiers = OrderedDict()
unique_identifiers["class"] = type(self).__name__
unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
unique_identifiers["split"] = self.split.name
unique_identifiers["weights"] = self.weights
unique_identifiers["size"] = self.size
self.unique_description = json.dumps(
unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
)
self.unique_description_hash = hashlib.md5(
self.unique_description.encode("utf-8")
).hexdigest()
self.dataset_index, self.dataset_sample_index = self._build_indices()
def __len__(self) -> int:
return self.dataset_index.shape[0]
def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
dataset_id = self.dataset_index[idx]
dataset_sample_id = self.dataset_sample_index[idx]
return {
"dataset_id": dataset_id,
**self.datasets[dataset_id][dataset_sample_id],
}
def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
"""Build and optionally cache the dataset index and the dataset sample index
The dataset index is a 1-D mapping which determines the dataset to query. The dataset
sample index is a 1-D mapping which determines the sample to request from the queried
dataset.
Returns:
Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
"""
path_to_cache = self.config.path_to_cache
if path_to_cache:
get_path_to = lambda suffix: os.path.join(
path_to_cache,
f"{self.unique_description_hash}-{type(self).__name__}-{self.split.name}-{suffix}",
)
path_to_description = get_path_to("description.txt")
path_to_dataset_index = get_path_to("dataset_index.npy")
path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
cache_hit = all(
map(
os.path.isfile,
[path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
)
)
else:
cache_hit = False
if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
log_single_rank(
logger, logging.INFO, f"Build and save the {type(self).__name__} indices",
)
# Build the dataset and dataset sample indexes
log_single_rank(
logger, logging.INFO, f"\tBuild and save the dataset and dataset sample indexes"
)
t_beg = time.time()
from megatron.core.datasets import helpers
if self.size is not None:
dataset_index = numpy.zeros(self.size, dtype=numpy.int16)
dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64)
helpers.build_blending_indices(
dataset_index,
dataset_sample_index,
self.weights,
len(self.datasets),
self.size,
_VERBOSE,
)
else:
size = sum(self.weights)
dataset_index = numpy.zeros(size, dtype=numpy.int16)
dataset_sample_index = numpy.zeros(size, dtype=numpy.int64)
helpers.build_exhaustive_blending_indices(
dataset_index, dataset_sample_index, self.weights, len(self.datasets)
)
if path_to_cache:
os.makedirs(path_to_cache, exist_ok=True)
# Write the description
with open(path_to_description, "wt") as writer:
writer.write(self.unique_description)
# Save the indexes
numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
else:
log_single_rank(
logger,
logging.WARNING,
f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
)
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
return dataset_index, dataset_sample_index
log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} indices")
log_single_rank(
logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
)
t_beg = time.time()
dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
log_single_rank(
logger,
logging.INFO,
f"\tLoad the dataset sample index from {path_to_dataset_sample_index}",
)
t_beg = time.time()
dataset_sample_index = numpy.load(
path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
)
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
return dataset_index, dataset_sample_index
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import logging
import math
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Callable, Iterable, List, Optional, Type, Union
import numpy
import torch
from megatron.core.datasets.blended_dataset import BlendedDataset
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
from megatron.core.datasets.utils import Split, normalize
from megatron.core.parallel_state import get_virtual_pipeline_model_parallel_rank
from megatron.core.utils import log_single_rank
logger = logging.getLogger(__name__)
MidLevelDataset = MegatronDataset
TopLevelDataset = Union[BlendedDataset, MidLevelDataset]
DistributedDataset = Union[
TopLevelDataset, MidLevelDataset, LowLevelDataset, torch.utils.data.Dataset
]
class BlendedMegatronDatasetBuilder(object):
"""Builder class for the BlendedDataset and MegatronDataset classes
Args:
cls (Type[MegatronDataset]): The class to instantiate, must inherit from MegatronDataset
sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value.
config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
"""
def __init__(
self,
cls: Type[MidLevelDataset],
sizes: List[int],
is_built_on_rank: Callable,
config: BlendedMegatronDatasetConfig,
):
self.cls = cls
self.sizes = sizes
self.is_built_on_rank = is_built_on_rank
self.config = config
log_single_rank(
logger,
logging.INFO,
f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}",
)
if not self.config.mock:
for split in Split:
size_is_none = self.sizes[split.value] is None
if self.config.blend_per_split is None:
weights_are_none = self.config.blend[1] is None
else:
if self.config.blend_per_split[split.value] is None:
continue
weights_are_none = self.config.blend_per_split[split.value][1] is None
if size_is_none:
assert (
weights_are_none
), f"size_is_none => weights_are_none fails for {split.name} split"
if torch.distributed.is_initialized():
gb_rank = torch.distributed.get_rank()
vp_rank = get_virtual_pipeline_model_parallel_rank()
if gb_rank == 0 and (vp_rank == 0 or vp_rank is None):
assert (
self.is_built_on_rank()
), "is_built_on_rank must return True when global rank = 0 and vp rank = 0"
def build(self) -> List[Optional[TopLevelDataset]]:
"""Build all dataset splits according to the provided blend(s)
This method is distributed-aware and must be called on all ranks.
The dataset splits returned can vary according to the config. Supply config.blend and
config.split to build BlendedDataset and/or MegatronDataset splits from the same
distribution. Supply config.blend_per_split to build BlendedDataset and/or MegatronDataset
splits from separate distributions. In either case, for each split, handle the following
cases:
(1) The split is None
- do nothing
(2) The split has one contributing dataset, and...
(a) 'size' is not None
- Build a mid-level dataset with low-level dataset sampling in proportion to the size
(b) 'size' is None
- Build mid-level datasets with no excess low-level dataset sampling
(3) The split has multiple contributing datasets, and...
(a) 'weights' is not None and 'size' is not None
- Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size
- Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size
(b) 'weights' is not None and 'size' is None
- Error
(c) 'weights' is None and 'size' is not None
- Build mid-level datasets with no excess low-level dataset sampling
- Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
- The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
(d) 'weights' is None and 'size' is None
- Build mid-level datasets with no excess low-level dataset sampling
- Build a top-level dataset with no excess mid-level dataset sampling
Returns:
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
"""
datasets = self._build_blended_dataset_splits()
for dataset in datasets:
if dataset is not None and len(dataset) > 0:
if isinstance(dataset, BlendedDataset):
# Check blend size
assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
# Check blend access of mid-level datasets
_, sizes = numpy.unique(dataset.dataset_index, return_counts=True)
for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)):
if len(dataset_and_size[0]) < dataset_and_size[1]:
raise IndexError(
f"{type(dataset).__name__} blend goes out of bounds for {type([dataset_and_size[0]]).__name__} {i} for {dataset.split.name} split"
)
return datasets
def _build_blended_dataset_splits(self,) -> List[Optional[TopLevelDataset]]:
"""Build all dataset splits according to the provided blend(s)
See the BlendedMegatronDatasetBuilder.build alias for more information.
Returns:
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
"""
##
# Return fake "mock" datasets
##
if self.config.mock:
split = self.config.split_matrix
try:
return self._build_megatron_dataset_splits(None, split, self.sizes)
except Exception as error:
raise Exception(
f"{self.cls.__name__} failed to build as a mock data generator"
) from error
##
# All splits come from the same distribution
##
elif self.config.blend:
prefixes, weights = self.config.blend
if weights is not None:
weights = normalize(weights)
split = self.config.split_matrix
# Blend consists of a single prefix
if len(prefixes) == 1 and weights is None:
return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
# Build the mid-level datasets
if weights is None:
sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
else:
sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
# build each dataset in parallel
megatron_datasets = self._build_megatron_datasets_parallel(
prefixes, split, sizes_per_dataset
)
# Build the top-level datasets
blended_datasets = [None] * len(Split)
for i in range(len(Split)):
if split[i] is not None:
weights_i = weights
if weights_i is not None and self.sizes[i] is not None:
size_i = sum(list(zip(*sizes_per_dataset))[i])
elif weights_i is None:
try:
weights_i = [
len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
]
except TypeError:
weights_i = [0 for _ in prefixes]
if self.sizes[i] is not None:
size_i = min(self.sizes[i], sum(weights_i))
else:
size_i = None # => the size will be sum(weights_i)
else:
raise RuntimeError
blended_datasets[i] = self.build_generic_dataset(
BlendedDataset,
self.is_built_on_rank,
True, # synchronize_ranks, default behavior to build on rank-0 first
megatron_datasets[i],
weights_i,
size_i,
self.config,
)
return blended_datasets
##
# Each split comes from a separate distribution
##
else:
blended_datasets = [None] * len(Split)
for i in range(len(Split)):
split_spoof = [None] * len(Split)
split_spoof[i] = (0.0, 1.0)
sizes_spoof = [0] * len(Split)
sizes_spoof[i] = self.sizes[i]
# Blend is provided for the split
blend = self.config.blend_per_split[i]
if blend is not None:
prefixes, weights = blend
if weights is not None:
weights = normalize(weights)
# Blend consists of a sigle prefix
if len(prefixes) == 1:
blended_datasets[i] = self._build_megatron_dataset_splits(
prefixes[0], split_spoof, sizes_spoof
)[i]
continue
# Build mid-level datasets
if weights is None:
sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
else:
sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
# build each dataset in parallel
megatron_datasets = self._build_megatron_datasets_parallel(
prefixes, split_spoof, sizes_per_dataset
)[i]
# Build top-level dataset
if weights is not None and self.sizes[i] is not None:
size = list(map(sum, zip(*sizes_per_dataset)))[i]
elif weights is None:
try:
weights = [
len(megatron_dataset) for megatron_dataset in megatron_datasets
]
except TypeError:
weights = [0 for _ in prefixes]
if self.sizes[i] is not None:
size = min(self.sizes[i], sum(weights))
else:
size = None # => the size will be sum(weights)
else:
raise RuntimeError
blended_datasets[i] = self.build_generic_dataset(
BlendedDataset,
self.is_built_on_rank,
True, # synchronize_ranks, default behavior to build on rank-0 first
megatron_datasets,
weights,
size,
self.config,
)
return blended_datasets
def _build_megatron_datasets_parallel(
self, prefixes: List[str], split: List[float], sizes_per_dataset: List[List[int]],
) -> List[List[Optional[MegatronDataset]]]:
"""Build the megatron datasets for a list of prefixes in parallel
Args:
prefixes (List[str]): The list of prefix strings
split (List[float]): The dataset split ratios (must sum to 1.00)
sizes_per_dataset (List[List[int]]): The number of samples to request
per MegatronDataset per spilt
Returns:
List[List[Optional[MegatronDataset]]]: For each split, have a list of
MegatronDataset per prefix
"""
# Helper function to wrap the threading logic
def _threading_helper(
megatron_datasets: List[List[Optional[MegatronDataset]]],
num_workers: int,
prefixes: List[str],
split: List[float],
sizes_per_dataset: List[List[int]],
) -> None:
with ThreadPoolExecutor(max_workers=num_workers) as executor:
all_futures = []
for i in range(len(prefixes)):
all_futures.append(
executor.submit(
self._build_megatron_dataset_splits,
prefixes[i],
split,
sizes_per_dataset[i],
False, # synchronize_ranks, barrier is called in this function
)
)
for future in all_futures:
try:
megatron_datasets_split = future.result()
for j in range(len(megatron_datasets_split)):
megatron_datasets[j].append(megatron_datasets_split[j])
except Exception as err:
raise err
return megatron_datasets
megatron_datasets = [[] for _ in range(len(Split))]
num_dataset_builder_threads = self.config.num_dataset_builder_threads
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
# First, build on rank 0
if rank == 0:
num_workers = num_dataset_builder_threads
if num_workers > 1:
# since only rank 0 is running, scale up the thread count
# but not too much to avoid overloading storage on miss path.
# if user set num_dataset_builder_threads to 1,
# i.e. meant for serial build, do not scale up.
num_workers *= min(2, max(1, torch.cuda.device_count()))
_threading_helper(
megatron_datasets, num_workers, prefixes, split, sizes_per_dataset,
)
torch.distributed.barrier()
# Then, build on other ranks; guaranteed to be data_cache hit
if rank != 0:
_threading_helper(
megatron_datasets,
num_dataset_builder_threads,
prefixes,
split,
sizes_per_dataset,
)
else:
_threading_helper(
megatron_datasets, num_dataset_builder_threads, prefixes, split, sizes_per_dataset,
)
return megatron_datasets
def _build_megatron_dataset_splits(
self,
dataset_path: Optional[str],
split: List[float],
sizes: List[int],
synchronize_ranks: bool = True,
) -> List[Optional[MidLevelDataset]]:
"""Build each MidLevelDataset split from a single LowLevelDataset
Args:
dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes
split (List[Tuple[float, float]]): The dataset split matrix
sizes (List[int]): The number of total samples to draw from each split
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
Returns:
List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
"""
# Build the low level dataset
low_level_dataset = self.cls.build_low_level_dataset(dataset_path, self.config)
# Build the split indices for the low level dataset
num_elements = self.cls.numel_low_level_dataset(low_level_dataset)
split_indices = []
for i, _ in enumerate(Split):
if split[i] is not None:
beg = int(round(split[i][0] * float(num_elements)))
end = int(round(split[i][1] * float(num_elements)))
split_indices.append(numpy.arange(start=beg, stop=end, step=1, dtype=numpy.int32))
else:
split_indices.append(None)
# Build the mid level dataset
mid_level_datasets = []
for i, _split in enumerate(Split):
if split[i] is None:
mid_level_datasets.append(None)
else:
mid_level_datasets.append(
self.build_generic_dataset(
self.cls,
self.is_built_on_rank,
synchronize_ranks,
low_level_dataset,
dataset_path,
split_indices[i],
sizes[i],
_split,
self.config,
)
)
return mid_level_datasets
@staticmethod
def build_generic_dataset(
cls: Union[Type[DistributedDataset], Callable],
is_built_on_rank: Callable,
synchronize_ranks: bool,
*args: Any,
) -> Optional[Union[DistributedDataset, Iterable]]:
"""Build the DistributedDataset
Return None if and only if the underlying dataset class is not built on the current rank
and torch.distributed is initialized.
Args:
cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
Raises:
Exception: When the dataset constructor raises an OSError
Returns:
Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None
"""
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
dataset = None
# First, build on rank 0
if rank == 0 and is_built_on_rank():
try:
dataset = cls(*args)
except OSError as err:
log = (
f"Failed to write dataset materials to the data cache directory. "
+ f"Please supply a directory to which you have write access via "
+ f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
+ f"retry. Refer to the preserved traceback above for more information."
)
raise Exception(log) from err
if synchronize_ranks:
torch.distributed.barrier()
# After, build on other ranks
if rank != 0 and is_built_on_rank():
dataset = cls(*args)
return dataset
return cls(*args)
def _get_size_per_split_per_dataset(
normalized_weights: List[float], target_size_per_split: List[int]
) -> List[List[int]]:
"""Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
Args:
normalized_weights (List[float]): e.g. [0.3, 0.7]
target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split
Returns:
List[List[int]]: The number of samples to request per MegatronDataset per split
"""
assert numpy.isclose(sum(normalized_weights), 1.0)
# Use 0.5% target margin to ensure we satiate the request
sizes_per_dataset = [
[int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split]
for weight in normalized_weights
]
return sizes_per_dataset
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import functools
import logging
import re
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
from megatron.core.datasets.utils import Split, log_single_rank, normalize
logger = logging.getLogger(__name__)
@dataclass
class BlendedMegatronDatasetConfig:
"""Configuration object for Megatron Core datasets"""
random_seed: int
"""The seed for all RNG during dataset creation."""
sequence_length: int
"""The sequence length."""
blend: Optional[Tuple[List[str], Optional[List[float]]]] = None
"""The blend, consisting of a list of dataset prefixes and optionally a list of dataset
weights. For example, [["dataset-path1", "dataset-path2"], [0.3, 0.7]]. When the weights are
None, they are inferred from the lengths of the contributing datasets. Not to be used with
'blend_per_split'. Defaults to None.
"""
blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] = None
"""A set of blends, as defined above, one for each split distribution. Not to be used with
'blend'. Defauls to None.
"""
split: Optional[str] = None
"""The split string, a comma separated weighting for the dataset splits when drawing samples
from a single distribution. Not to be used with 'blend_per_split'. Defaults to None.
"""
split_matrix: Optional[List[Tuple[float, float]]] = field(init=False, default=None)
"""The split matrix consisting of non-overlapping book-ends of each split in order. For more
information, refer to 'convert_split_vector_to_split_matrix'. Created automatically from
'split'. Not to be passed in to the constructor.
"""
num_dataset_builder_threads: int = 1
"""The number of threads to use for dataset building."""
path_to_cache: Optional[str] = None
"""Where all re-useable dataset indices are to be cached."""
mmap_bin_files: bool = True
"""Whether to mmap the .bin files or use file pointers."""
mock: bool = field(init=False, default=False)
"""Whether to bypass real data loading and validation in favor of mock data generation.
Created automatically from 'blend' and 'blend_per_split'. Not to be passed in to the
constructor.
"""
tokenizer: Optional[MegatronTokenizer] = None
"""The MegatronTokenizer instance or None. Required for datasets which do online tokenization."""
def __post_init__(self) -> None:
"""Do asserts and set fields post init
"""
if self.blend_per_split is not None and any(self.blend_per_split):
assert self.blend is None, "blend and blend_per_split are incompatible"
assert self.split is None, "split and blend_per_split are incompatible"
assert len(self.blend_per_split) == len(
Split
), f"blend_per_split must contain {len(Split)} blends"
for split in Split:
if self.blend_per_split[split.value] is None:
log_single_rank(
logger, logging.INFO, f"blend not provided for {split.name} split"
)
else:
assert self.blend_per_split[split.value][1] is None or len(
self.blend_per_split[split.value][0]
) == len(
self.blend_per_split[split.value][1]
), "blend per split prefixes and weights must be equal in number"
else:
if self.blend is not None:
assert self.blend[1] is None or len(self.blend[0]) == len(
self.blend[1]
), "blend prefixes and weights must be equal in number"
assert self.split is not None, "split must be provided when blend is not None"
else:
self.mock = True
log_single_rank(
logger,
logging.INFO,
f"Let mock = True, as both blend and blend_per_split are None",
)
self.split = "1,1,1"
log_single_rank(
logger,
logging.INFO,
f"Let split = {self.split}, an arbitrarily even split, as mock is True",
)
split_vector = parse_and_normalize_split(self.split)
self.split_matrix = convert_split_vector_to_split_matrix(split_vector)
log_single_rank(logger, logging.INFO, f"Let split_matrix = {self.split_matrix}")
def parse_and_normalize_split(split: str) -> List[float]:
"""Parse the dataset split ratios from a string
Args:
split (str): The train valid test split string e.g. "99,1,0"
Returns:
List[float]: The trian valid test split ratios e.g. [0.99, 0.01, 0.0]
"""
split = list(map(float, re.findall(r"[.0-9]+", split)))
split = split + [0.0 for _ in range(len(Split) - len(split))]
assert len(split) == len(Split)
assert all(map(lambda _: _ >= 0.0, split))
split = normalize(split)
return split
def convert_split_vector_to_split_matrix(
vector_a: List[float], vector_b: Optional[List[float]] = None
) -> List[Optional[Tuple[float, float]]]:
"""Build the split matrix from one or optionally two contributing split vectors.
Ex. a standard conversion:
[0.99, 0.01, 0.0] -> [(0, 0.99), (0.99, 1.0), None]
Ex. a conversion for Retro when Retro pretraining uses a [0.99, 0.01, 0.0] split and Retro
preprocessing used a [0.98, 0.02, 0.0] split:
[0.99, 0.01, 0.0], [0.98, 0.02, 0.0] -> [(0, 0.98), (0.99, 1.0), None]
Args:
vector_a (List[float]): The primary split vector
vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None.
Returns:
List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order
"""
if vector_b is None:
vector_b = vector_a
# [.900, .090, .010] -> [0.00, .900, .990, 100]
expansion_a = functools.reduce(lambda a, b: a + [a[len(a) - 1] + b], [[0], *vector_a])
expansion_b = functools.reduce(lambda a, b: a + [a[len(a) - 1] + b], [[0], *vector_b])
# [0.00, .900, .990, 100.0] -> [(0.00, .900), (.900, .990), (.990, 100)]
bookends_a = list(zip(expansion_a[:-1], expansion_a[1:]))
bookends_b = list(zip(expansion_b[:-1], expansion_b[1:]))
# gather per-split overlap or None
matrix = []
for bookend_a, bookend_b in zip(bookends_a, bookends_b):
if min(bookend_a[1], bookend_b[1]) <= max(bookend_a[0], bookend_b[0]):
overlap = None
else:
overlap = (max(bookend_a[0], bookend_b[0]), min(bookend_a[1], bookend_b[1]))
matrix.append(overlap)
return matrix
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import logging
import os
import time
from dataclasses import dataclass
from typing import Dict, Optional, Tuple
import numpy
import torch
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
from megatron.core.datasets.indexed_dataset import IndexedDataset
from megatron.core.datasets.megatron_dataset import MegatronDataset
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
from megatron.core.datasets.utils import Split
from megatron.core.utils import log_single_rank
logger = logging.getLogger(__name__)
_PAD_TOKEN_ID = -1
@dataclass
class GPTDatasetConfig(BlendedMegatronDatasetConfig):
"""Configuration object for Megatron Core GPT datasets"""
reset_position_ids: bool = None
"""Option to reset the position IDs in the dataset at an interval"""
reset_attention_mask: bool = None
"""Option to reset the attention mask from the dataset"""
eod_mask_loss: bool = None
"""Option to enable the EOD mask loss"""
create_attention_mask: bool = True
"""Option to enable the attention masks generation. Can be disabled if attention kernel
generates masks by itself.
"""
drop_last_partial_validation_sequence: bool = True
"""Option to drop the last partial validation sequence"""
add_extra_token_to_sequence: bool = True
"""Option to draw sequences with one extra token to ensure the sample input tokens and sample
output tokens are both of the desired sequence length
"""
def __post_init__(self) -> None:
"""Do asserts and set fields post init
"""
super().__post_init__()
assert self.tokenizer is not None
assert self.reset_position_ids is not None
assert self.reset_attention_mask is not None
assert self.eod_mask_loss is not None
class GPTDataset(MegatronDataset):
"""The base GPT dataset
Args:
indexed_dataset (IndexedDataset): The IndexedDataset around which to build the GPTDataset
dataset_path (Optional[str]): The real path on disk to the dataset, for bookkeeping
indexed_indices (numpy.ndarray): The set of the documents indices to expose
num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
index_split (Split): The indexed_indices Split
config (GPTDatasetConfig): The config
"""
def __init__(
self,
indexed_dataset: IndexedDataset,
dataset_path: Optional[str],
indexed_indices: numpy.ndarray,
num_samples: Optional[int],
index_split: Split,
config: GPTDatasetConfig,
) -> None:
super().__init__(
indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
)
self.masks_and_position_ids_are_cacheable = not any(
[
self.config.reset_position_ids,
self.config.reset_attention_mask,
self.config.eod_mask_loss,
]
)
self.masks_and_position_ids_are_cached = False
self.cached_attention_mask = None
self.cached_loss_mask = None
self.cached_position_ids = None
try:
self._pad_token_id = self.config.tokenizer.pad
except:
self._pad_token_id = _PAD_TOKEN_ID
(
self.document_index,
self.sample_index,
self.shuffle_index,
) = self._build_document_sample_shuffle_indices()
@staticmethod
def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
"""Abstract method implementation
For GPT, the underlying IndexedDataset should be split by sequence, as opposed to, say,
BERT, which should be split by document
Args:
low_level_dataset (IndexedDataset): The underlying IndexedDataset
Returns:
int: The number of unique elements in the underlying IndexedDataset
"""
return low_level_dataset.sequence_lengths.shape[0]
@staticmethod
def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfig) -> IndexedDataset:
"""Abstract method implementation
Args:
dataset_path (str): The real path prefix to the IndexedDataset .bin and .idx files
config (GPTDatasetConfig): The config
Returns:
IndexedDataset: The underlying IndexedDataset
"""
return IndexedDataset(dataset_path, multimodal=False, mmap=config.mmap_bin_files)
def __len__(self) -> int:
"""Abstract method implementation
Returns:
int: The length of the dataset
"""
return self.sample_index.shape[0] - 1
def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]:
"""Abstract method implementation
Args:
idx (Optioal[int]): The index into the dataset
Returns:
Dict[str, torch.Tensor]: The sample information wrapped in a dictionary
"""
if idx is None:
# Batch padding sequence so the index does not matter
text, _ = self._query_document_sample_shuffle_indices(0)
else:
text, _ = self._query_document_sample_shuffle_indices(idx)
text = torch.from_numpy(text).long()
if self.config.add_extra_token_to_sequence:
tokens = text[:-1].contiguous()
labels = text[1:].contiguous()
else:
tokens = text
labels = torch.roll(text, shifts=-1, dims=0)
labels[-1] = self._pad_token_id
if (
not self.masks_and_position_ids_are_cacheable
or not self.masks_and_position_ids_are_cached
):
attention_mask, loss_mask, position_ids = _get_ltor_masks_and_position_ids(
tokens,
self.config.tokenizer.eod,
self.config.reset_position_ids,
self.config.reset_attention_mask,
self.config.eod_mask_loss,
self.config.create_attention_mask,
)
if self.masks_and_position_ids_are_cacheable:
self.cached_attention_mask = attention_mask
self.cached_loss_mask = loss_mask
self.cached_position_ids = position_ids
self.masks_and_position_ids_are_cached = True
else:
attention_mask = self.cached_attention_mask
loss_mask = self.cached_loss_mask
position_ids = self.cached_position_ids
# For padded sequences, mask the loss
loss_mask[labels == self._pad_token_id] = 0.0
# For padded sequences, ensure the embedding layer can map the token ID
tokens[tokens == self._pad_token_id] = 0
labels[labels == self._pad_token_id] = 0
# Batch padding sequence so we mask the loss
if idx is None:
loss_mask = torch.zeros_like(loss_mask)
if self.config.create_attention_mask:
return {
"tokens": tokens,
"labels": labels,
"attention_mask": attention_mask,
"loss_mask": loss_mask,
"position_ids": position_ids,
}
else:
return {
"tokens": tokens,
"labels": labels,
"loss_mask": loss_mask,
"position_ids": position_ids,
}
def _query_document_sample_shuffle_indices(
self, idx: int
) -> Tuple[numpy.ndarray, numpy.ndarray]:
"""Get the text (token ids) and document ids for a given index
Args:
idx (int): The index into the dataset
Returns:
Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids
"""
# Do the shuffle mapping
idx = self.shuffle_index[idx]
# Get the beginning and end documents and offsets
doc_index_beg, doc_index_beg_offset = self.sample_index[idx]
doc_index_end, doc_index_end_offset = self.sample_index[idx + 1]
document_ids = []
sample_parts = []
# Sample spans a single document
if doc_index_beg == doc_index_end:
# Add the document id
document_ids.append(self.document_index[doc_index_beg])
# Add the entire sample
sample_parts.append(
self.dataset.get(
self.document_index[doc_index_beg],
offset=doc_index_beg_offset,
length=doc_index_end_offset
- doc_index_beg_offset
+ self.config.add_extra_token_to_sequence,
)
)
# Sample spans multiple documents
else:
for i in range(doc_index_beg, doc_index_end + 1):
# Add the document id
document_ids.append(self.document_index[i])
# Add the sample part
offset = 0 if i > doc_index_beg else doc_index_beg_offset
length = (
None
if i < doc_index_end
else doc_index_end_offset + self.config.add_extra_token_to_sequence
)
sample_parts.append(
self.dataset.get(self.document_index[i], offset=offset, length=length)
)
assert len(document_ids) == len(
sample_parts
), f"len(document_ids) ({len(document_ids)}) != len(sample_parts) ({len(sample_parts)})"
length = sum(map(len, sample_parts))
# Pad the sample if necessary
if length < (self.config.sequence_length + self.config.add_extra_token_to_sequence):
sample_parts.append(
[self._pad_token_id]
* (self.config.sequence_length + self.config.add_extra_token_to_sequence - length)
)
return (
numpy.concatenate(sample_parts, dtype=numpy.int64),
numpy.array(document_ids, dtype=numpy.int64),
)
def _build_document_sample_shuffle_indices(
self,
) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
"""Build the document index, the sample index, and the shuffle index
The document index:
-- 1-D
-- An ordered array of document ids
The sample index:
-- 2-D
-- The document indices and offsets which mark the start of every sample
The shuffle index:
-- 1-D
-- A random permutation of index range of the sample index
Returns:
Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the shuffle index
"""
path_to_cache = self.config.path_to_cache
if path_to_cache is None and not self.config.mock:
path_to_cache = os.path.join(
self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
)
if path_to_cache:
get_path_to = lambda suffix: os.path.join(
path_to_cache,
f"{self.unique_description_hash}-{type(self).__name__}-{self.index_split.name}-{suffix}",
)
path_to_description = get_path_to("description.txt")
path_to_document_index = get_path_to("document_index.npy")
path_to_sample_index = get_path_to("sample_index.npy")
path_to_shuffle_index = get_path_to("shuffle_index.npy")
cache_hit = all(
map(
os.path.isfile,
[
path_to_description,
path_to_document_index,
path_to_sample_index,
path_to_shuffle_index,
],
)
)
else:
cache_hit = False
if not path_to_cache or (
not cache_hit
and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0)
):
log_single_rank(
logger,
logging.INFO,
f"Build and save the {type(self).__name__} {self.index_split.name} indices",
)
t_beg = time.time()
sequence_length = self.config.sequence_length
num_tokens_per_epoch = self._get_num_tokens_per_epoch()
num_epochs = self._get_num_epochs(num_tokens_per_epoch)
if num_epochs == 1:
separate_final_epoch = False
else:
# Get the number of samples for the last epoch
num_samples_sans_final_epoch = (
(num_epochs - 1) * num_tokens_per_epoch
- self.config.add_extra_token_to_sequence
) // sequence_length
num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch
num_samples_per_epoch = (
num_tokens_per_epoch - self.config.add_extra_token_to_sequence
) // sequence_length
# num_samples_from_final_epoch should be non-negative
assert num_samples_from_final_epoch >= 0
# num_samples_from_final_epoch should not exceed max value
assert num_samples_from_final_epoch <= num_samples_per_epoch + 1
# Separate the final epoch if it falls below the threshold
threshold = 0.80
separate_final_epoch = num_samples_from_final_epoch < int(
threshold * num_samples_per_epoch
)
log_single_rank(
logger,
logging.DEBUG,
f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}",
)
log_single_rank(logger, logging.DEBUG, f"> threshold: {threshold}")
log_single_rank(
logger, logging.DEBUG, f"> num_samples_per_epoch: {num_samples_per_epoch}"
)
log_single_rank(
logger, logging.DEBUG, f"> separate_final_epoch: {separate_final_epoch}"
)
numpy_random_state = numpy.random.RandomState(self.config.random_seed)
# Build the document index
document_index = _build_document_index(
self.indices, num_epochs, numpy_random_state, separate_final_epoch
)
drop_last_partial_sequence = True
if self.index_split == Split.valid:
drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence
# Build the sample index
from megatron.core.datasets import helpers
if self.index_split == Split.valid:
drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence
else:
drop_last_partial_sequence = True
assert document_index.dtype == numpy.int32
assert self.dataset.sequence_lengths.dtype == numpy.int32
if len(document_index) * 2 > len(self.dataset.sequence_lengths):
# Heuristic: if "access density" of sequence_lengths is relatively high,
# force loading the mmap-ed array into memory by taking a copy.
# System performance benefits come from two aspects:
# 1. **sequentially** pre-loading the whole file if we're gonna read a large fraction anyways.
# 2. GIL is held when calling into c++ code; making the c++ func faster improves parallelism.
sequence_lengths_for_cpp = self.dataset.sequence_lengths.copy()
else:
sequence_lengths_for_cpp = self.dataset.sequence_lengths
sample_index = helpers.build_sample_idx(
sequence_lengths_for_cpp,
document_index,
sequence_length,
num_epochs,
num_tokens_per_epoch,
drop_last_partial_sequence,
self.config.add_extra_token_to_sequence,
)
# Build the shuffle index
if separate_final_epoch:
shuffle_index = _build_shuffle_index(
num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state
)
else:
shuffle_index = _build_shuffle_index(
sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state
)
if path_to_cache:
os.makedirs(path_to_cache, exist_ok=True)
# Write the description
with open(path_to_description, "wt") as writer:
writer.write(self.unique_description)
numpy.save(path_to_document_index, document_index, allow_pickle=True)
numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True)
else:
log_single_rank(
logger,
logging.WARNING,
f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
)
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
log_single_rank(
logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}"
)
log_single_rank(logger, logging.INFO, f"> total number of epochs: {num_epochs}")
return document_index, sample_index, shuffle_index
log_single_rank(
logger, logging.INFO, f"Load the {type(self).__name__} {self.index_split.name} indices"
)
log_single_rank(
logger,
logging.INFO,
f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
)
t_beg = time.time()
document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
log_single_rank(
logger,
logging.INFO,
f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
)
t_beg = time.time()
sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
log_single_rank(
logger,
logging.INFO,
f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
)
t_beg = time.time()
shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
t_end = time.time()
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
log_single_rank(
logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}"
)
return document_index, sample_index, shuffle_index
def _get_num_tokens_per_epoch(self) -> int:
"""Calculate the number of tokens in a single epoch
Returns:
int: The number of tokens in a single epoch
"""
return int(numpy.sum(self.dataset.sequence_lengths[self.indices]))
def _get_num_epochs(self, num_tokens_per_epoch: int) -> int:
"""Calculate the number of epochs
Args:
num_tokens_per_epoch (int): The number of tokens in a single epoch
Returns:
int: The number of epochs
"""
num_epochs = 1
num_tokens = num_tokens_per_epoch
if self.num_samples is None:
return num_epochs
else:
num_tokens_requested = (
self.num_samples * self.config.sequence_length
) + self.config.add_extra_token_to_sequence
while num_tokens < num_tokens_requested:
num_epochs += 1
num_tokens += num_tokens_per_epoch
return num_epochs
def _build_document_index(
documents: numpy.ndarray,
num_epochs: int,
numpy_random_state: numpy.random.RandomState,
separate_final_epoch: bool,
) -> numpy.ndarray:
"""Build an array with length = num epochs * num documents
Args:
documents (numpy.ndarray): the subset of exposed document indices
num_epochs (int): The number of epochs
numpy_random_state (numpy.random.RandomState): The NumPy random state
separate_final_epoch (bool): Whether to exclude the last epoch from the global shuffle
Returns:
numpy.ndarray: The document index
"""
if not separate_final_epoch or num_epochs == 1:
document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
document_index[:] = documents
document_index = document_index.reshape(-1)
document_index = document_index.astype(numpy.int32)
numpy_random_state.shuffle(document_index)
return document_index
doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False)
doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False)
return numpy.concatenate((doc_idx_first, doc_idx_last))
def _build_shuffle_index(
num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState
) -> numpy.ndarray:
"""Build the range [0, size) and shuffle
Args:
num_samples (int): The size of the first shuffle range [0, num_samples)
total_size (int): The size of the entire index. If larger than 'num_samples', it defines the second shuffle range [num_samples, total_size)
numpy_random_state (numpy.random.RandomState): The NumPy random state
Returns:
numpy.ndarray: The shuffle index
"""
dtype_ = numpy.uint32
if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
dtype_ = numpy.int64
shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
numpy_random_state.shuffle(shuffle_idx_first)
if num_samples == total_size:
return shuffle_idx_first
shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_)
numpy_random_state.shuffle(shuffle_idx_last)
return numpy.concatenate((shuffle_idx_first, shuffle_idx_last))
def _get_ltor_masks_and_position_ids(
data: torch.Tensor,
eod_token: int,
reset_position_ids: bool,
reset_attention_mask: bool,
eod_mask_loss: bool,
create_attention_mask: bool,
):
"""Build masks and position id for left to right model.
Args:
data (torch.Tensor): The data tenor that holds the tokens from the dataset
eod_token (int): ID of the token to that is considered the EOD
reset_position_ids (bool): Switch to reset the document position ID's
reset_attention_mask (bool): Switch to reset the attention mask
eod_mask_loss (bool): Switch to enable the EOD mask loss
create_attention_mask (bool): Switch to enable the attention masks generation. Can be disabled if attention kernel generates masks by itself.
Returns:
torch.Tensor: Attention mask needed to be used for Attention
torch.Tensor: The mask used for loss value during training
torch.Tensor: The position ID's of the token
"""
seq_length = data.numel()
if create_attention_mask:
attention_mask = torch.tril(
torch.ones((seq_length, seq_length), device=data.device)
).unsqueeze(0)
else:
attention_mask = None
# Loss mask.
loss_mask = torch.ones(seq_length, dtype=torch.float, device=data.device)
if eod_mask_loss:
loss_mask[data == eod_token] = 0.0
# Position ids.
position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
# We need to clone as the ids will be modifed based on batch index.
if reset_position_ids:
position_ids = position_ids.clone()
if reset_position_ids or reset_attention_mask:
# Find indices where EOD token is.
eod_index = position_ids[data == eod_token]
# Detach indices from positions if going to modify positions.
if reset_position_ids:
eod_index = eod_index.clone()
# Loop through EOD indices:
prev_index = 0
for j in range(eod_index.numel()):
i = eod_index[j]
# Mask attention loss.
if reset_attention_mask and attention_mask is not None:
attention_mask[0, (i + 1) :, : (i + 1)] = 0
# Reset positions.
if reset_position_ids:
position_ids[(i + 1) :] -= i + 1 - prev_index
prev_index = i + 1
if attention_mask is not None:
# Convert attention mask to binary:
attention_mask = attention_mask < 0.5
return attention_mask, loss_mask, position_ids
class MockGPTLowLevelDataset:
seed: int = 0
size: int = 100000
max_sequence_length: int = 4096
def __init__(self, tokenizer: MegatronTokenizer) -> None:
self.tokenizer = tokenizer
rng = numpy.random.default_rng(seed=self.seed)
self.sequence_lengths = rng.integers(
low=1, high=self.max_sequence_length, size=self.size, dtype=numpy.int32
)
def __len__(self) -> int:
return self.size
def __getitem__(self, idx: int) -> numpy.number:
length = self.sequence_lengths[idx]
sample = numpy.int64(
numpy.concatenate([numpy.arange(length - 1) + 1, [self.tokenizer.eod]])
)
return sample
def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
if length is None:
length = self.sequence_lengths[idx] - offset
return self[idx][offset : offset + length]
class MockGPTDataset(GPTDataset):
"""The mock GPT dataset
Args:
indexed_dataset (MockGPTLowLevelDataset): The MockGPTLowLevelDataset around which to build the MockGPTDataset
dataset_path (Optional[str]): This argument is of no consequence for the MockGPTDataset
indices (numpy.ndarray): The set of the dataset indices to expose
num_samples (int): The number of samples to draw from the dataset
index_split (Split): The indices Split
config (GPTDatasetConfig): The config
"""
def __init__(
self,
dataset: MockGPTLowLevelDataset,
dataset_path: Optional[str],
indices: numpy.ndarray,
num_samples: int,
index_split: Split,
config: GPTDatasetConfig,
) -> None:
assert config.mock
super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
@staticmethod
def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset) -> int:
"""Abstract method implementation
Args:
low_level_dataset (MockGPTLowLevelDataset): The underlying MockGPTLowLevelDataset
Returns:
int: The number of unique elements in the underlying MockGPTLowLevelDataset
"""
return len(low_level_dataset)
@staticmethod
def build_low_level_dataset(
dataset_path: Optional[str], config: GPTDatasetConfig
) -> MockGPTLowLevelDataset:
"""Abstract method implementation
Args:
dataset_path (Optional[str]): This argument is of no consequence for the MockGPTLowLevelDataset
config (GPTDatasetConfig): The config
Returns:
MockGPTLowLevelDataset: The underlying MockGPTLowLevelDataset
"""
return MockGPTLowLevelDataset(config.tokenizer)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment