升级0.12版本

2c63b5cd · wangxj · c271aaae · 2c63b5cd · 2c63b5cd · 2c63b5cd
Commit 2c63b5cd authored Mar 07, 2025 by wangxj
20 changed files
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
--- a/megatron/core/README.md
+++ b/megatron/core/README.md
--- a/megatron/core/README_STRAGGLER.md
+++ b/megatron/core/README_STRAGGLER.md
--- a/megatron/core/__init__.py
+++ b/megatron/core/__init__.py
--- a/megatron/core/config_logger.py
+++ b/megatron/core/config_logger.py
--- a/megatron/core/datasets/Makefile
+++ b/megatron/core/datasets/Makefile
--- a/megatron/core/datasets/__init__.py
+++ b/megatron/core/datasets/__init__.py
--- a/megatron/core/datasets/bert_dataset.py
+++ b/megatron/core/datasets/bert_dataset.py
--- a/megatron/core/datasets/blended_dataset.py
+++ b/megatron/core/datasets/blended_dataset.py
@@ -29,7 +29,8 @@ class BlendedDataset(torch.utils.data.Dataset):

        weights (List[Union[int, float]]): The weights that determine the dataset blend ratios

-        size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
+        size (Optional[int]): The number of samples to draw from the blend. If None, for each
+            dataset index idx draw exactly weights[idx] samples from datasets[idx].

        config (BlendedMegatronDatasetConfig): The config

@@ -74,7 +75,6 @@ class BlendedDataset(torch.utils.data.Dataset):
        unique_identifiers["split"] = self.split.name
        unique_identifiers["weights"] = self.weights
        unique_identifiers["size"] = self.size
-        unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights

        self.unique_description = json.dumps(
            unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
@@ -168,7 +168,7 @@ class BlendedDataset(torch.utils.data.Dataset):
                log_single_rank(
                    logger,
                    logging.WARNING,
-                    f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
+                    f"Cannot save the {type(self).__name__} indexes because path_to_cache is None",
                )

            t_end = time.time()

--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -34,7 +34,9 @@ class BlendedMegatronDatasetBuilder(object):

        sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split

-        is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value.
+        is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
+            the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
+            global rank, local group rank, and virtual rank may inform its return value.

        config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
    """
@@ -54,7 +56,7 @@ class BlendedMegatronDatasetBuilder(object):
        log_single_rank(
            logger,
            logging.INFO,
-            f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}",
+            f"Building {cls.__name__} splits with sizes={self.sizes} and config={self.config}",
        )

        if not self.config.mock:
@@ -96,7 +98,8 @@ class BlendedMegatronDatasetBuilder(object):
        (2) The split has one contributing dataset, and...

            (a) 'size' is not None
-                - Build a mid-level dataset with low-level dataset sampling in proportion to the size
+                - Build a mid-level dataset with low-level dataset sampling in proportion to the
+                size

            (b) 'size' is None
                - Build mid-level datasets with no excess low-level dataset sampling
@@ -104,24 +107,27 @@ class BlendedMegatronDatasetBuilder(object):
        (3) The split has multiple contributing datasets, and...

            (a) 'weights' is not None and 'size' is not None
-                - Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size
-                - Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size
+                - Build mid-level datasets with low-level dataset sampling in proportion to their
+                weights and the size
+                - Build a top-level dataset of length marginally greater than 'size' with mid-level
+                dataset sampling in proportion to their weights and the size

            (b) 'weights' is not None and 'size' is None
                - Error

            (c) 'weights' is None and 'size' is not None
                - Build mid-level datasets with no excess low-level dataset sampling
-                - Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
-
-                  - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
+                - Build a top-level dataset of length 'size' (capped at the sum of the mid-level
+                dataset lengths) with mid-level dataset sampling in proportion to their lengths
+                and the size

            (d) 'weights' is None and 'size' is None
                - Build mid-level datasets with no excess low-level dataset sampling
                - Build a top-level dataset with no excess mid-level dataset sampling

        Returns:
-            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
+            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
+                split
        """
        datasets = self._build_blended_dataset_splits()

@@ -134,24 +140,35 @@ class BlendedMegatronDatasetBuilder(object):
                        log_single_rank(
                            logger,
                            logging.INFO,
-                            f"Verifying NumPy indices for {type(dataset).__name__} {dataset.split.name} split",
+                            (
+                                f"Verifying NumPy indices for {type(dataset).__name__} "
+                                f"{dataset.split.name} split"
+                            ),
                        )
                    else:
                        log_single_rank(
                            logger,
                            logging.INFO,
-                            f"NumPy indices for {type(dataset).__name__} {dataset.split.name} split are fully cached, skipping verification",
+                            (
+                                f"NumPy indices for {type(dataset).__name__} {dataset.split.name} "
+                                f"split are fully cached, skipping verification"
+                            ),
                        )
                        continue
                    # Check blend size
                    assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
                    # Check blend access of mid-level datasets
-                    _, sizes = numpy.unique(dataset.dataset_index, return_counts=True)
-                    for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)):
-                        if len(dataset_and_size[0]) < dataset_and_size[1]:
+                    dataset_indices, dataset_sizes = numpy.unique(
+                        dataset.dataset_index, return_counts=True
+                    )
+                    for i, (index, size) in enumerate(zip(dataset_indices, dataset_sizes)):
+                        if len(dataset.datasets[index]) < size:
                            raise IndexError(
-                                f"The {dataset.split.name} blend oversamples (N = {dataset_and_size[1]}) {type(dataset_and_size[0]).__name__} {i} (len = {len(dataset_and_size[0])}). "
-                                f"Set renormalize_blend_weights to True and re-run. File an issue if the problem is not resolved."
+                                f"The {dataset.split.name} blend oversamples the contributing "
+                                f"datasets  and, e.g., requests {size} samples from "
+                                f"{type(dataset.datasets[index]).__name__} {i} with size "
+                                f"{len(dataset.datasets[index])}. This is unexpected. "
+                                f"Please file an issue."
                            )

        return datasets
@@ -162,7 +179,8 @@ class BlendedMegatronDatasetBuilder(object):
        See the BlendedMegatronDatasetBuilder.build alias for more information.

        Returns:
-            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
+            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
+                split
        """
        ##
        # Return fake "mock" datasets
@@ -192,13 +210,19 @@ class BlendedMegatronDatasetBuilder(object):

            # Build the mid-level datasets
            if weights is None:
-                sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
+                # Build only one "epoch"
+                sizes_per_dataset_buffer = [[None for split in Split] for prefix in prefixes]
            else:
-                sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
+                # The number of samples we plan to use per dataset
+                sizes_per_dataset_target = _get_size_per_split_per_dataset(weights, self.sizes)
+                # The number of samples we plan to build per dataset
+                sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
+                    weights, self.sizes, margin=0.5
+                )

-            # build each dataset in parallel
+            # Build each dataset in parallel
            megatron_datasets = self._build_megatron_datasets_parallel(
-                prefixes, split, sizes_per_dataset
+                prefixes, split, sizes_per_dataset_buffer
            )

            # Build the top-level datasets
@@ -207,11 +231,11 @@ class BlendedMegatronDatasetBuilder(object):
                if split[i] is not None:
                    weights_i = weights
                    if weights_i is not None and self.sizes[i] is not None:
-                        size_per_dataset = list(zip(*sizes_per_dataset))[i]
+                        # Blend according to client-specified weights and client-specified size
+                        size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
                        size_i = sum(size_per_dataset)
-                        if self.config.renormalize_blend_weights:
-                            weights_i = list(map(lambda _size: _size / size_i, size_per_dataset))
                    elif weights_i is None:
+                        # Blend according to dataset sizes as-is and (maybe) client-specified size
                        try:
                            weights_i = [
                                len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
@@ -221,9 +245,12 @@ class BlendedMegatronDatasetBuilder(object):
                        if self.sizes[i] is not None:
                            size_i = min(self.sizes[i], sum(weights_i))
                        else:
-                            size_i = None  # => the size will be sum(weights_i)
+                            # Build exhaustive indices
+                            size_i = None
                    else:
-                        raise RuntimeError
+                        raise ValueError(
+                            "Using client-specified weights requires client-specified size"
+                        )
                    blended_datasets[i] = self.build_generic_dataset(
                        BlendedDataset,
                        self.is_built_on_rank,
@@ -263,22 +290,31 @@ class BlendedMegatronDatasetBuilder(object):

                    # Build mid-level datasets
                    if weights is None:
-                        sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
+                        sizes_per_dataset_buffer = [
+                            [None for split in Split] for prefix in prefixes
+                        ]
                    else:
-                        sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
+                        # The number of samples we plan to use per dataset
+                        sizes_per_dataset_target = _get_size_per_split_per_dataset(
+                            weights, sizes_spoof
+                        )
+                        # The number of samples we plan to build per dataset
+                        sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
+                            weights, sizes_spoof, margin=0.5
+                        )

-                    # build each dataset in parallel
+                    # Build each dataset in parallel
                    megatron_datasets = self._build_megatron_datasets_parallel(
-                        prefixes, split_spoof, sizes_per_dataset
+                        prefixes, split_spoof, sizes_per_dataset_buffer
                    )[i]

                    # Build top-level dataset
                    if weights is not None and self.sizes[i] is not None:
-                        size_per_dataset = list(zip(*sizes_per_dataset))[i]
+                        # Blend according to client-specified weights and client-specified size
+                        size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
                        size = sum(size_per_dataset)
-                        if self.config.renormalize_blend_weights:
-                            weights = list(map(lambda _size: _size / size, size_per_dataset))
                    elif weights is None:
+                        # Blend according to dataset sizes as-is and (maybe) client-specified size
                        try:
                            weights = [
                                len(megatron_dataset) for megatron_dataset in megatron_datasets
@@ -288,7 +324,8 @@ class BlendedMegatronDatasetBuilder(object):
                        if self.sizes[i] is not None:
                            size = min(self.sizes[i], sum(weights))
                        else:
-                            size = None  # => the size will be sum(weights)
+                            # Build exhaustive indices
+                            size = None
                    else:
                        raise RuntimeError
                    blended_datasets[i] = self.build_generic_dataset(
@@ -395,13 +432,15 @@ class BlendedMegatronDatasetBuilder(object):
        """Build each MidLevelDataset split from a single LowLevelDataset

        Args:
-            dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes
+            dataset_path (Optional[str]): The path on disk which defines the underlying
+                LowLevelDataset, or None for mock dataset classes

            split (List[Tuple[float, float]]): The dataset split matrix

            sizes (List[int]): The number of total samples to draw from each split

-            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
+                behavior. Set to False when we enforce this behavior at higher level.

        Returns:
            List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
@@ -462,17 +501,22 @@ class BlendedMegatronDatasetBuilder(object):
        and torch.distributed is initialized.

        Args:
-            cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
+            cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be
+                built. In special cases, e.g. when we are building the low level dataset for a
+                RawMegatronDataset instance, we can accept a Callable which returns an Iterable.

-            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
+                behavior. Set to False when we enforce this behavior at higher level.

-            args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
+            args (Tuple[Any]): The positional arguments used to build the provided
+                DistributedDataset class

        Raises:
            Exception: When the dataset constructor raises an OSError

        Returns:
-            Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None
+            Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the
+                Iterable instantiation, or None
        """
        if torch.distributed.is_initialized():
            rank = torch.distributed.get_rank()
@@ -485,10 +529,10 @@ class BlendedMegatronDatasetBuilder(object):
                    dataset = cls(*args)
                except OSError as err:
                    log = (
-                        f"Failed to write dataset materials to the data cache directory. "
-                        + f"Please supply a directory to which you have write access via "
-                        + f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
-                        + f"retry. Refer to the preserved traceback above for more information."
+                        f"Failed to write dataset materials to the data cache directory. Please "
+                        f"supply a directory to which you have write access via the path_to_cache "
+                        f"attribute in BlendedMegatronDatasetConfig and retry. Refer to the "
+                        f"preserved traceback above for more information."
                    )
                    raise Exception(log) from err

@@ -505,23 +549,30 @@ class BlendedMegatronDatasetBuilder(object):


 def _get_size_per_split_per_dataset(
-    normalized_weights: List[float], target_size_per_split: List[int]
+    normalized_weights: List[float], target_size_per_split: List[int], margin: float = 0.0
 ) -> List[List[int]]:
    """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits

    Args:
        normalized_weights (List[float]): e.g. [0.3, 0.7]

-        target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split
+        target_size_per_split (List[int]): The number of samples to target for each BlendedDataset
+            split
+
+        margin (float): The relative quantity of extra samples to build per per split per dataset,
+            as a percentage

    Returns:
        List[List[int]]: The number of samples to request per MegatronDataset per split
    """
    assert numpy.isclose(sum(normalized_weights), 1.0)

-    # Use 0.5% target margin to ensure we satiate the request
+    # Use margin as buffer to ensure we satiate the request
    sizes_per_dataset = [
-        [int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split]
+        [
+            int(math.ceil(math.ceil(target_size * weight) * (1 + margin / 100)))
+            for target_size in target_size_per_split
+        ]
        for weight in normalized_weights
    ]


--- a/megatron/core/datasets/blended_megatron_dataset_config.py
+++ b/megatron/core/datasets/blended_megatron_dataset_config.py
@@ -34,12 +34,6 @@ class BlendedMegatronDatasetConfig:
       'blend'. Defauls to None.
    """

-    renormalize_blend_weights: bool = False
-    """Renormalize the blend weights to account for mid-level dataset oversampling done to ensure 
-       fulfillmenet of the of the requested number of samples. Defaults to False for backward
-       comparability in the data sample order.
-    """
-
    split: Optional[str] = None
    """The split string, a comma separated weighting for the dataset splits when drawing samples
       from a single distribution. Not to be used with 'blend_per_split'.  Defaults to None.
@@ -67,7 +61,7 @@ class BlendedMegatronDatasetConfig:
    """

    tokenizer: Optional[MegatronTokenizer] = None
-    """The MegatronTokenizer instance or None. Required for datasets which do online tokenization."""
+    """The MegatronTokenizer instance. Required for datasets that do online tokenization."""

    def __post_init__(self) -> None:
        """Do asserts and set fields post init"""
@@ -149,7 +143,8 @@ def convert_split_vector_to_split_matrix(
    Args:
        vector_a (List[float]): The primary split vector

-        vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None.
+        vector_b (Optional[List[float]]): An optional secondary split vector which constrains the
+            primary split vector. Defaults to None.

    Returns:
        List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order

--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
--- a/megatron/core/datasets/helpers.cpp
+++ b/megatron/core/datasets/helpers.cpp
--- a/megatron/core/datasets/helpers.py
+++ b/megatron/core/datasets/helpers.py
--- a/megatron/core/datasets/indexed_dataset.py
+++ b/megatron/core/datasets/indexed_dataset.py
--- a/megatron/core/datasets/masked_dataset.py
+++ b/megatron/core/datasets/masked_dataset.py
--- a/megatron/core/datasets/megatron_dataset.py
+++ b/megatron/core/datasets/megatron_dataset.py
--- a/megatron/core/datasets/megatron_tokenizer.py
+++ b/megatron/core/datasets/megatron_tokenizer.py
--- a/megatron/core/datasets/multimodal_dataset.py
+++ b/megatron/core/datasets/multimodal_dataset.py
--- a/megatron/core/datasets/readme.md
+++ b/megatron/core/datasets/readme.md