首次上传

d444a97a · yangzhong · d444a97a · d444a97a · d444a97a · d444a97a
Commit d444a97a authored Oct 30, 2025 by yangzhong
20 changed files
--- a/megatron/core/__pycache__/parallel_state.cpython-310.pyc
+++ b/megatron/core/__pycache__/parallel_state.cpython-310.pyc
--- a/megatron/core/__pycache__/rerun_state_machine.cpython-310.pyc
+++ b/megatron/core/__pycache__/rerun_state_machine.cpython-310.pyc
--- a/megatron/core/__pycache__/timers.cpython-310.pyc
+++ b/megatron/core/__pycache__/timers.cpython-310.pyc
--- a/megatron/core/__pycache__/utils.cpython-310.pyc
+++ b/megatron/core/__pycache__/utils.cpython-310.pyc
--- a/megatron/core/config_logger.py
+++ b/megatron/core/config_logger.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import dataclasses
+import json
+import os
+
+import torch
+import torch.nn as nn
+
+from megatron.core import parallel_state
+
+
+def get_config_logger_path(config):
+    return getattr(config, 'config_logger_dir', '')
+
+
+def has_config_logger_enabled(config):
+    return get_config_logger_path(config) != ''
+
+
+# For each prefix, holds a counter and increases it every time we dump with this
+# prefix.
+__config_logger_path_counts = {}
+
+
+def get_path_count(path):
+    """
+    keeps tracks of number of times we've seen the input `path` and return count-1
+    """
+    global __config_logger_path_counts
+    if not path in __config_logger_path_counts:
+        __config_logger_path_counts[path] = 0
+    count = __config_logger_path_counts[path]
+    __config_logger_path_counts[path] += 1
+    return count
+
+
+def get_path_with_count(path):
+    """
+    calls get_path_count and appends returned value to path
+    """
+    return f'{path}.iter{get_path_count(path)}'
+
+
+class JSONEncoderWithMcoreTypes(json.JSONEncoder):
+    def default(self, o):
+        if type(o).__name__ in ['function', 'ProcessGroup']:
+            return str(o)
+        if type(o).__name__ in ['dict', 'OrderedDict']:
+            return {k: self.default(v) for k, v in o.items()}
+        if type(o).__name__ in ['list', 'ModuleList']:
+            return [self.default(val) for val in o]
+        if type(o).__name__ == 'UniqueDescriptor':
+            return {
+                attr: self.default(getattr(o, attr))
+                for attr in filter(lambda x: not x.startswith('__'), dir(o))
+            }
+        if type(o) is torch.dtype:
+            return str(o)
+        # if it's a Float16Module, add "Float16Module" to the output dict
+        if type(o).__name__ == 'Float16Module':
+            return {'Float16Module': {'module': self.default(o.module)}}
+        # If it's a nn.Module subchild, either print its children or itself if leaf.
+        if issubclass(type(o), nn.Module):
+            if len(getattr(o, '_modules', {})) > 0:
+                return {key: self.default(val) for key, val in o._modules.items()}
+            else:
+                return str(o)
+        if type(o).__name__ in ['ABCMeta', 'type', 'AttnMaskType']:
+            return str(o)
+        if dataclasses.is_dataclass(o) or type(o).__name__ in ['ModuleSpec', 'TransformerConfig']:
+            return dataclasses.asdict(o)
+        try:
+            return super().default(o)
+        except:
+            return str(o)
+
+
+def log_config_to_disk(config, dict_data, prefix=''):
+    """
+    Encodes the input dict (dict_data) using the JSONEncoderWithMcoreTypes
+    and dumps to disk, as specified via path
+    """
+    path = get_config_logger_path(config)
+    assert path is not None, 'Expected config_logger_dir to be non-empty in config.'
+
+    if 'self' in dict_data:
+        if prefix == '':
+            prefix = type(dict_data['self']).__name__
+        del dict_data['self']
+
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+
+    rank = parallel_state.get_all_ranks()
+    path = get_path_with_count(os.path.join(path, f'{prefix}.rank_{rank}'))
+    if type(dict_data).__name__ == 'OrderedDict':
+        torch.save(dict_data, f'{path}.pth')
+    else:
+        with open(f'{path}.json', 'w') as fp:
+            json.dump(dict_data, fp, cls=JSONEncoderWithMcoreTypes)
+
+
+__all__ = ['has_config_logger_enabled', 'log_config_to_disk']
--- a/megatron/core/datasets/Makefile
+++ b/megatron/core/datasets/Makefile
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+
+LIBNAME = helpers_cpp
+LIBEXT = $(shell python3-config --extension-suffix)
+
+OUT = $(LIBNAME)$(LIBEXT)
+SRC = helpers.cpp
+
+default: $(OUT)
+
+$(OUT): $(SRC)
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
--- a/megatron/core/datasets/__init__.py
+++ b/megatron/core/datasets/__init__.py
--- a/megatron/core/datasets/__pycache__/__init__.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/__init__.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/blended_dataset.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/blended_dataset.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/blended_megatron_dataset_builder.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/blended_megatron_dataset_builder.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/blended_megatron_dataset_config.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/blended_megatron_dataset_config.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/gpt_dataset.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/gpt_dataset.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/helpers.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/helpers.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/indexed_dataset.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/indexed_dataset.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/megatron_dataset.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/megatron_dataset.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/megatron_tokenizer.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/megatron_tokenizer.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/utils.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/utils.cpython-310.pyc
--- a/megatron/core/datasets/__pycache__/utils_s3.cpython-310.pyc
+++ b/megatron/core/datasets/__pycache__/utils_s3.cpython-310.pyc
--- a/megatron/core/datasets/bert_dataset.py
+++ b/megatron/core/datasets/bert_dataset.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import numpy
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.masked_dataset import (
+    MaskedWordPieceDataset,
+    MaskedWordPieceDatasetConfig,
+)
+from megatron.core.datasets.utils import Split
+
+
+@dataclass
+class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
+    """Configuration object for Megatron Core BERT WordPiece datasets"""
+
+    classification_head: bool = None
+    """Option to perform the next sequence prediction during sampling"""
+
+    def __post_init__(self) -> None:
+        """Do asserts and set fields post init"""
+        super().__post_init__()
+
+        assert self.classification_head is not None
+
+
+class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
+    """The BERT dataset that assumes WordPiece tokenization
+
+    Args:
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
+
+        dataset_path (str): The real path on disk to the dataset, for bookkeeping
+
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
+
+        index_split (Split): The indexed_indices Split
+
+        config (BERTMaskedWordPieceDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: IndexedDataset,
+        dataset_path: str,
+        indexed_indices: numpy.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: BERTMaskedWordPieceDatasetConfig,
+    ) -> None:
+        super().__init__(
+            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
+        )
+
+        self.token_lookup = list(self.config.tokenizer.inv_vocab.keys())
+        # Account for the single <cls> and two <sep> token ids
+        self.sample_index = self._build_sample_index(
+            self.config.sequence_length - 3, 2 if self.config.classification_head else 1
+        )
+
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Inherited method implementation
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return super(
+            BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset
+        )._key_config_attributes() + ["classification_head"]
+
+    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        """Abstract method implementation
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Dict[str, Union[int, numpy.ndarray]]: The
+        """
+        idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
+        sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
+        numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)
+
+        assert target_sequence_length <= self.config.sequence_length
+
+        # Split the sample into contiguous subsegments A and B
+        pivot = len(sample)
+        is_next_random = False
+        if self.config.classification_head:
+            assert len(sample) > 1, "the sample must contain at least two sentences"
+            pivot = 1
+            if len(sample) >= 3:
+                pivot = numpy_random_state.randint(low=1, high=len(sample))
+            is_next_random = numpy_random_state.random() < 0.5
+        split_A = []
+        for sample_a in sample[:pivot]:
+            split_A.extend(sample_a)
+        split_B = []
+        for sample_b in sample[pivot:]:
+            split_B.extend(sample_b)
+        if is_next_random:
+            split_A, split_B = split_B, split_A
+
+        # Trim the subsegments from either end to a desired joint length
+        length_A = len(split_A)
+        length_B = len(split_B)
+        if length_A + length_B <= target_sequence_length:
+            truncated = False
+        else:
+            while length_A + length_B > target_sequence_length:
+                split = split_A if length_A > length_B else split_B
+                if numpy_random_state.random() < 0.5:
+                    del split[0]
+                else:
+                    del split[-1]
+                length_A = len(split_A)
+                length_B = len(split_B)
+            truncated = True
+
+        # Merge the subsegments and create the token assignment labels
+        tokens = [self.config.tokenizer.cls, *split_A, self.config.tokenizer.sep]
+        assignments = [0 for _ in range(1 + len(split_A) + 1)]
+        if split_B:
+            tokens += [*split_B, self.config.tokenizer.sep]
+            assignments += [1 for _ in range(len(split_B) + 1)]
+
+        # Masking
+        tokens, masked_positions, masked_labels, _, _ = self._create_masked_lm_predictions(
+            tokens, target_sequence_length, numpy_random_state
+        )
+
+        # Pad the sequences and convert to NumPy
+        length_toks = len(tokens)
+        length_pads = self.config.sequence_length - length_toks
+        assert length_pads >= 0
+
+        tokens = numpy.array(tokens, dtype=numpy.int64)
+        tokens = numpy.pad(tokens, (0, length_pads), constant_values=self.config.tokenizer.pad)
+
+        assignments = numpy.array(assignments, dtype=numpy.int64)
+        assignments = numpy.pad(
+            assignments, (0, length_pads), constant_values=self.config.tokenizer.pad
+        )
+
+        # Get the padding mask
+        mask_pads = numpy.ones(length_toks, dtype=numpy.int64)
+        mask_pads = numpy.pad(
+            mask_pads, (0, length_pads), constant_values=self.config.tokenizer.pad
+        )
+
+        # Mask the labels
+        labels = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) - 1
+        labels[masked_positions] = masked_labels
+
+        # Get the loss mask
+        mask_loss = numpy.zeros(self.config.sequence_length, dtype=numpy.int64)
+        mask_loss[masked_positions] = 1
+
+        return {
+            "text": tokens,
+            "types": assignments,
+            "labels": labels,
+            "is_random": int(is_next_random),
+            "padding_mask": mask_pads,
+            "loss_mask": mask_loss,
+            "truncated": int(truncated),
+        }
+
+    def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
+        """Abstract method implementation
+
+        80% of the time, replace the token id with mask token id. 10% of the time, replace token id
+        with a random token id from the vocabulary. 10% of the time, do nothing.
+
+        Args:
+            numpy_random_state (RandomState): The NumPy random state
+
+        Returns:
+            Optional[int]: The replacement token id or None
+        """
+        if numpy_random_state.random() < 0.8:
+            return self.config.tokenizer.mask
+        else:
+            if numpy_random_state.random() >= 0.5:
+                return self.token_lookup[numpy_random_state.randint(0, len(self.token_lookup))]
+        return None
--- a/megatron/core/datasets/blended_dataset.py
+++ b/megatron/core/datasets/blended_dataset.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import hashlib
+import json
+import logging
+import os
+import time
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.megatron_dataset import MegatronDataset
+from megatron.core.datasets.utils import normalize
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+_VERBOSE = False
+
+
+class BlendedDataset(torch.utils.data.Dataset):
+    """Conjugating class for a set of MegatronDataset instances
+
+    Args:
+        datasets (List[MegatronDataset]): The MegatronDataset instances to blend
+
+        weights (List[Union[int, float]]): The weights that determine the dataset blend ratios
+
+        size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
+
+        config (BlendedMegatronDatasetConfig): The config
+
+    Raises:
+        RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
+    """
+
+    def __init__(
+        self,
+        datasets: List[MegatronDataset],
+        weights: List[Union[int, float]],
+        size: Optional[int],
+        config: BlendedMegatronDatasetConfig,
+    ) -> None:
+        assert len(datasets) == len(weights)
+        assert len(datasets) < 32767
+        assert all(map(lambda _: type(_) == type(datasets[0]), datasets))
+        assert all(map(lambda _: _.index_split == datasets[0].index_split, datasets))
+        assert all(map(lambda _: _ > 0, weights))
+        assert all(map(lambda _: type(_) == type(weights[0]), weights))
+        if size is None and isinstance(weights[0], float):
+            assert all(map(lambda _: _ == int(_), weights))
+
+        # Alert user to unnecessary blending
+        if len(datasets) == 1:
+            log_single_rank(
+                logger, logging.WARNING, f"Building a BlendedDataset for a single MegatronDataset"
+            )
+
+        if size is not None:
+            weights = normalize(weights)
+
+        self.datasets = datasets
+        self.split = self.datasets[0].index_split
+        self.weights = weights
+        self.size = size
+        self.config = config
+
+        unique_identifiers = OrderedDict()
+        unique_identifiers["class"] = type(self).__name__
+        unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
+        unique_identifiers["split"] = self.split.name
+        unique_identifiers["weights"] = self.weights
+        unique_identifiers["size"] = self.size
+        unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights
+
+        self.unique_description = json.dumps(
+            unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
+        )
+        self.unique_description_hash = hashlib.md5(
+            self.unique_description.encode("utf-8")
+        ).hexdigest()
+
+        self.built_anew_on_cache_miss = False
+
+        self.dataset_index, self.dataset_sample_index = self._build_indices()
+
+    def __len__(self) -> int:
+        return self.dataset_index.shape[0]
+
+    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        dataset_id = self.dataset_index[idx]
+        dataset_sample_id = self.dataset_sample_index[idx]
+        return {"dataset_id": dataset_id, **self.datasets[dataset_id][dataset_sample_id]}
+
+    def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
+        """Build and optionally cache the dataset index and the dataset sample index
+
+        The dataset index is a 1-D mapping which determines the dataset to query. The dataset
+        sample index is a 1-D mapping which determines the sample to request from the queried
+        dataset.
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
+        """
+        path_to_cache = self.config.path_to_cache
+
+        if path_to_cache:
+            get_path_to = lambda suffix: os.path.join(
+                path_to_cache,
+                f"{self.unique_description_hash}-{type(self).__name__}-{self.split.name}-{suffix}",
+            )
+            path_to_description = get_path_to("description.txt")
+            path_to_dataset_index = get_path_to("dataset_index.npy")
+            path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
+            cache_hit = all(
+                map(
+                    os.path.isfile,
+                    [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
+                )
+            )
+        else:
+            cache_hit = False
+
+        if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
+            log_single_rank(
+                logger, logging.INFO, f"Build and save the {type(self).__name__} indices"
+            )
+            self.built_anew_on_cache_miss = True
+
+            # Build the dataset and dataset sample indexes
+            log_single_rank(
+                logger, logging.INFO, f"\tBuild and save the dataset and dataset sample indexes"
+            )
+            t_beg = time.time()
+            from megatron.core.datasets import helpers
+
+            if self.size is not None:
+                dataset_index = numpy.zeros(self.size, dtype=numpy.int16)
+                dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64)
+                helpers.build_blending_indices(
+                    dataset_index,
+                    dataset_sample_index,
+                    self.weights,
+                    len(self.datasets),
+                    self.size,
+                    _VERBOSE,
+                )
+            else:
+                size = sum(self.weights)
+                dataset_index = numpy.zeros(size, dtype=numpy.int16)
+                dataset_sample_index = numpy.zeros(size, dtype=numpy.int64)
+                helpers.build_exhaustive_blending_indices(
+                    dataset_index, dataset_sample_index, self.weights, len(self.datasets)
+                )
+
+            if path_to_cache:
+                os.makedirs(path_to_cache, exist_ok=True)
+                # Write the description
+                with open(path_to_description, "wt") as writer:
+                    writer.write(self.unique_description)
+                # Save the indexes
+                numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
+                numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
+            else:
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
+                )
+
+            t_end = time.time()
+            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+            return dataset_index, dataset_sample_index
+
+        log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} indices")
+
+        log_single_rank(
+            logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
+        )
+        t_beg = time.time()
+        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"\tLoad the dataset sample index from {path_to_dataset_sample_index}",
+        )
+        t_beg = time.time()
+        dataset_sample_index = numpy.load(
+            path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
+        )
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        return dataset_index, dataset_sample_index