首次上传

d444a97a · yangzhong · d444a97a · d444a97a · d444a97a · d444a97a
Commit d444a97a authored Oct 30, 2025 by yangzhong
20 changed files
--- a/hostfile_gptmoe
+++ b/hostfile_gptmoe
+node007 slots=8
+node008 slots=8
+node009 slots=8
+node010 slots=8
+node011 slots=8
+node012 slots=8
+node013 slots=8
+node014 slots=8
+node015 slots=8
+node016 slots=8
+node017 slots=8
+node018 slots=8
+node019 slots=8
+node020 slots=8
+node021 slots=8
+node022 slots=8
--- a/hostfile_mixtral8x7B
+++ b/hostfile_mixtral8x7B
+node007 slots=8
+node008 slots=8
--- a/images/model_table.png
+++ b/images/model_table.png
--- a/images/strong_scaling.png
+++ b/images/strong_scaling.png
--- a/images/weak_scaling.png
+++ b/images/weak_scaling.png
--- a/kill.sh
+++ b/kill.sh
+ps -ef | grep python3 | grep -v grep | awk '{print "kill "$2}' >& .tmp
+bash .tmp
+
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
+## Quick Start
+
+The following guide is a short getting started guide for Megatron Core. In it you:
+
+* Initialize Megatron Core on 2 GPUS. 
+* Build a GPT model with tensor model parallel size 2, pipeline parallel size 1
+* Train it for a five iterations using Megatron Core schedules
+* Save the model using the distributed checkpointing format
+* Load the model saved above. 
+
+**NOTE:** The following sample was tested using Megatron Core version 0.8.0 and NGC PyTorch Container version 24.02. 
+
+### Environment Setup
+
+```
+docker run --ipc=host --shm-size=512m --gpus 2 -it nvcr.io/nvidia/pytorch:24.02-py3
+
+git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM
+```
+<br>
+
+### Writing Your First Training Loop
+
+In the following steps you create a sample GPT model split across tensors (Tensor model parallel) on 2 GPUS, and run a forward pass through it using a MockGPT dataset helper class that we created in Megatron Core. 
+
+<br>
+
+**NOTE:** All of the following steps are in the [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) script.
+
+To run the ``run_simple_mcore_train_loop.py`` script:
+
+```
+PYTHONPATH=$PYTHON_PATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py
+```
+
+<br>
+
+**STEP 1 - Initialize Distributed Training and Model Parallel Setup**
+
+The following utility, when called, initializes your distributed setup. 
+
+```python
+import os
+import torch
+from megatron.core import parallel_state
+
+def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1):
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+```
+<br>
+
+**STEP 2 - GPT Model Setup**
+
+In this step, you create a GPT model. For a list of other configurations that you can pass into the model open and review [transformer_config.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/transformer_config.py).
+
+```
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=12, 
+        num_attention_heads=4, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32)
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=64)
+
+    return gpt_model
+```
+<br>
+
+**STEP 3 - GPT Mock Dataset Setup**
+
+In the following step, you explore the mock dataset utility.
+
+* To train the model using your data, use the GPTDataset class in [gpt_dataset.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/gpt_dataset.py).
+
+* To find more information about Megatron Core data pipeline, see the [data pipeline readme.md](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/readme.md?ref_type=heads).
+
+```
+import torch
+from torch.utils.data import DataLoader
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from megatron.core.datasets.utils import compile_helpers
+
+_SEQUENCE_LENGTH = 64
+
+def get_train_data_iterator():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    config = GPTDatasetConfig(
+        random_seed=0,
+        sequence_length=_SEQUENCE_LENGTH,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+        tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH),
+    )
+
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [1000, None, None], lambda: True, config
+    ).build()
+
+    train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True)
+
+    train_iterator = iter(train_dataloader)
+
+    return train_iterator
+
+```
+<br>
+
+**STEP 4 - Forward Step Function**
+
+Megatron Core uses [schedules.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/pipeline_parallel/schedules.py) to run the model. It is sufficient to define a forward step function, which takes as input the data iterator and the model and produces as output the output tensor and a loss function.
+
+```python
+from functools import partial
+
+def forward_step_func(data_iterator, model):
+   
+    def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+        # If you have data parallel reduce loss across data parallel groups. 
+        # If pipeline parallel, loss computation is done only in last stage.
+
+        return loss, {'lm loss': loss}
+
+    data = next(data_iterator)
+    tokens = data['tokens'].to(device)
+    attention_mask = data['attention_mask'].to(device)
+    position_ids = data['position_ids'].to(device)
+    labels = data['labels'].to(device)
+    loss_mask = data['loss_mask'].to(device)
+   
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)   
+```
+<br>
+
+**STEP 5 - Load and Save Distributed Checkpoint**
+
+Megatron Core uses distributed checkpoints for loading and saving models. This gives you the flexibility to convert the model from one model parallel setting to another when you load a model. For example, a model trained with tensor parallel size 2, can be loaded again as tensor model parallel size 4, and so forth.
+
+```python
+from megatron.core import dist_checkpointing
+
+def save_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
+    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+```
+<br>
+
+**STEP 6 - Main Function**
+
+The following code snippet is the main function that needs to go into your script. It runs the model for 5 iterations, saves the model, and loads the data model.  
+
+```python
+from pathlib import Path
+from torch.optim import Adam
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device)
+
+    optim = Adam(gpt_model.parameters())
+    
+    train_iterator = get_train_data_iterator()
+    
+    forward_backward_func = get_forward_backward_func()
+
+    # Running the model for 5 iterations
+    for _ in range(5):
+        optim.zero_grad()
+        
+        losses_reduced = forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=train_iterator,
+            model=gpt_model,
+            num_microbatches=1,
+            seq_length=64,
+            micro_batch_size=8,
+            decoder_seq_length=64,
+            forward_only=False)
+    
+        optim.step()
+
+        print(f'Losses reduced :  {losses_reduced}')
+
+    # Saving the model
+    save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt')
+
+    # Loading the model
+    gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt')
+    gpt_model.to(device)
+    print('Successfully loaded the model')  
+```
+<br>
+
+
+
+### Extending Further
+
+The example you explored here is a basic training loop in Megatron Core. To review more advanced examples, explore [pretrain_gpt.py]. ``pretrain_gpt.py`` has more complex training loops that includes the following and other Megatron Core features:
+
+* pipeline parallel
+* context parallel
+* rope embeddings
+* mixture of experts
--- a/megatron/core/README.md
+++ b/megatron/core/README.md
+# Megatron-Core
+
+Megatron-Core is an open-source PyTorch-based library that contains GPU-optimized techniques and cutting-edge system-level optimizations. It abstracts them into composable and modular APIs, allowing full flexibility for developers and model researchers to train custom transformers at-scale on NVIDIA accelerated computing infrastructure. This library is compatible with all NVIDIA Tensor Core GPUs, including FP8 acceleration support for [NVIDIA Hopper architectures](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/).
+
+Megatron-Core offers core building blocks such as attention mechanisms, transformer blocks and layers, normalization layers, and embedding techniques. Additional functionality like activation re-computation, distributed checkpointing is also natively built-in to the library. The building blocks and functionality are all GPU optimized, and can be built with advanced parallelization strategies for optimal training speed and stability on NVIDIA Accelerated Computing Infrastructure. Another key component of the Megatron-Core library includes advanced model parallelism techniques (tensor, sequence, pipeline, context, and MoE expert parallelism).
+
+Megatron-Core can be used with [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/), an enterprise-grade AI platform. Alternatively, you can explore Megatron-Core with the native PyTorch training loop [here](https://github.com/NVIDIA/Megatron-LM/tree/main/examples). Visit [Megatron-Core documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) to learn more.
+
+## Quick links
+
+- [Benchmark using NVIDIA NeMo](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html#performance-benchmarks)
+- [Multimodal example (LLaVA training pipeline)](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/multimodal)
+- [Mixture-of-Experts](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/moe)
+- [Training Mamba-based Language Models](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mamba)
--- a/megatron/core/README_STRAGGLER.md
+++ b/megatron/core/README_STRAGGLER.md
+## StragglerDetector for a TP Group
+
+The file `megatron/core/utils.py` has a class named `StragglerDetector` which supports Python Contexts.
+It can be used to find straggling TP group based on the RTT of the ranks in the TP Group. It also collects
+Power/Temp/Utilization for GPUs, which can additionally be used to narrow down to the exact GPU in the TP Group,
+assuming the straggling was caused by hardware anomaly in a given GPU.<br>
+This class supports collecting timing events for various steps of a given iteration. It
+keeps collecting such timing events on a per rank basis, and when the reporter is invoked
+during a logging interval, it computes the min and max of certain metric across all
+ranks and logs the observed metric and the rank as follows
+
+```
+ 0: INFO:megatron.core.utils:[2024-03-14 23:07:56] | MnRtt/Rnk: 3453.08ms/8 | MxRtt/Rnk: 3468.20ms/0 | MnPwr/Rnk: 601796W/8 | MxPwr/Rnk: 683801W/18 | MnTmp/Rnk: 52C/0 | MxTmp/Rnk: 65C/21 | MnUtl/Rnk: 97%/8 | MxUtl/Rnk: 100%/6 | MnClk/Rnk: 1950MHz/28 | MxClk/Rnk: 1980MHz/0 | MnDRtt/Rnk: 14.27ms/23 | MxDRtt/Rnk: 34.65ms/3 | MnEtpt/Rnk: 296.02TF/0 | MxEtpt/Rnk: 297.32TF/8
+```
+<hr>
+
+### Description of the metrics
+
+Each metric is prefixed with `Mn` or `Mx` to represent `Minimum` or `Maximum`. Each metric is also suffixed with the rank where the metric was measured. The metrics are averaged over the logging interval. Between the prefix and the rank is the name of the metric as follows
+
+- Rtt : RoundTrip Time (time spent in all the traced ops per iteration)
+- Pwr : GPU Power
+- Tmp : GPU Temperature
+- Utl : GPU Utilization
+- Clk : GPU Clock
+- DRtt: get_batch latency
+- Etpt: Estimated throughput. This is derived from actual computed throughput dividied by Rtt. Since we do not collect timing for backward pass, the value is further divided by three to come up with estimated throughput. 
+<hr>
+
+### Command Line activation
+To start using the StragglerDetector, need to pass the following argument `--log-straggler`. It optionally also takes two additional parameters. Default disabled
+- `--disable-straggler-on-startup` - whether to keept the StragglerDetector disabled on startup and enable later. Default enabled
+- `--straggler-ctrlr-port` - The StragglerDetector can toggle between on/off just by sending `curl Rank0Host:port`. Default port is 65535. Every time it is turned 
+- `--straggler-minmax-count` - If set to > 1 (N), it prints N Top and Bottom Etpt/Rank pairs as shown below
+```
+ 0: INFO:megatron.core.utils:^^^^ Bottom 4 Ranks with lowest  Etpt(TF): 296.02/0, 296.17/2, 296.23/1, 296.23/4,
+ 0: INFO:megatron.core.utils:^^^^ Top    4 Ranks with highest Etpt(TF): 297.28/15, 297.28/11, 297.32/12, 297.32/8,
+```
+<hr>
+
+### Programming the StragglerDetector
+The StragglerDetector class supports context, and its implementation is a Singleton.
+- Initialization 
+
+```
+ # initialization, where StragglerDetector will be used
+   from megatron.core.utils import StragglerDetector
+   stimer = StragglerDetector()
+```
+
+- One time for each rank
+
+```
+ # one time before the training loop starts
+ stimer.configure(world, rank, enabled=True, port=65545)
+
+ # Arguments to configure 
+ #     world   : World Size
+ #     rank    : The rank of this trainer
+ #     mmcnt   : (Optional) Number of ranks to print for showing Min/Max Etpt
+ #     amp     : (Optional) Set to 3.0 if we only use timers in fwd pass
+ #     port    : (Optional) control port, useful only for rank-0
+ #     prefill : (Optional) howmany Events to pre-populate
+ #     enabled : (Optional) whether or not collection is enabled on startup
+```
+
+- To Capture time
+
+```
+ # whereever timing need to be captured
+ with stimer:
+     do_operation()
+
+ # special case for get_batch
+ with stimer(bdata=True):
+      input,... = get_batch(iterator,...)
+```
+
+- Logging in main training loop
+
+```
+ # logging
+   total_flops = 0.0
+   iteration = 0
+   # inside the main training loop
+   while training:
+        iteration += 1
+        do_step()
+        total_flops += get_computed_flops()
+        if iteration % log_interval:
+           stimer.report(total_flops, log_interval)
+           total_flops = 0.0
+```
--- a/megatron/core/__init__.py
+++ b/megatron/core/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import megatron.core.tensor_parallel
+import megatron.core.utils
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallel
+from megatron.core.inference_params import InferenceParams
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __shortversion__,
+    __version__,
+)
+from megatron.core.timers import Timers
+
+# Alias parallel_state as mpu, its legacy name
+mpu = parallel_state
+
+__all__ = [
+    "parallel_state",
+    "tensor_parallel",
+    "utils",
+    "DistributedDataParallel",
+    "InferenceParams",
+    "ModelParallelConfig",
+    "Timers",
+]
--- a/megatron/core/__pycache__/__init__.cpython-310.pyc
+++ b/megatron/core/__pycache__/__init__.cpython-310.pyc
--- a/megatron/core/__pycache__/config_logger.cpython-310.pyc
+++ b/megatron/core/__pycache__/config_logger.cpython-310.pyc
--- a/megatron/core/__pycache__/enums.cpython-310.pyc
+++ b/megatron/core/__pycache__/enums.cpython-310.pyc
--- a/megatron/core/__pycache__/inference_params.cpython-310.pyc
+++ b/megatron/core/__pycache__/inference_params.cpython-310.pyc
--- a/megatron/core/__pycache__/jit.cpython-310.pyc
+++ b/megatron/core/__pycache__/jit.cpython-310.pyc
--- a/megatron/core/__pycache__/model_parallel_config.cpython-310.pyc
+++ b/megatron/core/__pycache__/model_parallel_config.cpython-310.pyc
--- a/megatron/core/__pycache__/num_microbatches_calculator.cpython-310.pyc
+++ b/megatron/core/__pycache__/num_microbatches_calculator.cpython-310.pyc
--- a/megatron/core/__pycache__/optimizer_param_scheduler.cpython-310.pyc
+++ b/megatron/core/__pycache__/optimizer_param_scheduler.cpython-310.pyc
--- a/megatron/core/__pycache__/package_info.cpython-310.pyc
+++ b/megatron/core/__pycache__/package_info.cpython-310.pyc
--- a/megatron/core/__pycache__/packed_seq_params.cpython-310.pyc
+++ b/megatron/core/__pycache__/packed_seq_params.cpython-310.pyc