Commit bc5c7fa7 authored by wxj's avatar wxj
Browse files

第一次测试提交

parent 70fddd0f
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Pipeline-parallel size options = [1, 2, 4, 8].
PP=1
# Batch size (global batch size) options = [8, 128].
GBS=8
# Set pipeline-parallel size options.
NLS=$((3*PP))
NNODES=${PP}
# Other params.
TP=8
MBS=1
HS=20480
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
# Name of the job.
export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Interleaved schedule options = [YES, NO].
INTERLEAVED=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12
# Set interleaved schedule options.
if [ ${INTERLEAVED} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${INTERLEAVED} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=12
MBS=1
NLS=96
HS=12288
NAH=96
DDP=local
NNODES=12
# Name of the job.
export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Pipeline-parallel size options = [2, 4, 8, 16, 32].
PP=2
# Batch size (global batch size) options = [32, 128].
GBS=32
# Set pipeline-parallel and tensor-parallel size options.
TP=$((64/PP))
# Other params.
MBS=1
NLS=32
HS=20480
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Pipeline-parallel size options = [2, 4, 8, 16, 32].
PP=2
# Batch size (global batch size) options = [32, 512].
GBS=32
# Set pipeline-parallel and data-parallel size options.
DP=$((64/PP))
# Other params.
TP=1
MBS=1
NLS=32
HS=3840
NAH=32
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Tensor-parallel size options = [2, 4, 8, 16, 32].
TP=2
# Batch size (global batch size) options = [32, 128, 512].
GBS=32
# Set tensor-parallel and data-parallel size options.
DP=$((64/TP))
# Other params.
PP=1
MBS=1
NLS=32
HS=3840
NAH=32
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Microbatch size options = [1, 2, 4, 8].
MBS=1
# Batch size (global batch size) options = [128, 512].
GBS=128
# Other params.
TP=8
PP=8
NLS=32
HS=15360
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Activation recomputation options = [YES, NO].
ACTIVATION_RECOMPUTATION=YES
# Batch size (global batch size) options = [1, 2, 4, ..., 256].
GBS=1
# Set activation recomputation.
if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
MEGATRON_EXTRA_PARAMS=""
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=16
MBS=1
NLS=80
HS=12288
NAH=96
DDP=local
NNODES=16
# Name of the job.
export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Scatter-gather communication optimization options = [YES, NO].
SCATTER_GATHER=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12
# Set scatter-gather communication optimization options.
if [ ${SCATTER_GATHER} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${SCATTER_GATHER} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=12
MBS=1
NLS=96
HS=12288
NAH=96
DDP=local
NNODES=12
# Name of the job.
export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
MODEL_SIZE=1.7B
if [ ${MODEL_SIZE} == "1.7B" ]; then
TP=1
PP=1
MBS=16
GBS=512
NLS=24
HS=2304
NAH=24
DDP=torch
NNODES=4
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "3.6B" ]; then
TP=2
PP=1
MBS=16
GBS=512
NLS=30
HS=3072
NAH=32
DDP=torch
NNODES=8
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "7.5B" ]; then
TP=4
PP=1
MBS=16
GBS=512
NLS=36
HS=4096
NAH=32
DDP=torch
NNODES=16
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "18B" ]; then
TP=8
PP=1
MBS=8
GBS=1024
NLS=40
HS=6144
NAH=48
DDP=torch
NNODES=32
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "39B" ]; then
TP=8
PP=2
MBS=4
GBS=1536
NLS=48
HS=8192
NAH=64
DDP=local
NNODES=64
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "76B" ]; then
TP=8
PP=4
MBS=2
GBS=1792
NLS=60
HS=10240
NAH=80
DDP=local
NNODES=128
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
elif [ ${MODEL_SIZE} == "145B" ]; then
TP=8
PP=8
MBS=2
GBS=2304
NLS=80
HS=12288
NAH=96
DDP=local
NNODES=192
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
elif [ ${MODEL_SIZE} == "310B" ]; then
TP=8
PP=16
MBS=1
GBS=2160
NLS=96
HS=16384
NAH=128
DDP=local
NNODES=240
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
elif [ ${MODEL_SIZE} == "530B" ]; then
TP=8
PP=35
MBS=1
GBS=2520
NLS=105
HS=20480
NAH=128
DDP=local
NNODES=315
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
elif [ ${MODEL_SIZE} == "1T" ]; then
TP=8
PP=64
MBS=1
GBS=3072
NLS=128
HS=25600
NAH=160
DDP=local
NNODES=384
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else
echo "Invalid configuration"
exit 1
fi
# Name of the job
export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
# T5 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model on a Slurm based cluster
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
ACCOUNT_NAME=""
PARTITION=""
JOB_NAME=""
NUM_NODES=1
CHECKPOINT_PATH="" #<Specify path to checkpoint>
TENSORBOARD_LOGS_PATH=""#<Specify path to tensorboard log>
VOCAB_FILE="" #<Specify path to file>/bert-large-cased-vocab.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
srun -N $NUM_NODES --container-image $PYTORCH_IMAGE --container-mounts "/path/to/data:/path/to/data,/path/to/megatron-lm:/workspace/megatron-lm" --account $ACCOUNT -N 1 -J $JOB_NAME -p $PARTITION --no-container-mount-home -c "
cd /workspace/megatron-lm
./examples/t5/train_t5_220m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH"
```
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The architecture arguments below shows configuration for T5 220M model.
### 220M
```
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
## 3. Training Results
<a id="markdown-training-results" name="training-results"></a>
Below is the training curve for the 220M model on Pile dataset. The training takes 4 days on 32 GPUs, with batch size of 2048.
Finetuning on SQUAD dataset, the validation result is: 63.44\%
<p align="center">
<img src="./t5_mcore_train_curve.png" width="800" height="400">
</p>
#!/bin/bash
# Runs the "220M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_DIR=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/bert-large-cased-vocab.txt
DATA_PATH=$4 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
T5_ARGS="
--encoder-num-layers 12 \
--decoder-num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--micro-batch-size 64 \
--global-batch-size 512 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--bf16 \
--vocab-extra-ids 100 \
--init-method-std 0.015 \
--transformer-impl transformer_engine \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--use-mcore-models \
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--tokenizer-type BertWordPieceCase \
--split 99982,9,9 \
"
OUTPUT_ARGS="
--log-interval 100 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--save-interval 500 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
$T5_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
## Quick Start
The following guide will show you how to quickly get started with Megatron Core. It will show you the following
* We will initalize megatron core on 2 GPUS.
* We will build a GPT model with tensor model parallel size 2, pipeline parallel size 1
* We will train it for a few iterations using megatron core schedules
* We will save the model using the distributed checkpointing format
* We will load the model saved above.
*NOTE: The following has been testing for megatron core version 0.5 and NGC Pytorch Container version 24.02
### Environment Setup
```
docker run --ipc=host --shm-size=512m --gpus all -it nvcr.io/nvidia/pytorch:24.02-py3
pip install megatron_core
pip install tensorstore==0.1.45
pip install zarr
```
<br>
### Writing Your First Training Loop
The following steps will walk you through how you can create a sample GPT model split across tensors (Tensor model parallel ) on 2 GPUS, and run a forward pass through it using a MockGPT dataset helper class that we created in Megatron core.
<br>
**NOTE: All of the folowing steps needs to be put into a script and then run as explained in the last step**
<br>
**STEP 1 - Initialize Distributed Training and Model parallel setup**
The following utility when called initalizes your distributed setup.
```python
import os
import torch
from megatron.core import parallel_state
def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1):
# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)
# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
```
<br>
**STEP 2 - GPT Model Setup**
The following step shows you how you can quickly create a GPT model. For a list of other configs that you can pass into the model look into [transformer_config.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/transformer_config.py)
```
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
def model_provider():
"""Build the model."""
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=12,
num_attention_heads=4,
use_cpu_initialization=True,
pipeline_dtype=torch.float32)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=64)
return gpt_model
```
<br>
**STEP 3 - GPT Mock dataset setup**
The following shows you how you can quickly get started with a mock dataset utility we created. In order to train with your data, please use the actual GPTDataset class in [gpt_dataset.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/gpt_dataset.py)
To find more information about megatron core data pipeline please refer to [this](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/readme.md?ref_type=heads)
```
from torch.utils.data import DataLoader
from megatron.core.datasets.utils import Split
from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
def get_train_data_iterator():
config = GPTDatasetConfig(
random_seed = 0,
sequence_length = 64,
blend=[],
mock=True,
reset_position_ids=False,
reset_attention_mask=False,
eod_mask_loss=False,
tokenizer="dummy")
training_data= MockGPTDataset(Split.train, config)
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
train_iterator = iter(train_dataloader)
return train_iterator
```
<br>
**STEP 4 - Forward Step Function**
In megatron core, we use [schedules.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/pipeline_parallel/schedules.py) to run the model. So it is sufficient to define a forward step function which takes as input the data iterator and the model and produces as output the output tensor and a loss function
```python
from functools import partial
def forward_step_func(data_iterator, model):
def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# If you have data parallel reduce loss across data parallel groups.
# If pipeline parallel, loss computation is done only in last stage.
return loss, {'lm loss': loss}
data = next(data_iterator)
tokens = data['tokens'].to(device)
attention_mask = data['attention_mask'].to(device)
position_ids = data['position_ids'].to(device)
labels = data['labels'].to(device)
loss_mask = data['loss_mask'].to(device)
output_tensor = model(tokens, position_ids, attention_mask,
labels=labels)
return output_tensor, partial(loss_func, loss_mask)
```
<br>
**STEP 5 - Load and Save Distributed Checkpoint**
Megatron core uses distributed checkpoint for loading and saving model. This gives you the flexiblity to convert model from one model parallel setting to another when you load a model (i.e A model trained with tensor parallel size 2, can now be loaded as tensor model parallel size 4 etc.)
*NOTE: Make sure you have zarr and tensorstore pip package installed as shown in the environment setup*
```python
from megatron.core import dist_checkpointing
def save_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model
```
<br>
**STEP 6 - Main Function**
The following is the main function that needs to go into your script.
```python
from pathlib import Path
from torch.optim import Adam
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
if __name__ == "__main__":
initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)
gpt_model = model_provider()
device = torch.device("cuda")
gpt_model.to(device)
optim = Adam(gpt_model.parameters())
train_iterator = get_train_data_iterator()
forward_backward_func = get_forward_backward_func()
# Running the model for 5 iterations
for _ in range(5):
optim.zero_grad()
losses_reduced = forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=train_iterator,
model=gpt_model,
num_microbatches=1,
seq_length=64,
micro_batch_size=8,
decoder_seq_length=64,
forward_only=False)
optim.step()
print(f'Losses reduced : {losses_reduced}')
# Saving the model
save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt')
# Loading the model
gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt')
gpt_model.to(device)
print('Successfully loaded the model')
```
<br>
**STEP 7 - Running the full example**
All the above steps are put to gether in a [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) script in examples folder in megatron . You can run it as follows
```
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM/examples
NUM_GPUS=2
torchrun --nproc-per-node $NUM_GPUS run_simple_mcore_train_loop.py
```
<br>
### Extending Further
The above example introduced you to a basic training loop in MCore. To see more advanced examples please look at [pretrain_gpt.py]. That will show you how you can write more complex training loops, involving pipeline parallel, context parallel, rope embeddings, mixture of experts and all other functionalities present in mcore.
Megatron Core is a library for efficient and scalable training of transformer based models.
\ No newline at end of file
## StragglerDetector
The file `megatron/core/utils.py` has a class named `StragglerDetector` which supports Python Contexts
This class supports collecting timing events for various steps of a given iteration. It
keeps collecting such timing events on a per rank basis, and when the reporter is invoked
during a logging interval, it computes the min and max of certain metric across all
ranks and logs the observed metric and the rank as follows
```
0: INFO:megatron.core.utils:[2024-03-14 23:07:56] | MnRtt/Rnk: 3453.08ms/8 | MxRtt/Rnk: 3468.20ms/0 | MnPwr/Rnk: 601796W/8 | MxPwr/Rnk: 683801W/18 | MnTmp/Rnk: 52C/0 | MxTmp/Rnk: 65C/21 | MnUtl/Rnk: 97%/8 | MxUtl/Rnk: 100%/6 | MnClk/Rnk: 1950MHz/28 | MxClk/Rnk: 1980MHz/0 | MnDRtt/Rnk: 14.27us/23 | MxDRtt/Rnk: 34.65us/3 | MnEtpt/Rnk: 296.02TF/0 | MxEtpt/Rnk: 297.32TF/8
```
<hr>
### Description of the metrics
Each metric is prefixed with `Mn` or `Mx` to represent `Minimum` or `Maximum`. Each metric is also suffixed with the rank where the metric was measured. The metrics are averaged over the logging interval. Between the prefix and the rank is the name of the metric as follows
- Rtt : RoundTrip Time (time spent in all the traced ops per iteration)
- Pwr : GPU Power
- Tmp : GPU Temperature
- Utl : GPU Utilization
- Clk : GPU Clock
- DRtt: get_batch latency
- Etpt: Estimated throughput. This is derived from actual computed throughput dividied by Rtt. Since we do not collect timing for backward pass, the value is further divided by three to come up with estimated throughput.
<hr>
### Command Line activation
To start using the StragglerDetector, need to pass the following argument `--log-straggler`. It optionally also takes two additional parameters. Default disabled
- `--disable-straggler-on-startup` - whether to keept the StragglerDetector disabled on startup and enable later. Default enabled
- `--straggler-ctrlr-port` - The StragglerDetector can toggle between on/off just by sending `curl Rank0Host:port`. Default port is 65535. Every time it is turned
- `--straggler-minmax-count` - If set to > 1 (N), it prints N Top and Bottom Etpt/Rank pairs as shown below
```
0: INFO:megatron.core.utils:^^^^ Bottom 4 Ranks with lowest Etpt(TF): 296.02/0, 296.17/2, 296.23/1, 296.23/4,
0: INFO:megatron.core.utils:^^^^ Top 4 Ranks with highest Etpt(TF): 297.28/15, 297.28/11, 297.32/12, 297.32/8,
```
<hr>
### Programming the StragglerDetector
The StragglerDetector class supports context, and its implementation is a Singleton.
- Initialization
```
# initialization, where StragglerDetector will be used
from megatron.core.utils import StragglerDetector
stimer = StragglerDetector()
```
- One time for each rank
```
# one time before the training loop starts
stimer.configure(world, rank, enabled=True, port=65545)
# Arguments to configure
# world : World Size
# rank : The rank of this trainer
# mmcnt : (Optional) Number of ranks to print for showing Min/Max Etpt
# amp : (Optional) Set to 3.0 if we only use timers in fwd pass
# port : (Optional) control port, useful only for rank-0
# prefill : (Optional) howmany Events to pre-populate
# enabled : (Optional) whether or not collection is enabled on startup
```
- To Capture time
```
# whereever timing need to be captured
with stimer:
do_operation()
# special case for get_batch
with stimer(bdata=True):
input,... = get_batch(iterator,...)
```
- Logging in main training loop
```
# logging
total_flops = 0.0
iteration = 0
# inside the main training loop
while training:
iteration += 1
do_step()
total_flops += get_computed_flops()
if iteration % log_interval:
stimer.report(total_flops, log_interval)
total_flops = 0.0
```
import megatron.core.tensor_parallel
import megatron.core.utils
from megatron.core import parallel_state
from megatron.core.distributed import DistributedDataParallel
from megatron.core.inference_params import InferenceParams
from megatron.core.model_parallel_config import ModelParallelConfig
from megatron.core.timers import Timers
# Alias parallel_state as mpu, its legacy name
mpu = parallel_state
__all__ = [
"parallel_state",
"tensor_parallel",
"utils",
"DistributedDataParallel",
"InferenceParams",
"ModelParallelConfig",
"Timers",
]
CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
CPPFLAGS += $(shell python3 -m pybind11 --includes)
LIBNAME = helpers
LIBEXT = $(shell python3-config --extension-suffix)
default: $(LIBNAME)$(LIBEXT)
%$(LIBEXT): %.cpp
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import numpy
from megatron.core.datasets.indexed_dataset import IndexedDataset
from megatron.core.datasets.masked_dataset import (
MaskedWordPieceDataset,
MaskedWordPieceDatasetConfig,
)
from megatron.core.datasets.utils import Split
@dataclass
class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
"""Configuration object for Megatron Core BERT WordPiece datasets"""
classification_head: bool = None
"""Option to perform the next sequence prediction during sampling"""
def __post_init__(self) -> None:
"""Do asserts and set fields post init
"""
super().__post_init__()
assert self.classification_head is not None
class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
"""The BERT dataset that assumes WordPiece tokenization
Args:
indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
dataset_path (str): The real path on disk to the dataset, for bookkeeping
indexed_indices (numpy.ndarray): The set of the documents indices to expose
num_samples (int): The number of samples to draw from the indexed dataset
index_split (Split): The indexed_indices Split
config (BERTMaskedWordPieceDatasetConfig): The config
"""
def __init__(
self,
indexed_dataset: IndexedDataset,
dataset_path: str,
indexed_indices: numpy.ndarray,
num_samples: int,
index_split: Split,
config: BERTMaskedWordPieceDatasetConfig,
) -> None:
super().__init__(
indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
)
def _finalize(self) -> None:
"""Abstract method implementation
"""
self.token_lookup = list(self.config.tokenizer.inv_vocab.keys())
# Account for the single <cls> and two <sep> token ids
self.sample_index = self._build_sample_index(
self.config.sequence_length - 3, 2 if self.config.classification_head else 1
)
@staticmethod
def _key_config_attributes() -> List[str]:
"""Inherited method implementation
Returns:
List[str]: The key config attributes
"""
return super(
BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset
)._key_config_attributes() + ["classification_head",]
def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
"""Abstract method implementation
Args:
idx (int): The index into the dataset
Returns:
Dict[str, Union[int, numpy.ndarray]]: The
"""
idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
numpy_random_state = numpy.random.RandomState(
seed=(self.config.random_seed + idx) % 2 ** 32
)
assert target_sequence_length <= self.config.sequence_length
# Split the sample into contiguous subsegments A and B
pivot = len(sample)
is_next_random = False
if self.config.classification_head:
assert len(sample) > 1, "the sample must contain at least two sentences"
pivot = 1
if len(sample) >= 3:
pivot = numpy_random_state.randint(low=1, high=len(sample))
is_next_random = numpy_random_state.random() < 0.5
split_A = []
for sample_a in sample[:pivot]:
split_A.extend(sample_a)
split_B = []
for sample_b in sample[pivot:]:
split_B.extend(sample_b)
if is_next_random:
split_A, split_B = split_B, split_A
# Trim the subsegments from either end to a desired joint length
length_A = len(split_A)
length_B = len(split_B)
if length_A + length_B <= target_sequence_length:
truncated = False
else:
while length_A + length_B > target_sequence_length:
split = split_A if length_A > length_B else split_B
if numpy_random_state.random() < 0.5:
del split[0]
else:
del split[-1]
length_A = len(split_A)
length_B = len(split_B)
truncated = True
# Merge the subsegments and create the token assignment labels
tokens = [
self.config.tokenizer.cls,
*split_A,
self.config.tokenizer.sep,
]
assignments = [0 for _ in range(1 + len(split_A) + 1)]
if split_B:
tokens += [*split_B, self.config.tokenizer.sep]
assignments += [1 for _ in range(len(split_B) + 1)]
# Masking
tokens, masked_positions, masked_labels, _, _ = self._create_masked_lm_predictions(
tokens, target_sequence_length, numpy_random_state
)
# Pad the sequences and convert to NumPy
length_toks = len(tokens)
length_pads = self.config.sequence_length - length_toks
assert length_pads >= 0
tokens = numpy.array(tokens, dtype=numpy.int64)
tokens = numpy.pad(tokens, (0, length_pads), constant_values=self.config.tokenizer.pad)
assignments = numpy.array(assignments, dtype=numpy.int64)
assignments = numpy.pad(
assignments, (0, length_pads), constant_values=self.config.tokenizer.pad
)
# Get the padding mask
mask_pads = numpy.ones(length_toks, dtype=numpy.int64)
mask_pads = numpy.pad(
mask_pads, (0, length_pads), constant_values=self.config.tokenizer.pad
)
# Mask the labels
labels = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) - 1
labels[masked_positions] = masked_labels
# Get the loss mask
mask_loss = numpy.zeros(self.config.sequence_length, dtype=numpy.int64)
mask_loss[masked_positions] = 1
return {
"text": tokens,
"types": assignments,
"labels": labels,
"is_random": int(is_next_random),
"padding_mask": mask_pads,
"loss_mask": mask_loss,
"truncated": int(truncated),
}
def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
"""Abstract method implementation
80% of the time, replace the token id with mask token id. 10% of the time, replace token id
with a random token id from the vocabulary. 10% of the time, do nothing.
Args:
numpy_random_state (RandomState): The NumPy random state
Returns:
Optional[int]: The replacement token id or None
"""
if numpy_random_state.random() < 0.8:
return self.config.tokenizer.mask
else:
if numpy_random_state.random() >= 0.5:
return self.token_lookup[numpy_random_state.randint(0, len(self.token_lookup))]
return None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment