Commit a8f4edcb authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'github_small_prs' into 'main'

some small PRs from github

See merge request ADLR/megatron-lm!305
parents 6ee3d210 f69f699f
...@@ -127,7 +127,7 @@ Further command line arguments are described in the source file [`preprocess_dat ...@@ -127,7 +127,7 @@ Further command line arguments are described in the source file [`preprocess_dat
## BERT Pretraining ## BERT Pretraining
The `examples/pretrain_bert.sh` script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--lr-warmup-fraction`. While this is single GPU training, the batch size specified by `--micro-batch-size` is a single forward-backward path batch-size and the code will perform gradient accumulation steps until it reaches `global-batch-size` whcih is the batch size per iteration. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`). We use `train-iters` as the training iterations requested. Alternatively, one can provide `--train-samples` which is total number of samples to train on. If this option is present, then instead of providing `--lr-decay-iters`, one will need to provide `--lr-decay-samples`. The `examples/pretrain_bert.sh` script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--lr-warmup-fraction`. While this is single GPU training, the batch size specified by `--micro-batch-size` is a single forward-backward path batch-size and the code will perform gradient accumulation steps until it reaches `global-batch-size` which is the batch size per iteration. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`). We use `train-iters` as the training iterations requested. Alternatively, one can provide `--train-samples` which is total number of samples to train on. If this option is present, then instead of providing `--lr-decay-iters`, one will need to provide `--lr-decay-samples`.
The logging, checkpoint-saving, and evaluation intervals are specified. Checkpointing the activations facilitates the training of larger models and/or batches. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions. The logging, checkpoint-saving, and evaluation intervals are specified. Checkpointing the activations facilitates the training of larger models and/or batches. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions.
......
...@@ -23,6 +23,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ ...@@ -23,6 +23,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--num-attention-heads 16 \ --num-attention-heads 16 \
--micro-batch-size 2 \ --micro-batch-size 2 \
--global-batch-size 16 \ --global-batch-size 16 \
--seq-length 512 \
--max-position-embeddings 512 \ --max-position-embeddings 512 \
--train-iters 1000000 \ --train-iters 1000000 \
--save $CHECKPOINT_PATH \ --save $CHECKPOINT_PATH \
......
...@@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule): ...@@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule):
MLP will take the input with h hidden state, project it to 4*h MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension. At the end, dropout is also state back into h hidden dimension.
applied.
""" """
def __init__(self, init_method, output_layer_init_method): def __init__(self, init_method, output_layer_init_method):
...@@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule): ...@@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule):
init_method=output_layer_init_method, init_method=output_layer_init_method,
skip_bias_add=True) skip_bias_add=True)
def forward(self, hidden_states): def forward(self, hidden_states):
# [s, b, 4hp] # [s, b, 4hp]
......
...@@ -356,9 +356,13 @@ def get_data_parallel_rank(): ...@@ -356,9 +356,13 @@ def get_data_parallel_rank():
def destroy_model_parallel(): def destroy_model_parallel():
"""Set the groups to none.""" """Set the groups to none."""
global _MODEL_PARALLEL_GROUP
_MODEL_PARALLEL_GROUP = None
global _TENSOR_MODEL_PARALLEL_GROUP global _TENSOR_MODEL_PARALLEL_GROUP
_TENSOR_MODEL_PARALLEL_GROUP = None _TENSOR_MODEL_PARALLEL_GROUP = None
global _PIPELINE_MODEL_PARALLEL_GROUP global _PIPELINE_MODEL_PARALLEL_GROUP
_PIPELINE_MODEL_PARALLEL_GROUP = None _PIPELINE_MODEL_PARALLEL_GROUP = None
global _DATA_PARALLEL_GROUP global _DATA_PARALLEL_GROUP
_DATA_PARALLEL_GROUP = None _DATA_PARALLEL_GROUP = None
global _EMBEDDING_GROUP
_EMBEDDING_GROUP = None
...@@ -256,7 +256,7 @@ class ColumnParallelLinear(torch.nn.Module): ...@@ -256,7 +256,7 @@ class ColumnParallelLinear(torch.nn.Module):
device=torch.cuda.current_device(), dtype=args.params_dtype)) device=torch.cuda.current_device(), dtype=args.params_dtype))
_initialize_affine_weight_gpu(self.weight, init_method, _initialize_affine_weight_gpu(self.weight, init_method,
partition_dim=0, stride=stride) partition_dim=0, stride=stride)
if bias: if bias:
if args.use_cpu_initialization: if args.use_cpu_initialization:
self.bias = Parameter(torch.empty( self.bias = Parameter(torch.empty(
...@@ -286,7 +286,7 @@ class ColumnParallelLinear(torch.nn.Module): ...@@ -286,7 +286,7 @@ class ColumnParallelLinear(torch.nn.Module):
# All-gather across the partitions. # All-gather across the partitions.
output = gather_from_tensor_model_parallel_region(output_parallel) output = gather_from_tensor_model_parallel_region(output_parallel)
else: else:
output = output_parallel output = output_parallel
output_bias = self.bias if self.skip_bias_add else None output_bias = self.bias if self.skip_bias_add else None
return output, output_bias return output, output_bias
...@@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module): ...@@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module):
keep_master_weight_for_test: This was added for testing and should be keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights set to False. It returns the master weights
used for initialization. used for initialization.
skip_bias_add: This was added to enable performance optimations where bias skip_bias_add: This was added to enable performance optimization where bias
can be fused with other elementwise operations. we skip can be fused with other elementwise operations. We skip
adding bias but instead return it. adding bias but instead return it.
""" """
......
...@@ -20,7 +20,7 @@ from .utils import split_tensor_along_last_dim ...@@ -20,7 +20,7 @@ from .utils import split_tensor_along_last_dim
def _reduce(input_): def _reduce(input_):
"""All-reduce the the input tensor across model parallel group.""" """All-reduce the input tensor across model parallel group."""
# Bypass the function if we are using only 1 GPU. # Bypass the function if we are using only 1 GPU.
if get_tensor_model_parallel_world_size()==1: if get_tensor_model_parallel_world_size()==1:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment