Unverified Commit dd5a6329 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Add benchmark: Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark (#582)

**Description**
Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark
parent 254ea7fe
......@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \
libtool \
lshw \
python3-mpi4py \
net-tools \
openssh-client \
openssh-server \
......
......@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \
libtool \
lshw \
python3-mpi4py \
net-tools \
openssh-client \
openssh-server \
......
......@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \
libtool \
lshw \
python3-mpi4py \
net-tools \
numactl \
openssh-client \
......@@ -136,7 +137,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make -C third_party rocm -o rocm_hipblaslt
RUN make -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm
ADD . .
RUN python3 -m pip install --upgrade setuptools==65.7 && \
......
......@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \
libtool \
lshw \
python3-mpi4py \
net-tools \
numactl \
openssh-client \
......@@ -141,7 +142,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm
ADD . .
RUN python3 -m pip install --no-cache-dir .[amdworker] && \
......
......@@ -37,8 +37,29 @@ For inference, supported percentiles include
| Name | Unit | Description |
|-----------------------------------------------------------------------------------------|------------------------|------------------------------------------------------------------------------|
| model-benchmarks/pytorch-${model_name}/${precision}_train_step_time | time (ms) | The average training step time with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput | throughput (samples/s) | The average training throughput with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput | throughput (samples/s) | The average training throughput with fp32/fp16 precision per GPU. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time | time (ms) | The average inference step time with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput | throughput (samples/s) | The average inference throughput with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time\_${percentile} | time (ms) | The n<sup>th</sup> percentile inference step time with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput\_${percentile} | throughput (samples/s) | The n<sup>th</sup> percentile inference throughput with fp32/fp16 precision. |
## Megatron Model benchmarks
### `megatron-gpt`
#### Introduction
Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) or [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed).
`tips: batch_size in this benchmark represents global batch size, the batch size on each GPU instance is micro_batch_size.`
#### Metrics
| Name | Unit | Description |
|---------------------------------------------------|------------------------|---------------------------------------------------------|
| megatron-gpt/${precision}_train_step_time | time (ms) | The average training step time per iteration. |
| megatron-gpt/${precision}_train_throughput | throughput (samples/s) | The average training throughput per iteration. |
| megatron-gpt/${precision}_train_tflops | tflops/s | The average training tflops per second per iteration. |
| megatron-gpt/${precision}_train_mem_allocated | GB | The average GPU memory allocated per iteration. |
| megatron-gpt/${precision}_train_max_mem_allocated | GB | The average maximum GPU memory allocated per iteration. |
......@@ -177,6 +177,7 @@ def run(self):
'xlrd>=2.0.1',
'xlsxwriter>=1.3.8',
'xmltodict>=0.12.0',
'types-requests',
],
extras_require=(
lambda x: {
......
......@@ -8,5 +8,6 @@
from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import PytorchGPT2
from superbench.benchmarks.model_benchmarks.pytorch_cnn import PytorchCNN
from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM
from superbench.benchmarks.model_benchmarks.megatron_gpt3 import MegatronGPT
__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM']
__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT']
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the megatron deepspeed GPT pretrain class."""
import json
import os
import statistics
import numpy as np
import requests
import torch
from pathlib import Path
import re
from superbench.benchmarks import BenchmarkRegistry
from superbench.benchmarks.context import Platform, Precision
from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark
from superbench.benchmarks.return_code import ReturnCode
from superbench.common.utils import logger, run_command
def download_file(url, path):
"""Download file from url to path."""
response = requests.get(url)
with open(path, 'wb') as file:
file.write(response.content)
class MegatronGPT(ModelBenchmark):
"""The Megatron DeepSpeed GPT pretrain benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): parameters of the benchmark.
"""
super().__init__(name, parameters)
self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16, Precision.BFLOAT16]
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.')
self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.')
self._parser.add_argument(
'--vocab_url',
type=str,
required=False,
default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json',
help='Vocab URL.'
)
self._parser.add_argument(
'--merges_url',
type=str,
required=False,
default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt',
help='Merges URL.'
)
self._parser.add_argument(
'--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.'
)
self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.')
self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.')
self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.')
self._parser.add_argument(
'--num_attn_heads', type=int, required=False, default=32, help='Number of attention heads.'
)
self._parser.add_argument('--micro_batch_size', type=int, required=False, default=2, help='micro batch size.')
self._parser.add_argument('--lr', type=float, required=False, default=1.2e-4, help='Learning rate.')
self._parser.add_argument('--min_lr', type=float, required=False, default=1.0e-6, help='Minimum learning rate.')
self._parser.add_argument('--init_std', type=float, required=False, default=0.009, help='Init std.')
self._parser.add_argument('--seq_len', type=int, required=False, default=2048, help='Sequence length.')
self._parser.add_argument(
'--tensor_model_parallel_size', type=int, required=False, default=1, help='Tensor model parallel size.'
)
self._parser.add_argument(
'--pipeline_model_parallel_size', type=int, required=False, default=1, help='Pipeline model parallel size.'
)
self._parser.add_argument(
'--num_gpus', type=int, required=False, default=8, help='Number of GPUs per node to run the benchmark.'
)
self._parser.add_argument(
'--num_nodes', type=int, required=False, default=1, help='Number of nodes to run the benchmark.'
)
self._parser.add_argument('--sequence_parallel', action='store_true', help='Enable Sequence parallel.')
self._parser.add_argument(
'--no_async_tensor_model_parallel_allreduce',
action='store_true',
help='No async tensor model parallel allreduce.'
)
self._parser.add_argument(
'--use_rotary_position_embeddings', action='store_true', help='Use rotary position embeddings.'
)
self._parser.add_argument(
'--no_gradient_accumulation_fusion', action='store_true', help='No gradient accumulation fusion.'
)
self._parser.add_argument('--use_flash_attn', action='store_true', help='Use flash attention.')
self._parser.add_argument('--no_masked_softmax_fusion', action='store_true', help='No masked softmax fusion.')
self._parser.add_argument('--no_bias_gelu_fusion', action='store_true', help='No bias gelu fusion.')
self._parser.add_argument('--no_bias_dropout_fusion', action='store_true', help='No bias dropout fusion.')
self._parser.add_argument(
'--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.'
)
# lr configs
# Parallelism configs
self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.')
# Misc configs
self._parser.add_argument('--log-interval', type=int, required=False, default=1, help='Log interval.')
self._parser.add_argument('--eval_iters', type=int, default=0, help='Eval iters.')
self._parser.add_argument('--eval_interval', type=int, default=10, help='Eval interval.')
self._parser.add_argument('--num_save', type=int, default=10000, help='Num save.')
self._parser.add_argument('--save_interval', type=int, default=10000, help='Save interval.')
# Output and data configs
self._parser.add_argument('--seed', type=int, default=1234, help='Seed.')
self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
)
self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.')
self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.')
self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.')
self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.')
def _preprocess(self):
if not super()._preprocess():
return False
if not os.path.exists(self._args.code_base) or \
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
logger.error('Code base is not valid.')
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
return False
data_parallel_size = self._args.num_gpus * self._num_nodes \
// self._args.pipeline_model_parallel_size // self._args.tensor_model_parallel_size
if self._args.micro_batch_size < 1 or \
self._args.micro_batch_size > (self._args.batch_size // data_parallel_size):
logger.error('Micro Batch size * data parallel size is larger than global batch size.')
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
return False
for precision in self._args.precision:
if precision not in self._supported_precision:
logger.error('Precision %s is not supported.' % precision)
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
return False
if not os.path.exists(self._args.data_home):
os.makedirs(self._args.data_home)
return True
def _parse_log(self, output):
"""Parse log output and get the performance."""
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B')
max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B')
lines = output.splitlines()
tflops = []
mem_allocated = []
max_mem_allocated = []
iteration_times = []
for line in lines:
if 'TFLOPs' in line:
tflops_matches = tflops_pattern.search(line)
elapsed_time_match = elapsed_time_pattern.search(line)
if tflops_matches:
tflops_values = float(tflops_matches.group(1))
tflops.append(tflops_values)
if elapsed_time_match:
elapsed_time_value = float(elapsed_time_match.group(1))
iteration_times.append(elapsed_time_value)
if 'MaxMemAllocated' in line:
mem_allocated_match = mem_allocated_pattern.search(line)
max_mem_allocated_match = max_mem_allocated_pattern.search(line)
if mem_allocated_match:
mem_allocated_value = float(mem_allocated_match.group(1))
mem_allocated.append(mem_allocated_value)
if max_mem_allocated_match:
max_mem_allocated_value = float(max_mem_allocated_match.group(1))
max_mem_allocated.append(max_mem_allocated_value)
return iteration_times, tflops, mem_allocated, max_mem_allocated
def __prepare_deespeed_config(self, precision_megatron):
"""Prepare deepspeed configs."""
self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json')
# Load deepspeed config template json file
precision_template = {
'enabled': True,
'loss_scale': 0,
'loss_scale_window': 500,
'hysteresis': 2,
'min_loss_scale': 1,
'initial_scale_power': 11
}
ds_config_template = {
'train_batch_size': self._args.batch_size,
'train_micro_batch_size_per_gpu': self._args.micro_batch_size,
'steps_per_print': self._args.log_interval,
'zero_optimization': {
'stage': self._args.zero_stage
},
'gradient_clipping': 1.0,
'prescale_gradients': self._args.prescale_grad,
}
if len(precision_megatron) > 0:
ds_config_template[precision_megatron] = precision_template
# Write to config json file
with open(self._config_json_path, 'w') as file:
json.dump(ds_config_template, file, indent=4)
deepspeed_options = f'\
--deepspeed \
--deepspeed_config {self._config_json_path} \
--zero-stage {self._args.zero_stage} \
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
if self._args.pipeline_model_parallel_size <= 1:
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
return deepspeed_options
def _megatron_command(self, precision): # noqa: C901
"""Generate megatron command."""
if precision == Precision.FLOAT32:
precision_megatron = ''
elif precision == Precision.FLOAT16:
precision_megatron = '--fp16'
elif precision == Precision.BFLOAT16:
precision_megatron = '--bf16'
megatron_options = f'\
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size {self._args.tensor_model_parallel_size} \
--init-method-std {self._args.init_std} \
--lr-decay-samples 43945312 \
--lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \
--lr-decay-style cosine \
--micro-batch-size {self._args.micro_batch_size} \
--global-batch-size {self._args.batch_size} \
--num-layers {self._args.num_layers} \
--hidden-size {self._args.hidden_size} \
--num-attention-heads {self._args.num_attn_heads} \
--seq-length {self._args.seq_len} \
--max-position-embeddings {self._args.seq_len} \
--train-tokens {self._args.train_tokens} \
--train-samples {self._args.num_steps * self._args.batch_size} \
--lr {self._args.lr} \
--min-lr {self._args.min_lr} \
--split 949,50,1 \
--log-interval {self._args.log_interval} \
--eval-interval {self._args.eval_interval} \
--eval-iters {self._args.eval_iters} \
--save-interval {self._args.save_interval} \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers {self._args.num_workers} \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--optimizer adam \
--use-distributed-optimizer \
{precision_megatron} \
--seed {self._args.seed}'
if self._args.sequence_parallel:
megatron_options = f'{megatron_options} --sequence-parallel'
if self._args.no_async_tensor_model_parallel_allreduce:
megatron_options = f'{megatron_options} --no-async-tensor-model-parallel-allreduce'
if self._args.use_rotary_position_embeddings:
megatron_options = f'{megatron_options} --use-rotary-position-embeddings'
if self._args.no_gradient_accumulation_fusion:
megatron_options = f'{megatron_options} --no-gradient-accumulation-fusion'
if self._args.use_flash_attn:
megatron_options = f'{megatron_options} --use-flash-attn'
if self._args.no_masked_softmax_fusion:
megatron_options = f'{megatron_options} --no-masked-softmax-fusion'
if self._args.no_bias_gelu_fusion:
megatron_options = f'{megatron_options} --no-bias-gelu-fusion'
if self._args.no_bias_dropout_fusion:
megatron_options = f'{megatron_options} --no-bias-dropout-fusion'
if self._args.extra:
megatron_options = f'{megatron_options} {self._args.extra}'
command = ''
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
if self._args.deepspeed:
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
if self._num_nodes > 1:
command = f'torchrun {self._distributed_args} ' + \
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
else:
command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}'
else:
command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}'
return command
def _train_step(self, precision): # noqa: E501
"""Train the model and get the performance."""
command = self._megatron_command(precision)
local_rank = os.environ.pop('OMPI_COMM_WORLD_LOCAL_RANK', None)
logger.info('Running command: {}.'.format(command))
output = run_command(command, flush_output=True)
os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = local_rank
iteration_times = []
info = {}
# last rank will print the result, first rank will print the memory usage
if self._num_nodes == 1 or \
int(os.environ['OMPI_COMM_WORLD_RANK']) == int(os.environ['OMPI_COMM_WORLD_SIZE']) - 1 \
or int(os.environ['OMPI_COMM_WORLD_RANK']) == 0:
iteration_times, tflops, mem_allocated, max_mem_allocated = self._parse_log(output.stdout)
if len(tflops) > 0:
info['tflops'] = tflops
if len(mem_allocated) > 0:
info['mem_allocated'] = mem_allocated
if len(max_mem_allocated) > 0:
info['max_mem_allocated'] = max_mem_allocated
if not iteration_times:
iteration_times = [-1 for i in range(self._args.num_steps)]
return iteration_times, info
def _sync_result(self, data):
"""Sync the result of model benchmarking.
Args:
data (list): the data to be reduced.
"""
from mpi4py import MPI
comm = MPI.COMM_WORLD
data = np.array(data, dtype=np.float64)
# Reduce the data to a single value on rank 0
result = np.zeros_like(data)
comm.Allreduce([data, MPI.DOUBLE], [result, MPI.DOUBLE], op=MPI.MAX)
return result.tolist()
def _process_info(self, model_action, precision, info):
"""Process the result of model benchmarking."""
precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'bfloat16': 'bf16'}
if precision.value in precision_metric.keys():
precision = precision_metric[precision.value]
for key, values in info.items():
metric = '{}_{}_{}'.format(precision, model_action, key)
self._result.add_raw_data(metric, values, self._args.log_raw_data)
self._result.add_result(metric, statistics.mean(values))
logger.info(
'Average {} - round: {}, model: {}, precision: {}, value: {:.6f}.'.format(
key, self._curr_run_index, self._name, precision, statistics.mean(values)
)
)
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU.
Return:
True if distributed library is initialized successfully.
"""
if not os.getenv('OMPI_COMM_WORLD_SIZE'):
logger.error('MPI is not enabled.')
return False
self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
if self._num_nodes > 1:
if not self._args.hostfile:
sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
if os.path.exists(sb_hostfile):
hosts = open(sb_hostfile).read().split('\n')
hosts = [f'{host} slots={self._args.num_gpus}' for host in hosts if host != '']
self._args.hostfile = os.path.join(self._args.data_home, 'hostfile')
with open(self._args.hostfile, 'w') as file:
file.write('\n'.join(hosts))
if not os.path.exists(self._args.hostfile):
logger.error('Hostfile not found.')
return False
hosts = open(self._args.hostfile, 'r').readlines()
if self._num_nodes != len(hosts):
logger.error('MPI init failed since hostfile not match the MPI setting.')
return False
addr = os.getenv('MASTER_ADDR', hosts[0].split()[0])
port = os.getenv('MASTER_PORT', '29500')
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
return True
def _generate_dataset(self):
"""Generate dataset for benchmarking.
Return:
True if dataset is created successfully.
"""
self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json')
download_file(self._args.vocab_url, self._vocab_path)
self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt')
download_file(self._args.merges_url, self._merges_path)
if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \
or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')):
if self._args.dataset_url:
self._raw_data_path = str(Path(self._args.data_home) / 'data.json')
download_file(self._args.dataset_url, self._raw_data_path)
command = (
'python3 '
f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} '
f'--input {self._raw_data_path} '
f'--tokenizer-type {self._args.tokenizer_type} '
f'--output-prefix {os.path.join(self._args.data_home, "dataset")} '
f'--workers {str(self._args.num_workers)} '
f'--vocab-file {self._vocab_path} '
f'--merge-file {self._merges_path}'
)
# split documents
run_command(command, flush_output=True)
# binarize dataset
run_command(command, flush_output=True)
if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \
or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')):
logger.error('Dataset failed to generate.')
self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
return False
else:
logger.error('No dataset or dataset url provided.')
self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
return False
self._data_path = os.path.join(self._args.data_home, f'{self._args.data_prefix}')
self._data_options = f'\
--vocab-file {self._vocab_path} \
--merge-file {self._merges_path} \
--data-path {self._data_path} \
--data-impl {self._args.data_impl}'
logger.info('Dataset preparation successfully.')
return True
def _set_force_fp32(self):
"""Set force FP32."""
pass
def _init_dataloader(self):
"""Initialize the dataloader.
Return:
True if dataloader is created successfully.
"""
return True
def _create_optimizer(self):
"""Create the optimzier instance used for training and wrap with distributed library if need.
Return:
True if optimizer instance is created successfully.
"""
return True
def _create_model(self, precision):
"""Construct the model for benchmarking.
Args:
precision (Precision): precision of model and input data, such as float32, float16.
"""
return True
def _inference_step(self, precision):
"""Define the inference process.
Args:
precision (Precision): precision of model and input data,
such as float32, float16.
Return:
The latency list of every inference operation.
"""
pass
def _cal_params_count(self):
"""Calculate the parameters scale of the model.
Return:
The count of trainable parameters.
"""
pass
# Register GPT3 benchmark.
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA)
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM)
......@@ -7,6 +7,7 @@
import time
import statistics
from abc import abstractmethod
from typing import Union
from superbench.common.utils import logger, stdout_logger
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
......@@ -263,6 +264,10 @@ def __train(self, precision):
# The unit of step time should be millisecond.
step_times = self._train_step(precision)
if isinstance(step_times, tuple):
step_times = step_times[0]
info = step_times[1]
self._process_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if not step_times:
self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
......@@ -302,7 +307,7 @@ def __inference(self, precision):
return True
@abstractmethod
def _train_step(self, precision):
def _train_step(self, precision) -> Union[list, tuple]:
"""Define the training process.
Args:
......@@ -418,6 +423,7 @@ def __process_model_result(self, model_action, precision, step_times):
precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'float64': 'fp64', 'bfloat16': 'bf16'}
if precision.value in precision_metric.keys():
precision = precision_metric[precision.value]
metric_s = '{}_{}_step_time'.format(precision, model_action)
metric_t = '{}_{}_throughput'.format(precision, model_action)
# The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec.
......@@ -428,7 +434,7 @@ def __process_model_result(self, model_action, precision, step_times):
if model_action == ModelAction.TRAIN:
step_times = self._sync_result(step_times)
if not step_times:
if not step_times or statistics.mean(step_times) < 0:
return None
if self._local_rank is None or self._global_rank == 0:
self._result.add_result(metric_s, statistics.mean(step_times))
......@@ -468,3 +474,13 @@ def _log_step_time(self, curr_step, precision, duration):
step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \
else statistics.mean(duration[-self._args.log_n_steps:])
stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n')
def _process_info(self, model_action, precision, info):
"""Process other info.
Args:
model_action (ModelAction): train or inference.
precision (Precision): precision of model.
info (dict): other info.
"""
pass
......@@ -207,6 +207,23 @@ superbench:
seq_length: 224
batch_size: 1
precision: int8
megatron-gpt:
modes:
- name: mpi
proc_num: 1
node_num: all
parameters:
code_base: /opt/superbench/third_party/Megatron/Megatron-DeepSpeed/
dataset_url: https://huggingface.co/datasets/suolyer/pile_bookcorpus2/raw/main/test.json
batch_size: 2048
num_warmup: 0
num_steps: 10
precision:
- float16
- bfloat16
deepspeed: yes
sequence_parallel: yes
use_rotary_position_embeddings: yes
gpt_models:
<<: *default_pytorch_mode
models:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for BERT model benchmarks."""
import os
from pathlib import Path
import statistics
from unittest import mock
import unittest
from superbench.benchmarks.context import ModelAction, Precision
from tests.helper import decorator
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from tests.helper.testcase import BenchmarkTestCase
class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for IBBenchmark benchmark."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.benchmark_name = 'megatron-gpt'
cls.createMockEnvs(cls)
cls.hostfile_path = os.path.join(cls._tmp_dir, 'hostfile')
@classmethod
def tearDownClass(cls):
"""Hook method for deconstructing the class fixture after running all tests in the class."""
for p in [
Path(cls._tmp_dir) / 'pretrain_gpt.py',
Path(cls._tmp_dir) / 'customdataset_text_document.bin',
Path(cls._tmp_dir) / 'customdataset_text_document.idx',
Path(cls._tmp_dir) / 'hostfile'
]:
if p.is_file():
p.unlink()
super().tearDownClass()
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_gpt_preprocess(self, mock_generate_dataset):
"""Test megatron-gpt benchmark."""
# Check registry.
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--hostfile {self.hostfile_path} --batch_size 2048',
)
# Check init distribued setting.
os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
f.write('host2\n')
f.write('host3\n')
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
assert (benchmark.return_code == ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
benchmark = benchmark_cls(
self.benchmark_name,
parameters='--hostfile xxx --batch_size 2048',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
assert (benchmark.return_code == ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
os.environ['OMPI_COMM_WORLD_SIZE'] = '3'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--hostfile {self.hostfile_path} --batch_size 2048',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
self.assertEqual(benchmark._num_nodes, 3)
self.assertEqual(
benchmark._distributed_args,
'--nproc_per_node {0} --nnodes {1} --node_rank {2} --master_addr {3} --master_port {4}'.format(
benchmark._args.num_gpus, benchmark._num_nodes, 0, 'localhost', '12345'
)
)
# Check preprocessing.
# Negative cases
# no code_base
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --batch_size 2048',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
self.createMockFiles(['pretrain_gpt.py'])
# invalid micro batch size
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --micro_batch_size -1',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --micro_batch_size 4096',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
# invalid precision
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--batch_size 2048 --precision int8',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
# Positive cases
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --batch_size 2048',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is True)
def test_megatron_gpt_dataset(self):
"""Test dataset genreation."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
# use existing dataset
self.createMockFiles(['customdataset_text_document.bin', 'customdataset_text_document.idx'])
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} \
--batch_size 2048 --data_prefix customdataset_text_document',
)
ret = benchmark._preprocess()
ret = benchmark._generate_dataset()
assert (ret is True)
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_gpt_command(self, mock_generate_dataset):
"""Test command generation."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
f.write('host2\n')
# use url to process dataset
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
--init-method-std 0.009 \
--lr-decay-samples 43945312 \
--lr-warmup-samples 0 \
--lr-decay-style cosine \
--micro-batch-size 2 \
--global-batch-size 2048 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \
--eval-interval 10 \
--eval-iters 0 \
--save-interval 10000 \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers 8 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--optimizer adam \
--use-distributed-optimizer \
{precision} \
--seed 1234 {data_options}'
precision = Precision.FLOAT32
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
precision='',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
precision = Precision.FLOAT16
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
precision='--fp16',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
precision='--bf16',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
--init-method-std 0.009 \
--lr-decay-samples 43945312 \
--lr-warmup-samples 0 \
--lr-decay-style cosine \
--micro-batch-size 2 \
--global-batch-size 2048 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \
--eval-interval 10 \
--eval-iters 0 \
--save-interval 10000 \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers 8 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--optimizer adam \
--use-distributed-optimizer \
{precision} \
--seed 1234 {data_options} {deepseed_options}'
expect_ds_options = f'\
--deepspeed \
--deepspeed_config {benchmark._config_json_path} \
--zero-stage 1 \
--pipeline-model-parallel-size 1 --no-pipeline-parallel'
self.assertEqual(
command,
expected_command.format(
precision='--bf16',
data_options=benchmark._data_options,
script_path=script_path,
deepseed_options=expect_ds_options
)
)
@decorator.load_data('tests/data/megatron_deepspeed.log')
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
"""Test parse log function."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
# use url to process dataset
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --num_warmup 0 --num_steps 10 --batch_size 2048',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
assert (statistics.mean(iteration_times) == 75239.24)
assert (statistics.mean(tflops) == 149.136)
assert (statistics.mean(mem_allocated) == 17.54)
assert (statistics.mean(max_mem_allocated) == 66.97)
info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
assert (benchmark.result is not None)
assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
[2023-11-29 08:50:44,619] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-11-29 08:50:46,231] [INFO] [runner.py:463:main] Using IP address of 10.218.187.178 for node vm-07-05
[2023-11-29 08:50:46,232] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: vm-07-05,vm-07-14
[2023-11-29 08:50:46,232] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w vm-07-05,vm-07-14 export PYTHONPATH=/root/Megatron-DeepSpeed/examples_deepspeed/rebase::/root/Megatron-DeepSpeed; export UCX_HOME=/opt/ucx; cd /root/Megatron-DeepSpeed/examples_deepspeed/rebase; /opt/conda/envs/py_3.9/bin/python -u -m deepspeed.launcher.launch --world_info=eyJ2bS0wNy0wNSI6IFswLCAxLCAyLCAzLCA0LCA1LCA2LCA3XSwgInZtLTA3LTE0IjogWzAsIDEsIDIsIDMsIDQsIDUsIDYsIDddfQ== --node_rank=%n --master_addr=10.218.187.178 --master_port=29500 /root/Megatron-DeepSpeed/examples_deepspeed/rebase/../../pretrain_gpt.py --override-opt_param-scheduler --adam-beta1 '0.9' --adam-beta2 '0.95' --tensor-model-parallel-size '1' --init-method-std '0.009' --lr-decay-samples '43945312' --lr-warmup-samples '2048000' --lr-decay-style 'cosine' --micro-batch-size '2' --exit-duration-in-mins '30000000' --global-batch-size '2048' --num-layers '32' --hidden-size '4096' --num-attention-heads '32' --seq-length '2048' --max-position-embeddings '2048' --train-tokens '300000000000' --train-samples '10240' --lr '1.2e-4' --min-lr '1.0e-6' --split '949,50,1' --log-interval '1' --eval-interval '500' --eval-iters '10' --save-interval '10000' --weight-decay '0.1' --clip-grad '1.0' --hysteresis '2' --num-workers '2' --attention-dropout '0.0' --hidden-dropout '0.0' --optimizer 'adam' --use-distributed-optimizer --sequence-parallel --fp16 --seed '1234' --load './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase' --save './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase' --no-async-tensor-model-parallel-allreduce --use-rotary-position-embeddings --no-gradient-accumulation-fusion --vocab-file 'gpt2-vocab.json' --merge-file 'gpt2-merges.txt' --data-path '/root//dataset_text_sentence' --data-impl 'mmap' --deepspeed --deepspeed_config 'ds_config_gbs2048_mbs2_log1_zero1.json' --zero-stage '1' --pipeline-model-parallel-size '1' --no-pipeline-parallel
vm-07-05: [2023-11-29 08:50:48,288] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:48,369] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:145:main] WORLD INFO DICT: {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [0, 1, 2, 3, 4, 5, 6, 7]}
vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=8, node_rank=0
vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [8, 9, 10, 11, 12, 13, 14, 15]})
vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:163:main] dist_world_size=16
vm-07-05: [2023-11-29 08:50:49,536] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:145:main] WORLD INFO DICT: {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [0, 1, 2, 3, 4, 5, 6, 7]}
vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=8, node_rank=1
vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'vm-07-05': [0, 1, 2, 3, 4, 5, 6, 7], 'vm-07-14': [8, 9, 10, 11, 12, 13, 14, 15]})
vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:163:main] dist_world_size=16
vm-07-14: [2023-11-29 08:50:49,657] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
vm-07-05: [2023-11-29 08:50:51,594] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,640] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,644] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,644] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,660] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,675] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,684] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,705] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: [2023-11-29 08:50:51,713] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,724] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,777] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,780] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,784] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,820] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,820] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-14: [2023-11-29 08:50:51,820] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-05: Deterministic: False
vm-07-05: Performance Mode: True
vm-07-05: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-14: Deterministic: False
vm-07-14: Performance Mode: True
vm-07-14: Using QLoop: True
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: INFO: overriding default arguments for tokenizer_type:None with tokenizer_type:GPT2BPETokenizer
vm-07-05: using world size: 16, data-parallel-size: 16, sequence-parallel size: 1, tensor-model-parallel size: 1, pipeline-model-parallel size: 1
vm-07-05: using torch.float16 for parameters ...
vm-07-05: ------------------------ arguments ------------------------
vm-07-05: accumulate_allreduce_grads_in_fp32 .............. False
vm-07-05: adam_beta1 ...................................... 0.9
vm-07-05: adam_beta2 ...................................... 0.95
vm-07-05: adam_eps ........................................ 1e-08
vm-07-05: add_bias_linear ................................. True
vm-07-05: add_position_embedding .......................... False
vm-07-05: adlr_autoresume ................................. False
vm-07-05: adlr_autoresume_interval ........................ 1000
vm-07-05: aml_data_download_path .......................... None
vm-07-05: apply_layernorm_1p .............................. False
vm-07-05: apply_query_key_layer_scaling ................... True
vm-07-05: apply_residual_connection_post_layernorm ........ False
vm-07-05: async_tensor_model_parallel_allreduce ........... False
vm-07-05: attention_dropout ............................... 0.0
vm-07-05: attention_softmax_in_fp32 ....................... False
vm-07-05: barrier_with_L1_time ............................ True
vm-07-05: bert_binary_head ................................ True
vm-07-05: bert_embedder_type .............................. megatron
vm-07-05: bert_load ....................................... None
vm-07-05: bf16 ............................................ False
vm-07-05: bias_dropout_fusion ............................. True
vm-07-05: bias_gelu_fusion ................................ True
vm-07-05: biencoder_projection_dim ........................ 0
vm-07-05: biencoder_shared_query_context_model ............ False
vm-07-05: block_data_path ................................. None
vm-07-05: checkpoint_activations .......................... False
vm-07-05: checkpoint_in_cpu ............................... False
vm-07-05: checkpoint_num_layers ........................... 1
vm-07-05: classes_fraction ................................ 1.0
vm-07-05: clip_grad ....................................... 1.0
vm-07-05: compression_training ............................ False
vm-07-05: consumed_train_samples .......................... 0
vm-07-05: consumed_train_tokens ........................... 0
vm-07-05: consumed_valid_samples .......................... 0
vm-07-05: contagious_checkpointing ........................ False
vm-07-05: cpu_optimizer ................................... False
vm-07-05: cpu_torch_adam .................................. False
vm-07-05: create_moe_param_group .......................... False
vm-07-05: curriculum_learning_legacy ...................... False
vm-07-05: data_cache_path ................................. None
vm-07-05: data_efficiency_curriculum_learning ............. False
vm-07-05: data_impl ....................................... mmap
vm-07-05: data_parallel_random_init ....................... False
vm-07-05: data_parallel_size .............................. 16
vm-07-05: data_path ....................................... ['/root//dataset_text_sentence']
vm-07-05: data_per_class_fraction ......................... 1.0
vm-07-05: data_sharding ................................... True
vm-07-05: dataloader_type ................................. single
vm-07-05: DDP_impl ........................................ local
vm-07-05: decoder_num_layers .............................. None
vm-07-05: decoder_seq_length .............................. None
vm-07-05: deepscale ....................................... False
vm-07-05: deepscale_config ................................ None
vm-07-05: deepspeed ....................................... True
vm-07-05: deepspeed_activation_checkpointing .............. False
vm-07-05: deepspeed_config ................................ ds_config_gbs2048_mbs2_log1_zero1.json
vm-07-05: deepspeed_mpi ................................... False
vm-07-05: dino_bottleneck_size ............................ 256
vm-07-05: dino_freeze_last_layer .......................... 1
vm-07-05: dino_head_hidden_size ........................... 2048
vm-07-05: dino_local_crops_number ......................... 10
vm-07-05: dino_local_img_size ............................. 96
vm-07-05: dino_norm_last_layer ............................ False
vm-07-05: dino_teacher_temp ............................... 0.07
vm-07-05: dino_warmup_teacher_temp ........................ 0.04
vm-07-05: dino_warmup_teacher_temp_epochs ................. 30
vm-07-05: distribute_checkpointed_activations ............. False
vm-07-05: distribute_saved_activations .................... False
vm-07-05: distributed_backend ............................. nccl
vm-07-05: distributed_timeout_minutes ..................... 10
vm-07-05: ds_inference .................................... False
vm-07-05: ds_pipeline_enabled ............................. False
vm-07-05: ds_sequence_parallel_size ....................... 1
vm-07-05: embedding_path .................................. None
vm-07-05: embedding_weights_in_fp32 ....................... False
vm-07-05: empty_unused_memory_level ....................... 0
vm-07-05: enable_expert_tensor_parallelism ................ False
vm-07-05: encoder_num_layers .............................. 32
vm-07-05: encoder_seq_length .............................. 2048
vm-07-05: end_weight_decay ................................ 0.1
vm-07-05: eod_mask_loss ................................... False
vm-07-05: eval_interval ................................... 500
vm-07-05: eval_iters ...................................... 10
vm-07-05: evidence_data_path .............................. None
vm-07-05: exit_duration_in_mins ........................... 30000000
vm-07-05: exit_interval ................................... None
vm-07-05: exit_on_missing_checkpoint ...................... False
vm-07-05: exit_signal_handler ............................. False
vm-07-05: expert_interval ................................. 2
vm-07-05: ffn_hidden_size ................................. 16384
vm-07-05: finetune ........................................ False
vm-07-05: force_ds_sequence_parallel ...................... False
vm-07-05: fp16 ............................................ True
vm-07-05: fp16_lm_cross_entropy ........................... False
vm-07-05: fp32_residual_connection ........................ False
vm-07-05: fp8_amax_compute_algo ........................... most_recent
vm-07-05: fp8_amax_history_len ............................ 1
vm-07-05: fp8_e4m3 ........................................ False
vm-07-05: fp8_hybrid ...................................... False
vm-07-05: fp8_interval .................................... 1
vm-07-05: fp8_margin ...................................... 0
vm-07-05: fp8_wgrad ....................................... True
vm-07-05: global_batch_size ............................... 2048
vm-07-05: gradient_accumulation_fusion .................... False
vm-07-05: head_lr_mult .................................... 1.0
vm-07-05: hidden_dropout .................................. 0.0
vm-07-05: hidden_size ..................................... 4096
vm-07-05: hidden_size_teacher ............................. None
vm-07-05: hysteresis ...................................... 2
vm-07-05: ict_head_size ................................... None
vm-07-05: ict_load ........................................ None
vm-07-05: img_h ........................................... 224
vm-07-05: img_w ........................................... 224
vm-07-05: indexer_batch_size .............................. 128
vm-07-05: indexer_log_interval ............................ 1000
vm-07-05: inference ....................................... False
vm-07-05: inference_batch_times_seqlen_threshold .......... 512
vm-07-05: init_method_std ................................. 0.009
vm-07-05: init_method_xavier_uniform ...................... False
vm-07-05: initial_loss_scale .............................. 4294967296
vm-07-05: iter_per_epoch .................................. 1250
vm-07-05: kd .............................................. False
vm-07-05: kd_alpha_ce ..................................... 1
vm-07-05: kd_beta_ce ...................................... 1
vm-07-05: kd_temp ......................................... 1.0
vm-07-05: kv_channels ..................................... 128
vm-07-05: layernorm_epsilon ............................... 1e-05
vm-07-05: lazy_mpu_init ................................... None
vm-07-05: load ............................................ .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase
vm-07-05: load_teacher .................................... None
vm-07-05: local_rank ...................................... 0
vm-07-05: log_batch_size_to_tensorboard ................... False
vm-07-05: log_interval .................................... 1
vm-07-05: log_learning_rate_to_tensorboard ................ True
vm-07-05: log_loss_scale_to_tensorboard ................... True
vm-07-05: log_memory_to_tensorboard ....................... False
vm-07-05: log_num_zeros_in_grad ........................... False
vm-07-05: log_optimizer_states_to_tensorboard ............. False
vm-07-05: log_params_norm ................................. False
vm-07-05: log_timers_to_tensorboard ....................... False
vm-07-05: log_validation_ppl_to_tensorboard ............... False
vm-07-05: log_world_size_to_tensorboard ................... False
vm-07-05: loss_scale ...................................... None
vm-07-05: loss_scale_window ............................... 1000
vm-07-05: lr .............................................. 0.00012
vm-07-05: lr_decay_iters .................................. None
vm-07-05: lr_decay_samples ................................ 43945312
vm-07-05: lr_decay_style .................................. cosine
vm-07-05: lr_decay_tokens ................................. None
vm-07-05: lr_warmup_fraction .............................. None
vm-07-05: lr_warmup_iters ................................. 0
vm-07-05: lr_warmup_samples ............................... 2048000
vm-07-05: lr_warmup_tokens ................................ None
vm-07-05: make_vocab_size_divisible_by .................... 128
vm-07-05: mask_factor ..................................... 1.0
vm-07-05: mask_prob ....................................... 0.15
vm-07-05: mask_type ....................................... random
vm-07-05: masked_softmax_fusion ........................... True
vm-07-05: max_position_embeddings ......................... 2048
vm-07-05: max_tokens_to_oom ............................... 12000
vm-07-05: mem_efficient_ln ................................ True
vm-07-05: memory_centric_tiled_linear ..................... False
vm-07-05: merge_file ...................................... gpt2-merges.txt
vm-07-05: micro_batch_size ................................ 2
vm-07-05: min_loss_scale .................................. 1.0
vm-07-05: min_lr .......................................... 1e-06
vm-07-05: mlp_type ........................................ standard
vm-07-05: mmap_warmup ..................................... False
vm-07-05: moe_eval_capacity_factor ........................ 1.0
vm-07-05: moe_expert_parallel_size ........................ 1
vm-07-05: moe_loss_coeff .................................. 0.1
vm-07-05: moe_min_capacity ................................ 4
vm-07-05: moe_token_dropping .............................. True
vm-07-05: moe_train_capacity_factor ....................... 1.0
vm-07-05: mos ............................................. False
vm-07-05: no_load_lr_state ................................ False
vm-07-05: no_load_optim ................................... None
vm-07-05: no_load_rng ..................................... None
vm-07-05: no_persist_layer_norm ........................... False
vm-07-05: no_pipeline_parallel ............................ True
vm-07-05: no_save_optim ................................... None
vm-07-05: no_save_rng ..................................... None
vm-07-05: normalization ................................... layernorm
vm-07-05: num_attention_heads ............................. 32
vm-07-05: num_attention_heads_teacher ..................... None
vm-07-05: num_channels .................................... 3
vm-07-05: num_classes ..................................... 1000
vm-07-05: num_experts ..................................... [1]
vm-07-05: num_experts_switch .............................. None
vm-07-05: num_experts_teacher ............................. [1]
vm-07-05: num_key_value_heads ............................. 32
vm-07-05: num_layers ...................................... 32
vm-07-05: num_layers_per_virtual_pipeline_stage ........... None
vm-07-05: num_layers_teacher .............................. None
vm-07-05: num_workers ..................................... 2
vm-07-05: onnx_safe ....................................... None
vm-07-05: openai_gelu ..................................... False
vm-07-05: optimizer ....................................... adam
vm-07-05: output_bert_embeddings .......................... False
vm-07-05: overlap_p2p_comm ................................ False
vm-07-05: override_opt_param_scheduler .................... True
vm-07-05: params_dtype .................................... torch.float16
vm-07-05: partition_activations ........................... False
vm-07-05: patch_dim ....................................... 16
vm-07-05: perform_initialization .......................... True
vm-07-05: pipeline_model_parallel_size .................... 1
vm-07-05: pipeline_model_parallel_split_rank .............. None
vm-07-05: profile_backward ................................ False
vm-07-05: query_in_block_prob ............................. 0.1
vm-07-05: rampup_batch_size ............................... None
vm-07-05: random_ltd ...................................... False
vm-07-05: rank ............................................ 0
vm-07-05: recompute_granularity ........................... None
vm-07-05: recompute_method ................................ None
vm-07-05: recompute_num_layers ............................ 1
vm-07-05: remote_device ................................... none
vm-07-05: reset_attention_mask ............................ False
vm-07-05: reset_iteration ................................. False
vm-07-05: reset_position_ids .............................. False
vm-07-05: retriever_report_topk_accuracies ................ []
vm-07-05: retriever_score_scaling ......................... False
vm-07-05: retriever_seq_length ............................ 256
vm-07-05: retro_add_retriever ............................. False
vm-07-05: retro_cyclic_train_iters ........................ None
vm-07-05: retro_encoder_attention_dropout ................. 0.1
vm-07-05: retro_encoder_hidden_dropout .................... 0.1
vm-07-05: retro_encoder_layers ............................ 2
vm-07-05: retro_num_neighbors ............................. 2
vm-07-05: retro_num_retrieved_chunks ...................... 2
vm-07-05: retro_return_doc_ids ............................ False
vm-07-05: retro_workdir ................................... None
vm-07-05: return_data_index ............................... False
vm-07-05: rotary_percent .................................. 1.0
vm-07-05: sample_rate ..................................... 1.0
vm-07-05: save ............................................ .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase
vm-07-05: save_interval ................................... 10000
vm-07-05: scatter_gather_tensors_in_pipeline .............. True
vm-07-05: scattered_embeddings ............................ False
vm-07-05: seed ............................................ 1234
vm-07-05: seq_length ...................................... 2048
vm-07-05: sequence_parallel ............................... False
vm-07-05: sgd_momentum .................................... 0.9
vm-07-05: short_seq_prob .................................. 0.1
vm-07-05: skip_train ...................................... False
vm-07-05: split ........................................... 949,50,1
vm-07-05: split_transformers .............................. False
vm-07-05: squared_relu .................................... False
vm-07-05: standalone_embedding_stage ...................... False
vm-07-05: start_weight_decay .............................. 0.1
vm-07-05: swiglu .......................................... False
vm-07-05: swin_backbone_type .............................. tiny
vm-07-05: synchronize_each_layer .......................... False
vm-07-05: tensor_model_parallel_size ...................... 1
vm-07-05: tensorboard_dir ................................. None
vm-07-05: tensorboard_log_interval ........................ 1
vm-07-05: tensorboard_queue_size .......................... 1000
vm-07-05: test_data_path .................................. None
vm-07-05: tile_factor ..................................... 1
vm-07-05: timing_log_level ................................ 0
vm-07-05: timing_log_option ............................... minmax
vm-07-05: titles_data_path ................................ None
vm-07-05: tokenizer_model ................................. None
vm-07-05: tokenizer_type .................................. GPT2BPETokenizer
vm-07-05: topk ............................................ 1
vm-07-05: train_data_exact_num_epochs ..................... None
vm-07-05: train_data_path ................................. None
vm-07-05: train_desc_path ................................. None
vm-07-05: train_doc_idx_path .............................. None
vm-07-05: train_idx_path .................................. None
vm-07-05: train_iters ..................................... None
vm-07-05: train_sample_idx_path ........................... None
vm-07-05: train_samples ................................... 10240
vm-07-05: train_shuffle_idx_path .......................... None
vm-07-05: train_tokens .................................... 300000000000
vm-07-05: transformer_impl ................................ local
vm-07-05: transformer_pipeline_model_parallel_size ........ 1
vm-07-05: universal_checkpoint ............................ False
vm-07-05: untie_embeddings_and_output_weights ............. False
vm-07-05: use_checkpoint_args ............................. False
vm-07-05: use_checkpoint_opt_param_scheduler .............. False
vm-07-05: use_contiguous_buffers_in_local_ddp ............. True
vm-07-05: use_cpu_initialization .......................... None
vm-07-05: use_dataset_only ................................ False
vm-07-05: use_distributed_optimizer ....................... True
vm-07-05: use_flash_attn .................................. False
vm-07-05: use_flash_attn_triton ........................... False
vm-07-05: use_flash_attn_v1 ............................... False
vm-07-05: use_flash_attn_v2 ............................... False
vm-07-05: use_one_sent_docs ............................... False
vm-07-05: use_pin_memory .................................. False
vm-07-05: use_ring_exchange_p2p ........................... False
vm-07-05: use_rotary_position_embeddings .................. True
vm-07-05: use_tutel ....................................... False
vm-07-05: valid_data_path ................................. None
vm-07-05: variable_seq_lengths ............................ False
vm-07-05: virtual_pipeline_model_parallel_size ............ None
vm-07-05: vision_backbone_type ............................ vit
vm-07-05: vision_pretraining .............................. False
vm-07-05: vision_pretraining_type ......................... classify
vm-07-05: vocab_extra_ids ................................. 0
vm-07-05: vocab_file ...................................... gpt2-vocab.json
vm-07-05: vocab_size ...................................... None
vm-07-05: weight_decay .................................... 0.1
vm-07-05: weight_decay_incr_style ......................... constant
vm-07-05: world_size ...................................... 16
vm-07-05: zero_allgather_bucket_size ...................... 0.0
vm-07-05: zero_contagious_gradients ....................... False
vm-07-05: zero_reduce_bucket_size ......................... 0.0
vm-07-05: zero_reduce_scatter ............................. False
vm-07-05: zero_stage ...................................... 1
vm-07-05: -------------------- end of arguments ---------------------
vm-07-05: setting number of micro-batches to constant 64
vm-07-05: > building GPT2BPETokenizer tokenizer ...
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] ....... [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: cpu_lion ............... [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: ragged_ops ............. [NO] ....... [OKAY]
vm-07-14: random_ltd ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: sparse_attn ............ [NO] ....... [NO]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............ [NO] ....... [OKAY]
vm-07-14: stochastic_transformer . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: [2023-11-29 08:50:53,882] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
vm-07-05: > initializing torch distributed ...
vm-07-05: [2023-11-29 08:50:53,893] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-05: [2023-11-29 08:50:53,893] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
vm-07-05: [2023-11-29 08:50:53,901] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-14: [2023-11-29 08:50:53,958] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed C++/CUDA extension op report
vm-07-05: --------------------------------------------------
vm-07-05: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-05: runtime if needed. Op compatibility means that your system
vm-07-05: meet the required dependencies to JIT install the op.
vm-07-05: --------------------------------------------------
vm-07-05: JIT compiled ops requires ninja
vm-07-05: ninja .................. [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: op name ................ installed .. compatible
vm-07-05: --------------------------------------------------
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] ....... [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: cpu_lion ............... [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: ragged_ops ............. [NO] ....... [OKAY]
vm-07-14: random_ltd ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: sparse_attn ............ [NO] ....... [NO]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............ [NO] ....... [OKAY]
vm-07-14: stochastic_transformer . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-05: async_io ............... [NO] ....... [OKAY]
vm-07-05: fused_adam ............. [NO] ....... [OKAY]
vm-07-05: cpu_adam ............... [NO] ....... [OKAY]
vm-07-05: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-05: cpu_lion ............... [NO] ....... [OKAY]
vm-07-05:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-05: evoformer_attn ......... [NO] ....... [NO]
vm-07-05: fused_lamb ............. [NO] ....... [OKAY]
vm-07-05: fused_lion ............. [NO] ....... [OKAY]
vm-07-05: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-05: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-05: quantizer .............. [NO] ....... [OKAY]
vm-07-05: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-05: ragged_ops ............. [NO] ....... [OKAY]
vm-07-05: random_ltd ............. [NO] ....... [OKAY]
vm-07-05:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-05: sparse_attn ............ [NO] ....... [NO]
vm-07-05: spatial_inference ...... [NO] ....... [OKAY]
vm-07-05: transformer ............ [NO] ....... [OKAY]
vm-07-05: stochastic_transformer . [NO] ....... [OKAY]
vm-07-05: transformer_inference .. [NO] ....... [OKAY]
vm-07-05: --------------------------------------------------
vm-07-05: DeepSpeed general environment info:
vm-07-05: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-05: torch version .................... 2.1.0a0+gita09f30a
vm-07-05: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-05: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-05: torch cuda version ............... None
vm-07-05: torch hip version ................ 5.7.31920-f5021ed14
vm-07-05: nvcc version ..................... None
vm-07-05: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-05: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: [2023-11-29 08:50:54,034] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-05: [2023-11-29 08:50:54,051] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] ....... [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: cpu_lion ............... [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: ragged_ops ............. [NO] ....... [OKAY]
vm-07-14: random_ltd ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: sparse_attn ............ [NO] ....... [NO]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............ [NO] ....... [OKAY]
vm-07-14: stochastic_transformer . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-05: [2023-11-29 08:50:54,091] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-05: [2023-11-29 08:50:54,104] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] .......async_io [OKAY]
vm-07-14: ............... [NO] ....... cpu_adam[OKAY]
vm-07-14: ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: fused_adam cpu_lion............. ...............[NO] [NO]....... .......[OKAY]
vm-07-14: [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attncpu_lion ........................ [NO][NO] .............. [NO][OKAY]
vm-07-14:
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... fused_lion[OKAY]
vm-07-14: ............. [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... ragged_ops[OKAY]
vm-07-14: ............. [NO]quantizer ..................... [OKAY][NO]
vm-07-14: ....... [OKAY]random_ltd
vm-07-14: ............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: [OKAY]
vm-07-14: sparse_attn ............ [NO]ragged_ops .................... [NO][NO]
vm-07-14: ....... [OKAY]
vm-07-14: random_ltdspatial_inference ................... [NO][NO] .............. [OKAY][OKAY]
vm-07-14:
vm-07-14: transformer ............ [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: [NO] sparse_attn....... ............[OKAY]
vm-07-14: [NO] ....... [NO]stochastic_transformer
vm-07-14: . [NO] ....... [OKAY]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............transformer_inference [NO].. .......[NO] [OKAY].......
vm-07-14: [OKAY]
vm-07-14: --------------------------------------------------stochastic_transformer
vm-07-14: . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info DeepSpeed general environment info:...................
vm-07-14: 0.12.3, unknown, unknown
vm-07-14: torch install pathtorch cuda version .............................. None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14:
vm-07-14: nvcc version ..................... Nonetorch version
vm-07-14: deepspeed wheel compiled w..................... ...... 2.1.0a0+gita09f30atorch 2.1, hip 5.7
vm-07-14:
vm-07-14: deepspeed install pathshared memory (/dev/shm) size ............... 865.10 GB['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14:
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-14: [2023-11-29 08:50:54,109] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-05: [2023-11-29 08:50:54,130] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed C++/CUDA extension op report
vm-07-14: --------------------------------------------------
vm-07-14: NOTE: Ops not installed will be just-in-time (JIT) compiled at
vm-07-14: runtime if needed. Op compatibility means that your system
vm-07-14: meet the required dependencies to JIT install the op.
vm-07-14: --------------------------------------------------
vm-07-14: JIT compiled ops requires ninja
vm-07-14: ninja .................. [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: op name ................ installed .. compatible
vm-07-14: --------------------------------------------------
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] ....... [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: cpu_lion ............... [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: ragged_ops ............. [NO] ....... [OKAY]
vm-07-14: random_ltd ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: sparse_attn ............ [NO] ....... [NO]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............ [NO] ....... [OKAY]
vm-07-14: stochastic_transformer . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] ....... [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: cpu_lion ............... [NO] ....... [OKAY]
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: ragged_ops ............. [NO] ....... [OKAY]
vm-07-14: random_ltd ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: sparse_attn ............ [NO] ....... [NO]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............ [NO] ....... [OKAY]
vm-07-14: stochastic_transformer . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: [2023-11-29 08:50:54,175] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: async_io ............... [NO] ....... [OKAY]
vm-07-14: fused_adam ............. [NO] ....... [OKAY]
vm-07-14: cpu_adam ............... [NO] ....... [OKAY]
vm-07-14: cpu_adagrad ............ [NO] ....... [OKAY]
vm-07-14: cpu_lion ............... [NO] ....... [OKAY]
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14:  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
vm-07-14: evoformer_attn ......... [NO] ....... [NO]
vm-07-14: fused_lamb ............. [NO] ....... [OKAY]
vm-07-14: fused_lion ............. [NO] ....... [OKAY]
vm-07-14: inference_core_ops ..... [NO] ....... [OKAY]
vm-07-14: cutlass_ops ............ [NO] ....... [OKAY]
vm-07-14: quantizer .............. [NO] ....... [OKAY]
vm-07-14: ragged_device_ops ...... [NO] ....... [OKAY]
vm-07-14: ragged_ops ............. [NO] ....... [OKAY]
vm-07-14: random_ltd ............. [NO] ....... [OKAY]
vm-07-14:  [WARNING]  sparse_attn is not compatible with ROCM
vm-07-14: sparse_attn ............ [NO] ....... [NO]
vm-07-14: spatial_inference ...... [NO] ....... [OKAY]
vm-07-14: transformer ............ [NO] ....... [OKAY]
vm-07-14: stochastic_transformer . [NO] ....... [OKAY]
vm-07-14: transformer_inference .. [NO] ....... [OKAY]
vm-07-14: --------------------------------------------------
vm-07-14: DeepSpeed general environment info:
vm-07-14: torch install path ............... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch']
vm-07-14: torch version .................... 2.1.0a0+gita09f30a
vm-07-14: deepspeed install path ........... ['/opt/conda/envs/py_3.9/lib/python3.9/site-packages/deepspeed']
vm-07-14: deepspeed info ................... 0.12.3, unknown, unknown
vm-07-14: torch cuda version ............... None
vm-07-14: torch hip version ................ 5.7.31920-f5021ed14
vm-07-14: nvcc version ..................... None
vm-07-14: deepspeed wheel compiled w. ...... torch 2.1, hip 5.7
vm-07-14: shared memory (/dev/shm) size .... 865.10 GB
vm-07-14: [2023-11-29 08:50:54,199] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: [2023-11-29 08:50:54,203] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: **** Git info for Megatron: git_hash=82d83b8 git_branch=main ****
vm-07-14: [2023-11-29 08:50:54,248] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: [2023-11-29 08:50:54,255] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-14: [2023-11-29 08:50:54,282] [INFO] [comm.py:637:init_distributed] cdb=None
vm-07-05: > initialized tensor model parallel with size 1
vm-07-05: > initialized pipeline model parallel with size 1
vm-07-05: > setting random seeds to 1234 ...
vm-07-05: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
vm-07-05: > compiling dataset index builder ...
vm-07-05: make: Entering directory '/root/Megatron-DeepSpeed/megatron/data'
vm-07-05: make: Nothing to be done for 'default'.
vm-07-05: make: Leaving directory '/root/Megatron-DeepSpeed/megatron/data'
vm-07-05: >>> done with dataset index builder. Compilation time: 0.047 seconds
vm-07-05: > compiling and loading fused kernels ...
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: Total number of unsupported CUDA function calls: 0
vm-07-05:
vm-07-05:
vm-07-05: Total number of replaced kernel launches: 99
vm-07-05: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/opt/rocm-6.0.0-12660/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: Total number of unsupported CUDA function calls: 0
vm-07-05:
vm-07-05:
vm-07-05: Total number of replaced kernel launches: 69
vm-07-05: [1/1] c++ scaled_masked_softmax_hip.o scaled_masked_softmax_hip.cuda.o -shared -L/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/opt/rocm-6.0.0-12660/lib -lamdhip64 -o scaled_masked_softmax_cuda.so
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax.cpp -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax_hip.cpp [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax_cuda.cu -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_softmax_hip.hip [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /root/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified]
vm-07-05: Total number of unsupported CUDA function calls: 0
vm-07-05:
vm-07-05:
vm-07-05: Total number of replaced kernel launches: 69
vm-07-05: ninja: no work to do.
vm-07-05: >>> done with compiling and loading fused kernels. Compilation time: 2.092 seconds
vm-07-05: time to initialize megatron (seconds): 4.207
vm-07-05: [after megatron is initialized] datetime: 2023-11-29 08:50:57
vm-07-05: building GPT model ...
vm-07-05: [2023-11-29 08:50:57,577] [INFO] [utils.py:802:see_memory_usage] Before Building Model
vm-07-05: [2023-11-29 08:50:57,578] [INFO] [utils.py:803:see_memory_usage] MA 0.0 GB Max_MA 2.13 GB CA 0.0 GB Max_CA 2 GB
vm-07-05: [2023-11-29 08:50:57,578] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 45.89 GB, percent = 2.7%
vm-07-05: [2023-11-29 08:50:57,670] [INFO] [utils.py:802:see_memory_usage] After Building Model
vm-07-05: [2023-11-29 08:50:57,671] [INFO] [utils.py:803:see_memory_usage] MA 12.39 GB Max_MA 12.39 GB CA 12.39 GB Max_CA 12 GB
vm-07-05: [2023-11-29 08:50:57,671] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 45.9 GB, percent = 2.7%
vm-07-05: > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 6650208256
vm-07-05: setting training iterations to 5
vm-07-05: > learning rate decay style: cosine
vm-07-05: DeepSpeed is enabled.
vm-07-05: [2023-11-29 08:50:57,673] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.3, git-hash=unknown, git-branch=unknown
vm-07-05: [2023-11-29 08:50:57,862] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
vm-07-05: [2023-11-29 08:50:57,863] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
vm-07-05: [2023-11-29 08:50:57,863] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'apex.optimizers.fused_adam.FusedAdam'>
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 1 optimizer
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:147:__init__] Reduce bucket size 500,000,000
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:148:__init__] Allgather bucket size 500,000,000
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:149:__init__] CPU Offload: False
vm-07-05: [2023-11-29 08:50:57,874] [INFO] [stage_1_and_2.py:150:__init__] Round robin gradient partitioning: False
vm-07-05: [2023-11-29 08:51:11,210] [INFO] [utils.py:802:see_memory_usage] Before initializing optimizer states
vm-07-05: [2023-11-29 08:51:11,211] [INFO] [utils.py:803:see_memory_usage] MA 13.94 GB Max_MA 13.94 GB CA 13.96 GB Max_CA 14 GB
vm-07-05: [2023-11-29 08:51:11,211] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 217.2 GB, percent = 12.6%
vm-07-05: [2023-11-29 08:51:11,357] [INFO] [utils.py:802:see_memory_usage] After initializing optimizer states
vm-07-05: [2023-11-29 08:51:11,358] [INFO] [utils.py:803:see_memory_usage] MA 17.04 GB Max_MA 18.58 GB CA 18.6 GB Max_CA 19 GB
vm-07-05: [2023-11-29 08:51:11,358] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 218.51 GB, percent = 12.6%
vm-07-05: [2023-11-29 08:51:11,358] [INFO] [stage_1_and_2.py:514:__init__] optimizer state initialized
vm-07-05: [2023-11-29 08:51:11,648] [INFO] [utils.py:802:see_memory_usage] After initializing ZeRO optimizer
vm-07-05: [2023-11-29 08:51:11,648] [INFO] [utils.py:803:see_memory_usage] MA 17.04 GB Max_MA 17.04 GB CA 18.6 GB Max_CA 19 GB
vm-07-05: [2023-11-29 08:51:11,648] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 221.2 GB, percent = 12.8%
vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam
vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <megatron.optimizer_param_scheduler.OptimizerParamScheduler object at 0x7f4c65d55c70>
vm-07-05: [2023-11-29 08:51:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)]
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:974:print] DeepSpeedEngine configuration:
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] activation_checkpointing_config {
vm-07-05: "partition_activations": false,
vm-07-05: "contiguous_memory_optimization": false,
vm-07-05: "cpu_checkpointing": false,
vm-07-05: "number_checkpoints": null,
vm-07-05: "synchronize_checkpoint_boundary": false,
vm-07-05: "profile": false
vm-07-05: }
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] amp_enabled .................. False
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] amp_params ................... False
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] autotuning_config ............ {
vm-07-05: "enabled": false,
vm-07-05: "start_step": null,
vm-07-05: "end_step": null,
vm-07-05: "metric_path": null,
vm-07-05: "arg_mappings": null,
vm-07-05: "metric": "throughput",
vm-07-05: "model_info": null,
vm-07-05: "results_dir": "autotuning_results",
vm-07-05: "exps_dir": "autotuning_exps",
vm-07-05: "overwrite": true,
vm-07-05: "fast": true,
vm-07-05: "start_profile_step": 3,
vm-07-05: "end_profile_step": 5,
vm-07-05: "tuner_type": "gridsearch",
vm-07-05: "tuner_early_stopping": 5,
vm-07-05: "tuner_num_trials": 50,
vm-07-05: "model_info_path": null,
vm-07-05: "mp_size": 1,
vm-07-05: "max_train_batch_size": null,
vm-07-05: "min_train_batch_size": 1,
vm-07-05: "max_train_micro_batch_size_per_gpu": 1.024000e+03,
vm-07-05: "min_train_micro_batch_size_per_gpu": 1,
vm-07-05: "num_tuning_micro_batch_sizes": 3
vm-07-05: }
vm-07-05: [2023-11-29 08:51:11,651] [INFO] [config.py:978:print] bfloat16_enabled ............. False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] checkpoint_parallel_write_pipeline False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] checkpoint_tag_validation_enabled True
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] checkpoint_tag_validation_fail False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f4c43035bb0>
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] communication_data_type ...... None
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] curriculum_enabled_legacy .... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] curriculum_params_legacy ..... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] data_efficiency_enabled ...... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] dataloader_drop_last ......... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] disable_allgather ............ False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] dump_state ................... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] dynamic_loss_scale_args ...... {'init_scale': 2048, 'scale_window': 500, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1}
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_enabled ........... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_gas_boundary_resolution 1
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_layer_name ........ bert.encoder.layer
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_layer_num ......... 0
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_max_iter .......... 100
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_stability ......... 1e-06
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_tol ............... 0.01
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] eigenvalue_verbose ........... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] elasticity_enabled ........... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] flops_profiler_config ........ {
vm-07-05: "enabled": false,
vm-07-05: "recompute_fwd_factor": 0.0,
vm-07-05: "profile_step": 1,
vm-07-05: "module_depth": -1,
vm-07-05: "top_modules": 1,
vm-07-05: "detailed": true,
vm-07-05: "output_file": null
vm-07-05: }
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] fp16_auto_cast ............... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] fp16_enabled ................. True
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] fp16_master_weights_and_gradients False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] global_rank .................. 0
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] grad_accum_dtype ............. None
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] gradient_accumulation_steps .. 64
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] gradient_clipping ............ 1.0
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] gradient_predivide_factor .... 1.0
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] initial_dynamic_scale ........ 2048
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] load_universal_checkpoint .... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] loss_scale ................... 0
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] memory_breakdown ............. False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] mics_hierarchial_params_gather False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] mics_shard_size .............. -1
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] nebula_config ................ {
vm-07-05: "enabled": false,
vm-07-05: "persistent_storage_path": null,
vm-07-05: "persistent_time_interval": 100,
vm-07-05: "num_of_version_in_retention": 2,
vm-07-05: "enable_nebula_load": true,
vm-07-05: "load_path": null
vm-07-05: }
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] optimizer_legacy_fusion ...... False
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] optimizer_name ............... None
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] optimizer_params ............. None
vm-07-05: [2023-11-29 08:51:11,652] [INFO] [config.py:978:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] pld_enabled .................. False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] pld_params ................... False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] prescale_gradients ........... False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] scheduler_name ............... None
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] scheduler_params ............. None
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] seq_parallel_communication_data_type torch.float32
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] sparse_attention ............. None
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] sparse_gradients_enabled ..... False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] steps_per_print .............. 1
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] train_batch_size ............. 2048
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] train_micro_batch_size_per_gpu 2
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] use_node_local_storage ....... False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] wall_clock_breakdown ......... False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] weight_quantization_config ... None
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] world_size ................... 16
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_allow_untested_optimizer False
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_enabled ................. True
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_force_ds_cpu_optimizer .. True
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:978:print] zero_optimization_stage ...... 1
vm-07-05: [2023-11-29 08:51:11,653] [INFO] [config.py:964:print_user_config] json = {
vm-07-05: "train_batch_size": 2.048000e+03,
vm-07-05: "train_micro_batch_size_per_gpu": 2,
vm-07-05: "steps_per_print": 1,
vm-07-05: "zero_optimization": {
vm-07-05: "stage": 1
vm-07-05: },
vm-07-05: "gradient_clipping": 1.0,
vm-07-05: "prescale_gradients": false,
vm-07-05: "fp16": {
vm-07-05: "enabled": true,
vm-07-05: "loss_scale": 0,
vm-07-05: "loss_scale_window": 500,
vm-07-05: "hysteresis": 2,
vm-07-05: "min_loss_scale": 1,
vm-07-05: "initial_scale_power": 11
vm-07-05: },
vm-07-05: "wall_clock_breakdown": false
vm-07-05: }
vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,000] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,001] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-14: [2023-11-29 08:51:13,999] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: WARNING: could not find the metadata file .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase
vm-07-14: [2023-11-29 08:51:13,999] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: will not load any checkpoints and will start from random
vm-07-14: [2023-11-29 08:51:13,999] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,001] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [2023-11-29 08:51:14,001] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-14: [2023-11-29 08:51:14,013] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-14: [2023-11-29 08:51:14,013] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-14: [2023-11-29 08:51:14,013] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-14: [2023-11-29 08:51:14,014] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-14: [2023-11-29 08:51:14,014] [WARNING] [engine.py:2699:load_checkpoint] Unable to find latest file at .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.
vm-07-05: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-11-29 08:51:14
vm-07-14: (min, max) time across ranks (ms):
vm-07-14: load-checkpoint ................................: (1.15, 16.61)
vm-07-05: > building train, validation, and test datasets ...
vm-07-05: > datasets target sizes (minimum size):
vm-07-05: train: 10240
vm-07-05: validation: 20480
vm-07-05: test: 20480
vm-07-05: > building train, validation, and test datasets for GPT ...
vm-07-05: Single data path provided for train, valid & test
vm-07-05: > building dataset index ...
vm-07-05: reading sizes...
vm-07-05: reading pointers...
vm-07-05: reading document index...
vm-07-05: creating numpy buffer of mmap...
vm-07-05: creating memory view of numpy buffer...
vm-07-05: > finished creating indexed dataset in 0.000362 seconds
vm-07-05: number of documents: 115876
vm-07-05: > dataset split:
vm-07-05: train:
vm-07-05: document indices in [0, 109966) total of 109966 documents
vm-07-05: validation:
vm-07-05: document indices in [109966, 115760) total of 5794 documents
vm-07-05: test:
vm-07-05: document indices in [115760, 115876) total of 116 documents
vm-07-05: > loading doc-idx mapping from /root/index-cache/06115e84e99e3b6ca4187cde686826c9_doc_idx.npy
vm-07-05: > loading sample-idx mapping from /root/index-cache/06115e84e99e3b6ca4187cde686826c9_sample_idx.npy
vm-07-05: > loading shuffle-idx mapping from /root/index-cache/06115e84e99e3b6ca4187cde686826c9_shuffle_idx.npy
vm-07-05: loaded indexed file in 0.001 seconds
vm-07-05: total number of samples: 10575
vm-07-05: total number of epochs: 10
vm-07-05: > loading doc-idx mapping from /root/index-cache/c9410219284a5371a54555ffc4190827_doc_idx.npy
vm-07-05: > loading sample-idx mapping from /root/index-cache/c9410219284a5371a54555ffc4190827_sample_idx.npy
vm-07-05: > loading shuffle-idx mapping from /root/index-cache/c9410219284a5371a54555ffc4190827_shuffle_idx.npy
vm-07-05: loaded indexed file in 0.001 seconds
vm-07-05: total number of samples: 20485
vm-07-05: total number of epochs: 530
vm-07-05: > loading doc-idx mapping from /root/index-cache/087bcc1d515023256208907c78e6a640_doc_idx.npy
vm-07-05: > loading sample-idx mapping from /root/index-cache/087bcc1d515023256208907c78e6a640_sample_idx.npy
vm-07-05: > loading shuffle-idx mapping from /root/index-cache/087bcc1d515023256208907c78e6a640_shuffle_idx.npy
vm-07-05: loaded indexed file in 0.001 seconds
vm-07-05: total number of samples: 20481
vm-07-05: total number of epochs: 22018
vm-07-05: > finished creating GPT datasets ...
vm-07-05: [after dataloaders are built] datetime: 2023-11-29 08:51:14
vm-07-05: done with setup ...
vm-07-05: training ...
vm-07-14: (min, max) time across ranks (ms):
vm-07-14: model-and-optimizer-setup ......................: (16584.81, 16588.33)
vm-07-14: train/valid/test-data-iterators-setup ..........: (298.83, 342.60)
vm-07-05: [before the start of training step] datetime: 2023-11-29 08:51:14
vm-07-05: [2023-11-29 08:52:27,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[1.2000000000000002e-07, 1.2000000000000002e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
vm-07-14: iteration 1/ 5 | consumed samples: 2048 | consumed tokens: 4194304 | elapsed time per iteration (ms): 76015.3 | learning rate: 1.200E-07 | global batch size: 2048 | lm loss: 1.100492E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 26.942 | TFLOPs: 147.61 |
vm-07-05: [Rank 0] (after 1 iterations) memory (MB) | allocated: 17956.4931640625 | max allocated: 68581.8095703125 | reserved: 90178.0 | max reserved: 90178.0
vm-07-05: [2023-11-29 08:53:42,835] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[2.4000000000000003e-07, 2.4000000000000003e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
vm-07-14: iteration 2/ 5 | consumed samples: 4096 | consumed tokens: 8388608 | elapsed time per iteration (ms): 75254.0 | learning rate: 2.400E-07 | global batch size: 2048 | lm loss: 1.100421E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.215 | TFLOPs: 149.10 |
vm-07-05: [2023-11-29 08:54:58,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=3, skipped=0, lr=[3.6000000000000005e-07, 3.6000000000000005e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
vm-07-05: [2023-11-29 08:55:00,932] [INFO] [timer.py:260:stop] epoch=0/micro_step=3/global_step=3, RunningAvgSamplesPerSec=291.6169137604138, CurrSamplesPerSec=291.6169137604138, MemAllocated=17.54GB, MaxMemAllocated=66.97GB
vm-07-14: iteration 3/ 5 | consumed samples: 6144 | consumed tokens: 12582912 | elapsed time per iteration (ms): 75249.6 | learning rate: 3.600E-07 | global batch size: 2048 | lm loss: 1.096316E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.216 | TFLOPs: 149.11 |
vm-07-05: [2023-11-29 08:56:13,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=4, skipped=0, lr=[4.800000000000001e-07, 4.800000000000001e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
vm-07-05: [2023-11-29 08:56:16,103] [INFO] [timer.py:260:stop] epoch=0/micro_step=4/global_step=4, RunningAvgSamplesPerSec=303.1183419363961, CurrSamplesPerSec=315.5642580603495, MemAllocated=17.54GB, MaxMemAllocated=66.97GB
vm-07-14: iteration 4/ 5 | consumed samples: 8192 | consumed tokens: 16777216 | elapsed time per iteration (ms): 75182.4 | learning rate: 4.800E-07 | global batch size: 2048 | lm loss: 1.055474E+01 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.240 | TFLOPs: 149.24 |
vm-07-05: [2023-11-29 08:57:28,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=5, skipped=0, lr=[6.000000000000001e-07, 6.000000000000001e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
vm-07-05: [2023-11-29 08:57:30,611] [INFO] [timer.py:260:stop] epoch=0/micro_step=5/global_step=5, RunningAvgSamplesPerSec=311.41529524062446, CurrSamplesPerSec=329.45074244342015, MemAllocated=17.54GB, MaxMemAllocated=66.97GB
vm-07-14: iteration 5/ 5 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (ms): 74494.9 | learning rate: 6.000E-07 | global batch size: 2048 | lm loss: 9.927882E+00 | loss scale: 2048.0 | actual seqlen: 2048 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 27.492 | TFLOPs: 150.62 |
vm-07-05: [after training is done] datetime: 2023-11-29 08:57:30
vm-07-05: saving checkpoint at iteration 5 to .//checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase
vm-07-05: [2023-11-29 08:57:32,086] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48040
vm-07-05: [2023-11-29 08:57:32,114] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48041
vm-07-05: [2023-11-29 08:57:32,248] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48042
vm-07-14: [2023-11-29 08:57:32,266] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49031
vm-07-14: [2023-11-29 08:57:32,269] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49032
vm-07-14: [2023-11-29 08:57:32,270] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49033
vm-07-14: [2023-11-29 08:57:32,272] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49034
vm-07-14: [2023-11-29 08:57:32,272] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49035
vm-07-14: [2023-11-29 08:57:32,273] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49036
vm-07-14: [2023-11-29 08:57:32,275] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49037
vm-07-05: [2023-11-29 08:57:32,302] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48043
vm-07-05: [2023-11-29 08:57:32,303] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48044
vm-07-05: [2023-11-29 08:57:32,305] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48045
vm-07-14: [2023-11-29 08:57:32,303] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 49038
vm-07-14: [2023-11-29 08:57:32,304] [ERROR] [launch.py:321:sigkill_handler] ['/opt/conda/envs/py_3.9/bin/python', '-u', '/root/Megatron-DeepSpeed/examples_deepspeed/rebase/../../pretrain_gpt.py', '--local_rank=7', '--override-opt_param-scheduler', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--tensor-model-parallel-size', '1', '--init-method-std', '0.009', '--lr-decay-samples', '43945312', '--lr-warmup-samples', '2048000', '--lr-decay-style', 'cosine', '--micro-batch-size', '2', '--exit-duration-in-mins', '30000000', '--global-batch-size', '2048', '--num-layers', '32', '--hidden-size', '4096', '--num-attention-heads', '32', '--seq-length', '2048', '--max-position-embeddings', '2048', '--train-tokens', '300000000000', '--train-samples', '10240', '--lr', '1.2e-4', '--min-lr', '1.0e-6', '--split', '949,50,1', '--log-interval', '1', '--eval-interval', '500', '--eval-iters', '10', '--save-interval', '10000', '--weight-decay', '0.1', '--clip-grad', '1.0', '--hysteresis', '2', '--num-workers', '2', '--attention-dropout', '0.0', '--hidden-dropout', '0.0', '--optimizer', 'adam', '--use-distributed-optimizer', '--sequence-parallel', '--fp16', '--seed', '1234', '--load', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--save', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--no-async-tensor-model-parallel-allreduce', '--use-rotary-position-embeddings', '--no-gradient-accumulation-fusion', '--vocab-file', 'gpt2-vocab.json', '--merge-file', 'gpt2-merges.txt', '--data-path', '/root//dataset_text_sentence', '--data-impl', 'mmap', '--deepspeed', '--deepspeed_config', 'ds_config_gbs2048_mbs2_log1_zero1.json', '--zero-stage', '1', '--pipeline-model-parallel-size', '1', '--no-pipeline-parallel'] exits with return code = 1
vm-07-05: [2023-11-29 08:57:32,306] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48046
vm-07-05: [2023-11-29 08:57:32,306] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 48047
vm-07-05: [2023-11-29 08:57:32,308] [ERROR] [launch.py:321:sigkill_handler] ['/opt/conda/envs/py_3.9/bin/python', '-u', '/root/Megatron-DeepSpeed/examples_deepspeed/rebase/../../pretrain_gpt.py', '--local_rank=7', '--override-opt_param-scheduler', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--tensor-model-parallel-size', '1', '--init-method-std', '0.009', '--lr-decay-samples', '43945312', '--lr-warmup-samples', '2048000', '--lr-decay-style', 'cosine', '--micro-batch-size', '2', '--exit-duration-in-mins', '30000000', '--global-batch-size', '2048', '--num-layers', '32', '--hidden-size', '4096', '--num-attention-heads', '32', '--seq-length', '2048', '--max-position-embeddings', '2048', '--train-tokens', '300000000000', '--train-samples', '10240', '--lr', '1.2e-4', '--min-lr', '1.0e-6', '--split', '949,50,1', '--log-interval', '1', '--eval-interval', '500', '--eval-iters', '10', '--save-interval', '10000', '--weight-decay', '0.1', '--clip-grad', '1.0', '--hysteresis', '2', '--num-workers', '2', '--attention-dropout', '0.0', '--hidden-dropout', '0.0', '--optimizer', 'adam', '--use-distributed-optimizer', '--sequence-parallel', '--fp16', '--seed', '1234', '--load', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--save', './/checkpoint/gpt_6.7B_tok300B_lr1.2e-4_min1.0e-6_wM_dB__gbs2048_mbs2_g16_z1_seed1234_rebase', '--no-async-tensor-model-parallel-allreduce', '--use-rotary-position-embeddings', '--no-gradient-accumulation-fusion', '--vocab-file', 'gpt2-vocab.json', '--merge-file', 'gpt2-merges.txt', '--data-path', '/root//dataset_text_sentence', '--data-impl', 'mmap', '--deepspeed', '--deepspeed_config', 'ds_config_gbs2048_mbs2_log1_zero1.json', '--zero-stage', '1', '--pipeline-model-parallel-size', '1', '--no-pipeline-parallel'] exits with return code = 1
......@@ -11,12 +11,12 @@ HPCX_HOME ?= /opt/hpcx
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
# Build all targets.
all: cuda rocm
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
cpu: common cpu_perftest
common: cpu_hpl cpu_stream fio
directx_amd: directx_amf_encoding_latency
......@@ -171,3 +171,20 @@ directx_amf_encoding_latency:
del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \
"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
)
# Install Megatron-LM
megatron_lm:
if [ ! -d "Megatron/Megatron-LM" ]; then \
git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt
# Install Megatron-DeepSpeed
megatron_deepspeed:
if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt && \
python -m pip install DeepSpeed
nltk
parameterized
pybind11
regex
six
# versions from HF transformers
black==21.4b0
isort>=5.5.4
tqdm
sentencepiece
wandb
einops
typing_extensions==4.5.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment