Commit bf95e032 authored by hepj987's avatar hepj987
Browse files

dtk23.04初始化

parents
Pipeline #431 failed with stage
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 16,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 1
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"zero_allow_untested_optimizer": true,
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 16,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 0
},
"bf16": {
"enabled": true
},
"zero_allow_untested_optimizer": true,
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 16,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 1
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"curriculum_learning": {
"enabled": true,
"curriculum_type": "seqlen",
"min_difficulty": 8,
"max_difficulty": 128,
"schedule_type": "fixed_linear",
"schedule_config": {
"total_curriculum_step": 30,
"difficulty_step": 4
}
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 16,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"zero_allow_untested_optimizer": false,
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
import random
import unittest
import torch
from torch.nn import functional as F
from megatron.model.glu_activations import GLU_ACTIVATIONS, geglu, liglu, reglu, swiglu
from megatron.testing_utils import set_seed, torch_assert_equal
class TestActivations(unittest.TestCase):
def setUp(self):
"""setup an input of reasonable size"""
set_seed()
self.batch_size = random.randint(2, 64)
self.seq_len = random.randint(256, 1025)
self.num_channels = random.randint(1, 384) * 2
self.x = torch.randn(self.batch_size, self.seq_len, self.num_channels)
self.x1, self.x2 = self.x.chunk(2, dim=-1)
# glu should halve the last dimension
self.output_shape = [self.batch_size, self.seq_len, self.num_channels // 2]
def test_shapes(self):
for activation_fn in GLU_ACTIVATIONS.values():
output = activation_fn(self.x)
self.assertEqual(list(output.shape), self.output_shape)
def test_liglu(self):
expected = self.x1 * self.x2
torch_assert_equal(liglu(self.x), expected)
def test_geglu(self):
expected = self.x1 * F.gelu(self.x2)
torch_assert_equal(geglu(self.x), expected)
def test_reglu(self):
expected = self.x1 * F.relu(self.x2)
torch_assert_equal(reglu(self.x), expected)
def test_swiglu(self):
expected = self.x1 * F.silu(self.x2)
torch_assert_equal(swiglu(self.x), expected)
# from megatron.testing_utils import require_torch_bf16
# @require_torch_bf16
# def test_bf16_jit(self):
# x_bf16 = self.x.to(torch.bfloat16)
# for activation_fn in GLU_ACTIVATIONS.values():
# output = activation_fn(x_bf16)
# self.assertEqual(list(output.shape), self.output_shape)
def test_import():
import megatron
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import pytest
from pathlib import Path
from parameterized import parameterized
from megatron.testing_utils import (
CaptureStdout,
TestCasePlus,
execute_subprocess_async,
get_gpu_count,
require_deepspeed,
require_torch_gpu,
require_torch_multi_gpu,
set_seed
)
set_seed(42)
def parameterized_custom_name_func(func, param_num, param):
# customize the test name generator function as we want both params to appear in the sub-test
# name, as by default it shows only the first param
param_based_name = parameterized.to_safe_name("_to_".join(str(x) for x in param.args))
return f"{func.__name__}_{param_based_name}"
params = [
# TP_PP_DP
["1_1_1", "1_1_1"],
["2_1_1", "1_1_1"],
["1_2_1", "1_1_1"],
["1_1_2", "1_1_1"],
["2_1_1", "2_1_1"],
["1_1_1", "2_1_1"],
["1_1_1", "1_2_1"],
["1_1_1", "1_1_2"],
["1_1_2", "1_1_2"],
["1_1_2", "2_1_1"],
["1_1_2", "1_2_1"],
["1_2_1", "1_2_1"],
["1_2_1", "2_1_1"],
["1_2_1", "1_1_2"],
["2_1_1", "2_1_1"],
["2_1_1", "1_2_1"],
["2_1_1", "1_1_2"],
["2_2_2", "1_1_1"],
["2_2_2", "2_2_2"],
["1_1_1", "2_2_2"],
["1_1_8", "2_2_2"],
]
def get_launcher(num_gpus):
# 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
# - it won't be able to handle that
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
@require_deepspeed
@require_torch_gpu
class MegDSTestCheckpoints(TestCasePlus):
""" """
def setUp(self):
super().setUp()
# at times magatron fails to build kernels and doesn't remove the lock file, which makes
# subsequent runs hang - so make sure there is no lock when starting the testing
meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
if os.path.exists(meg_lock_file_path):
os.unlink(meg_lock_file_path)
def get_config(self, output_dir, tp_size, pp_size, dp_size):
data_dir = f"{self.data_dir}/gpt2"
num_gpus = pp_size * tp_size * dp_size
print(f"Using {num_gpus} GPUs")
n_samples = 300 # about 56 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
seq_len = 128
# XXX: for now while testing shapes make it really short and fast
exit_interval = 1
seq_len = 8
# common/shared configs
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
--zero-stage 0
--deepspeed-activation-checkpointing
""".split()
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--log-interval 1
--save-interval 1
--eval-interval 10
--eval-iters 1
--checkpoint-activations
--partition-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_dir}/meg-gpt2-openwebtext_text_document
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--num-layers 2
--hidden-size 8
--num-attention-heads 2
--seq-length {seq_len}
--max-position-embeddings 8
--micro-batch-size 1
--global-batch-size 16
--train-samples {n_samples}
--embed-layernorm
--position-embedding-type alibi
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--lr-decay-samples 6
--clip-grad 1.0
--weight-decay 1e-1
--bf16
--log-level debug
--log-level-replica info
""".split()
# XXX: fails to handle:
#--embed-layernorm
#
# stderr: RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding:
# stderr: size mismatch for norm.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
# stderr: size mismatch for norm.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
return args, ds_args, num_gpus
def train_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
src_dir = self.src_dir
script = [f"{src_dir}/pretrain_gpt.py"]
args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
#print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
def convert_checkpoint_to_universal(self, output_dir, step):
cmd = f"""
python tools/convert_checkpoint/ds_to_universal.py
--input_folder {output_dir}/checkpoints/global_step{step}
--output_folder {output_dir}/checkpoints/global_step{step}_universal
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
self.assertIn("Convert DeepSpeed Checkpoint to Universal Checkpoint", cs.out)
def resume_from_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
src_dir = self.src_dir
script = [f"{src_dir}/pretrain_gpt.py"]
args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
def resume_from_universal_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
src_dir = self.src_dir
script = [f"{src_dir}/pretrain_gpt.py"]
args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args + ["--universal-checkpoint"]
# keep for quick debug
#print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
@require_torch_multi_gpu
@parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_checkpoint_reshaping_main(self, src, tgt):
# this test needs at least 2 gpus - if there are more gpus it will do more extensive testing
tp_size_src, pp_size_src, dp_size_src = list(map(int, src.split('_')))
tp_size_tgt, pp_size_tgt, dp_size_tgt = list(map(int, tgt.split('_')))
n_gpus = get_gpu_count()
n_gpus_src = tp_size_src * pp_size_src * dp_size_src
n_gpus_tgt = tp_size_tgt * pp_size_tgt * dp_size_tgt
if n_gpus_src > n_gpus:
pytest.skip(f"the test requires {n_gpus_src} gpus for source topology but have only {n_gpus}")
if n_gpus_tgt > n_gpus:
pytest.skip(f"the test requires {n_gpus_tgt} gpus for target topology but have only {n_gpus}")
output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
# 1. train with initial topology defined in the first arg of params
self.train_checkpoint(output_dir, tp_size=tp_size_src , pp_size=pp_size_src , dp_size=dp_size_src )
# 2. convert checkpoint to universal checkpoint (topology )
self.convert_checkpoint_to_universal(output_dir=output_dir, step=1)
# 3. check we can resume training from a reshaped checkpoint to the target topology - the last arg of params
self.resume_from_universal_checkpoint(output_dir, tp_size=tp_size_tgt, pp_size=pp_size_tgt, dp_size=dp_size_tgt)
@require_torch_multi_gpu
def test_checkpoint_reshaping_empty_dir(self):
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
with self.assertRaises(RuntimeError) as context:
self.convert_checkpoint_to_universal(output_dir=output_dir, step=1)
import itertools
import os
import shutil
from typing import Set
from unittest.mock import patch
import deepspeed
import torch
import finetune_t0_non_causal_decoder
from megatron import global_vars, get_tokenizer, initialize_megatron, get_args
from megatron.data import mlm_dataset, mtf_dataset, decoder_packed_mtf_dataset
from megatron.data.data_samplers import build_pretraining_data_loader
from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context, torch_assert_equal
def get_default_args():
"""return a dictionary with key as argument name and value as additional arguments"""
return {
# GPT_ARGS
"--num-layers": "2",
"--hidden-size": "128",
"--num-attention-heads": "4",
"--seq-length": "512",
"--max-position-embeddings": "512",
"--micro-batch-size": "4",
"--global-batch-size": "8",
"--lr-decay-iters": "320000",
"--lr-decay-style": "cosine",
"--lr": "0.00015",
"--min-lr": "1.0e-5",
"--train-iters": "5000",
"--tokenizer-type": "PretrainedFromHF",
"--tokenizer-name-or-path": "gpt2",
"--data-impl": "mmap",
"--split": "949,50,1",
"--distributed-backend": "nccl",
"--weight-decay": "1e-2",
"--clip-grad": "1.0",
"--lr-warmup-fraction": ".01",
"--fp16": "",
"--attention-dropout": "0",
"--hidden-dropout": "0",
# OUTPUT_ARGS
"--log-interval": "10",
"--save-interval": "500",
"--eval-interval": "100",
"--eval-iters": "10",
"--checkpoint-activations": "",
# DATA_ARGS
}
def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]):
seq_length += 1
num_segments = torch.randint(1, 5, ())
segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long)
is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool)
for batch_id in range(micro_batch_size):
# - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least
# - `+1`: Hack in order the start_mew_segments not to be 0
start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1
segment_ids[batch_id, start_new_segments] = 1
end_inputs = [
torch.randint(low=start_segment, high=end_segment, size=())
for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length])
]
for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]):
is_inputs[batch_id][start_segment: end_input + 1] = True
segment_ids = torch.cumsum(segment_ids, dim=-1) + 1
tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long)
flatten_token_view = tokens.view(-1,)
for token_id in range(len(flatten_token_view)):
token = flatten_token_view[token_id]
# While token is a special tokens we change that token
while token in special_tokens_ids:
flatten_token_view[token_id] = (token + 1) % vocab_size
token = flatten_token_view[token_id]
return {
"decoder_token_ids": tokens,
"decoder_segment_ids": segment_ids,
"decoder_is_inputs": is_inputs
}
class TestDataLoading(TestCasePlus):
def setUp(self) -> None:
super().setUp()
# We reset all global variables
global_vars._GLOBAL_ARGS = None
global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
global_vars._GLOBAL_TOKENIZER = None
global_vars._GLOBAL_TENSORBOARD_WRITER = None
global_vars._GLOBAL_ADLR_AUTORESUME = None
global_vars._GLOBAL_TIMERS = None
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
def copy_data_to_temp(self, root_dir, prefix):
"""copy data to temp, and return paths to temp version"""
src_path = os.path.join(root_dir, prefix)
src_dirname = os.path.dirname(src_path)
tmp_dir = self.get_auto_remove_tmp_dir()
dest_path = os.path.join(tmp_dir, prefix)
dest_dirname = os.path.dirname(dest_path)
os.makedirs(dest_dirname, exist_ok=True)
for folder in os.listdir(src_dirname):
src_folder = os.path.join(src_dirname, folder)
dest_folder = os.path.join(dest_dirname, folder)
if src_folder.startswith(src_path):
if os.path.isdir(src_folder):
shutil.copytree(src_folder, dest_folder)
else:
shutil.copy2(src_folder, dest_folder)
return dest_path
def test_mlm_dataset(self):
command_args = get_default_args()
data_path = self.copy_data_to_temp(self.data_dir, "gpt2/meg-gpt2-openwebtext_text_document")
command_args["--data-path"] = data_path
command_args["--noise-density"] = "0.15"
command_args["--mean-noise-span-length"] = "3"
command_args["--vocab-extra-ids"] = "100"
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
# tokenizer
tokenizer = get_tokenizer()
# SEP is required to put in MLM preprocessed.
tokenizer.tokenizer.add_special_tokens({"sep_token": "<s>"})
args = get_args()
train_val_test_num_samples = [
args.train_iters * args.global_batch_size,
args.eval_iters * args.global_batch_size,
0
]
train_ds, valid_ds, test_ds = mlm_dataset.build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
# TODO @thomasw21 figure how that value works
train_valid_test_num_samples=train_val_test_num_samples,
sequence_length=args.seq_length,
noise_density=args.noise_density,
mean_noise_span_length=args.mean_noise_span_length,
seed=args.seed,
skip_warmup=(not args.mmap_warmup)
)
sample = train_ds[0]
# +1 is needed to compute labels. As inputs and targets are just concatenated.
self.assertEqual(len(sample["input_tokens"]) + len(sample["target_tokens"]), args.seq_length + 1)
# We make sure that inputs/targets end with <sep>
self.assertEqual(sample["input_tokens"][-1], tokenizer.sep)
self.assertEqual(sample["target_tokens"][-1], tokenizer.sep)
def test_decoder_packed_mtf_dataloader(self):
command_args = get_default_args()
data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt")
command_args["--data-path"] = data_path
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
# Hack: `gpt2` doesn't have a padding token, so we override that value.
tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
train_val_test_num_samples = [
args.train_iters * args.global_batch_size,
args.eval_iters * args.global_batch_size,
0
]
train_ds, valid_ds, test_ds = decoder_packed_mtf_dataset.build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
# TODO @thomasw21 figure how that value works
train_valid_test_num_samples=train_val_test_num_samples,
seq_length=args.seq_length + 1,
pad_token=tokenizer.pad,
eos_token=tokenizer.eos,
seed=args.seed,
skip_warmup=(not args.mmap_warmup)
)
batch_iterator = build_pretraining_data_loader(
train_ds, consumed_samples=0, num_workers=4
)
last_padding_size = 0
for i, items in enumerate(batch_iterator):
micro_batch_size, seq_length = items["decoder_token_ids"].shape
# Check dtypes
self.assertEqual(items["decoder_token_ids"].dtype, torch.int64)
self.assertEqual(items["decoder_segment_ids"].dtype, torch.int64)
self.assertEqual(items["decoder_is_inputs"].dtype, torch.bool)
# `micro_batch_size` correspond to the one in argument
self.assertEqual(micro_batch_size, args.micro_batch_size)
# `seq_length` correspond to the one in argument + 1 in order to get tokens/labels
self.assertEqual(seq_length, args.seq_length + 1)
original_samples_count = 0
for batch_id in range(micro_batch_size):
segment_ids = [k for k, _ in itertools.groupby(items["decoder_segment_ids"][batch_id])]
# `segment_ids` is [1,2,...]
self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids))))
# `0` signify that the tokens are padding
self.assertIn(segment_ids[-1], [0, len(segment_ids)])
original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0])
# Test that we actually pack, ie we have more samples than the `batch_size`
self.assertGreater(original_samples_count, micro_batch_size)
# Test that the first sample of each batch couldn't fit inside the previous batch
first_sample_segment_ids = next(itertools.groupby(items["decoder_segment_ids"][0]))[1]
first_sample_size = len(list(first_sample_segment_ids))
self.assertGreater(first_sample_size, last_padding_size)
# update `last_padding_size`
last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0])
def test_finetune_t0_non_causal_decoder_get_batch_pipe(self):
command_args = get_default_args()
command_args["--position-embedding-type"] = "alibi"
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
# Hack: `gpt2` doesn't have a padding token, so we override that value.
tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
# Dummy data
data = get_dummy_mtf_decoder_packed_data(
micro_batch_size=args.micro_batch_size,
seq_length=args.seq_length,
vocab_size=args.padded_vocab_size,
special_tokens_ids={tokenizer.pad}
)
(tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data)
tokens = tokens.cpu()
position_ids = position_ids.cpu()
attention_mask = attention_mask.cpu()
labels = labels.cpu()
loss_mask = loss_mask.cpu()
self.assertEqual(loss_mask.dtype, torch.float)
torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_token_ids"][:, :-1] != tokenizer.pad))
torch_assert_equal(tokens, data["decoder_token_ids"][:, :-1])
torch_assert_equal(labels, data["decoder_token_ids"][:, 1:])
for batch_id in range(args.micro_batch_size):
segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) + 1
for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]):
self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, :segment_start]))
self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, segment_end:]))
# TODO @thomasw21 make sure that we reset `position_ids`
from random import randint
from typing import Set
from unittest.mock import patch
import deepspeed
import torch
from parameterized import parameterized
from torch import nn
import torch.nn.functional as F
from megatron.enums import AttnMaskType
from megatron.model.fused_layer_norm import MixedFusedLayerNorm
from packaging import version
from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
from megatron.model.fused_softmax import ScaledMaskedSoftmax, FusedScaleMaskSoftmax
from megatron.model.utils import attention_mask_func
from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \
torch_assert_close, require_torch_bf16
from megatron.training import setup_model_and_optimizer
import pretrain_gpt
import pretrain_prefix_lm
import finetune_t0_non_causal_decoder
def get_default_args(test_file_dir: str):
"""return a dictionary with key as argument name and value as additional arguments"""
return {
# GPT_ARGS
"--num-layers": "2",
"--hidden-size": "128",
"--num-attention-heads": "4",
"--seq-length": "256",
"--max-position-embeddings": "256",
"--micro-batch-size": "2",
"--global-batch-size": "2",
"--lr-decay-iters": "320000",
"--lr-decay-style": "cosine",
"--lr": "0.00015",
"--min-lr": "1.0e-5",
"--train-iters": "5000",
"--tokenizer-type": "PretrainedFromHF",
"--tokenizer-name-or-path": "gpt2",
"--data-impl": "mmap",
"--split": "949,50,1",
"--distributed-backend": "nccl",
"--weight-decay": "1e-2",
"--clip-grad": "1.0",
"--lr-warmup-fraction": ".01",
"--fp16": "",
"--inference": "",
"--attention-dropout": "0",
"--hidden-dropout": "0",
# OUTPUT_ARGS
"--log-interval": "10",
"--save-interval": "500",
"--eval-interval": "100",
"--eval-iters": "10",
"--checkpoint-activations": "",
# DATA_ARGS
# DeepSpeed args
"--deepspeed": "",
"--deepspeed_config": f"{test_file_dir}/ds_config_inference.json",
"--zero-stage": "0",
}
def equal_vectors(tensor1, tensor2, dim=-1):
"""View tensor1 and tensor2 as a list of vectors, and compute equality"""
return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
def iter_out_of_one(one):
return iter([one])
def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]):
"""Code from `tests/test_dataloaders.py"""
seq_length += 1
num_segments = torch.randint(1, 5, ())
segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long)
is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool)
for batch_id in range(micro_batch_size):
# - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least
# - `+1`: Hack in order the start_mew_segments not to be 0
start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1
segment_ids[batch_id, start_new_segments] = 1
end_inputs = [
torch.randint(low=start_segment, high=end_segment - 1, size=())
for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length])
]
for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]):
is_inputs[batch_id][start_segment: end_input + 1] = True
segment_ids = torch.cumsum(segment_ids, dim=-1) + 1
tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long)
flatten_token_view = tokens.view(-1,)
for token_id in range(len(flatten_token_view)):
token = flatten_token_view[token_id]
# While token is a special tokens we change that token
while token in special_tokens_ids:
flatten_token_view[token_id] = (token + 1) % vocab_size
token = flatten_token_view[token_id]
return {
"decoder_token_ids": tokens,
"decoder_segment_ids": segment_ids,
"decoder_is_inputs": is_inputs
}
class MyTestCase(TestCasePlus):
def setUp(self) -> None:
super().setUp()
# We reset all global variables
global_vars._GLOBAL_ARGS = None
global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
global_vars._GLOBAL_TOKENIZER = None
global_vars._GLOBAL_TENSORBOARD_WRITER = None
global_vars._GLOBAL_ADLR_AUTORESUME = None
global_vars._GLOBAL_TIMERS = None
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
def test_gpt(self):
"""Test causal invariance, ie past token don't depend on future tokens."""
command_args = get_default_args(self.test_file_dir_str)
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
# get a modified version of the first batch, we change a specific index
changed_index = randint(0, args.seq_length - 2)
token_ids_changed = token_ids.clone()
# We increment the token_id by one for that index in order to artificially change the sequence.
token_ids_changed[:, changed_index] = \
(token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
output = model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False)
# All token in past should be unchanged
torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index])
# All tokens in the future should have changed
self.assertFalse(
torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
)
def test_prefix_lm_reset_attention_mask(self):
"""
Test prefix invariances when `reset_attention_mask=True`:
- Past target tokens don't depend on future target tokens.
- Target tokens depend on input tokens.
- Input tokens depend on all other input tokens, but never target tokens.
"""
command_args = get_default_args(self.test_file_dir_str)
command_args["--reset-attention-mask"] = ""
command_args["--loss-on-targets-only"] = ""
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
# we preprocess batch_fn manually
model.set_batch_fn(None)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token, this also guarantees that the whole row is considered as a document.
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
# process batch to have non empty prefix
input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
for batch_id in range(len(prefix_indices)):
for id in prefix_indices[batch_id]:
self.assertTrue(loss_mask[batch_id, id] == 1)
self.assertTrue(id > 0)
# Make sure that the last prefix token predicts the first token.
self.assertTrue(loss_mask[batch_id, id -1] == 1)
output = model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
## --------------- CHANGE A TARGET TOKEN ---------------------------
# get a modified version of the first batch
# guaranteed to exist as each row has at least one partial document
changed_target_index = prefix_indices[0][0]
token_ids_changed_target = input_batch[0].clone()
# We increment the token id on the changed index.
token_ids_changed_target[0, changed_target_index] = \
(token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
# make sure we're not changing a token to eod as it's a special token
token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
# Test change
output_changed_target = model.eval_batch(iter_out_of_one(((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
# All token in past should be unchanged
torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
# All tokens in the future should have changed
self.assertFalse(
torch.any(
equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:])
)
)
# Unchanged changed rows should not change either
torch_assert_equal(output[1, :], output_changed_target[1, :])
## --------------- CHANGE AN INPUT TOKEN ---------------------------
# Let's change the the last prefix token and make sure that the first token changed
# guaranteed to be positive as we avoid pathological case previously
last_prefix_index = prefix_indices[0][0] - 1
token_ids_changed_input = input_batch[0].clone()
# We increment the token id on the changed index.
token_ids_changed_input[0, last_prefix_index] = \
(token_ids_changed_input[0, last_prefix_index] + 1) % args.padded_vocab_size
# make sure we're not changing a token to eod as it's a special token
token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
output_changed_input = model.eval_batch(iter_out_of_one(((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
# All tokens should be changed
self.assertFalse(
torch.any(
equal_vectors(output[0, :], output_changed_input[0, :])
)
)
# Unchanged changed rows should not change either
torch_assert_equal(output[1, :], output_changed_input[1, :])
def test_prefix_lm_wo_reset_attention_mask(self):
"""
Test prefix invariances when `reset_attention_mask=False`:
- Past target tokens don't depend on future target tokens.
- Target tokens depend on input tokens.
- Input tokens depend on all other input tokens, but never target tokens.
"""
command_args = get_default_args(self.test_file_dir_str)
command_args["--loss-on-targets-only"] = ""
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
# we preprocess batch_fn manually
model.set_batch_fn(None)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
for batch_id in range(len(prefix_indices)):
id = prefix_indices[batch_id]
self.assertTrue(loss_mask[batch_id, id] == 1)
self.assertTrue(id > 0)
# Make sure that the last prefix token predicts the first token.
self.assertTrue(loss_mask[batch_id, id -1] == 1)
model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
#TODO: Check all invariants
def test_gpt_rotary_embeddings(self):
"""Test rotary embeddings"""
command_args = get_default_args(self.test_file_dir_str)
del command_args["--max-position-embeddings"]
command_args["--position-embedding-type"] = "rotary"
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
#TODO: Check all invariants
@require_torch_bf16
def test_fused_layer_norm(self):
command_args = get_default_args(self.test_file_dir_str)
# Condition to use custom cuda kernel
command_args["--bf16"] = ""
del command_args["--fp16"]
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
initialize_megatron()
args = get_args()
dummy_input = torch.randn(args.micro_batch_size, args.seq_length, args.hidden_size, device="cuda", dtype=torch.bfloat16)
normalized_shape = (args.hidden_size,)
epsilon = 1e-5
mfln = MixedFusedLayerNorm(normalized_shape, eps=epsilon)
self.assertTrue(mfln.use_meg_ds_fused_layer_norm, "Expected model to use Megatron-DeepSpeed custom cuda kernel for LayerNorm.")
self.assertTrue(args.bf16, "Test has to be done in half precision.")
# We set the weight manually so we simulate state that's not the initialisation
weight = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
bias = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
mfln.weight = nn.Parameter(weight)
mfln.bias = nn.Parameter(bias)
mfln_output = mfln(dummy_input)
# We check that our layernorm matches pytorch 1.11 onwards
if version.parse(torch.__version__) >= version.parse("1.11.0"):
torch_layer_norm_output = F.layer_norm(dummy_input, normalized_shape, weight, bias, eps=epsilon)
else:
# In this case we use can check that basically it corresponds to the fp32 version
torch_layer_norm_output = F.layer_norm(dummy_input.float(), normalized_shape, weight.float(), bias.float(), eps=epsilon).to(torch.bfloat16)
torch_assert_equal(mfln_output, torch_layer_norm_output)
@parameterized.expand([(attn_mask_type,) for attn_mask_type in AttnMaskType])
def test_fused_masked_softmax(self, attn_mask_type: AttnMaskType):
command_args = get_default_args(self.test_file_dir_str)
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
initialize_megatron()
args = get_args()
dummy_input = torch.randn(
args.micro_batch_size,
args.num_attention_heads,
args.seq_length,
args.seq_length,
device="cuda",
dtype=args.params_dtype
)
if attn_mask_type == AttnMaskType.causal:
dummy_attention_mask = None
else:
dummy_attention_mask = torch.randn(
args.micro_batch_size,
1, # `args.num_attention_heads` not implemented in our cuda kernel
args.seq_length,
args.seq_length,
device="cuda",
dtype=args.params_dtype
) < 0
scale = torch.rand(())
fused_scaled_softmax = FusedScaleMaskSoftmax(
input_in_fp16=args.params_dtype == torch.float16,
input_in_bf16=args.params_dtype == torch.bfloat16,
attn_mask_type=attn_mask_type,
scaled_masked_softmax_fusion=True,
mask_func=attention_mask_func,
softmax_in_fp32=True,
scale=scale,
)
unfused_scaled_softmax = FusedScaleMaskSoftmax(
input_in_fp16=args.params_dtype == torch.float16,
input_in_bf16=args.params_dtype == torch.bfloat16,
attn_mask_type=attn_mask_type,
scaled_masked_softmax_fusion=False,
mask_func=attention_mask_func,
softmax_in_fp32=True,
scale=scale,
)
self.assertTrue(fused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
fused_output = fused_scaled_softmax(dummy_input, dummy_attention_mask)
self.assertFalse(unfused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
unfused_output = unfused_scaled_softmax(dummy_input, dummy_attention_mask)
# Test that the nonzeros are the same with the mask
for i in range(args.num_attention_heads):
if dummy_attention_mask is None:
# Make sure it's causal, values in the lower triangle should be not zero.
non_zero_values = torch.tril(torch.ones_like(fused_output[:, i]))
torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(non_zero_values))
else:
torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0]))
# Cuda kernel produces slightly different results
torch_assert_close(fused_output, unfused_output)
def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self):
command_args = get_default_args(self.test_file_dir_str)
command_args["--position-embedding-type"] = "alibi"
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
# Hack: `gpt2` doesn't have a padding token, so we override that value.
tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
data = get_dummy_mtf_decoder_packed_data(
micro_batch_size=args.micro_batch_size,
seq_length=args.seq_length,
vocab_size=args.padded_vocab_size,
special_tokens_ids={tokenizer.pad}
)
model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
output = model.eval_batch(iter_out_of_one(data), compute_loss=False)
## --------------- CHANGE A TARGET TOKEN ---------------------------
# change the first token in the first batch to a random value
change_batch_id = 0
change_token_id = 0
token_ids_changed = data["decoder_token_ids"].clone()
# We increment the token id on the changed index.
token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
while token_ids_changed[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}:
token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
# Test change
output_changed_target = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed}), compute_loss=False)
first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0]
# Check that values changed in segment 1 of batch_id 0
self.assertFalse(torch.any(
equal_vectors(
output[change_batch_id, change_token_id:first_segment_first_batch_id_end],
output_changed_target[change_batch_id, change_token_id:first_segment_first_batch_id_end]
)
))
# Check that values did not change in other segments of batch_id 0
torch_assert_equal(
output[change_batch_id, first_segment_first_batch_id_end:],
output_changed_target[change_batch_id, first_segment_first_batch_id_end:]
)
# Check that values did not change in other segments in other batches
non_change_ids = torch.arange(output.shape[0]) != change_batch_id
torch_assert_equal(output[non_change_ids], output_changed_target[non_change_ids])
## --------------- CHANGE A TARGET TOKEN ---------------------------
# change the last token in the first batch to a pad
token_ids_changed_pad = data["decoder_token_ids"].clone()
segment_ids_changed_pad = data["decoder_segment_ids"].clone()
# We increment the token id on the changed index.
token_ids_changed_pad[change_batch_id, -1] = tokenizer.pad
segment_ids_changed_pad[change_batch_id, -1] = 0
# Test model handles padding correctly
output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False)
self.assertFalse(torch.any(torch.isnan(output_changed_pad)))
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import filecmp
import io
import json
import re
import os
import unittest
import functools
from pathlib import Path
from megatron.testing_utils import (
TestCasePlus,
execute_subprocess_async,
set_seed
)
from datasets import load_dataset
set_seed(42)
def write_jsonl(path, lines_num=1000, line_length=1024):
def get_text_line(line_length):
# XXX: fix to generate line_length
return "It's a wonderful world. I'm just walking on air. Talk of heaven on earth. I've got more than my share. Haven't got a care. Happy all day through. It's a wonderful world. Loving wonderful you!"
with io.open(path, "w", encoding="utf-8") as f:
for i in range(lines_num):
rec = dict(text=get_text_line(line_length))
x = json.dumps(rec, indent=0, ensure_ascii=False)
x = re.sub(r'\n', ' ', x, 0, re.M)
f.write(x + "\n")
@functools.lru_cache()
def download_hf_dataset(dsetname):
return load_dataset(dsetname)
class MegDSTestPreprocessing(TestCasePlus):
""" """
def setUp(self):
super().setUp()
def test_preprocess_data(self):
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
# autogenerate "input.jsonl"
input_path = f"{output_dir}/input.jsonl"
write_jsonl(input_path)
output_prefix =f"{output_dir}/test-ds"
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {input_path}
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
--workers 2
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
for ext in ["bin", "idx"]:
tgt_path = f"{output_prefix}_text_document.{ext}"
self.assertTrue(Path(tgt_path).exists(), )
def compare_meg_data_files(self, tgt, ref):
for ext in ["bin", "idx"]:
tgt_path = f"{tgt}.{ext}"
ref_path = f"{ref}.{ext}"
self.assertTrue(Path(tgt_path).exists(), )
self.assertTrue(filecmp.cmp(tgt_path, ref_path, shallow=False))
def preprocess_partitioned_dataset(self, output_dir, dsetname, splitname, linelimit, numparts):
"""Preprocess a dataset as a whole and in shards to prepare environment for merge test.
Load specified HF dataset using given split and record limit.
Write the dataset to a jsonl file and preprocess.
Also split dataset into numparts contiguous shards, write each shard to its own jsonl, and preprocess each.
Return path to the full dataset and a list of paths for each shard."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
# preproces_data_dist requires one to have already downloaded the input HF dataset.
# We do that by running this script before the test.
dset = download_hf_dataset(dsetname)[splitname]
# limit the test to use the first linelimit entries to be faster
dset = dset.select(range(linelimit))
# write jsonl file of full dataset
json_ds = f"{output_dir}/ds-full.jsonl"
dset.to_json(json_ds)
# process full jsonl into indexed dataset file
ds_full = f"{output_dir}/ds-full"
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {json_ds}
--output-prefix {ds_full}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
ds_full += '_text_document'
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
# write each part to its own json file
ds_parts = []
for i in range(numparts):
json_part = f"{output_dir}/ds-part-{i}.jsonl"
dset.shard(numparts, i, contiguous=True).to_json(json_part)
ds_part = f"{output_dir}/ds-part-{i}"
ds_parts.append(ds_part + '_text_document')
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {json_part}
--output-prefix {ds_part}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
return ds_full, ds_parts
def test_merge_serial(self):
"""Check that serial merge of partial dataset files produces the same file as the full dataset."""
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
# process full dataset, and process the full dataset as 3 contiguous chunks
ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
# merge the part files into a single indexed dataset
ds_merged = f"{output_dir}/ds-merged"
cmd = f"""
python {src_dir}/tools/merge_preprocessed_data.py
--datasets {" ".join(ds_parts)}
--output-prefix {ds_merged}
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
# the full dataset and the merged dataset should be identical
self.compare_meg_data_files(ds_full, ds_merged)
def test_merge_distributed(self):
"""Check that serial merge of partial dataset files produces the same file as the full dataset."""
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
# process full dataset, and process the full dataset as 3 contiguous chunks
ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
# merge the part files into a single indexed dataset
ds_merged = f"{output_dir}/ds-merged"
cmd = f"""
python -m torch.distributed.launch --nproc_per_node 6 {src_dir}/tools/merge_preprocessed_data.py
--merge distributed
--datasets {" ".join(ds_parts)}
--output-prefix {ds_merged}
--torch-backend gloo
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
# the full dataset and the merged dataset should be identical
self.compare_meg_data_files(ds_full, ds_merged)
def test_process_data_microsoft(self):
"""We want to be stable to Microsoft version."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
input_path = f"{self.tests_dir}/data/gpt2/openwebtext-1000.jsonl"
output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext"
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {input_path}
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
--workers 2
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
def test_process_data_dist_microsoft(self):
"""We want to be stable to Microsoft version."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
# preprocess_data_dist requires one to have already downloaded the input HF dataset.
# We do that by running this script before the test.
dsetname = 'stas/openwebtext-10k'
download_hf_dataset(dsetname)
cmd = f"""
python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
--input {dsetname}
--count 1000
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
def test_process_data_dist_serial_microsoft(self):
"""We want to be stable to Microsoft version."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
# preproces_data_dist requires one to have already downloaded the input HF dataset.
# We do that by running this script before the test.
dsetname = 'stas/openwebtext-10k'
download_hf_dataset(dsetname)
cmd = f"""
python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
--input {dsetname}
--count 1000
--merge serial
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
import unittest
from random import randint
from unittest.mock import patch
import deepspeed
import torch
import logging
import numpy as np
import pytest
from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, require_torch_multi_gpu
from megatron.training import setup_model_and_optimizer
from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
import multiprocessing as mp
from multiprocessing import Pool
from megatron.checkpointing import save_checkpoint
from megatron.utils import get_ltor_masks_and_position_ids
@require_deepspeed
@require_torch_multi_gpu
class MegDSTestTP(TestCasePlus):
def get_default_args(self):
"""return a dictionary with key as argument name and value as additional arguments"""
data_dir = f"{self.data_dir}/gpt2"
return {
# GPT_ARGS
"--num-layers": "2",
"--hidden-size": "128",
"--num-attention-heads": "4",
"--seq-length": "256",
"--max-position-embeddings": "256",
"--micro-batch-size": "4",
"--global-batch-size": "8",
"--lr-decay-iters": "320000",
"--lr-decay-style": "cosine",
"--lr": "0.00015",
"--min-lr": "1.0e-5",
"--train-iters": "5000",
"--tokenizer-type": "GPT2BPETokenizer",
"--merge-file": f"{data_dir}/gpt2-tiny-merges.txt",
"--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json",
"--data-impl": "mmap",
"--split": "949,50,1",
"--distributed-backend": "nccl",
"--weight-decay": "1e-2",
"--clip-grad": "1.0",
"--lr-warmup-fraction": ".01",
"--fp16": "",
"--attention-dropout": "0",
"--hidden-dropout": "0",
# OUTPUT_ARGS
"--log-interval": "10",
"--save-interval": "500",
"--eval-interval": "100",
"--eval-iters": "10",
"--checkpoint-activations": "",
#ds args
"--deepspeed": "",
"--deepspeed_config":f"{self.test_file_dir_str}/ds_config.json",
"--zero-stage": "1",
"--deepspeed-activation-checkpointing": ""
# DATA_ARGS
}
def setUp(self) -> None:
super().setUp()
# We reset all global variables
global_vars._GLOBAL_ARGS = None
global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
global_vars._GLOBAL_TOKENIZER = None
global_vars._GLOBAL_TENSORBOARD_WRITER = None
global_vars._GLOBAL_ADLR_AUTORESUME = None
global_vars._GLOBAL_TIMERS = None
def infer_model(args):
tp_index, tp_size, command_args, token_ids, save, load = args
dist_env = dict(
MASTER_ADDR="localhost", MASTER_PORT="9991", RANK=str(tp_index), LOCAL_RANK=str(tp_index), WORLD_SIZE=str(tp_size)
)
logging.getLogger().critical("Process: starting")
#Hack
import megatron.initialize as init
init.git_ds_info = lambda: None
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**dist_env):
def create_model_inputs(tokens):
args = get_args()
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss,
prefix_indices=None,
loss_on_targets_only=False)
return (tokens, position_ids, attention_mask), (tokens, loss_mask)
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(gpt_model_provider)
model = model[0]
if load is not None:
# Hack (same as in eval_harness/evaluate.py)
# Loading pipelined models in deepspeed with different TP than it was originally trained on fails
# due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
# This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
# which does not contain these attention-specific keys.
#
# Deepspeed does however manage to load the model if we just turn off this sanity check.
deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
zero_enabled = model._config.zero_enabled
model._config.zero_enabled = False
_, _ = model.load_checkpoint(load, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
model._config.zero_enabled = zero_enabled
if token_ids is None:
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
else:
token_ids = torch.tensor(token_ids)
model.micro_batches = 1
model.set_batch_fn(create_model_inputs)
# process batch
input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
# get a modified version of the first batch, we change a specific index
changed_index = randint(0, args.seq_length - 2)
input_token_ids_changed = input_batch[0].clone()
# We increment the token_id by one for that index in order to artificially change the sequence.
input_token_ids_changed[:, changed_index] = \
(input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size
output = model.eval_batch(iter([token_ids]), compute_loss = False, reduce_output = None)[0]
output = gather_from_tensor_model_parallel_region(output)
if save != None:
args.save = save
save_checkpoint(0, [model], None, None)
return (output[0].detach().cpu().numpy(), token_ids.detach().cpu().numpy())
def test_alibi_tp(self):
mp.set_start_method('spawn', force=True)
cp_dir = self.get_auto_remove_tmp_dir()
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
command_args["--position-embedding-type"] = "alibi"
command_args["--tensor-model-parallel-size"] = "1"
pool = Pool(1)
result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, cp_dir, None))])
pool.close()
pool.join()
output, tokens = result[0]
logging.getLogger().info("First done!")
command_args["--tensor-model-parallel-size"] = "2"
pool = Pool(2)
result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
pool.close()
pool.join()
output2, tokens = result[0]
logging.getLogger().critical(output-output2)
self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
def test_embedding_matrix_tp(self):
mp.set_start_method('spawn', force=True)
cp_dir = self.get_auto_remove_tmp_dir()
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
command_args["--seq-length"] = "4"
command_args["--micro-batch-size"] = "2"
tokens = [[5119, 0, 1, 5100],[0, 1, 5111, 5101]]
command_args["--tensor-model-parallel-size"] = "1"
pool = Pool(1)
# tp_index, tp_size, command_args, token_ids, save, load
result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, cp_dir, None))])
pool.close()
pool.join()
output, _ = result[0]
logging.getLogger().info("First done!")
command_args["--tensor-model-parallel-size"] = "2"
pool = Pool(2)
result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
pool.close()
pool.join()
output2, _ = result[0]
logging.getLogger().critical(output-output2)
self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
def test_embedding_matrix_tp_with_invalid_tokens_ids(self):
mp.set_start_method('spawn', force=True)
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
command_args["--seq-length"] = "4"
command_args["--micro-batch-size"] = "2"
tokens = [[5120, 0, 1, 2],[0, 1, 3, 4]]
command_args["--tensor-model-parallel-size"] = "1"
pool = Pool(1)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, None, None))])
pool.close()
pool.join()
self.assertIn("There is an input id in the input that is greater than the highest possible input id" , str(exc_info.value))
logging.getLogger().info("First done!")
command_args["--tensor-model-parallel-size"] = "2"
pool = Pool(2)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, None)), ((1, 2, command_args, tokens, None, None))])
pool.close()
pool.join()
self.assertIn("There is an input id in the input that is greater than the highest possible input id", str(exc_info.value))
def test_tokenizer_vocab_size_multiple_of_tp_size(self):
mp.set_start_method('spawn', force=True)
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
command_args["--micro-batch-size"] = "4"
command_args["--tensor-model-parallel-size"] = "2"
command_args["--make-vocab-size-divisible-by"] = "1"
pool = Pool(2)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
pool.close()
pool.join()
self.assertEqual(str(exc_info.value), "5121 is not divisible by 2")
def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
mp.set_start_method('spawn', force=True)
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
command_args["--micro-batch-size"] = "4"
pool = Pool(2)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
pool.close()
pool.join()
self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
if __name__ == '__main__':
unittest.main()
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import dataclasses
import io
import json
import os
import glob
import re
import shutil
import unittest
from pathlib import Path
from parameterized import parameterized
from megatron.testing_utils import (
CaptureStdout,
CaptureStd,
TestCasePlus,
execute_subprocess_async,
get_gpu_count,
require_bnb_non_decorator,
require_deepspeed,
require_torch_gpu,
set_seed
)
set_seed(42)
def get_launcher(num_gpus):
# 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
# - it won't be able to handle that
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
def get_3d_dimensions():
num_gpus = get_gpu_count()
# with fewer gpus the preference is first to to do PP>1, then TP>1, then DP>1
if num_gpus >= 8:
dp_size = 2
pp_size = 2
tp_size = 2
if num_gpus >= 4:
dp_size = 1
pp_size = 2
tp_size = 2
elif num_gpus >= 2:
dp_size = 1
pp_size = 2
tp_size = 1
else:
dp_size = 1
pp_size = 1
tp_size = 1
return pp_size, tp_size, dp_size
@require_deepspeed
@require_torch_gpu
class MegDSTestTraining(TestCasePlus):
""" """
def setUp(self):
super().setUp()
# at times magatron fails to build kernels and doesn't remove the lock file, which makes
# subsequent runs hang - so make sure there is no lock when starting the testing
meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
if os.path.exists(meg_lock_file_path):
os.unlink(meg_lock_file_path)
def copy_data_to_temp(self, root_dir, prefix):
"""copy data to temp, and return paths to temp version"""
src_path = os.path.join(root_dir, prefix)
src_dirname = os.path.dirname(src_path)
tmp_dir = self.get_auto_remove_tmp_dir()
dest_path = os.path.join(tmp_dir, prefix)
dest_dirname = os.path.dirname(dest_path)
os.makedirs(dest_dirname, exist_ok=True)
for folder in os.listdir(src_dirname):
src_folder = os.path.join(src_dirname, folder)
dest_folder = os.path.join(dest_dirname, folder)
if src_folder.startswith(src_path):
if os.path.isdir(src_folder):
shutil.copytree(src_folder, dest_folder)
else:
shutil.copy2(src_folder, dest_folder)
return dest_path
def get_variation_config(self, variation, output_dir, n_samples=None):
data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
print(f"Using {num_gpus} GPUs")
if variation == "bnb":
# we want to make sure at least tp=2 is used, so we swap tp and pp
pp_size, tp_size = tp_size, pp_size
if n_samples is None:
n_samples = 300 # about 56 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
seq_len = 128
# common/shared configs
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--log-interval 1
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--partition-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_dir}/meg-gpt2-openwebtext_text_document
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length {seq_len}
--max-position-embeddings 1024
--micro-batch-size 1
--global-batch-size 16
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--embed-layernorm
--sync-tp-duplicated-parameters
--fp16
--log-level debug
--log-level-replica info
""".split()
if variation == "base":
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
elif variation == "bnb":
# BitsAndBytes - 8-bit optimizer
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
--use-bnb-optimizer
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
elif variation == "cl":
# CurriculumLearning
lr_decay_samples = 6
lr_decay_tokens = lr_decay_samples * seq_len
train_tokens = n_samples * seq_len
# XXX: if changing seq_len from 128, must adjust ds config to:
# curriculum_learning.max_difficulty: $SEQLEN
# XXX: probably we should write the ds config on the fly to keep everything in sync,
# rather than using the pre-saved config
new_args = f"""
--train-samples {n_samples*2}
--train-tokens {train_tokens}
--lr-decay-tokens {lr_decay_tokens}
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config_cl.json
""".split()
elif variation == "glu":
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
--no-bias-gelu-fusion
--glu-activation geglu
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
elif variation == "alibi":
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
--position-embedding-type alibi
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
else:
raise ValueError(f"Don't know of variation {variation}")
args.extend(new_args)
ds_args.extend(new_ds_args)
return args, ds_args, num_gpus
def test_kill_switch(self):
variation = "base"
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
kill_switch_path = os.path.join(output_dir, "kill-switch-xyz")
args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
args += f"--kill-switch-path {kill_switch_path}".split()
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. kill switch armed but not triggered
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# 2. trigger kill switch
fh = open(kill_switch_path, "w")
with CaptureStd() as cs:
execute_subprocess_async(cmd, env=self.get_env())
self.assertIn(f"Detected kill switch at {kill_switch_path}", cs.out)
# test deepspeed wasn't run
self.assertNotIn("DeepSpeed info", cs.out)
@parameterized.expand(["base", "cl", "bnb", "glu", "alibi"])
def test_training_all(self, variation):
# optional runs
if variation == "bnb":
require_bnb_non_decorator()
# all in one test
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
if variation == "glu":
self.assertIn("Using GLU activation: GELU", cs.out)
if variation == "alibi":
self.assertIn("Using Alibi", cs.out)
# 2. test training from checkpoint: resume
# now do it again, this time resuming from the checkpoint
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard (1 file from the first run, plus 1 now)
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
if variation == "glu":
self.assertIn("Using GLU activation: GELU", cs.out)
@parameterized.expand([(True, True), (False, False), (True, False), (False, True)])
def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_on_position_frequency):
# all in one test
src_dir = self.src_dir
data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
logs_dir = f"{output_dir}/logs"
Path(logs_dir).mkdir(parents=True, exist_ok=True)
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
n_samples = 200 # about 37 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length 128
--max-position-embeddings 1024
--micro-batch-size 1
--rampup-batch-size 2 2 {n_samples}
--global-batch-size 16
--train-samples {n_samples}
{"--loss-on-targets-only" if loss_on_targets_only else ""}
{"--reweight-loss-based-on-position-frequency" if reweight_loss_based_on_position_frequency else ""}
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--fp16
--log-interval 5
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--log-path {logs_dir}
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_dir}/meg-gpt2-openwebtext_text_document
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-level debug
""".split()
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
script = [f"{src_dir}/pretrain_prefix_lm.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
if reweight_loss_based_on_position_frequency:
self.assertIn("Using loss reweighting", cs.out)
# 2. test training from checkpoint: resume
# now do it again, this time resuming from the checkpoint
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard (1 file from the first run, plus 1 now)
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
def test_training_t0(self):
data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt")
output_dir = self.get_auto_remove_tmp_dir()
logs_dir = f"{output_dir}/logs"
Path(logs_dir).mkdir(parents=True, exist_ok=True)
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
n_samples = 200 # about 37 iterations
exit_interval = 10 # some samples in the first half and then some more in the 2nd half after resume
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length 128
--max-position-embeddings 1024
--position-embedding-type alibi
--micro-batch-size 1
--rampup-batch-size 2 2 {n_samples}
--global-batch-size 16
--train-samples {n_samples}
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--fp16
--log-interval 5
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--exit-interval {exit_interval}
--tokenizer-type PretrainedFromHF
--tokenizer-name-or-path bigscience/tokenizer
--log-path {logs_dir}
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_path}
--split 90,10,0
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-level debug
""".split()
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
script = [f"{self.src_dir}/finetune_t0_non_causal_decoder.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
# 2. test training from checkpoint: resume
# now do it again, this time resuming from the checkpoint
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard (1 file from the first run, plus 1 now)
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
@parameterized.expand(["gpt", "prefix", "no_eval"])
def test_mode2_dataloading(self, variation):
src_dir = self.src_dir
data_dir = self.copy_data_to_temp(self.data_dir, "gpt2")
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
logs_dir = f"{output_dir}/logs"
Path(logs_dir).mkdir(parents=True, exist_ok=True)
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
n_samples = 200 # about 37 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length 128
--max-position-embeddings 1024
--micro-batch-size 1
--rampup-batch-size 2 2 {n_samples}
--global-batch-size 16
--train-samples {n_samples}
--loss-on-targets-only
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--fp16
--log-interval 5
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--log-path {logs_dir}
--save {output_dir}/checkpoints
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
""".split()
data_args = [
"--train-weighted-split-paths", f'TRAIN: 1 0:0.95 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0:0.90 {data_dir}/meg-gpt2-openwebtext_text_document']
if variation != "no_eval":
data_args += ["--valid-weighted-split-paths", f'VALID1: 1 0.95:0.98 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0.90:0.99 {data_dir}/meg-gpt2-openwebtext_text_document',
f'VALID2: 0.5 0.95:0.97 {data_dir}/meg-gpt2-openwebtext_text_document, 0.5 0.90:0.98 {data_dir}/meg-gpt2-openwebtext_text_document']
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
if variation == "prefix":
script = [f"{src_dir}/pretrain_prefix_lm.py"]
else:
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + data_args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
def test_skip_train_iteration(self):
# skip iterations setup
extra_args = f"""
--skip-train-iteration-range 2-2 4-7
""".split()
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir()
args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
args.extend(extra_args)
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# check skipped iterations
self.assertIn("Skipped iterations 2 to 2 due to --skip-train-iteration-range flag", cs.out)
self.assertIn("Skipped iterations 4 to 7 due to --skip-train-iteration-range flag", cs.out)
train_iterations = range(1,10)
for i in train_iterations:
self.assertTrue(f"iteration {i:8d}/" in cs.out)
# Test suite tools
# Make tiny tokenizer files
currently for gpt2 run:
```
./shrink-tokenizer.py
```
and then we have tiny vocab and merge files under the generated dir `tiny` to add to repo under `data/gpt2`.
```
cp tiny/merges.txt ../data/gpt2/gpt2-tiny-merges.txt
cp tiny/vocab.json ../data/gpt2/gpt2-tiny-vocab.json
```
Note, the tiny vocab was set to 5000 items after experimenting with the resulting index files size. Using a tiny vocab of 500 (and adjusted merge entries) proved to generate very large index files, so it actually ends up costing more in final file size. 5000 proved to generate an almost identical index files as with the original 50k vocab size.
# Make tiny pre-processed index
to be used in test training
```
./openwebtext-to-jsonl.py
```
generates:
```
openwebtext-10000.jsonl
```
we don't want to store jsonl in repo, to keep the size small, so it's a temp file.
Now we pre-process it:
```
cd ../..
input=tests/tools/openwebtext-1000.jsonl
python tools/preprocess_data.py \
--input $input \
--output-prefix tests/data/gpt2/meg-gpt2-openwebtext \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file tests/data/gpt2/gpt2-tiny-merges.txt \
--vocab tests/data/gpt2/gpt2-tiny-vocab.json \
--append-eod \
--workers 6
```
and voila we now have:
```
ls -sh1 tests/data/gpt2/meg-gpt2-openwebtext*
2.6M tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin
20K tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx
```
which we can now commit and use in tests.
#!/usr/bin/env python
# generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor
import sys
from datasets import load_dataset
dataset_name = "stas/openwebtext-10k"
# subset to jsonlines
n_samples = 1000
ds = load_dataset(dataset_name, split='train')
ds_small = ds.select(range(n_samples))
path = f"openwebtext-{n_samples}.jsonl"
ds_small.to_json(path, orient="records", lines=True)
#!/usr/bin/env python
# produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo)
import json
from transformers import AutoTokenizer
from tokenizers import Tokenizer
mname = "gpt2"
vocab_keep_items = 5000
tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True)
assert tokenizer.is_fast, "This only works for fast tokenizers."
tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
vocab = tokenizer_json["model"]["vocab"]
if tokenizer_json["model"]["type"] == "BPE":
if "gpt2" in mname:
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 }
new_vocab["<|endoftext|>"] = vocab_keep_items-1
else:
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
merges = tokenizer_json["model"]["merges"]
new_merges = []
for i in range(len(merges)):
a, b = merges[i].split()
new_token = "".join((a, b))
if a in new_vocab and b in new_vocab and new_token in new_vocab:
new_merges.append(merges[i])
tokenizer_json["model"]["merges"] = new_merges
elif tokenizer_json["model"]["type"] == "Unigram":
new_vocab = vocab[:vocab_keep_items]
elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
else:
raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
tokenizer_json["model"]["vocab"] = new_vocab
tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
tokenizer.save_pretrained("tiny")
# Tools
- [sample_idxs_to_text.py](./sample_idxs_to_text.py) - want to see which text was feed at specific iterations? for example to understand why the training went astray? Then use this script. The pre-amble of the script contains the documentation and usage examples.
## A few notes on how we created the datasets:
### Creating the Json Lines text file
First you need to create a jsonl file containing your dataset. For this we exported from the HF-datasets format. For example for C4:
```
from datasets import load_dataset
c4 = load_dataset("c4", "en")
c4["train"].to_json("c4_en_train.jsonl")
c4["validation"].to_json("c4_en_valid.jsonl")
```
This creates quite a large file compared to the size of the HF dataset on disk (810GB vs 305 for C4 for example)
### Megatron pre-processing
Then you need to pass that text file to the `preprocess_data.py` script for tokenization and memory-mapping, creating two files, one to store the tokens indices and one to store the document start and ends. The result will be slightly bigger than the text dataset. (360GB vs 305GB for C4 for example). You can choose one of the default Megatron tokenizers (but then you have to pass merges and vocab files) or one from HF-tokenizers. For example, in our GPT-like models reusing a T5 sentencepiece-bpe tokenizer:
`python tools/preprocess_data.py --input ~/c4_en_train.jsonl --output-prefix c4_en_train --dataset-impl mmap --tokenizer-type PretrainedFromHF --tokenizer-name-or-path t5-small --workers 30 --append-eod`
Do note that adding too many workers can be counterproductive for very large dataset: as the bottleneck becomes disk writing, the intermediary process results pool up and can flood the RAM. In our experiments on GCP machines, running with 60 workers on C4 inevitably led the program to fail.
# Introduction
This folder is a collection of scripts for converting checkpoints of one training framework (e.g., DeepSpeed) into that of a different framework (e.g., Megatron-LM, HF Transformers).
The folder also contains scripts for inspecting checkpoint files and folders, which could be useful when developing checkpoint conversion logic. At the time of creation, this folder contains scripts to convert DeepSpeed checkpoints to Megatron-LM and HF Transformers checkpoints (this motivated this effort as part of the BigScience project).
Here are the list and details of checkpoint conversions provided by the available scripts:
1. [Megatron-DeepSpeed to Megatron-LM](#Megatron-DeepSpeed-to-Megatron)
1. [Megatron-DeepSpeed to HF Transformers](#Megatron-DeepSpeed-to-HF-Transformers)
## Megatron-DeepSpeed to Megatron
The (current implementation of the) converter extracts args and model parameters from a DeepSpeed checkpoint (i.e., excludes other training states such as optimizer, scheduler, etc) and convert into a Megatron-LM checkpoint similarly containing only model parameters. The converter also provides a best-effort attempt to reshape the tensor-parallelism and pipeline parallelism degrees for the checkpoint. The resulting Megatron-LM checkpoint could be loaded into Megatron-LM framework for finetuning or inference. Tensor parallelism (TP) and pipeline parallelism (PP) are supported in the sense that the generated Megatron-LM checkpoint (folders and files) will be of the same TP and PP of the training that created the input DeepSpeed checkpoint. The entry point of the converter is `deepspeed_to_megatron.py`, which as the following usage:
```bash
python tools/convert_checkpoint/deepspeed_to_megatron.py -h
Convert DeepSpeed Checkpoint to Megatron Checkpoint
usage: deepspeed_to_megatron.py [-h] [--input_folder INPUT_FOLDER]
[--output_folder OUTPUT_FOLDER]
[--target_tp TARGET_TP]
[--target_pp TARGET_PP] [--for_release]
optional arguments:
-h, --help show this help message and exit
--input_folder INPUT_FOLDER
Input DeepSpeed Checkpoint folder
--output_folder OUTPUT_FOLDER
Output Megatron checkpoint folder
--target_tp TARGET_TP
Target TP degree
--target_pp TARGET_PP
Target PP degree
--for_release Convert for release purpose, reset some (progress)
counters.
```
The following scripts which proved useful for debugging are also included:
1. `inspect_deepspeed_checkpoint.py`: view the contents of a DeepSpeed checkpoint folder.
2. `inspect_checkpoint.py`: view the contents of a PyTorch checkpoint file.
## Megatron-DeepSpeed to HF Transformers
In order to convert from Megatron-DeepSpeed to HF Transformers, you can do this directly using:
```bash
python tools/convert_checkpoint/deepspeed_to_transformers.py \
--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
--output_folder /path/to/transformers/checkpoint
```
since `transformers` currently only works with PP=1/TP=1 we use the defaults `--target_tp 1 --target_pp 1`.
The script taps into `transformers` and as of this writing requires `transformers@master` (or `transformers==4.11` if you read this later and a new version is released).
Note that you may run into problems with not having `megatron.enums` defined since `Megatron-Deepspeed` in the `bigscience-workshop` tree diverged from the `microsoft` tree. In such cases you can fix this on the fly by ensuring the former appears first in the `sys.path`. For example:
```bash
PYTHONPATH=/hf/Megatron-DeepSpeed-bigscience:/hf/Megatron-DeepSpeed-microsoft \
python tools/convert_checkpoint/deepspeed_to_transformers.py \
--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
--output_folder /path/to/transformers/checkpoint
```
Alternatively, you can convert first from Megatron-DeepSpeed to Megatron and then to HF Transformers:
```bash
# 1. Megatron-DeepSpeed to Megatron
cd /hf/Megatron-DeepSpeed-bigscience
python tools/convert_checkpoint/deepspeed_to_megatron.py --target_tp 1 --target_pp 1 \
--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
--output_folder /path/to/Megatron/checkpoint
# 2. Megatron to HF Transformers
cd /hf/transformers
python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py \
/path/to/Megatron/checkpoint/iter_0097500/mp_rank_00/model_optim_rng.pt
```
#!/usr/bin/env python
import sys
import argparse
import os
import torch
from pathlib import Path
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
from megatron.tokenizer.tokenizer import _vocab_size_with_padding
from deepspeed.checkpoint.deepspeed_checkpoint import (
ARGS_KEY,
CHECKPOINT_INFO_KEY,
)
from deepspeed.checkpoint import (
DeepSpeedCheckpoint,
get_model_ckpt_name_for_rank,
get_zero_ckpt_name_for_rank,
get_layer_ckpt_name_for_rank
)
CHECKPOINT_FILE_SUFFIX = '_model_states.pt'
MP_WORLD_SIZE ='mp_world_size'
WORD_EMBEDDINGS_KEY = 'word_embeddings.weight'
ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
PADDED_VOCAB_SIZE = 'padded_vocab_size'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder',
default=None,
type=str,
help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder',
default=None,
type=str,
help='Output Megatron checkpoint folder')
parser.add_argument('--target_tp',
default=None,
type=int,
help='Target TP degree')
parser.add_argument('--target_pp',
default=None,
type=int,
help='Target PP degree')
parser.add_argument('--target_dp',
default=None,
type=int,
help='Target DP degree')
args = parser.parse_args()
print(f'args = {args}')
return args
def _save_checkpoint(file_path, chkpt_sd):
dir, _ = os.path.split(file_path)
os.makedirs(dir, exist_ok=True)
torch.save(chkpt_sd, file_path)
def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index):
sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index)
layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index)
assert len(sd_list) == len(layer_id_list)
for sd, layer_id in zip(sd_list, layer_id_list):
ckpt_path = get_layer_ckpt_name_for_rank(
base_folder=base_folder,
layer_id=layer_id,
tp_rank=tp_index)
_save_checkpoint(ckpt_path, sd)
def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
target_args = ds_checkpoint.get_args()
checkpoint_info = ds_checkpoint.get_checkpoint_info()
target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree
target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args)
assert target_args.padded_vocab_size <= padded_vocab_tensor.numel()
checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size
unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size)
return unpadded_vocab_tensor.clone()
def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
sd = ds_checkpoint.get_embedding_state(tp_index)
if ds_checkpoint.is_change_tp_degree():
sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY])
layer_id = ds_checkpoint.get_embedding_layer_id()
ckpt_path = get_layer_ckpt_name_for_rank(
base_folder=base_folder,
tp_rank=tp_index,
layer_id=layer_id)
_save_checkpoint(ckpt_path, sd)
def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
sd = ds_checkpoint.get_final_norm_state(tp_index)
layer_id = ds_checkpoint.get_final_norm_layer_id()
ckpt_path = get_layer_ckpt_name_for_rank(
base_folder=base_folder,
tp_rank=tp_index,
layer_id=layer_id)
_save_checkpoint(ckpt_path, sd)
def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index,
pp_index):
sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index,
pp_index=pp_index)
sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree
file_id = pp_index * ds_checkpoint.tp_degree + tp_index
ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}')
# Adjust specific fields
sd[ARGS_KEY] = ds_checkpoint.get_args()
sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size
_save_checkpoint(ckpt_path, sd)
def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index):
_2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index
sd = ds_checkpoint.get_zero_checkpoint_state(
pp_index=pp_index,
tp_index=tp_index,
dp_index=dp_index)
ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder,
dp_rank=dp_index,
mp_rank=_2d_rank)
_save_checkpoint(ckpt_path, sd)
def _create_latest_file(base_folder, file_name, latest_tag):
file_path = os.path.join(base_folder, file_name)
os.makedirs(base_folder, exist_ok=True)
with open(file_path, 'w') as f:
f.write(str(latest_tag))
def main():
print(f'Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint')
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(
args.input_folder,
args.target_tp,
args.target_pp,
args.target_dp)
iteration = ds_checkpoint.get_iteration()
latest_tag = f'global_step{iteration}'
_create_latest_file(args.output_folder,
'latest_checkpointed_iteration.txt', iteration)
_create_latest_file(args.output_folder, 'latest', latest_tag)
base_folder = os.path.join(args.output_folder, latest_tag)
for i in range(ds_checkpoint.tp_degree):
_create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i)
_create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i)
for j in range(ds_checkpoint.pp_degree):
_create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j)
_create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j)
for i in range(ds_checkpoint.dp_degree):
for j in range(ds_checkpoint.pp_degree):
for k in range(ds_checkpoint.tp_degree):
_create_zero_checkpoint(ds_checkpoint, base_folder, i, j, k)
if __name__ == "__main__":
main()
#!/usr/bin/env python
import argparse
import os
import torch
from collections import OrderedDict
from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
MODEL_KEY = 'model'
ARGS_KEY = 'args'
LANGUGAGE_MODEL_KEY = 'language_model'
EMBEDDING_KEY = 'embedding'
ENCODER_KEY = 'encoder'
WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
WORD_EMBEDDINGS_KEY = 'word_embeddings'
FINAL_LAYER_NORM_KEY = 'final_layernorm'
CHECKPOINT_VERSION_KEY = 'checkpoint_version'
CHECKPOINT_VERSION_VALUE = 3.0
ITERATION_KEY = 'iteration'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder',
default=None,
type=str,
help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder',
default=None,
type=str,
help='Output Megatron checkpoint folder')
parser.add_argument('--target_tp',
default=1,
type=int,
help='Target TP degree')
parser.add_argument('--target_pp',
default=1,
type=int,
help='Target PP degree')
parser.add_argument(
'--for_release',
action='store_true',
help='Convert for release purpose, reset some (progress) counters.')
args = parser.parse_args()
print(f'args = {args}')
return args
def _convert_ds_transformer_state(sd_list):
new_sd = OrderedDict()
for i, sd in enumerate(sd_list):
for key, value in sd.items():
new_key = f'layers.{i}.{key}'
new_sd[new_key] = value
return new_sd
def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
path_list = []
iter_folder = f'iter_{iteration:07d}'
for i in range(0, tp_degree):
path_list.append([])
for j in range(0, pp_degree):
rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
path_list[i].append(
os.path.join(base_folder, iter_folder, ckpt_path))
return path_list
def _create_megatron_dict():
language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
megatron_dict = {
MODEL_KEY: {
LANGUGAGE_MODEL_KEY: language_model_dict
},
CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
}
return megatron_dict
def _save_checkpoint(file_path, chkpt_sd):
dir, _ = os.path.split(file_path)
os.makedirs(dir, exist_ok=True)
torch.save(chkpt_sd, file_path)
def _renest_sd(sd):
new_sd = OrderedDict()
for key, value in sd.items():
a, b = key.split('.')
new_sd[a] = {b: value}
return new_sd
def _create_rank_checkpoint(ds_checkpoint,
checkpoint_path,
tp_index,
pp_index,
for_release=False):
meg_encoder_sd = OrderedDict()
meg_embedding_sd = OrderedDict()
meg_embedding_for_head_sd = OrderedDict()
transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index)
meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd))
if pp_index in [0, ds_checkpoint.pp_degree - 1]:
embedding_sd = ds_checkpoint.get_embedding_state(tp_index)
nested_embedding_sd = _renest_sd(embedding_sd)
if pp_index == 0:
meg_embedding_sd.update(nested_embedding_sd)
if pp_index == ds_checkpoint.pp_degree - 1:
for key, value in embedding_sd.items():
if key.startswith(WORD_EMBEDDINGS_KEY):
fields = key.split('.')
new_fields = fields[1:]
new_key = '.'.join(new_fields)
meg_embedding_for_head_sd[new_key] = value
final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
new_final_norm_sd = {
f'{FINAL_LAYER_NORM_KEY}.{key}': value
for key, value in final_norm_sd.items()
}
meg_encoder_sd.update(new_final_norm_sd)
checkpoint_sd = _create_megatron_dict()
iteration = ds_checkpoint.get_iteration()
checkpoint_sd[ITERATION_KEY] = iteration
if pp_index == 0:
checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][
EMBEDDING_KEY] = meg_embedding_sd
checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
if pp_index == ds_checkpoint.pp_degree - 1:
checkpoint_sd[MODEL_KEY][
WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
# Adjust specific fields
checkpoint_sd[
ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
checkpoint_sd[
ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
if for_release:
checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
return checkpoint_sd
def _create_latest_file(base_folder, iteration):
file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt')
os.makedirs(base_folder, exist_ok=True)
with open(file_path, 'w') as f:
f.write(str(iteration))
def main():
print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
args.target_pp)
iteration = ds_checkpoint.get_iteration()
_create_latest_file(args.output_folder, iteration)
checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
ds_checkpoint.tp_degree,
ds_checkpoint.pp_degree)
for i in range(0, ds_checkpoint.tp_degree):
for j in range(0, ds_checkpoint.pp_degree):
sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
_save_checkpoint(checkpoint_paths[i][j], sd)
if __name__ == "__main__":
main()
#!/usr/bin/env python
import os
import torch
import json
import sys
from pathlib import Path
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
from deepspeed.checkpoint import DeepSpeedCheckpoint
from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
# the import was tested to work with this version
# https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
# copying that version here instead
from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
from transformers import GPT2Config
def main():
# this first part comes mainly from deepspeed_to_megatron.main
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
args.target_pp)
iteration = ds_checkpoint.get_iteration()
input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0,
args.for_release)
# the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
# Spell out all parameters in case the defaults change.
config = GPT2Config(
vocab_size=50257,
n_positions=1024,
n_ctx=1024,
n_embd=1024,
n_layer=24,
n_head=16,
n_inner=4096,
activation_function="gelu", # used to be "gelu_new" in earlier versions
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
scale_attn_weights=True,
gradient_checkpointing=False,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
)
# Convert.
print("Converting to HF Checkpoint")
output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
config)
basename = args.output_folder
os.makedirs(basename, exist_ok=True)
# Print the structure of converted state dict.
#if args.print_checkpoint_structure:
# recursive_print(None, output_state_dict)
# Store the config to file.
output_config_file = os.path.join(basename, "config.json")
output_config = config.to_dict()
output_config["architectures"] = ["GPT2LMHeadModel"]
output_config["model_type"] = "gpt2"
print(f'Saving config to "{output_config_file}"')
with open(output_config_file, "w") as f:
json.dump(output_config, f)
# Store the state_dict to file.
output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
print(f'Saving checkpoint to "{output_checkpoint_file}"')
torch.save(output_state_dict, output_checkpoint_file)
print("Now add tokenizer files and upload to the hub")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment