"vscode:/vscode.git/clone" did not exist on "228ee2ad82541fe0a7e0c05dcc0dfbff99567b32"
Commit 8ec5d678 authored by hepj987's avatar hepj987
Browse files

GPT2 base on megatron-deepspeed

parents
from random import randint
from typing import Set
from unittest.mock import patch
import deepspeed
import torch
from parameterized import parameterized
from torch import nn
import torch.nn.functional as F
from megatron.enums import AttnMaskType
from megatron.model.fused_layer_norm import MixedFusedLayerNorm
from packaging import version
from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
from megatron.model.fused_softmax import ScaledMaskedSoftmax, FusedScaleMaskSoftmax
from megatron.model.utils import attention_mask_func
from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \
torch_assert_close, require_torch_bf16
from megatron.training import setup_model_and_optimizer
import pretrain_gpt
import pretrain_prefix_lm
import finetune_t0_non_causal_decoder
def get_default_args(test_file_dir: str):
"""return a dictionary with key as argument name and value as additional arguments"""
return {
# GPT_ARGS
"--num-layers": "2",
"--hidden-size": "128",
"--num-attention-heads": "4",
"--seq-length": "256",
"--max-position-embeddings": "256",
"--micro-batch-size": "2",
"--global-batch-size": "2",
"--lr-decay-iters": "320000",
"--lr-decay-style": "cosine",
"--lr": "0.00015",
"--min-lr": "1.0e-5",
"--train-iters": "5000",
"--tokenizer-type": "PretrainedFromHF",
"--tokenizer-name-or-path": "gpt2",
"--data-impl": "mmap",
"--split": "949,50,1",
"--distributed-backend": "nccl",
"--weight-decay": "1e-2",
"--clip-grad": "1.0",
"--lr-warmup-fraction": ".01",
"--fp16": "",
"--inference": "",
"--attention-dropout": "0",
"--hidden-dropout": "0",
# OUTPUT_ARGS
"--log-interval": "10",
"--save-interval": "500",
"--eval-interval": "100",
"--eval-iters": "10",
"--checkpoint-activations": "",
# DATA_ARGS
# DeepSpeed args
"--deepspeed": "",
"--deepspeed_config": f"{test_file_dir}/ds_config_inference.json",
"--zero-stage": "0",
}
def equal_vectors(tensor1, tensor2, dim=-1):
"""View tensor1 and tensor2 as a list of vectors, and compute equality"""
return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
def iter_out_of_one(one):
return iter([one])
def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]):
"""Code from `tests/test_dataloaders.py"""
seq_length += 1
num_segments = torch.randint(1, 5, ())
segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long)
is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool)
for batch_id in range(micro_batch_size):
# - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least
# - `+1`: Hack in order the start_mew_segments not to be 0
start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1
segment_ids[batch_id, start_new_segments] = 1
end_inputs = [
torch.randint(low=start_segment, high=end_segment - 1, size=())
for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length])
]
for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]):
is_inputs[batch_id][start_segment: end_input + 1] = True
segment_ids = torch.cumsum(segment_ids, dim=-1) + 1
tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long)
flatten_token_view = tokens.view(-1,)
for token_id in range(len(flatten_token_view)):
token = flatten_token_view[token_id]
# While token is a special tokens we change that token
while token in special_tokens_ids:
flatten_token_view[token_id] = (token + 1) % vocab_size
token = flatten_token_view[token_id]
return {
"decoder_token_ids": tokens,
"decoder_segment_ids": segment_ids,
"decoder_is_inputs": is_inputs
}
class MyTestCase(TestCasePlus):
def setUp(self) -> None:
super().setUp()
# We reset all global variables
global_vars._GLOBAL_ARGS = None
global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
global_vars._GLOBAL_TOKENIZER = None
global_vars._GLOBAL_TENSORBOARD_WRITER = None
global_vars._GLOBAL_ADLR_AUTORESUME = None
global_vars._GLOBAL_TIMERS = None
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
def test_gpt(self):
"""Test causal invariance, ie past token don't depend on future tokens."""
command_args = get_default_args(self.test_file_dir_str)
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
# get a modified version of the first batch, we change a specific index
changed_index = randint(0, args.seq_length - 2)
token_ids_changed = token_ids.clone()
# We increment the token_id by one for that index in order to artificially change the sequence.
token_ids_changed[:, changed_index] = \
(token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
output = model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False)
# All token in past should be unchanged
torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index])
# All tokens in the future should have changed
self.assertFalse(
torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
)
def test_prefix_lm_reset_attention_mask(self):
"""
Test prefix invariances when `reset_attention_mask=True`:
- Past target tokens don't depend on future target tokens.
- Target tokens depend on input tokens.
- Input tokens depend on all other input tokens, but never target tokens.
"""
command_args = get_default_args(self.test_file_dir_str)
command_args["--reset-attention-mask"] = ""
command_args["--loss-on-targets-only"] = ""
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
# we preprocess batch_fn manually
model.set_batch_fn(None)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token, this also guarantees that the whole row is considered as a document.
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
# process batch to have non empty prefix
input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
for batch_id in range(len(prefix_indices)):
for id in prefix_indices[batch_id]:
self.assertTrue(loss_mask[batch_id, id] == 1)
self.assertTrue(id > 0)
# Make sure that the last prefix token predicts the first token.
self.assertTrue(loss_mask[batch_id, id -1] == 1)
output = model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
## --------------- CHANGE A TARGET TOKEN ---------------------------
# get a modified version of the first batch
# guaranteed to exist as each row has at least one partial document
changed_target_index = prefix_indices[0][0]
token_ids_changed_target = input_batch[0].clone()
# We increment the token id on the changed index.
token_ids_changed_target[0, changed_target_index] = \
(token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
# make sure we're not changing a token to eod as it's a special token
token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
# Test change
output_changed_target = model.eval_batch(iter_out_of_one(((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
# All token in past should be unchanged
torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
# All tokens in the future should have changed
self.assertFalse(
torch.any(
equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:])
)
)
# Unchanged changed rows should not change either
torch_assert_equal(output[1, :], output_changed_target[1, :])
## --------------- CHANGE AN INPUT TOKEN ---------------------------
# Let's change the the last prefix token and make sure that the first token changed
# guaranteed to be positive as we avoid pathological case previously
last_prefix_index = prefix_indices[0][0] - 1
token_ids_changed_input = input_batch[0].clone()
# We increment the token id on the changed index.
token_ids_changed_input[0, last_prefix_index] = \
(token_ids_changed_input[0, last_prefix_index] + 1) % args.padded_vocab_size
# make sure we're not changing a token to eod as it's a special token
token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
output_changed_input = model.eval_batch(iter_out_of_one(((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
# All tokens should be changed
self.assertFalse(
torch.any(
equal_vectors(output[0, :], output_changed_input[0, :])
)
)
# Unchanged changed rows should not change either
torch_assert_equal(output[1, :], output_changed_input[1, :])
def test_prefix_lm_wo_reset_attention_mask(self):
"""
Test prefix invariances when `reset_attention_mask=False`:
- Past target tokens don't depend on future target tokens.
- Target tokens depend on input tokens.
- Input tokens depend on all other input tokens, but never target tokens.
"""
command_args = get_default_args(self.test_file_dir_str)
command_args["--loss-on-targets-only"] = ""
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
# we preprocess batch_fn manually
model.set_batch_fn(None)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
for batch_id in range(len(prefix_indices)):
id = prefix_indices[batch_id]
self.assertTrue(loss_mask[batch_id, id] == 1)
self.assertTrue(id > 0)
# Make sure that the last prefix token predicts the first token.
self.assertTrue(loss_mask[batch_id, id -1] == 1)
model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
#TODO: Check all invariants
def test_gpt_rotary_embeddings(self):
"""Test rotary embeddings"""
command_args = get_default_args(self.test_file_dir_str)
del command_args["--max-position-embeddings"]
command_args["--position-embedding-type"] = "rotary"
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
#TODO: Check all invariants
@require_torch_bf16
def test_fused_layer_norm(self):
command_args = get_default_args(self.test_file_dir_str)
# Condition to use custom cuda kernel
command_args["--bf16"] = ""
del command_args["--fp16"]
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
initialize_megatron()
args = get_args()
dummy_input = torch.randn(args.micro_batch_size, args.seq_length, args.hidden_size, device="cuda", dtype=torch.bfloat16)
normalized_shape = (args.hidden_size,)
epsilon = 1e-5
mfln = MixedFusedLayerNorm(normalized_shape, eps=epsilon)
self.assertTrue(mfln.use_meg_ds_fused_layer_norm, "Expected model to use Megatron-DeepSpeed custom cuda kernel for LayerNorm.")
self.assertTrue(args.bf16, "Test has to be done in half precision.")
# We set the weight manually so we simulate state that's not the initialisation
weight = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
bias = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
mfln.weight = nn.Parameter(weight)
mfln.bias = nn.Parameter(bias)
mfln_output = mfln(dummy_input)
# We check that our layernorm matches pytorch 1.11 onwards
if version.parse(torch.__version__) >= version.parse("1.11.0"):
torch_layer_norm_output = F.layer_norm(dummy_input, normalized_shape, weight, bias, eps=epsilon)
else:
# In this case we use can check that basically it corresponds to the fp32 version
torch_layer_norm_output = F.layer_norm(dummy_input.float(), normalized_shape, weight.float(), bias.float(), eps=epsilon).to(torch.bfloat16)
torch_assert_equal(mfln_output, torch_layer_norm_output)
@parameterized.expand([(attn_mask_type,) for attn_mask_type in AttnMaskType])
def test_fused_masked_softmax(self, attn_mask_type: AttnMaskType):
command_args = get_default_args(self.test_file_dir_str)
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
initialize_megatron()
args = get_args()
dummy_input = torch.randn(
args.micro_batch_size,
args.num_attention_heads,
args.seq_length,
args.seq_length,
device="cuda",
dtype=args.params_dtype
)
if attn_mask_type == AttnMaskType.causal:
dummy_attention_mask = None
else:
dummy_attention_mask = torch.randn(
args.micro_batch_size,
1, # `args.num_attention_heads` not implemented in our cuda kernel
args.seq_length,
args.seq_length,
device="cuda",
dtype=args.params_dtype
) < 0
scale = torch.rand(())
fused_scaled_softmax = FusedScaleMaskSoftmax(
input_in_fp16=args.params_dtype == torch.float16,
input_in_bf16=args.params_dtype == torch.bfloat16,
attn_mask_type=attn_mask_type,
scaled_masked_softmax_fusion=True,
mask_func=attention_mask_func,
softmax_in_fp32=True,
scale=scale,
)
unfused_scaled_softmax = FusedScaleMaskSoftmax(
input_in_fp16=args.params_dtype == torch.float16,
input_in_bf16=args.params_dtype == torch.bfloat16,
attn_mask_type=attn_mask_type,
scaled_masked_softmax_fusion=False,
mask_func=attention_mask_func,
softmax_in_fp32=True,
scale=scale,
)
self.assertTrue(fused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
fused_output = fused_scaled_softmax(dummy_input, dummy_attention_mask)
self.assertFalse(unfused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
unfused_output = unfused_scaled_softmax(dummy_input, dummy_attention_mask)
# Test that the nonzeros are the same with the mask
for i in range(args.num_attention_heads):
if dummy_attention_mask is None:
# Make sure it's causal, values in the lower triangle should be not zero.
non_zero_values = torch.tril(torch.ones_like(fused_output[:, i]))
torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(non_zero_values))
else:
torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0]))
# Cuda kernel produces slightly different results
torch_assert_close(fused_output, unfused_output)
def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self):
command_args = get_default_args(self.test_file_dir_str)
command_args["--position-embedding-type"] = "alibi"
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**self.dist_env_1_gpu):
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
# Hack: `gpt2` doesn't have a padding token, so we override that value.
tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
data = get_dummy_mtf_decoder_packed_data(
micro_batch_size=args.micro_batch_size,
seq_length=args.seq_length,
vocab_size=args.padded_vocab_size,
special_tokens_ids={tokenizer.pad}
)
model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider)
model = model[0]
model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
model.set_train_batch_size(args.micro_batch_size)
output = model.eval_batch(iter_out_of_one(data), compute_loss=False)
## --------------- CHANGE A TARGET TOKEN ---------------------------
# change the first token in the first batch to a random value
change_batch_id = 0
change_token_id = 0
token_ids_changed = data["decoder_token_ids"].clone()
# We increment the token id on the changed index.
token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
while token_ids_changed[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}:
token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
# Test change
output_changed_target = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed}), compute_loss=False)
first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0]
# Check that values changed in segment 1 of batch_id 0
self.assertFalse(torch.any(
equal_vectors(
output[change_batch_id, change_token_id:first_segment_first_batch_id_end],
output_changed_target[change_batch_id, change_token_id:first_segment_first_batch_id_end]
)
))
# Check that values did not change in other segments of batch_id 0
torch_assert_equal(
output[change_batch_id, first_segment_first_batch_id_end:],
output_changed_target[change_batch_id, first_segment_first_batch_id_end:]
)
# Check that values did not change in other segments in other batches
non_change_ids = torch.arange(output.shape[0]) != change_batch_id
torch_assert_equal(output[non_change_ids], output_changed_target[non_change_ids])
## --------------- CHANGE A TARGET TOKEN ---------------------------
# change the last token in the first batch to a pad
token_ids_changed_pad = data["decoder_token_ids"].clone()
segment_ids_changed_pad = data["decoder_segment_ids"].clone()
# We increment the token id on the changed index.
token_ids_changed_pad[change_batch_id, -1] = tokenizer.pad
segment_ids_changed_pad[change_batch_id, -1] = 0
# Test model handles padding correctly
output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False)
self.assertFalse(torch.any(torch.isnan(output_changed_pad)))
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import filecmp
import io
import json
import re
import os
import unittest
import functools
from pathlib import Path
from megatron.testing_utils import (
TestCasePlus,
execute_subprocess_async,
set_seed
)
from datasets import load_dataset
set_seed(42)
def write_jsonl(path, lines_num=1000, line_length=1024):
def get_text_line(line_length):
# XXX: fix to generate line_length
return "It's a wonderful world. I'm just walking on air. Talk of heaven on earth. I've got more than my share. Haven't got a care. Happy all day through. It's a wonderful world. Loving wonderful you!"
with io.open(path, "w", encoding="utf-8") as f:
for i in range(lines_num):
rec = dict(text=get_text_line(line_length))
x = json.dumps(rec, indent=0, ensure_ascii=False)
x = re.sub(r'\n', ' ', x, 0, re.M)
f.write(x + "\n")
@functools.lru_cache()
def download_hf_dataset(dsetname):
return load_dataset(dsetname)
class MegDSTestPreprocessing(TestCasePlus):
""" """
def setUp(self):
super().setUp()
def test_preprocess_data(self):
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
# autogenerate "input.jsonl"
input_path = f"{output_dir}/input.jsonl"
write_jsonl(input_path)
output_prefix =f"{output_dir}/test-ds"
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {input_path}
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
--workers 2
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
for ext in ["bin", "idx"]:
tgt_path = f"{output_prefix}_text_document.{ext}"
self.assertTrue(Path(tgt_path).exists(), )
def compare_meg_data_files(self, tgt, ref):
for ext in ["bin", "idx"]:
tgt_path = f"{tgt}.{ext}"
ref_path = f"{ref}.{ext}"
self.assertTrue(Path(tgt_path).exists(), )
self.assertTrue(filecmp.cmp(tgt_path, ref_path, shallow=False))
def preprocess_partitioned_dataset(self, output_dir, dsetname, splitname, linelimit, numparts):
"""Preprocess a dataset as a whole and in shards to prepare environment for merge test.
Load specified HF dataset using given split and record limit.
Write the dataset to a jsonl file and preprocess.
Also split dataset into numparts contiguous shards, write each shard to its own jsonl, and preprocess each.
Return path to the full dataset and a list of paths for each shard."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
# preproces_data_dist requires one to have already downloaded the input HF dataset.
# We do that by running this script before the test.
dset = download_hf_dataset(dsetname)[splitname]
# limit the test to use the first linelimit entries to be faster
dset = dset.select(range(linelimit))
# write jsonl file of full dataset
json_ds = f"{output_dir}/ds-full.jsonl"
dset.to_json(json_ds)
# process full jsonl into indexed dataset file
ds_full = f"{output_dir}/ds-full"
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {json_ds}
--output-prefix {ds_full}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
ds_full += '_text_document'
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
# write each part to its own json file
ds_parts = []
for i in range(numparts):
json_part = f"{output_dir}/ds-part-{i}.jsonl"
dset.shard(numparts, i, contiguous=True).to_json(json_part)
ds_part = f"{output_dir}/ds-part-{i}"
ds_parts.append(ds_part + '_text_document')
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {json_part}
--output-prefix {ds_part}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
return ds_full, ds_parts
def test_merge_serial(self):
"""Check that serial merge of partial dataset files produces the same file as the full dataset."""
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
# process full dataset, and process the full dataset as 3 contiguous chunks
ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
# merge the part files into a single indexed dataset
ds_merged = f"{output_dir}/ds-merged"
cmd = f"""
python {src_dir}/tools/merge_preprocessed_data.py
--datasets {" ".join(ds_parts)}
--output-prefix {ds_merged}
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
# the full dataset and the merged dataset should be identical
self.compare_meg_data_files(ds_full, ds_merged)
def test_merge_distributed(self):
"""Check that serial merge of partial dataset files produces the same file as the full dataset."""
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
# process full dataset, and process the full dataset as 3 contiguous chunks
ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
# merge the part files into a single indexed dataset
ds_merged = f"{output_dir}/ds-merged"
cmd = f"""
python -m torch.distributed.launch --nproc_per_node 6 {src_dir}/tools/merge_preprocessed_data.py
--merge distributed
--datasets {" ".join(ds_parts)}
--output-prefix {ds_merged}
--torch-backend gloo
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
# the full dataset and the merged dataset should be identical
self.compare_meg_data_files(ds_full, ds_merged)
def test_process_data_microsoft(self):
"""We want to be stable to Microsoft version."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
input_path = f"{self.tests_dir}/data/gpt2/openwebtext-1000.jsonl"
output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext"
cmd = f"""
python {src_dir}/tools/preprocess_data.py
--input {input_path}
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
--workers 2
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
def test_process_data_dist_microsoft(self):
"""We want to be stable to Microsoft version."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
# preprocess_data_dist requires one to have already downloaded the input HF dataset.
# We do that by running this script before the test.
dsetname = 'stas/openwebtext-10k'
download_hf_dataset(dsetname)
cmd = f"""
python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
--input {dsetname}
--count 1000
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
def test_process_data_dist_serial_microsoft(self):
"""We want to be stable to Microsoft version."""
src_dir = self.src_dir
data_dir = f"{self.data_dir}/gpt2"
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
# preproces_data_dist requires one to have already downloaded the input HF dataset.
# We do that by running this script before the test.
dsetname = 'stas/openwebtext-10k'
download_hf_dataset(dsetname)
cmd = f"""
python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
--input {dsetname}
--count 1000
--merge serial
--output-prefix {output_prefix}
--dataset-impl mmap
--tokenizer-type GPT2BPETokenizer
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab {data_dir}/gpt2-tiny-vocab.json
--append-eod
""".split()
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env())
self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
import unittest
from random import randint
from unittest.mock import patch
import deepspeed
import torch
import logging
import numpy as np
import pytest
from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, require_torch_multi_gpu
from megatron.training import setup_model_and_optimizer
from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
import multiprocessing as mp
from multiprocessing import Pool
from megatron.checkpointing import save_checkpoint
from megatron.utils import get_ltor_masks_and_position_ids
@require_deepspeed
@require_torch_multi_gpu
class MegDSTestTP(TestCasePlus):
def get_default_args(self):
"""return a dictionary with key as argument name and value as additional arguments"""
data_dir = f"{self.data_dir}/gpt2"
return {
# GPT_ARGS
"--num-layers": "2",
"--hidden-size": "128",
"--num-attention-heads": "4",
"--seq-length": "256",
"--max-position-embeddings": "256",
"--micro-batch-size": "4",
"--global-batch-size": "8",
"--lr-decay-iters": "320000",
"--lr-decay-style": "cosine",
"--lr": "0.00015",
"--min-lr": "1.0e-5",
"--train-iters": "5000",
"--tokenizer-type": "GPT2BPETokenizer",
"--merge-file": f"{data_dir}/gpt2-tiny-merges.txt",
"--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json",
"--data-impl": "mmap",
"--split": "949,50,1",
"--distributed-backend": "nccl",
"--weight-decay": "1e-2",
"--clip-grad": "1.0",
"--lr-warmup-fraction": ".01",
"--fp16": "",
"--attention-dropout": "0",
"--hidden-dropout": "0",
# OUTPUT_ARGS
"--log-interval": "10",
"--save-interval": "500",
"--eval-interval": "100",
"--eval-iters": "10",
"--checkpoint-activations": "",
#ds args
"--deepspeed": "",
"--deepspeed_config":f"{self.test_file_dir_str}/ds_config.json",
"--zero-stage": "1",
"--deepspeed-activation-checkpointing": ""
# DATA_ARGS
}
def setUp(self) -> None:
super().setUp()
# We reset all global variables
global_vars._GLOBAL_ARGS = None
global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
global_vars._GLOBAL_TOKENIZER = None
global_vars._GLOBAL_TENSORBOARD_WRITER = None
global_vars._GLOBAL_ADLR_AUTORESUME = None
global_vars._GLOBAL_TIMERS = None
def infer_model(args):
tp_index, tp_size, command_args, token_ids, save, load = args
dist_env = dict(
MASTER_ADDR="localhost", MASTER_PORT="9991", RANK=str(tp_index), LOCAL_RANK=str(tp_index), WORLD_SIZE=str(tp_size)
)
logging.getLogger().critical("Process: starting")
#Hack
import megatron.initialize as init
init.git_ds_info = lambda: None
with patch('sys.argv', flatten_arguments(command_args)):
with mockenv_context(**dist_env):
def create_model_inputs(tokens):
args = get_args()
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss,
prefix_indices=None,
loss_on_targets_only=False)
return (tokens, position_ids, attention_mask), (tokens, loss_mask)
deepspeed.init_distributed()
initialize_megatron()
args = get_args()
tokenizer = get_tokenizer()
model, _, _ = setup_model_and_optimizer(gpt_model_provider)
model = model[0]
if load is not None:
# Hack (same as in eval_harness/evaluate.py)
# Loading pipelined models in deepspeed with different TP than it was originally trained on fails
# due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
# This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
# which does not contain these attention-specific keys.
#
# Deepspeed does however manage to load the model if we just turn off this sanity check.
deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
zero_enabled = model._config.zero_enabled
model._config.zero_enabled = False
_, _ = model.load_checkpoint(load, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
model._config.zero_enabled = zero_enabled
if token_ids is None:
token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
# eod is a special token
token_ids[token_ids == tokenizer.eod] += 1
token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
else:
token_ids = torch.tensor(token_ids)
model.micro_batches = 1
model.set_batch_fn(create_model_inputs)
# process batch
input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
# get a modified version of the first batch, we change a specific index
changed_index = randint(0, args.seq_length - 2)
input_token_ids_changed = input_batch[0].clone()
# We increment the token_id by one for that index in order to artificially change the sequence.
input_token_ids_changed[:, changed_index] = \
(input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size
output = model.eval_batch(iter([token_ids]), compute_loss = False, reduce_output = None)[0]
output = gather_from_tensor_model_parallel_region(output)
if save != None:
args.save = save
save_checkpoint(0, [model], None, None)
return (output[0].detach().cpu().numpy(), token_ids.detach().cpu().numpy())
def test_alibi_tp(self):
mp.set_start_method('spawn', force=True)
cp_dir = self.get_auto_remove_tmp_dir()
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
command_args["--position-embedding-type"] = "alibi"
command_args["--tensor-model-parallel-size"] = "1"
pool = Pool(1)
result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, cp_dir, None))])
pool.close()
pool.join()
output, tokens = result[0]
logging.getLogger().info("First done!")
command_args["--tensor-model-parallel-size"] = "2"
pool = Pool(2)
result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
pool.close()
pool.join()
output2, tokens = result[0]
logging.getLogger().critical(output-output2)
self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
def test_embedding_matrix_tp(self):
mp.set_start_method('spawn', force=True)
cp_dir = self.get_auto_remove_tmp_dir()
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
command_args["--seq-length"] = "4"
command_args["--micro-batch-size"] = "2"
tokens = [[5119, 0, 1, 5100],[0, 1, 5111, 5101]]
command_args["--tensor-model-parallel-size"] = "1"
pool = Pool(1)
# tp_index, tp_size, command_args, token_ids, save, load
result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, cp_dir, None))])
pool.close()
pool.join()
output, _ = result[0]
logging.getLogger().info("First done!")
command_args["--tensor-model-parallel-size"] = "2"
pool = Pool(2)
result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
pool.close()
pool.join()
output2, _ = result[0]
logging.getLogger().critical(output-output2)
self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
def test_embedding_matrix_tp_with_invalid_tokens_ids(self):
mp.set_start_method('spawn', force=True)
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
command_args["--seq-length"] = "4"
command_args["--micro-batch-size"] = "2"
tokens = [[5120, 0, 1, 2],[0, 1, 3, 4]]
command_args["--tensor-model-parallel-size"] = "1"
pool = Pool(1)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, None, None))])
pool.close()
pool.join()
self.assertIn("There is an input id in the input that is greater than the highest possible input id" , str(exc_info.value))
logging.getLogger().info("First done!")
command_args["--tensor-model-parallel-size"] = "2"
pool = Pool(2)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, None)), ((1, 2, command_args, tokens, None, None))])
pool.close()
pool.join()
self.assertIn("There is an input id in the input that is greater than the highest possible input id", str(exc_info.value))
def test_tokenizer_vocab_size_multiple_of_tp_size(self):
mp.set_start_method('spawn', force=True)
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
command_args["--micro-batch-size"] = "4"
command_args["--tensor-model-parallel-size"] = "2"
command_args["--make-vocab-size-divisible-by"] = "1"
pool = Pool(2)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
pool.close()
pool.join()
self.assertEqual(str(exc_info.value), "5121 is not divisible by 2")
def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
mp.set_start_method('spawn', force=True)
command_args = self.get_default_args()
command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
command_args["--micro-batch-size"] = "4"
pool = Pool(2)
with pytest.raises(Exception) as exc_info:
_ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
pool.close()
pool.join()
self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
if __name__ == '__main__':
unittest.main()
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import dataclasses
import io
import json
import os
import glob
import re
import shutil
import unittest
from pathlib import Path
from parameterized import parameterized
from megatron.testing_utils import (
CaptureStdout,
CaptureStd,
TestCasePlus,
execute_subprocess_async,
get_gpu_count,
require_bnb_non_decorator,
require_deepspeed,
require_torch_gpu,
set_seed
)
set_seed(42)
def get_launcher(num_gpus):
# 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
# - it won't be able to handle that
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
def get_3d_dimensions():
num_gpus = get_gpu_count()
# with fewer gpus the preference is first to to do PP>1, then TP>1, then DP>1
if num_gpus >= 8:
dp_size = 2
pp_size = 2
tp_size = 2
if num_gpus >= 4:
dp_size = 1
pp_size = 2
tp_size = 2
elif num_gpus >= 2:
dp_size = 1
pp_size = 2
tp_size = 1
else:
dp_size = 1
pp_size = 1
tp_size = 1
return pp_size, tp_size, dp_size
@require_deepspeed
@require_torch_gpu
class MegDSTestTraining(TestCasePlus):
""" """
def setUp(self):
super().setUp()
# at times magatron fails to build kernels and doesn't remove the lock file, which makes
# subsequent runs hang - so make sure there is no lock when starting the testing
meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
if os.path.exists(meg_lock_file_path):
os.unlink(meg_lock_file_path)
def copy_data_to_temp(self, root_dir, prefix):
"""copy data to temp, and return paths to temp version"""
src_path = os.path.join(root_dir, prefix)
src_dirname = os.path.dirname(src_path)
tmp_dir = self.get_auto_remove_tmp_dir()
dest_path = os.path.join(tmp_dir, prefix)
dest_dirname = os.path.dirname(dest_path)
os.makedirs(dest_dirname, exist_ok=True)
for folder in os.listdir(src_dirname):
src_folder = os.path.join(src_dirname, folder)
dest_folder = os.path.join(dest_dirname, folder)
if src_folder.startswith(src_path):
if os.path.isdir(src_folder):
shutil.copytree(src_folder, dest_folder)
else:
shutil.copy2(src_folder, dest_folder)
return dest_path
def get_variation_config(self, variation, output_dir, n_samples=None):
data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
print(f"Using {num_gpus} GPUs")
if variation == "bnb":
# we want to make sure at least tp=2 is used, so we swap tp and pp
pp_size, tp_size = tp_size, pp_size
if n_samples is None:
n_samples = 300 # about 56 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
seq_len = 128
# common/shared configs
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--log-interval 1
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--partition-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_dir}/meg-gpt2-openwebtext_text_document
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length {seq_len}
--max-position-embeddings 1024
--micro-batch-size 1
--global-batch-size 16
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--embed-layernorm
--sync-tp-duplicated-parameters
--fp16
--log-level debug
--log-level-replica info
""".split()
if variation == "base":
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
elif variation == "bnb":
# BitsAndBytes - 8-bit optimizer
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
--use-bnb-optimizer
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
elif variation == "cl":
# CurriculumLearning
lr_decay_samples = 6
lr_decay_tokens = lr_decay_samples * seq_len
train_tokens = n_samples * seq_len
# XXX: if changing seq_len from 128, must adjust ds config to:
# curriculum_learning.max_difficulty: $SEQLEN
# XXX: probably we should write the ds config on the fly to keep everything in sync,
# rather than using the pre-saved config
new_args = f"""
--train-samples {n_samples*2}
--train-tokens {train_tokens}
--lr-decay-tokens {lr_decay_tokens}
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config_cl.json
""".split()
elif variation == "glu":
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
--no-bias-gelu-fusion
--glu-activation geglu
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
elif variation == "alibi":
new_args = f"""
--rampup-batch-size 2 2 {n_samples}
--train-samples {n_samples}
--lr-decay-samples 6
--position-embedding-type alibi
""".split()
new_ds_args = f"""
--deepspeed_config {self.test_file_dir_str}/ds_config.json
""".split()
else:
raise ValueError(f"Don't know of variation {variation}")
args.extend(new_args)
ds_args.extend(new_ds_args)
return args, ds_args, num_gpus
def test_kill_switch(self):
variation = "base"
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
kill_switch_path = os.path.join(output_dir, "kill-switch-xyz")
args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
args += f"--kill-switch-path {kill_switch_path}".split()
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. kill switch armed but not triggered
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# 2. trigger kill switch
fh = open(kill_switch_path, "w")
with CaptureStd() as cs:
execute_subprocess_async(cmd, env=self.get_env())
self.assertIn(f"Detected kill switch at {kill_switch_path}", cs.out)
# test deepspeed wasn't run
self.assertNotIn("DeepSpeed info", cs.out)
@parameterized.expand(["base", "cl", "bnb", "glu", "alibi"])
def test_training_all(self, variation):
# optional runs
if variation == "bnb":
require_bnb_non_decorator()
# all in one test
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
if variation == "glu":
self.assertIn("Using GLU activation: GELU", cs.out)
if variation == "alibi":
self.assertIn("Using Alibi", cs.out)
# 2. test training from checkpoint: resume
# now do it again, this time resuming from the checkpoint
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard (1 file from the first run, plus 1 now)
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
if variation == "glu":
self.assertIn("Using GLU activation: GELU", cs.out)
@parameterized.expand([(True, True), (False, False), (True, False), (False, True)])
def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_on_position_frequency):
# all in one test
src_dir = self.src_dir
data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
logs_dir = f"{output_dir}/logs"
Path(logs_dir).mkdir(parents=True, exist_ok=True)
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
n_samples = 200 # about 37 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length 128
--max-position-embeddings 1024
--micro-batch-size 1
--rampup-batch-size 2 2 {n_samples}
--global-batch-size 16
--train-samples {n_samples}
{"--loss-on-targets-only" if loss_on_targets_only else ""}
{"--reweight-loss-based-on-position-frequency" if reweight_loss_based_on_position_frequency else ""}
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--fp16
--log-interval 5
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--log-path {logs_dir}
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_dir}/meg-gpt2-openwebtext_text_document
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-level debug
""".split()
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
script = [f"{src_dir}/pretrain_prefix_lm.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
if reweight_loss_based_on_position_frequency:
self.assertIn("Using loss reweighting", cs.out)
# 2. test training from checkpoint: resume
# now do it again, this time resuming from the checkpoint
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard (1 file from the first run, plus 1 now)
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
def test_training_t0(self):
data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt")
output_dir = self.get_auto_remove_tmp_dir()
logs_dir = f"{output_dir}/logs"
Path(logs_dir).mkdir(parents=True, exist_ok=True)
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
n_samples = 200 # about 37 iterations
exit_interval = 10 # some samples in the first half and then some more in the 2nd half after resume
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length 128
--max-position-embeddings 1024
--position-embedding-type alibi
--micro-batch-size 1
--rampup-batch-size 2 2 {n_samples}
--global-batch-size 16
--train-samples {n_samples}
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--fp16
--log-interval 5
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--exit-interval {exit_interval}
--tokenizer-type PretrainedFromHF
--tokenizer-name-or-path bigscience/tokenizer
--log-path {logs_dir}
--save {output_dir}/checkpoints
--load {output_dir}/checkpoints
--data-path {data_path}
--split 90,10,0
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-level debug
""".split()
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
script = [f"{self.src_dir}/finetune_t0_non_causal_decoder.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test there should be no checkpoint this round
self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
# 2. test training from checkpoint: resume
# now do it again, this time resuming from the checkpoint
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test checkpoint loading
self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard (1 file from the first run, plus 1 now)
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
@parameterized.expand(["gpt", "prefix", "no_eval"])
def test_mode2_dataloading(self, variation):
src_dir = self.src_dir
data_dir = self.copy_data_to_temp(self.data_dir, "gpt2")
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
logs_dir = f"{output_dir}/logs"
Path(logs_dir).mkdir(parents=True, exist_ok=True)
pp_size, tp_size, dp_size = get_3d_dimensions()
num_gpus = pp_size * tp_size * dp_size
n_samples = 200 # about 37 iterations
exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
args = f"""
--tensor-model-parallel-size {tp_size}
--pipeline-model-parallel-size {pp_size}
--distributed-backend nccl
--num-layers 2
--hidden-size 64
--num-attention-heads 2
--seq-length 128
--max-position-embeddings 1024
--micro-batch-size 1
--rampup-batch-size 2 2 {n_samples}
--global-batch-size 16
--train-samples {n_samples}
--loss-on-targets-only
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-8
--lr 1e-4
--lr-warmup-samples 5
--clip-grad 1.0
--weight-decay 1e-1
--fp16
--log-interval 5
--save-interval 10
--eval-interval 10
--eval-iters 5
--checkpoint-activations
--exit-interval {exit_interval}
--merge-file {data_dir}/gpt2-tiny-merges.txt
--vocab-file {data_dir}/gpt2-tiny-vocab.json
--log-path {logs_dir}
--save {output_dir}/checkpoints
--tensorboard-dir {output_dir}/tensorboard
--tensorboard-queue-size 5
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
""".split()
data_args = [
"--train-weighted-split-paths", f'TRAIN: 1 0:0.95 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0:0.90 {data_dir}/meg-gpt2-openwebtext_text_document']
if variation != "no_eval":
data_args += ["--valid-weighted-split-paths", f'VALID1: 1 0.95:0.98 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0.90:0.99 {data_dir}/meg-gpt2-openwebtext_text_document',
f'VALID2: 0.5 0.95:0.97 {data_dir}/meg-gpt2-openwebtext_text_document, 0.5 0.90:0.98 {data_dir}/meg-gpt2-openwebtext_text_document']
ds_args = f"""
--deepspeed
--deepspeed_config {self.test_file_dir_str}/ds_config.json
--zero-stage 1
--deepspeed-activation-checkpointing
""".split()
if variation == "prefix":
script = [f"{src_dir}/pretrain_prefix_lm.py"]
else:
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + data_args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
# 1. test training from scratch (no checkpoint)
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# test deepspeed is running
self.assertIn("DeepSpeed info", cs.out)
# test reports
self.assertIn("consumed samples", cs.out)
# test checkpoint saving
self.assertIn("successfully saved checkpoint at iteration", cs.out)
# test tensorboard
tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
def test_skip_train_iteration(self):
# skip iterations setup
extra_args = f"""
--skip-train-iteration-range 2-2 4-7
""".split()
src_dir = self.src_dir
output_dir = self.get_auto_remove_tmp_dir()
args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
args.extend(extra_args)
script = [f"{src_dir}/pretrain_gpt.py"]
launcher = get_launcher(num_gpus)
cmd = launcher + script + args + ds_args
# keep for quick debug
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
with CaptureStdout() as cs:
execute_subprocess_async(cmd, env=self.get_env())
# check skipped iterations
self.assertIn("Skipped iterations 2 to 2 due to --skip-train-iteration-range flag", cs.out)
self.assertIn("Skipped iterations 4 to 7 due to --skip-train-iteration-range flag", cs.out)
train_iterations = range(1,10)
for i in train_iterations:
self.assertTrue(f"iteration {i:8d}/" in cs.out)
# Test suite tools
# Make tiny tokenizer files
currently for gpt2 run:
```
./shrink-tokenizer.py
```
and then we have tiny vocab and merge files under the generated dir `tiny` to add to repo under `data/gpt2`.
```
cp tiny/merges.txt ../data/gpt2/gpt2-tiny-merges.txt
cp tiny/vocab.json ../data/gpt2/gpt2-tiny-vocab.json
```
Note, the tiny vocab was set to 5000 items after experimenting with the resulting index files size. Using a tiny vocab of 500 (and adjusted merge entries) proved to generate very large index files, so it actually ends up costing more in final file size. 5000 proved to generate an almost identical index files as with the original 50k vocab size.
# Make tiny pre-processed index
to be used in test training
```
./openwebtext-to-jsonl.py
```
generates:
```
openwebtext-10000.jsonl
```
we don't want to store jsonl in repo, to keep the size small, so it's a temp file.
Now we pre-process it:
```
cd ../..
input=tests/tools/openwebtext-1000.jsonl
python tools/preprocess_data.py \
--input $input \
--output-prefix tests/data/gpt2/meg-gpt2-openwebtext \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file tests/data/gpt2/gpt2-tiny-merges.txt \
--vocab tests/data/gpt2/gpt2-tiny-vocab.json \
--append-eod \
--workers 6
```
and voila we now have:
```
ls -sh1 tests/data/gpt2/meg-gpt2-openwebtext*
2.6M tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin
20K tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx
```
which we can now commit and use in tests.
#!/usr/bin/env python
# generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor
import sys
from datasets import load_dataset
dataset_name = "stas/openwebtext-10k"
# subset to jsonlines
n_samples = 1000
ds = load_dataset(dataset_name, split='train')
ds_small = ds.select(range(n_samples))
path = f"openwebtext-{n_samples}.jsonl"
ds_small.to_json(path, orient="records", lines=True)
#!/usr/bin/env python
# produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo)
import json
from transformers import AutoTokenizer
from tokenizers import Tokenizer
mname = "gpt2"
vocab_keep_items = 5000
tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True)
assert tokenizer.is_fast, "This only works for fast tokenizers."
tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
vocab = tokenizer_json["model"]["vocab"]
if tokenizer_json["model"]["type"] == "BPE":
if "gpt2" in mname:
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 }
new_vocab["<|endoftext|>"] = vocab_keep_items-1
else:
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
merges = tokenizer_json["model"]["merges"]
new_merges = []
for i in range(len(merges)):
a, b = merges[i].split()
new_token = "".join((a, b))
if a in new_vocab and b in new_vocab and new_token in new_vocab:
new_merges.append(merges[i])
tokenizer_json["model"]["merges"] = new_merges
elif tokenizer_json["model"]["type"] == "Unigram":
new_vocab = vocab[:vocab_keep_items]
elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
else:
raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
tokenizer_json["model"]["vocab"] = new_vocab
tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
tokenizer.save_pretrained("tiny")
# Tools
- [sample_idxs_to_text.py](./sample_idxs_to_text.py) - want to see which text was feed at specific iterations? for example to understand why the training went astray? Then use this script. The pre-amble of the script contains the documentation and usage examples.
## A few notes on how we created the datasets:
### Creating the Json Lines text file
First you need to create a jsonl file containing your dataset. For this we exported from the HF-datasets format. For example for C4:
```
from datasets import load_dataset
c4 = load_dataset("c4", "en")
c4["train"].to_json("c4_en_train.jsonl")
c4["validation"].to_json("c4_en_valid.jsonl")
```
This creates quite a large file compared to the size of the HF dataset on disk (810GB vs 305 for C4 for example)
### Megatron pre-processing
Then you need to pass that text file to the `preprocess_data.py` script for tokenization and memory-mapping, creating two files, one to store the tokens indices and one to store the document start and ends. The result will be slightly bigger than the text dataset. (360GB vs 305GB for C4 for example). You can choose one of the default Megatron tokenizers (but then you have to pass merges and vocab files) or one from HF-tokenizers. For example, in our GPT-like models reusing a T5 sentencepiece-bpe tokenizer:
`python tools/preprocess_data.py --input ~/c4_en_train.jsonl --output-prefix c4_en_train --dataset-impl mmap --tokenizer-type PretrainedFromHF --tokenizer-name-or-path t5-small --workers 30 --append-eod`
Do note that adding too many workers can be counterproductive for very large dataset: as the bottleneck becomes disk writing, the intermediary process results pool up and can flood the RAM. In our experiments on GCP machines, running with 60 workers on C4 inevitably led the program to fail.
# Introduction
This folder is a collection of scripts for converting checkpoints of one training framework (e.g., DeepSpeed) into that of a different framework (e.g., Megatron-LM, HF Transformers).
The folder also contains scripts for inspecting checkpoint files and folders, which could be useful when developing checkpoint conversion logic. At the time of creation, this folder contains scripts to convert DeepSpeed checkpoints to Megatron-LM and HF Transformers checkpoints (this motivated this effort as part of the BigScience project).
Here are the list and details of checkpoint conversions provided by the available scripts:
1. [Megatron-DeepSpeed to Megatron-LM](#Megatron-DeepSpeed-to-Megatron)
1. [Megatron-DeepSpeed to HF Transformers](#Megatron-DeepSpeed-to-HF-Transformers)
## Megatron-DeepSpeed to Megatron
The (current implementation of the) converter extracts args and model parameters from a DeepSpeed checkpoint (i.e., excludes other training states such as optimizer, scheduler, etc) and convert into a Megatron-LM checkpoint similarly containing only model parameters. The converter also provides a best-effort attempt to reshape the tensor-parallelism and pipeline parallelism degrees for the checkpoint. The resulting Megatron-LM checkpoint could be loaded into Megatron-LM framework for finetuning or inference. Tensor parallelism (TP) and pipeline parallelism (PP) are supported in the sense that the generated Megatron-LM checkpoint (folders and files) will be of the same TP and PP of the training that created the input DeepSpeed checkpoint. The entry point of the converter is `deepspeed_to_megatron.py`, which as the following usage:
```bash
python tools/convert_checkpoint/deepspeed_to_megatron.py -h
Convert DeepSpeed Checkpoint to Megatron Checkpoint
usage: deepspeed_to_megatron.py [-h] [--input_folder INPUT_FOLDER]
[--output_folder OUTPUT_FOLDER]
[--target_tp TARGET_TP]
[--target_pp TARGET_PP] [--for_release]
optional arguments:
-h, --help show this help message and exit
--input_folder INPUT_FOLDER
Input DeepSpeed Checkpoint folder
--output_folder OUTPUT_FOLDER
Output Megatron checkpoint folder
--target_tp TARGET_TP
Target TP degree
--target_pp TARGET_PP
Target PP degree
--for_release Convert for release purpose, reset some (progress)
counters.
```
The following scripts which proved useful for debugging are also included:
1. `inspect_deepspeed_checkpoint.py`: view the contents of a DeepSpeed checkpoint folder.
2. `inspect_checkpoint.py`: view the contents of a PyTorch checkpoint file.
## Megatron-DeepSpeed to HF Transformers
In order to convert from Megatron-DeepSpeed to HF Transformers, you can do this directly using:
```bash
python tools/convert_checkpoint/deepspeed_to_transformers.py \
--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
--output_folder /path/to/transformers/checkpoint
```
since `transformers` currently only works with PP=1/TP=1 we use the defaults `--target_tp 1 --target_pp 1`.
The script taps into `transformers` and as of this writing requires `transformers@master` (or `transformers==4.11` if you read this later and a new version is released).
Note that you may run into problems with not having `megatron.enums` defined since `Megatron-Deepspeed` in the `bigscience-workshop` tree diverged from the `microsoft` tree. In such cases you can fix this on the fly by ensuring the former appears first in the `sys.path`. For example:
```bash
PYTHONPATH=/hf/Megatron-DeepSpeed-bigscience:/hf/Megatron-DeepSpeed-microsoft \
python tools/convert_checkpoint/deepspeed_to_transformers.py \
--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
--output_folder /path/to/transformers/checkpoint
```
Alternatively, you can convert first from Megatron-DeepSpeed to Megatron and then to HF Transformers:
```bash
# 1. Megatron-DeepSpeed to Megatron
cd /hf/Megatron-DeepSpeed-bigscience
python tools/convert_checkpoint/deepspeed_to_megatron.py --target_tp 1 --target_pp 1 \
--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
--output_folder /path/to/Megatron/checkpoint
# 2. Megatron to HF Transformers
cd /hf/transformers
python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py \
/path/to/Megatron/checkpoint/iter_0097500/mp_rank_00/model_optim_rng.pt
```
#!/usr/bin/env python
import sys
import argparse
import os
import torch
from pathlib import Path
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
from megatron.tokenizer.tokenizer import _vocab_size_with_padding
from deepspeed.checkpoint.deepspeed_checkpoint import (
ARGS_KEY,
CHECKPOINT_INFO_KEY,
)
from deepspeed.checkpoint import (
DeepSpeedCheckpoint,
get_model_ckpt_name_for_rank,
get_zero_ckpt_name_for_rank,
get_layer_ckpt_name_for_rank
)
CHECKPOINT_FILE_SUFFIX = '_model_states.pt'
MP_WORLD_SIZE ='mp_world_size'
WORD_EMBEDDINGS_KEY = 'word_embeddings.weight'
ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
PADDED_VOCAB_SIZE = 'padded_vocab_size'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder',
default=None,
type=str,
help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder',
default=None,
type=str,
help='Output Megatron checkpoint folder')
parser.add_argument('--target_tp',
default=None,
type=int,
help='Target TP degree')
parser.add_argument('--target_pp',
default=None,
type=int,
help='Target PP degree')
parser.add_argument('--target_dp',
default=None,
type=int,
help='Target DP degree')
args = parser.parse_args()
print(f'args = {args}')
return args
def _save_checkpoint(file_path, chkpt_sd):
dir, _ = os.path.split(file_path)
os.makedirs(dir, exist_ok=True)
torch.save(chkpt_sd, file_path)
def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index):
sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index)
layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index)
assert len(sd_list) == len(layer_id_list)
for sd, layer_id in zip(sd_list, layer_id_list):
ckpt_path = get_layer_ckpt_name_for_rank(
base_folder=base_folder,
layer_id=layer_id,
tp_rank=tp_index)
_save_checkpoint(ckpt_path, sd)
def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
target_args = ds_checkpoint.get_args()
checkpoint_info = ds_checkpoint.get_checkpoint_info()
target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree
target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args)
assert target_args.padded_vocab_size <= padded_vocab_tensor.numel()
checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size
unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size)
return unpadded_vocab_tensor.clone()
def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
sd = ds_checkpoint.get_embedding_state(tp_index)
if ds_checkpoint.is_change_tp_degree():
sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY])
layer_id = ds_checkpoint.get_embedding_layer_id()
ckpt_path = get_layer_ckpt_name_for_rank(
base_folder=base_folder,
tp_rank=tp_index,
layer_id=layer_id)
_save_checkpoint(ckpt_path, sd)
def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
sd = ds_checkpoint.get_final_norm_state(tp_index)
layer_id = ds_checkpoint.get_final_norm_layer_id()
ckpt_path = get_layer_ckpt_name_for_rank(
base_folder=base_folder,
tp_rank=tp_index,
layer_id=layer_id)
_save_checkpoint(ckpt_path, sd)
def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index,
pp_index):
sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index,
pp_index=pp_index)
sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree
file_id = pp_index * ds_checkpoint.tp_degree + tp_index
ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}')
# Adjust specific fields
sd[ARGS_KEY] = ds_checkpoint.get_args()
sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size
_save_checkpoint(ckpt_path, sd)
def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index):
_2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index
sd = ds_checkpoint.get_zero_checkpoint_state(
pp_index=pp_index,
tp_index=tp_index,
dp_index=dp_index)
ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder,
dp_rank=dp_index,
mp_rank=_2d_rank)
_save_checkpoint(ckpt_path, sd)
def _create_latest_file(base_folder, file_name, latest_tag):
file_path = os.path.join(base_folder, file_name)
os.makedirs(base_folder, exist_ok=True)
with open(file_path, 'w') as f:
f.write(str(latest_tag))
def main():
print(f'Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint')
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(
args.input_folder,
args.target_tp,
args.target_pp,
args.target_dp)
iteration = ds_checkpoint.get_iteration()
latest_tag = f'global_step{iteration}'
_create_latest_file(args.output_folder,
'latest_checkpointed_iteration.txt', iteration)
_create_latest_file(args.output_folder, 'latest', latest_tag)
base_folder = os.path.join(args.output_folder, latest_tag)
for i in range(ds_checkpoint.tp_degree):
_create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i)
_create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i)
for j in range(ds_checkpoint.pp_degree):
_create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j)
_create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j)
for i in range(ds_checkpoint.dp_degree):
for j in range(ds_checkpoint.pp_degree):
for k in range(ds_checkpoint.tp_degree):
_create_zero_checkpoint(ds_checkpoint, base_folder, i, j, k)
if __name__ == "__main__":
main()
#!/usr/bin/env python
import argparse
import os
import torch
from collections import OrderedDict
from deepspeed.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
MODEL_KEY = 'model'
ARGS_KEY = 'args'
LANGUGAGE_MODEL_KEY = 'language_model'
EMBEDDING_KEY = 'embedding'
ENCODER_KEY = 'encoder'
WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
WORD_EMBEDDINGS_KEY = 'word_embeddings'
FINAL_LAYER_NORM_KEY = 'final_layernorm'
CHECKPOINT_VERSION_KEY = 'checkpoint_version'
CHECKPOINT_VERSION_VALUE = 3.0
ITERATION_KEY = 'iteration'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder',
default=None,
type=str,
help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder',
default=None,
type=str,
help='Output Megatron checkpoint folder')
parser.add_argument('--target_tp',
default=1,
type=int,
help='Target TP degree')
parser.add_argument('--target_pp',
default=1,
type=int,
help='Target PP degree')
parser.add_argument(
'--for_release',
action='store_true',
help='Convert for release purpose, reset some (progress) counters.')
args = parser.parse_args()
print(f'args = {args}')
return args
def _convert_ds_transformer_state(sd_list):
new_sd = OrderedDict()
for i, sd in enumerate(sd_list):
for key, value in sd.items():
new_key = f'layers.{i}.{key}'
new_sd[new_key] = value
return new_sd
def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
path_list = []
iter_folder = f'iter_{iteration:07d}'
for i in range(0, tp_degree):
path_list.append([])
for j in range(0, pp_degree):
rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
path_list[i].append(
os.path.join(base_folder, iter_folder, ckpt_path))
return path_list
def _create_megatron_dict():
language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
megatron_dict = {
MODEL_KEY: {
LANGUGAGE_MODEL_KEY: language_model_dict
},
CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
}
return megatron_dict
def _save_checkpoint(file_path, chkpt_sd):
dir, _ = os.path.split(file_path)
os.makedirs(dir, exist_ok=True)
torch.save(chkpt_sd, file_path)
def _renest_sd(sd):
new_sd = OrderedDict()
for key, value in sd.items():
a, b = key.split('.')
new_sd[a] = {b: value}
return new_sd
def _create_rank_checkpoint(ds_checkpoint,
checkpoint_path,
tp_index,
pp_index,
for_release=False):
meg_encoder_sd = OrderedDict()
meg_embedding_sd = OrderedDict()
meg_embedding_for_head_sd = OrderedDict()
transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index)
meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd))
if pp_index in [0, ds_checkpoint.pp_degree - 1]:
embedding_sd = ds_checkpoint.get_embedding_state(tp_index)
nested_embedding_sd = _renest_sd(embedding_sd)
if pp_index == 0:
meg_embedding_sd.update(nested_embedding_sd)
if pp_index == ds_checkpoint.pp_degree - 1:
for key, value in embedding_sd.items():
if key.startswith(WORD_EMBEDDINGS_KEY):
fields = key.split('.')
new_fields = fields[1:]
new_key = '.'.join(new_fields)
meg_embedding_for_head_sd[new_key] = value
final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
new_final_norm_sd = {
f'{FINAL_LAYER_NORM_KEY}.{key}': value
for key, value in final_norm_sd.items()
}
meg_encoder_sd.update(new_final_norm_sd)
checkpoint_sd = _create_megatron_dict()
iteration = ds_checkpoint.get_iteration()
checkpoint_sd[ITERATION_KEY] = iteration
if pp_index == 0:
checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][
EMBEDDING_KEY] = meg_embedding_sd
checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
if pp_index == ds_checkpoint.pp_degree - 1:
checkpoint_sd[MODEL_KEY][
WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
# Adjust specific fields
checkpoint_sd[
ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
checkpoint_sd[
ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
if for_release:
checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
return checkpoint_sd
def _create_latest_file(base_folder, iteration):
file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt')
os.makedirs(base_folder, exist_ok=True)
with open(file_path, 'w') as f:
f.write(str(iteration))
def main():
print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
args.target_pp)
iteration = ds_checkpoint.get_iteration()
_create_latest_file(args.output_folder, iteration)
checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
ds_checkpoint.tp_degree,
ds_checkpoint.pp_degree)
for i in range(0, ds_checkpoint.tp_degree):
for j in range(0, ds_checkpoint.pp_degree):
sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
_save_checkpoint(checkpoint_paths[i][j], sd)
if __name__ == "__main__":
main()
#!/usr/bin/env python
import os
import torch
import json
import sys
from pathlib import Path
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
from deepspeed.checkpoint import DeepSpeedCheckpoint
from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
# the import was tested to work with this version
# https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
# copying that version here instead
from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
from transformers import GPT2Config
def main():
# this first part comes mainly from deepspeed_to_megatron.main
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
args.target_pp)
iteration = ds_checkpoint.get_iteration()
input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0,
args.for_release)
# the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
# Spell out all parameters in case the defaults change.
config = GPT2Config(
vocab_size=50257,
n_positions=1024,
n_ctx=1024,
n_embd=1024,
n_layer=24,
n_head=16,
n_inner=4096,
activation_function="gelu", # used to be "gelu_new" in earlier versions
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
scale_attn_weights=True,
gradient_checkpointing=False,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
)
# Convert.
print("Converting to HF Checkpoint")
output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
config)
basename = args.output_folder
os.makedirs(basename, exist_ok=True)
# Print the structure of converted state dict.
#if args.print_checkpoint_structure:
# recursive_print(None, output_state_dict)
# Store the config to file.
output_config_file = os.path.join(basename, "config.json")
output_config = config.to_dict()
output_config["architectures"] = ["GPT2LMHeadModel"]
output_config["model_type"] = "gpt2"
print(f'Saving config to "{output_config_file}"')
with open(output_config_file, "w") as f:
json.dump(output_config, f)
# Store the state_dict to file.
output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
print(f'Saving checkpoint to "{output_checkpoint_file}"')
torch.save(output_state_dict, output_checkpoint_file)
print("Now add tokenizer files and upload to the hub")
if __name__ == "__main__":
main()
#!/usr/bin/env python
from collections import OrderedDict
from copy import deepcopy
from email.policy import default
from functools import partial
from pathlib import Path
from pprint import pprint
import argparse
import glob
import itertools
import logging
import multiprocessing
import os
import re
import shutil
import sys
import torch
import tqdm
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
from deepspeed.checkpoint import DeepSpeedCheckpoint
MODEL_KEY = 'model'
ARGS_KEY = 'args'
LANGUGAGE_MODEL_KEY = 'language_model'
EMBEDDING_KEY = 'embedding'
ENCODER_KEY = 'encoder'
WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
WORD_EMBEDDINGS_KEY = 'word_embeddings'
FINAL_LAYER_NORM_KEY = 'final_layernorm'
CHECKPOINT_VERSION_KEY = 'checkpoint_version'
CHECKPOINT_VERSION_VALUE = 3.0
ITERATION_KEY = 'iteration'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input_folder',
type=str,
help='Input DeepSpeed Checkpoint folder')
parser.add_argument('--output_folder',
type=str,
help='Output Megatron checkpoint folder')
parser.add_argument('--target_tp',
default=1,
type=int,
help='Target TP degree')
parser.add_argument('--target_pp',
default=1,
type=int,
help='Target PP degree')
parser.add_argument('--num_extract_workers',
default=4,
type=int,
help='How many parallel processes to extract zero shards')
parser.add_argument('--num_merge_workers',
default=2,
type=int,
help='How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))')
parser.add_argument(
'--for_release',
action='store_true',
help='Convert for release purpose, reset some (progress) counters.')
args = parser.parse_args()
print(f'args = {args}')
return args
def _convert_ds_transformer_state(sd_list):
new_sd = OrderedDict()
for i, sd in enumerate(sd_list):
for key, value in sd.items():
new_key = f'layers.{i}.{key}'
new_sd[new_key] = value
return new_sd
def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
path_list = []
iter_folder = f'iter_{iteration:07d}'
for i in range(0, tp_degree):
path_list.append([])
for j in range(0, pp_degree):
rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
path_list[i].append(
os.path.join(base_folder, iter_folder, ckpt_path))
return path_list
def _create_megatron_dict():
language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
megatron_dict = {
MODEL_KEY: {
LANGUGAGE_MODEL_KEY: language_model_dict
},
CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
}
return megatron_dict
def _save_checkpoint(file_path, chkpt_sd):
dir, _ = os.path.split(file_path)
os.makedirs(dir, exist_ok=True)
torch.save(chkpt_sd, file_path)
def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D):
pp_index, tp_index, dp_index = indices_3D
sd = ds_checkpoint.get_zero_checkpoint_state(
pp_index=pp_index,
tp_index=tp_index,
dp_index=dp_index)
#pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}")
optim_sd = sd["optimizer_state_dict"]
param_slice_mappings = optim_sd["param_slice_mappings"]
# dict
state_groups = optim_sd["base_optimizer_state"]["state"]
# list
fp32_groups = optim_sd["single_partition_of_fp32_groups"]
param_groups_cnt = len(state_groups)
for param_group_id in range(param_groups_cnt):
flat_state = dict(
exp_avg=state_groups[param_group_id]["exp_avg"],
exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"],
fp32=fp32_groups[param_group_id],
)
for name,fragment_mapping in param_slice_mappings[param_group_id].items():
if "word_embeddings.weight" in name and pp_index > 0:
# Skip tied weights that are replicated in first and last pp stages
continue
#print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}")
for state_key in flat_state.keys():
dump_param_fragment(dir, tp_index, dp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel)
cnt = 0
def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel):
global cnt # temp hack
param_base_path = os.path.join(dir, param_name, str(tp_index))
os.makedirs(param_base_path, exist_ok=True)
cnt += 1
counter = f"{dp_index:0>2d}"
path = os.path.join(param_base_path, f"{state_name}.{counter}")
#print(f"{param_name}: {offset}: {numel} => {path}")
t = state_flat_tensor.narrow(0, offset, numel)
_save_checkpoint(path, t)
def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
slices = []
for tp_index in range(tp_degree):
prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
paths = sorted(list(glob.glob(f"{prefix_path}.0*")))
#print(paths)
shards = [torch.load(p) for p in paths]
slice = torch.cat(shards, dim=0).reshape(slice_shape)
slices.append(slice)
return slices
ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
checkpoint_info = ds_checkpoint.get_checkpoint_info()
padding_tensor = padded_vocab_tensor.narrow(0, checkpoint_info[ORIGINAL_VOCAB_SIZE], padded_vocab_tensor.shape[0]-checkpoint_info[ORIGINAL_VOCAB_SIZE])
#print(f'{padded_vocab_tensor[checkpoint_info[ORIGINAL_VOCAB_SIZE]-3:,:]=}')
return padded_vocab_tensor.narrow(0, 0, checkpoint_info[ORIGINAL_VOCAB_SIZE])
WEIGHTS_TO_AVERAGE_PATTERNS = [
r"tied_modules.embed.word_embeddings.norm.weight",
r"tied_modules.embed.word_embeddings.norm.bias",
r"\d+.input_layernorm.weight",
r"\d+.input_layernorm.bias",
r"\d+.post_attention_layernorm.weight",
r"\d+.post_attention_layernorm.bias",
r"\d+.self_attention.dense.bias",
r"\d+.mlp.dense_4h_to_h.bias",
r"\d+.weight",
r"\d+.bias",
]
WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [
"dense_4h_to_h.weight",
"self_attention.dense.weight",
]
def _get_vocab_divisibility_padding_tensor(ds_checkpoint, padded_vocab_tensor):
checkpoint_info = ds_checkpoint.get_checkpoint_info()
if padded_vocab_tensor.shape[0] > checkpoint_info[ORIGINAL_VOCAB_SIZE]:
return padded_vocab_tensor[-1]
else:
return torch.zeros(padded_vocab_tensor.shape[1])
def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
name, shape = name_and_shape
slice_base_path = os.path.join(slice_dir, name)
param_base_path = os.path.join(dir, name)
for state in ("fp32", "exp_avg", "exp_avg_sq"):
slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
final_path = os.path.join(param_base_path, f"{state}.pt")
#print(f"Expected shape: {shape}")
#print(f"Fragment sizes:", list(frag.shape for frag in slices))
ckpt_dict = {}
if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS):
param = sum(slices) / len(slices)
else:
cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
#print(f"CAT DIM: {cat_dim}")
param = torch.cat(slices, dim=cat_dim)
ckpt_dict['cat_dim'] = cat_dim
if "word_embeddings.weight" in name:
#print(f"Before {param.shape=}")
# strip padding
#param = _strip_vocab_padding(ds_checkpoint, param)
ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param)
#print(f"After {param.shape=}")
#print(f"Final shape: {param.shape}")
ckpt_dict['param'] = param
_save_checkpoint(final_path, ckpt_dict)
def _get_chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
def _do_parallel_work(do_work, work_chunks, num_workers):
pool = multiprocessing.Pool(num_workers)
for batch in tqdm.tqdm(work_chunks):
pool.map(do_work, batch)
pool.close()
pool.join()
def _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir):
_3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree)))
#pprint(_3d_range_list)
work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
#pprint(work_chunks)
do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint)
_do_parallel_work(do_work, work_chunks, args.num_extract_workers)
def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers))
#pprint(work_chunks)
zero_output_folder = os.path.join(args.output_folder, "zero")
do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree)
_do_parallel_work(do_work, work_chunks, args.num_merge_workers)
def main():
print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint')
args = parse_arguments()
print(
f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}'
)
ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)#, 1, 2) # args.target_tp, args.target_pp)
iteration = ds_checkpoint.get_iteration()
#_create_latest_file(args.output_folder, iteration)
checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
ds_checkpoint.tp_degree,
ds_checkpoint.pp_degree)
slice_shapes = []
for mp_rank_file in ds_checkpoint.mp_rank_files:
mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
slice_shapes += mp_sd["param_shapes"]
# fix back to normal flat dict, merge duplicates for tp>1
slice_shapes = dict((k,v) for d in slice_shapes for k,v in d.items() )
temp_dir = os.path.join(args.output_folder, 'tmp')
print('*** 1. Extracting ZeRO fragments')
_extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir)
print('*** 2. Merging slices')
_merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)
shutil.rmtree(temp_dir, ignore_errors=True)
# Copy mp* files into output folder
for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
shutil.copy2(f, args.output_folder)
# Update latest to output folder
checkpoint_root_folder, step_folder = os.path.split(args.output_folder)
latest_file = os.path.join(checkpoint_root_folder, 'latest_universal')
with open(latest_file, "w") as f:
f.write(step_folder)
print('*** Done!')
if __name__ == "__main__":
main()
import sys
import torch
import os
from collections import OrderedDict
from pathlib import Path
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
def dump_data(datum, name_list=[]):
if type(datum) in (dict, OrderedDict):
for k, v in datum.items():
dump_data(v, name_list + [str(k)])
elif type(datum) in (list, tuple):
for v in datum:
dump_data(v, name_list)
elif torch.is_tensor(datum):
prefix = '.'.join(name_list)
print(f'[tensor] {prefix} = {datum.shape}')
else:
#pass
prefix = '.'.join(name_list)
print(f'[other] {prefix} = {datum}')
def main():
if len(sys.argv) < 2:
print(f'Usage: {sys.argv[0]} <checkpoint file>')
exit(1)
ckpt_file = sys.argv[1]
if not os.path.isfile(ckpt_file):
print(f'{ckpt_file} is not a valid file')
exit(1)
print(f'loading checkpoint file: {ckpt_file}')
sd = torch.load(ckpt_file, map_location=torch.device('cpu'))
dump_data(sd)
quit()
if __name__ == "__main__":
main()
import sys
from pathlib import Path
# insert megatron's root dir into sys.path
root_repo_path = str(Path(__file__).resolve().parents[2])
if root_repo_path not in sys.path:
sys.path.insert(0, root_repo_path)
import argparse
from deepspeed.checkpoint import DeepSpeedCheckpoint
def list_files(file_list, tag):
print(f'Listing files: {tag}')
for i, file in enumerate(file_list):
print(f'{i+1}: {file}')
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--folder',
default=None,
type=str,
help='DeepSpeed Checkpoint folder')
parser.add_argument('--target_tp',
default=None,
type=int,
help='Target TP degree')
parser.add_argument('--target_pp',
default=None,
type=int,
help='Target PP degree')
args = parser.parse_args()
print(f'args = {args}')
return args
def show_input_files(ds_checkpoint):
list_files(ds_checkpoint.file_list, 'all')
list_files(ds_checkpoint.zero_files, 'zero')
list_files(ds_checkpoint.layer_files, 'layer')
list_files(ds_checkpoint.mp_rank_files, 'mp rank')
def show_simple_state(ds_checkpoint):
print(f'layer keys = {ds_checkpoint.layer_keys}')
print(f'layer count = {ds_checkpoint.layer_count}')
print(
f'tp_degree_count = {ds_checkpoint.original_tp_degree} ------> {ds_checkpoint.tp_degree}'
)
print(
f'pp_degree_count = {ds_checkpoint.original_pp_degree} ------> {ds_checkpoint.pp_degree}'
)
print(f'dp_degree_count = {ds_checkpoint.dp_degree}')
ds_checkpoint.old_2d_map.print_data('old 2d map ==>')
ds_checkpoint.new_2d_map.print_data('new 2d map ==>')
def show_mappings(ds_checkpoint):
ds_checkpoint.show_pp_tranformer_map()
ds_checkpoint.show_transformer_file_map()
ds_checkpoint.show_tp_embedding_map()
ds_checkpoint.show_tp_final_norm_map()
ds_checkpoint.show_2d_mapping()
def show_state_summary(tag, sd):
summary = {k: v.shape for k, v in sd.items()}
print(f'{tag} = {summary}')
def show_embedding_states(ds_checkpoint):
for i in range(0, ds_checkpoint.tp_degree):
sd = ds_checkpoint.get_embedding_state(i)
show_state_summary(f'embedding[{i}]', sd)
def show_final_norm_states(ds_checkpoint):
for i in range(0, ds_checkpoint.tp_degree):
sd = ds_checkpoint.get_final_norm_state(i)
show_state_summary(f'final_norm[{i}]', sd)
def show_transformer_states(ds_checkpoint):
for i in range(0, ds_checkpoint.tp_degree):
for j in range(0, ds_checkpoint.pp_degree):
state_list = ds_checkpoint.get_transformer_state(tp_index=i,
pp_index=j)
print(f'tp_pp_rank[{i},{j}] = ')
for k, sd in enumerate(state_list):
show_state_summary(f' block[{k}]', sd)
print("")
def main():
print(f'Inspecting DeepSpeed Checkpoint')
args = parse_arguments()
ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp,
args.target_pp)
ds_checkpoint.validate_files()
show_simple_state(ds_checkpoint)
show_input_files(ds_checkpoint)
show_simple_state(ds_checkpoint)
show_mappings(ds_checkpoint)
show_embedding_states(ds_checkpoint)
show_final_norm_states(ds_checkpoint)
show_transformer_states(ds_checkpoint)
checkpoint_args = ds_checkpoint.get_args()
print(f'checkpoint args = {checkpoint_args}')
if __name__ == "__main__":
main()
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir)))
from megatron import print_rank_0
from megatron.indexer import IndexBuilder
from megatron.initialize import initialize_megatron
def main():
"""Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
- Include all args needed for initial model specification
Other key args:
--block-data-path: path to write to
--ict-load or --realm-load: path to checkpoint with which to embed
--data-path and --titles-data-path: paths for dataset
--indexer-log-interval: reporting interval
--indexer-batch-size: size specific for indexer jobs
Check README.md for example script
"""
initialize_megatron(extra_args_provider=None,
args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
index_builder = IndexBuilder()
index_builder.build_and_save_index()
print_rank_0("Build and save indices: done!")
if __name__ == "__main__":
main()
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Sample Generate GPT"""
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir)))
from megatron import get_args
from megatron import print_rank_0
from megatron import get_tokenizer
from megatron import mpu
from megatron.checkpointing import load_checkpoint
from megatron.initialize import initialize_megatron
from megatron.model.gpt_model import GPTModel
from megatron.training import get_model
from megatron.text_generation_utils import generate_and_write_samples_unconditional
from megatron.text_generation_utils import generate_samples_input_from_file
from megatron.text_generation_utils import generate_samples_interactive
os.environ["HIP_VISIBLE_DEVICES"] = "4,5,6,7"
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
print_rank_0('building GPT model ...')
model = GPTModel(num_tokentypes=0, parallel_output=False,
pre_process=pre_process, post_process=post_process)
return model
def add_text_generate_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='text generation')
group.add_argument("--temperature", type=float, default=1.0,
help='Sampling temperature.')
group.add_argument("--greedy", action='store_true', default=False,
help='Use greedy sampling.')
group.add_argument("--top_p", type=float, default=0.0,
help='Top p sampling.')
group.add_argument("--top_k", type=int, default=0,
help='Top k sampling.')
group.add_argument("--out-seq-length", type=int, default=1024,
help='Size of the output generated text.')
group.add_argument("--sample-input-file", type=str, default=None,
help='Get input from file instead of interactive mode, '
'each line is an input.')
group.add_argument("--sample-output-file", type=str, default=None,
help='Output file got from --sample-input-file')
group.add_argument("--num-samples", type=int, default=0,
help='Number of samples to generate unconditionally, '
'defaults to 0 and interactive conditional sampling')
group.add_argument("--genfile", type=str,
help='Output file when generating unconditionally')
group.add_argument("--recompute", action='store_true',
help='During generation recompute all attention '
'instead of using previously computed keys/values.')
return parser
def main():
"""Main program."""
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
'no_load_rng': True,
'no_load_optim': True})
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
# Set up model and load checkpoint.
model = get_model(model_provider)
if args.load is not None:
_ = load_checkpoint(model, None, None)
assert len(model) == 1, "Above condition should have caught this"
model = model[0]
# Generate samples.
if args.num_samples == 0:
args.micro_batch_size = 1
if args.sample_input_file != None:
generate_samples_input_from_file(model)
else:
generate_samples_interactive(model)
else:
generate_and_write_samples_unconditional(model)
if __name__ == "__main__":
main()
import os
import os.path as osp
import pathlib
import subprocess
def recursively_lint_files():
"""Recursively lint all python files in chosen subdirectories of megatron-lm"""
try:
import autopep8
except ModuleNotFoundError:
print("Please first install autopep8 via `pip install autopep8`")
return
# get all python file paths from top level directory
file_dir = str(pathlib.Path(__file__).parent.absolute())
working_dir = osp.join(file_dir, os.pardir)
all_py_paths = set(os.path.join(working_dir, fname)
for fname in os.listdir(working_dir) if ".py" in fname)
# get all python file paths from chosen subdirectories
check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
for sub_dir in check_dirs:
for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
print("Linting the following: ")
for py_path in all_py_paths:
print(py_path)
command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
subprocess.check_call(command)
if __name__ == "__main__":
recursively_lint_files()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This script fixes up BigScience log files by adjusting and fixing
# units of logged values to be seconds instead of milliseconds.
# It does the modification in-place (so make back ups!).
#
# Example:
#
# find . -name "*.out*" -print0 | xargs -0 -P 8 rescale-logs.py
#
# See also the discussion in
# https://github.com/bigscience-workshop/Megatron-DeepSpeed/issues/236.
#
# This script is derived from https://stackoverflow.com/a/60080531/9201239
# and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
import os
import re
import sys
LINE_START_RE = re.compile(' ?iteration')
ELAPSED_TIME_RE = re.compile(r'elapsed time per iteration \(ms\): ([0-9.]+)')
SAMPLES_PER_SEC_RE = re.compile('samples per second: ([0-9.]+)')
def rescale_logs(log_file_path):
new_log_file_path = log_file_path + '.new'
with open(log_file_path, 'r') as log_file:
with open(new_log_file_path, 'w') as new_log_file:
for line in log_file.readlines():
if LINE_START_RE.match(line):
match = ELAPSED_TIME_RE.search(line)
if match:
# Logged time is in ms, so convert the match.
time_in_sec = float(match[1]) / 1000
replacement = (
f'elapsed time per iteration (s): '
f'{time_in_sec:.2f}'
)
# We only need to replace once per line.
line = ELAPSED_TIME_RE.sub(replacement, line, count=1)
match = SAMPLES_PER_SEC_RE.search(line)
if match:
# Logged time is in ms, so convert the match.
time_in_sec = float(match[1]) * 1000
# As the values are already logged up to 3
# numbers after the decimal point and we scale
# by exactly that amount, we log them without
# decimal point here in order to not seem more
# exact than we are.
replacement = f'samples per second: {time_in_sec:.0f}'
# We only need to replace once per line.
line = SAMPLES_PER_SEC_RE.sub(
replacement,
line,
count=1,
)
new_log_file.write(line)
os.rename(new_log_file_path, log_file_path)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f'{sys.argv[0]} <input file>',
file=sys.stderr)
sys.exit(1)
input_file = sys.argv[1]
rescale_logs(input_file)
print('Done')
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This script renames event names in TensorBoard log files.
# It does the renaming in-place (so make back ups!).
#
# Example:
#
# find . -name "*.tfevents*" -exec tb-rename-events.py {} "iteration-time" "iteration-time/iteration-time" \;
#
# More than one old tag can be remapped to one new tag – use ";" as a separator:
#
# tb-rename-events.py events.out.tfevents.1 "training loss;validation loss" "loss"
#
# This script is derived from https://stackoverflow.com/a/60080531/9201239
# and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
import os
import sys
# Use this if you want to avoid using the GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
from tensorflow.core.util.event_pb2 import Event
def rename_events(input_file, old_tags, new_tag):
new_file = input_file + '.new'
# Make a record writer
with tf.io.TFRecordWriter(new_file) as writer:
# Iterate event records
for rec in tf.data.TFRecordDataset([input_file]):
# Read event
ev = Event()
ev.MergeFromString(rec.numpy())
# Check if it is a summary
if ev.summary:
# Iterate summary values
for v in ev.summary.value:
# Check if the tag should be renamed
if v.tag in old_tags:
# Rename with new tag name
v.tag = new_tag
writer.write(ev.SerializeToString())
os.rename(new_file, input_file)
if __name__ == '__main__':
if len(sys.argv) != 4:
print(f'{sys.argv[0]} <input file> <old tags> <new tag>',
file=sys.stderr)
sys.exit(1)
input_file, old_tags, new_tag = sys.argv[1:]
old_tags = old_tags.split(';')
rename_events(input_file, old_tags, new_tag)
print('Done')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment