convert_megatron_gpt2_checkpoint.py

####################################################################################################

# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

####################################################################################################

#
# Note: If when running this conversion script you're getting an exception:
#     ModuleNotFoundError: No module named 'megatron.model.enums'
# you need to tell python where to find the clone of Megatron-LM, e.g.:
#
# cd /tmp
# git clone https://github.com/NVIDIA/Megatron-LM
# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ...
#
# if you already have it cloned elsewhere, simply adjust the path to the existing path
#
# If the training was done using a Megatron-LM fork, e.g.,
# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
# in your path, i.e., /path/to/Megatron-DeepSpeed/
#

import argparse
import os
import re
import zipfile

import torch

from transformers import AutoTokenizer, GPT2Config
import pdb

####################################################################################################


def recursive_print(name, val, spaces=0):
    # Format the message.
    if name is None:
        msg = None
    else:
        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
        msg = fmt.format(name)

    # Print and recurse (if needed).
    if isinstance(val, dict):
        if msg is not None:
            print(msg)
        for k in val.keys():
            recursive_print(k, val[k], spaces + 2)
    elif isinstance(val, torch.Tensor):
        print(msg, ":", val.size())
    else:
        print(msg, ":", val)


def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size):
    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
    # for compatibility with later versions of NVIDIA Megatron-LM.
    # The inverse operation is performed inside Megatron-LM to read checkpoints:
    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
    # If param is the weight tensor of the self-attention block, the returned tensor
    # will have to be transposed one more time to be read by HuggingFace GPT2.
    input_shape = param.size()
    if checkpoint_version == 1.0:
        # version 1.0 stores [num_heads * hidden_size * num_splits, :]
        saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:]
        param = param.view(*saved_shape)
        param = param.transpose(0, 2)
        param = param.transpose(1, 2).contiguous()
    elif checkpoint_version >= 2.0:
        # other versions store [num_heads * num_splits * hidden_size, :]
        saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
        param = param.view(*saved_shape)
        param = param.transpose(0, 1).contiguous()
    param = param.view(*input_shape)
    return param


####################################################################################################


def convert_megatron_checkpoint(args, input_state_dict, config, origin_tp_degree=1):
    # The converted output model.
    output_state_dict = {}

    # old versions did not store training args
    ds_args = input_state_dict.get("args", None)
    if ds_args is not None:
        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
        # from pprint import pprint
        # pprint(vars(ds_args))

        config.vocab_size = ds_args.padded_vocab_size
        config.n_positions = ds_args.max_position_embeddings
        config.n_embd = ds_args.hidden_size
        config.n_layer = ds_args.num_layers
        config.n_head = ds_args.num_attention_heads
        config.n_inner = ds_args.ffn_hidden_size
        # pprint(config)

    # The number of heads.
    heads = config.n_head
    # The hidden_size per head.
    hidden_size_per_head = config.n_embd // config.n_head
    # Megatron-LM checkpoint version
    if "checkpoint_version" in input_state_dict.keys():
        checkpoint_version = input_state_dict["checkpoint_version"]
    else:
        checkpoint_version = 0.0

    # The model.
    model = input_state_dict["model"] if "model" in input_state_dict else input_state_dict["module"]
    for key in model.keys():
        print(f">> {key} in model: {model[key].keys()}")
        for sub_key in model[key].keys():
            print(f"\t>> {sub_key} in {key} in model: {model[key][sub_key].keys()}")
    # The language model.
    lm = model["language_model"]
    # The embeddings.
    embeddings = lm["embedding"]

    # The word embeddings.
    word_embeddings = embeddings["word_embeddings"]["weight"]
    # Truncate the embedding table to vocab_size rows.
    word_embeddings = word_embeddings[: config.vocab_size, :]
    #output_state_dict["transformer.wte.weight"] = word_embeddings
    output_state_dict["model.embed_tokens.weight"] = word_embeddings

    # for LLAMA2
    lm_head = lm['output_layer']['weight'] if 'output_layer' in lm else word_embeddings

    # The position embeddings.
    #pos_embeddings = embeddings["position_embeddings"]["weight"]
    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
    #n_positions = pos_embeddings.size(0)
    n_positions = config.n_positions
    if n_positions != config.n_positions:
        raise ValueError(
            f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
        )
    # Store the position embeddings.
    #output_state_dict["transformer.wpe.weight"] = pos_embeddings

    # The transformer.
    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]

    # The regex to extract layer names.
    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")

    # The simple map of names for "automated" rules.
    megatron_to_transformers = {
        #"attention.dense": ".attn.c_proj.",
        #"self_attention.dense": ".attn.c_proj.",
        "attention.dense": ".self_attn.o_proj.",
        "self_attention.dense": ".self_attn.o_proj.",
        "mlp.dense_h_to_4h": ".mlp.c_fc.",
        #"mlp.dense_4h_to_h": ".mlp.c_proj.",
        "mlp.dense_4h_to_h": ".mlp.down_proj.",
    }

    # Extract the layers.
    for key, val in transformer.items():
        # Match the name.
        m = layer_re.match(key)

        # Stop if that's not a layer
        if m is None:
            break

        # The index of the layer.
        layer_idx = int(m.group(1))
        # The name of the operation.
        op_name = m.group(2)
        # Is it a weight or a bias?
        weight_or_bias = m.group(3)

        # The name of the layer.
        layer_name = f"transformer.h.{layer_idx}"
        layer_name = f"model.layers.{layer_idx}"

        # For layernorm(s), simply store the layer norm.
        if op_name.endswith("layernorm"):
            #ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
            ln_name = op_name
            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val

        # Transpose the QKV matrix.
        elif (
            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
        ) and weight_or_bias == "weight":
            # Insert a tensor of 1x1xDxD bias.
            #causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
            #    1, 1, n_positions, n_positions
            #)
            #output_state_dict[layer_name + ".attn.bias"] = causal_mask

            # Insert a "dummy" tensor for masked_bias.
            #masked_bias = torch.tensor(-1e4, dtype=torch.float16)
            #output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias

            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
            #out_val = out_val.transpose(0, 1).contiguous()
            out_val = out_val.contiguous()
            # Store.
            #output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
            output_state_dict[layer_name + ".self_attn.q_proj.weight"] = out_val[:config.n_embd, :]
            output_state_dict[layer_name + ".self_attn.k_proj.weight"] = out_val[config.n_embd:config.n_embd * 2, :]
            output_state_dict[layer_name + ".self_attn.v_proj.weight"] = out_val[config.n_embd * 2 :, :]
        elif (
            op_name == "self_attention.query"
        ) and weight_or_bias == "weight":
            out_val = fix_query_key_value_ordering(val, checkpoint_version, 1, heads, hidden_size_per_head)
            #out_val = out_val.transpose(0, 1).contiguous()
            out_val = out_val.contiguous()
            output_state_dict[layer_name + ".self_attn.q_proj.weight"] = out_val
        elif (
            op_name == "self_attention.key_value"
        )   and weight_or_bias == "weight":
            #print(f">> key_value origin size: {val.size()}")
            size_per_weight = val.size(0) // 2
            #please set the NUM_KV_HEADS used to replace number "4" in fix_query_key_value_ordering function
            out_val = fix_query_key_value_ordering(val, checkpoint_version, 2, 4, hidden_size_per_head)
            #print(f">> key_value output size: {out_val.size()}")
            out_val = out_val.contiguous()
            output_state_dict[layer_name + ".self_attn.k_proj.weight"] = out_val[:size_per_weight, :]
            output_state_dict[layer_name + ".self_attn.v_proj.weight"] = out_val[size_per_weight:, :]
        # Transpose the bias.
        elif (
            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
        ) and weight_or_bias == "bias":
            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
            # Store. No change of shape.
            output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
        elif op_name == "mlp.dense_h_to_4h":
            # this 2 lines for TP=1 (swiglu)
            if origin_tp_degree == 1:
                output_state_dict[layer_name + ".mlp.gate_proj.weight"] = val[:config.n_inner, :]
                output_state_dict[layer_name + ".mlp.up_proj.weight"] = val[config.n_inner:, :]
            elif origin_tp_degree == 2:
            # this 2 lines for TP=2 (swiglu)
                output_state_dict[layer_name + ".mlp.gate_proj.weight"] = torch.cat([val[:config.n_inner//2, :], val[config.n_inner:config.n_inner + config.n_inner // 2, :]])
                output_state_dict[layer_name + ".mlp.up_proj.weight"] = torch.cat([val[config.n_inner//2:config.n_inner, :], val[config.n_inner + config.n_inner // 2:, :]])
            elif origin_tp_degree == 4:
                output_state_dict[layer_name + ".mlp.gate_proj.weight"] = torch.cat([val[:config.n_inner//4, :],val[config.n_inner//2:config.n_inner//2+config.n_inner//4, :],val[config.n_inner:config.n_inner+config.n_inner//4, :],val[config.n_inner+config.n_inner//2:config.n_inner+config.n_inner//4*3, :] ])
                output_state_dict[layer_name + ".mlp.up_proj.weight"] = torch.cat([val[config.n_inner//4:config.n_inner//2,:], val[config.n_inner//2+config.n_inner//4:config.n_inner,:],val[config.n_inner+config.n_inner//4:config.n_inner+config.n_inner//2,:],val[config.n_inner+config.n_inner//4*3:config.n_inner*2, :] ])
            else:
                raise ValueError("Not Implemented Yet for TP /= 1 && 2 && 4.")
        # Transpose the weights.
        elif weight_or_bias == "weight":
            out_name = megatron_to_transformers[op_name]
            output_state_dict[layer_name + out_name + "weight"] = val#.transpose(0, 1)

        # Copy the bias.
        elif weight_or_bias == "bias":
            out_name = megatron_to_transformers[op_name]
            output_state_dict[layer_name + out_name + "bias"] = val

    # DEBUG.
    assert config.n_layer == layer_idx + 1

    # The final layernorm.
    #output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
    #pdb.set_trace()
    output_state_dict["model.norm.weight"] = transformer["final_layernorm.weight"]
    #output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]

    # For LM head, transformers' wants the matrix to weight embeddings.
    output_state_dict["lm_head.weight"] = lm_head

    # transform the key for LLAMA2
    transform_dict = {
        "transformer.h": "model.layers",

    }
    # It should be done!
    return output_state_dict


####################################################################################################


def main():
    # Create the argument parser.
    parser = argparse.ArgumentParser()
    parser.add_argument("--print-checkpoint-structure", action="store_true")
    parser.add_argument(
        "path_to_checkpoint",
        type=str,
        help="Path to the checkpoint file (.zip archive or direct .pt file)",
    )
    parser.add_argument(
        "--config_file",
        default="",
        type=str,
        help="An optional config json file describing the pre-trained model.",
    )
    args = parser.parse_args()

    # Extract the basename.
    basename = os.path.dirname(args.path_to_checkpoint)

    # Load the model.
    # the .zip is very optional, let's keep it for backward compatibility
    print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
    if args.path_to_checkpoint.endswith(".zip"):
        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
            with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
    else:
        input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
    print(f">> keys: {input_state_dict.keys()}")
    ds_args = input_state_dict.get("args", None)
    #print(f">> ds_args: {ds_args}")
    # Read the config, or default to the model released by NVIDIA.
    if args.config_file == "":
        if ds_args is not None:
            if ds_args.bias_gelu_fusion:
                activation_function = "gelu_fast"
            elif ds_args.openai_gelu:
                activation_function = "gelu_new"
            else:
                activation_function = "gelu"
        else:
            # in the very early days this used to be "gelu_new"
            activation_function = "gelu_new"

        # Spell out all parameters in case the defaults change.
        config = GPT2Config(
            vocab_size=50257,
            n_positions=1024,
            n_embd=1024,
            n_layer=24,
            n_head=16,
            n_inner=4096,
            activation_function=activation_function,
            resid_pdrop=0.1,
            embd_pdrop=0.1,
            attn_pdrop=0.1,
            layer_norm_epsilon=1e-5,
            initializer_range=0.02,
            summary_type="cls_index",
            summary_use_proj=True,
            summary_activation=None,
            summary_proj_to_labels=True,
            summary_first_dropout=0.1,
            scale_attn_weights=True,
            use_cache=True,
            bos_token_id=50256,
            eos_token_id=50256,
        )
    else:
        config = GPT2Config.from_json_file(args.config_file)

    config.architectures = ["GPT2LMHeadModel"]

    # Convert.
    print("Converting")
    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)

    # Print the structure of converted state dict.
    if args.print_checkpoint_structure:
        recursive_print(None, output_state_dict)

    tokenizer_model_name = ""
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
    tokenizer_class = type(tokenizer).__name__
    config.tokenizer_class = tokenizer_class

    # Store the config to file.
    print("Saving config")
    #config.save_pretrained(basename)
    print(f">> here is the local converter")
    # Save tokenizer based on args
    print(f"Adding {tokenizer_class} tokenizer files")
    #tokenizer.save_pretrained(basename)

    # Store the state_dict to file.
    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
    print(f'Saving checkpoint to "{output_checkpoint_file}"')
    torch.save(output_state_dict, output_checkpoint_file)


####################################################################################################

if __name__ == "__main__":
    main()

####################################################################################################