Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
megatron-deepspeed-llama_pytorch
Commits
6709039a
"src/include/blockwise_direct_convolution.hip.hpp" did not exist on "20423a3583ab2d6b129a4b3d3174f6023850cec9"
Commit
6709039a
authored
Feb 04, 2024
by
liangjing
Browse files
update convert_scripts
parent
cf8450ec
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
921 additions
and
0 deletions
+921
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/__pycache__/convert_megatron_gpt2_checkpoint.cpython-38.pyc
...pycache__/convert_megatron_gpt2_checkpoint.cpython-38.pyc
+0
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/convert_megatron_gpt2_checkpoint.py
...ts/Megatron-Deepspeed/convert_megatron_gpt2_checkpoint.py
+402
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/__pycache__/deepspeed_checkpoint.cpython-38.pyc
...heckpoint/__pycache__/deepspeed_checkpoint.cpython-38.pyc
+0
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/__pycache__/deepspeed_to_megatron.cpython-38.pyc
...eckpoint/__pycache__/deepspeed_to_megatron.cpython-38.pyc
+0
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/deepspeed_checkpoint.py
...Deepspeed/tools/covert_checkpoint/deepspeed_checkpoint.py
+248
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/deepspeed_to_megatron.py
...eepspeed/tools/covert_checkpoint/deepspeed_to_megatron.py
+178
-0
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/deepspeed_to_transformers.py
...peed/tools/covert_checkpoint/deepspeed_to_transformers.py
+81
-0
tools/convert_scripts/convert_scripts/convert.sh
tools/convert_scripts/convert_scripts/convert.sh
+12
-0
No files found.
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/__pycache__/convert_megatron_gpt2_checkpoint.cpython-38.pyc
0 → 100644
View file @
6709039a
File added
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/convert_megatron_gpt2_checkpoint.py
0 → 100644
View file @
6709039a
####################################################################################################
# Copyright (c) 2021-, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
####################################################################################################
#
# Note: If when running this conversion script you're getting an exception:
# ModuleNotFoundError: No module named 'megatron.model.enums'
# you need to tell python where to find the clone of Megatron-LM, e.g.:
#
# cd /tmp
# git clone https://github.com/NVIDIA/Megatron-LM
# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ...
#
# if you already have it cloned elsewhere, simply adjust the path to the existing path
#
# If the training was done using a Megatron-LM fork, e.g.,
# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
# in your path, i.e., /path/to/Megatron-DeepSpeed/
#
import
argparse
import
os
import
re
import
zipfile
import
torch
from
transformers
import
AutoTokenizer
,
GPT2Config
import
pdb
####################################################################################################
def
recursive_print
(
name
,
val
,
spaces
=
0
):
# Format the message.
if
name
is
None
:
msg
=
None
else
:
fmt
=
"."
*
max
(
0
,
spaces
-
2
)
+
"# {:"
+
str
(
50
-
spaces
)
+
"s}"
msg
=
fmt
.
format
(
name
)
# Print and recurse (if needed).
if
isinstance
(
val
,
dict
):
if
msg
is
not
None
:
print
(
msg
)
for
k
in
val
.
keys
():
recursive_print
(
k
,
val
[
k
],
spaces
+
2
)
elif
isinstance
(
val
,
torch
.
Tensor
):
print
(
msg
,
":"
,
val
.
size
())
else
:
print
(
msg
,
":"
,
val
)
def
fix_query_key_value_ordering
(
param
,
checkpoint_version
,
num_splits
,
num_heads
,
hidden_size
):
# Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
# for compatibility with later versions of NVIDIA Megatron-LM.
# The inverse operation is performed inside Megatron-LM to read checkpoints:
# https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
# If param is the weight tensor of the self-attention block, the returned tensor
# will have to be transposed one more time to be read by HuggingFace GPT2.
input_shape
=
param
.
size
()
if
checkpoint_version
==
1.0
:
# version 1.0 stores [num_heads * hidden_size * num_splits, :]
saved_shape
=
(
num_heads
,
hidden_size
,
num_splits
)
+
input_shape
[
1
:]
param
=
param
.
view
(
*
saved_shape
)
param
=
param
.
transpose
(
0
,
2
)
param
=
param
.
transpose
(
1
,
2
).
contiguous
()
elif
checkpoint_version
>=
2.0
:
# other versions store [num_heads * num_splits * hidden_size, :]
saved_shape
=
(
num_heads
,
num_splits
,
hidden_size
)
+
input_shape
[
1
:]
param
=
param
.
view
(
*
saved_shape
)
param
=
param
.
transpose
(
0
,
1
).
contiguous
()
param
=
param
.
view
(
*
input_shape
)
return
param
####################################################################################################
def
convert_megatron_checkpoint
(
args
,
input_state_dict
,
config
,
origin_tp_degree
=
1
):
# The converted output model.
output_state_dict
=
{}
# old versions did not store training args
ds_args
=
input_state_dict
.
get
(
"args"
,
None
)
if
ds_args
is
not
None
:
# do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
# from pprint import pprint
# pprint(vars(ds_args))
config
.
vocab_size
=
ds_args
.
padded_vocab_size
config
.
n_positions
=
ds_args
.
max_position_embeddings
config
.
n_embd
=
ds_args
.
hidden_size
config
.
n_layer
=
ds_args
.
num_layers
config
.
n_head
=
ds_args
.
num_attention_heads
config
.
n_inner
=
ds_args
.
ffn_hidden_size
# pprint(config)
# The number of heads.
heads
=
config
.
n_head
# The hidden_size per head.
hidden_size_per_head
=
config
.
n_embd
//
config
.
n_head
# Megatron-LM checkpoint version
if
"checkpoint_version"
in
input_state_dict
.
keys
():
checkpoint_version
=
input_state_dict
[
"checkpoint_version"
]
else
:
checkpoint_version
=
0.0
# The model.
model
=
input_state_dict
[
"model"
]
if
"model"
in
input_state_dict
else
input_state_dict
[
"module"
]
for
key
in
model
.
keys
():
print
(
f
">>
{
key
}
in model:
{
model
[
key
].
keys
()
}
"
)
for
sub_key
in
model
[
key
].
keys
():
print
(
f
"
\t
>>
{
sub_key
}
in
{
key
}
in model:
{
model
[
key
][
sub_key
].
keys
()
}
"
)
# The language model.
lm
=
model
[
"language_model"
]
# The embeddings.
embeddings
=
lm
[
"embedding"
]
# The word embeddings.
word_embeddings
=
embeddings
[
"word_embeddings"
][
"weight"
]
# Truncate the embedding table to vocab_size rows.
word_embeddings
=
word_embeddings
[:
config
.
vocab_size
,
:]
#output_state_dict["transformer.wte.weight"] = word_embeddings
output_state_dict
[
"model.embed_tokens.weight"
]
=
word_embeddings
# for LLAMA2
lm_head
=
lm
[
'output_layer'
][
'weight'
]
if
'output_layer'
in
lm
else
word_embeddings
# The position embeddings.
#pos_embeddings = embeddings["position_embeddings"]["weight"]
# Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
#n_positions = pos_embeddings.size(0)
n_positions
=
config
.
n_positions
if
n_positions
!=
config
.
n_positions
:
raise
ValueError
(
f
"pos_embeddings.max_sequence_length=
{
n_positions
}
and config.n_positions=
{
config
.
n_positions
}
don't match"
)
# Store the position embeddings.
#output_state_dict["transformer.wpe.weight"] = pos_embeddings
# The transformer.
transformer
=
lm
[
"transformer"
]
if
"transformer"
in
lm
.
keys
()
else
lm
[
"encoder"
]
# The regex to extract layer names.
layer_re
=
re
.
compile
(
r
"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)"
)
# The simple map of names for "automated" rules.
megatron_to_transformers
=
{
#"attention.dense": ".attn.c_proj.",
#"self_attention.dense": ".attn.c_proj.",
"attention.dense"
:
".self_attn.o_proj."
,
"self_attention.dense"
:
".self_attn.o_proj."
,
"mlp.dense_h_to_4h"
:
".mlp.c_fc."
,
#"mlp.dense_4h_to_h": ".mlp.c_proj.",
"mlp.dense_4h_to_h"
:
".mlp.down_proj."
,
}
# Extract the layers.
for
key
,
val
in
transformer
.
items
():
# Match the name.
m
=
layer_re
.
match
(
key
)
# Stop if that's not a layer
if
m
is
None
:
break
# The index of the layer.
layer_idx
=
int
(
m
.
group
(
1
))
# The name of the operation.
op_name
=
m
.
group
(
2
)
# Is it a weight or a bias?
weight_or_bias
=
m
.
group
(
3
)
# The name of the layer.
layer_name
=
f
"transformer.h.
{
layer_idx
}
"
layer_name
=
f
"model.layers.
{
layer_idx
}
"
# For layernorm(s), simply store the layer norm.
if
op_name
.
endswith
(
"layernorm"
):
#ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
ln_name
=
op_name
output_state_dict
[
layer_name
+
"."
+
ln_name
+
"."
+
weight_or_bias
]
=
val
# Transpose the QKV matrix.
elif
(
op_name
==
"attention.query_key_value"
or
op_name
==
"self_attention.query_key_value"
)
and
weight_or_bias
==
"weight"
:
# Insert a tensor of 1x1xDxD bias.
#causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
# 1, 1, n_positions, n_positions
#)
#output_state_dict[layer_name + ".attn.bias"] = causal_mask
# Insert a "dummy" tensor for masked_bias.
#masked_bias = torch.tensor(-1e4, dtype=torch.float16)
#output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
out_val
=
fix_query_key_value_ordering
(
val
,
checkpoint_version
,
3
,
heads
,
hidden_size_per_head
)
# Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
#out_val = out_val.transpose(0, 1).contiguous()
out_val
=
out_val
.
contiguous
()
# Store.
#output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
output_state_dict
[
layer_name
+
".self_attn.q_proj.weight"
]
=
out_val
[:
config
.
n_embd
,
:]
output_state_dict
[
layer_name
+
".self_attn.k_proj.weight"
]
=
out_val
[
config
.
n_embd
:
config
.
n_embd
*
2
,
:]
output_state_dict
[
layer_name
+
".self_attn.v_proj.weight"
]
=
out_val
[
config
.
n_embd
*
2
:,
:]
elif
(
op_name
==
"self_attention.query"
)
and
weight_or_bias
==
"weight"
:
out_val
=
fix_query_key_value_ordering
(
val
,
checkpoint_version
,
1
,
heads
,
hidden_size_per_head
)
#out_val = out_val.transpose(0, 1).contiguous()
out_val
=
out_val
.
contiguous
()
output_state_dict
[
layer_name
+
".self_attn.q_proj.weight"
]
=
out_val
elif
(
op_name
==
"self_attention.key_value"
)
and
weight_or_bias
==
"weight"
:
#print(f">> key_value origin size: {val.size()}")
size_per_weight
=
val
.
size
(
0
)
//
2
#please set the NUM_KV_HEADS used to replace number "4" in fix_query_key_value_ordering function
out_val
=
fix_query_key_value_ordering
(
val
,
checkpoint_version
,
2
,
4
,
hidden_size_per_head
)
#print(f">> key_value output size: {out_val.size()}")
out_val
=
out_val
.
contiguous
()
output_state_dict
[
layer_name
+
".self_attn.k_proj.weight"
]
=
out_val
[:
size_per_weight
,
:]
output_state_dict
[
layer_name
+
".self_attn.v_proj.weight"
]
=
out_val
[
size_per_weight
:,
:]
# Transpose the bias.
elif
(
op_name
==
"attention.query_key_value"
or
op_name
==
"self_attention.query_key_value"
)
and
weight_or_bias
==
"bias"
:
out_val
=
fix_query_key_value_ordering
(
val
,
checkpoint_version
,
3
,
heads
,
hidden_size_per_head
)
# Store. No change of shape.
output_state_dict
[
layer_name
+
".attn.c_attn.bias"
]
=
out_val
elif
op_name
==
"mlp.dense_h_to_4h"
:
# this 2 lines for TP=1 (swiglu)
if
origin_tp_degree
==
1
:
output_state_dict
[
layer_name
+
".mlp.gate_proj.weight"
]
=
val
[:
config
.
n_inner
,
:]
output_state_dict
[
layer_name
+
".mlp.up_proj.weight"
]
=
val
[
config
.
n_inner
:,
:]
elif
origin_tp_degree
==
2
:
# this 2 lines for TP=2 (swiglu)
output_state_dict
[
layer_name
+
".mlp.gate_proj.weight"
]
=
torch
.
cat
([
val
[:
config
.
n_inner
//
2
,
:],
val
[
config
.
n_inner
:
config
.
n_inner
+
config
.
n_inner
//
2
,
:]])
output_state_dict
[
layer_name
+
".mlp.up_proj.weight"
]
=
torch
.
cat
([
val
[
config
.
n_inner
//
2
:
config
.
n_inner
,
:],
val
[
config
.
n_inner
+
config
.
n_inner
//
2
:,
:]])
elif
origin_tp_degree
==
4
:
output_state_dict
[
layer_name
+
".mlp.gate_proj.weight"
]
=
torch
.
cat
([
val
[:
config
.
n_inner
//
4
,
:],
val
[
config
.
n_inner
//
2
:
config
.
n_inner
//
2
+
config
.
n_inner
//
4
,
:],
val
[
config
.
n_inner
:
config
.
n_inner
+
config
.
n_inner
//
4
,
:],
val
[
config
.
n_inner
+
config
.
n_inner
//
2
:
config
.
n_inner
+
config
.
n_inner
//
4
*
3
,
:]
])
output_state_dict
[
layer_name
+
".mlp.up_proj.weight"
]
=
torch
.
cat
([
val
[
config
.
n_inner
//
4
:
config
.
n_inner
//
2
,:],
val
[
config
.
n_inner
//
2
+
config
.
n_inner
//
4
:
config
.
n_inner
,:],
val
[
config
.
n_inner
+
config
.
n_inner
//
4
:
config
.
n_inner
+
config
.
n_inner
//
2
,:],
val
[
config
.
n_inner
+
config
.
n_inner
//
4
*
3
:
config
.
n_inner
*
2
,
:]
])
else
:
raise
ValueError
(
"Not Implemented Yet for TP /= 1 && 2 && 4."
)
# Transpose the weights.
elif
weight_or_bias
==
"weight"
:
out_name
=
megatron_to_transformers
[
op_name
]
output_state_dict
[
layer_name
+
out_name
+
"weight"
]
=
val
#.transpose(0, 1)
# Copy the bias.
elif
weight_or_bias
==
"bias"
:
out_name
=
megatron_to_transformers
[
op_name
]
output_state_dict
[
layer_name
+
out_name
+
"bias"
]
=
val
# DEBUG.
assert
config
.
n_layer
==
layer_idx
+
1
# The final layernorm.
#output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
#pdb.set_trace()
output_state_dict
[
"model.norm.weight"
]
=
transformer
[
"final_layernorm.weight"
]
#output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
# For LM head, transformers' wants the matrix to weight embeddings.
output_state_dict
[
"lm_head.weight"
]
=
lm_head
# transform the key for LLAMA2
transform_dict
=
{
"transformer.h"
:
"model.layers"
,
}
# It should be done!
return
output_state_dict
####################################################################################################
def
main
():
# Create the argument parser.
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--print-checkpoint-structure"
,
action
=
"store_true"
)
parser
.
add_argument
(
"path_to_checkpoint"
,
type
=
str
,
help
=
"Path to the checkpoint file (.zip archive or direct .pt file)"
,
)
parser
.
add_argument
(
"--config_file"
,
default
=
""
,
type
=
str
,
help
=
"An optional config json file describing the pre-trained model."
,
)
args
=
parser
.
parse_args
()
# Extract the basename.
basename
=
os
.
path
.
dirname
(
args
.
path_to_checkpoint
)
# Load the model.
# the .zip is very optional, let's keep it for backward compatibility
print
(
f
"Extracting PyTorch state dictionary from
{
args
.
path_to_checkpoint
}
"
)
if
args
.
path_to_checkpoint
.
endswith
(
".zip"
):
with
zipfile
.
ZipFile
(
args
.
path_to_checkpoint
,
"r"
)
as
checkpoint
:
with
checkpoint
.
open
(
"release/mp_rank_00/model_optim_rng.pt"
)
as
pytorch_dict
:
input_state_dict
=
torch
.
load
(
pytorch_dict
,
map_location
=
"cpu"
)
else
:
input_state_dict
=
torch
.
load
(
args
.
path_to_checkpoint
,
map_location
=
"cpu"
)
print
(
f
">> keys:
{
input_state_dict
.
keys
()
}
"
)
ds_args
=
input_state_dict
.
get
(
"args"
,
None
)
#print(f">> ds_args: {ds_args}")
# Read the config, or default to the model released by NVIDIA.
if
args
.
config_file
==
""
:
if
ds_args
is
not
None
:
if
ds_args
.
bias_gelu_fusion
:
activation_function
=
"gelu_fast"
elif
ds_args
.
openai_gelu
:
activation_function
=
"gelu_new"
else
:
activation_function
=
"gelu"
else
:
# in the very early days this used to be "gelu_new"
activation_function
=
"gelu_new"
# Spell out all parameters in case the defaults change.
config
=
GPT2Config
(
vocab_size
=
50257
,
n_positions
=
1024
,
n_embd
=
1024
,
n_layer
=
24
,
n_head
=
16
,
n_inner
=
4096
,
activation_function
=
activation_function
,
resid_pdrop
=
0.1
,
embd_pdrop
=
0.1
,
attn_pdrop
=
0.1
,
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
summary_type
=
"cls_index"
,
summary_use_proj
=
True
,
summary_activation
=
None
,
summary_proj_to_labels
=
True
,
summary_first_dropout
=
0.1
,
scale_attn_weights
=
True
,
use_cache
=
True
,
bos_token_id
=
50256
,
eos_token_id
=
50256
,
)
else
:
config
=
GPT2Config
.
from_json_file
(
args
.
config_file
)
config
.
architectures
=
[
"GPT2LMHeadModel"
]
# Convert.
print
(
"Converting"
)
output_state_dict
=
convert_megatron_checkpoint
(
args
,
input_state_dict
,
config
)
# Print the structure of converted state dict.
if
args
.
print_checkpoint_structure
:
recursive_print
(
None
,
output_state_dict
)
tokenizer_model_name
=
""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_model_name
)
tokenizer_class
=
type
(
tokenizer
).
__name__
config
.
tokenizer_class
=
tokenizer_class
# Store the config to file.
print
(
"Saving config"
)
#config.save_pretrained(basename)
print
(
f
">> here is the local converter"
)
# Save tokenizer based on args
print
(
f
"Adding
{
tokenizer_class
}
tokenizer files"
)
#tokenizer.save_pretrained(basename)
# Store the state_dict to file.
output_checkpoint_file
=
os
.
path
.
join
(
basename
,
"pytorch_model.bin"
)
print
(
f
'Saving checkpoint to "
{
output_checkpoint_file
}
"'
)
torch
.
save
(
output_state_dict
,
output_checkpoint_file
)
####################################################################################################
if
__name__
==
"__main__"
:
main
()
####################################################################################################
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/__pycache__/deepspeed_checkpoint.cpython-38.pyc
0 → 100644
View file @
6709039a
File added
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/__pycache__/deepspeed_to_megatron.cpython-38.pyc
0 → 100644
View file @
6709039a
File added
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/deepspeed_checkpoint.py
0 → 100644
View file @
6709039a
import
os
from
typing
import
Dict
import
torch
ZERO_FILE_PREFIX
=
'zero_pp_rank_'
LAYER_FILE_PREFIX
=
'layer_'
MP_RANK_FILE_PREFIX
=
'mp_rank_'
EMBEDDING_LAYER_INDEX
=
0
LM_HEAD_LAYER_INDEX
=
-
1
FINAL_LAYER_NORM_INDEX
=
-
2
# original -1 but -1 is the output_layer
ARGS_KEY
=
'args'
ITERATION_KEY
=
'iteration'
SEQUENTIAL_LAYERS
=
[
'input_layernorm.weight'
,
'input_layernorm.bias'
,
'self_attention.dense.bias'
,
'post_attention_layernorm.weight'
,
'post_attention_layernorm.bias'
,
'mlp.dense_4h_to_h.bias'
,
'position_embeddings.weight'
]
LAYER_CONCAT_DIM
=
{
'self_attention.dense.weight'
:
1
,
'mlp.dense_4h_to_h.weight'
:
1
}
class
DeepSpeedCheckpoint
(
object
):
def
__init__
(
self
,
dir
,
tp_degree
=
None
,
pp_degree
=
None
,
no_pp
=
False
,
share_emb
=
False
):
self
.
dir
=
dir
self
.
no_pp
=
no_pp
self
.
file_list
=
self
.
_get_files
(
dir
)
self
.
zero_files
=
self
.
_get_files_with_context
(
self
.
file_list
,
ZERO_FILE_PREFIX
)
self
.
layer_files
=
self
.
_get_files_with_prefix
(
self
.
file_list
,
LAYER_FILE_PREFIX
)
self
.
mp_rank_files
=
self
.
_get_files_with_prefix
(
self
.
file_list
,
MP_RANK_FILE_PREFIX
)
self
.
layer_keys
=
self
.
_get_layer_keys
()
self
.
layer_count
=
len
(
self
.
layer_keys
)
if
not
self
.
no_pp
:
self
.
original_tp_degree
=
len
(
self
.
_get_files_with_prefix
(
self
.
layer_files
,
f
'
{
LAYER_FILE_PREFIX
}
01'
))
self
.
original_pp_degree
=
len
(
self
.
mp_rank_files
)
//
self
.
original_tp_degree
else
:
self
.
original_tp_degree
=
len
(
self
.
mp_rank_files
)
self
.
original_pp_degree
=
1
self
.
dp_degree
=
len
(
self
.
zero_files
)
//
(
self
.
original_pp_degree
*
self
.
original_tp_degree
)
self
.
tp_degree
=
self
.
original_tp_degree
if
tp_degree
is
None
else
tp_degree
self
.
pp_degree
=
self
.
original_pp_degree
if
pp_degree
is
None
else
pp_degree
print
(
f
">> layer_files:"
)
for
item
in
self
.
layer_files
[:
2
]:
print
(
f
">
{
item
}
"
)
print
(
""
)
print
(
f
">> mp_rank_files:"
)
for
item
in
self
.
mp_rank_files
:
print
(
f
">
{
item
}
"
)
print
(
""
)
print
(
f
">> layer_keys:
{
self
.
layer_keys
}
"
)
print
(
f
">> dp_degree:
{
self
.
dp_degree
}
, tp_degree:
{
self
.
tp_degree
}
, pp_degree:
{
self
.
pp_degree
}
"
)
print
(
""
)
self
.
global_state
=
{}
self
.
_sanity_check
()
self
.
pp_to_transformer_map
=
self
.
_build_pp_transformer_map
()
if
self
.
original_pp_degree
==
1
and
self
.
original_tp_degree
==
1
:
self
.
transformer_file_map
=
self
.
_build_transformer_file_map_nopp
()
else
:
self
.
transformer_file_map
=
self
.
_build_transformer_file_map
()
#self.show_transformer_file_map()
if
not
self
.
no_pp
:
self
.
tp_to_embedding_map
=
self
.
_build_tp_other_layer_map
(
EMBEDDING_LAYER_INDEX
)
if
share_emb
:
FINAL_LAYER_NORM_INDEX
=
-
1
self
.
tp_to_final_norm_map
=
self
.
_build_tp_other_layer_map
(
FINAL_LAYER_NORM_INDEX
)
self
.
tp_to_lm_head_map
=
{}
else
:
FINAL_LAYER_NORM_INDEX
=
-
2
self
.
tp_to_lm_head_map
=
self
.
_build_tp_other_layer_map
(
LM_HEAD_LAYER_INDEX
)
self
.
tp_to_final_norm_map
=
self
.
_build_tp_other_layer_map
(
FINAL_LAYER_NORM_INDEX
)
self
.
_build_global_state
()
def
show_tp_embedding_map
(
self
):
self
.
_dump_mapping
(
self
.
tp_to_embedding_map
,
'tp_to_embedding_layers'
)
def
show_tp_final_norm_map
(
self
):
self
.
_dump_mapping
(
self
.
tp_to_final_norm_map
,
'tp_to_final_norm_layers'
)
def
show_pp_tranformer_map
(
self
):
self
.
_dump_mapping
(
self
.
pp_to_transformer_map
,
'pp_to_tranformer_layers'
)
def
show_transformer_file_map
(
self
):
self
.
_dump_mapping
(
self
.
transformer_file_map
,
'rank_to_tranformer_files'
)
def
_build_global_state
(
self
):
sd
=
torch
.
load
(
self
.
mp_rank_files
[
0
],
map_location
=
torch
.
device
(
'cpu'
))
self
.
global_state
[
ITERATION_KEY
]
=
sd
.
get
(
ITERATION_KEY
,
0
)
self
.
global_state
[
ARGS_KEY
]
=
sd
.
get
(
ARGS_KEY
,
None
)
def
get_iteration
(
self
):
if
not
ITERATION_KEY
in
self
.
global_state
:
sd
=
torch
.
load
(
self
.
mp_rank_files
[
0
],
map_location
=
torch
.
device
(
'cpu'
))
self
.
global_state
[
ITERATION_KEY
]
=
sd
.
get
(
ITERATION_KEY
,
0
)
return
self
.
global_state
[
ITERATION_KEY
]
def
get_embedding_state
(
self
,
tp_index
:
int
)
->
Dict
:
print
(
f
">> tp_index:
{
tp_index
}
searched keys:
{
self
.
tp_to_embedding_map
.
keys
()
}
"
)
assert
tp_index
in
self
.
tp_to_embedding_map
.
keys
()
for
name
in
self
.
tp_to_embedding_map
[
tp_index
]:
print
(
f
">> deepspeed checkpoint.py, name in tp_to_embedding_map:
{
name
}
"
)
sd_list
=
[
torch
.
load
(
fname
,
map_location
=
torch
.
device
(
'cpu'
))
for
fname
in
self
.
tp_to_embedding_map
[
tp_index
]]
sd
=
self
.
_merge_state_dicts
(
sd_list
)
return
sd
def
get_lm_head_state
(
self
,
tp_index
:
int
)
->
Dict
:
assert
tp_index
in
self
.
tp_to_embedding_map
.
keys
()
for
name
in
self
.
tp_to_lm_head_map
[
tp_index
]:
print
(
f
">> deepspeed checkpoint.py, name in tp_to_lm_head_map:
{
name
}
"
)
sd_list
=
[
torch
.
load
(
fname
,
map_location
=
torch
.
device
(
'cpu'
))
for
fname
in
self
.
tp_to_lm_head_map
[
tp_index
]]
print
(
f
">> lm head sd_list:
{
sd_list
}
"
)
sd
=
self
.
_merge_state_dicts
(
sd_list
)
return
sd
def
get_args
(
self
):
if
not
ARGS_KEY
in
self
.
global_state
:
sd
=
torch
.
load
(
self
.
mp_rank_files
[
0
],
map_location
=
torch
.
device
(
'cpu'
))
self
.
global_state
[
ARGS_KEY
]
=
sd
.
get
(
ARGS_KEY
,
None
)
return
self
.
global_state
[
ARGS_KEY
]
def
get_transformer_state
(
self
,
tp_index
:
int
,
pp_index
:
int
)
->
list
:
assert
tp_index
<
self
.
tp_degree
assert
pp_index
<
self
.
pp_degree
t_list
=
[]
for
fname_list
in
self
.
transformer_file_map
[(
tp_index
,
pp_index
)]:
sd_list
=
[
torch
.
load
(
fname
,
map_location
=
torch
.
device
(
'cpu'
))
for
fname
in
fname_list
]
sd
=
self
.
_merge_state_dicts
(
sd_list
)
t_list
.
append
(
sd
)
return
t_list
def
get_final_norm_state
(
self
,
tp_index
:
int
)
->
Dict
:
assert
tp_index
in
self
.
tp_to_final_norm_map
.
keys
()
sd
=
torch
.
load
(
self
.
tp_to_final_norm_map
[
tp_index
][
0
],
map_location
=
torch
.
device
(
'cpu'
))
return
sd
def
_build_tp_other_layer_map
(
self
,
layer_index
:
int
):
assert
layer_index
<
len
(
self
.
layer_files
)
layer_files
=
self
.
_get_files_with_prefix
(
self
.
layer_files
,
self
.
layer_keys
[
layer_index
])
layer_file_partitions
=
self
.
_partition_data
(
layer_files
,
self
.
tp_degree
)
data_map
=
{
i
:
flist
for
i
,
flist
in
enumerate
(
layer_file_partitions
)}
return
data_map
def
_build_pp_transformer_map
(
self
):
data_map
=
{}
transformer_layers
=
self
.
layer_keys
[
1
:
-
1
]
layers_per_pp
=
len
(
transformer_layers
)
//
self
.
pp_degree
data_map
=
{
i
:
transformer_layers
[
i
*
layers_per_pp
:(
i
+
1
)
*
layers_per_pp
]
for
i
in
range
(
0
,
self
.
pp_degree
)}
return
data_map
def
_dump_mapping
(
self
,
data_map
,
map_tag
=
None
):
if
map_tag
is
not
None
:
print
(
f
'Dump mapping:
{
map_tag
}
'
)
for
k
,
v
in
data_map
.
items
():
print
(
f
'
{
k
}
=
{
v
}
'
)
# xianfzeng
def
_build_transformer_file_map_nopp
(
self
):
file_map
=
{(
0
,
0
):
[
self
.
mp_rank_files
]}
return
file_map
def
_build_transformer_file_map
(
self
):
transformer_layer_keys
=
self
.
layer_keys
[
1
:
-
1
]
file_map
=
{}
layers_per_pp
=
len
(
transformer_layer_keys
)
//
self
.
pp_degree
for
key_index
,
layer_key
in
enumerate
(
transformer_layer_keys
):
pp_index
=
key_index
//
layers_per_pp
layer_files
=
self
.
_get_files_with_prefix
(
self
.
layer_files
,
layer_key
)
layer_file_partitions
=
self
.
_partition_data
(
layer_files
,
self
.
tp_degree
)
for
tp_index
in
range
(
self
.
tp_degree
):
map_key
=
(
tp_index
,
pp_index
)
if
not
map_key
in
file_map
.
keys
():
file_map
[
map_key
]
=
[]
file_map
[
map_key
].
append
(
layer_file_partitions
[
tp_index
])
return
file_map
def
_sanity_check
(
self
):
assert
len
(
self
.
mp_rank_files
)
%
self
.
tp_degree
==
0
assert
len
(
self
.
zero_files
)
%
(
self
.
pp_degree
*
self
.
tp_degree
)
==
0
if
not
self
.
no_pp
:
assert
len
(
self
.
layer_keys
)
>
2
assert
(
len
(
self
.
layer_keys
)
-
2
)
%
self
.
pp_degree
==
0
def
_get_files_with_prefix
(
self
,
all_files
,
prefix
):
file_list
=
[]
for
file_path
in
all_files
:
_
,
fname
=
os
.
path
.
split
(
file_path
)
if
fname
.
startswith
(
prefix
):
file_list
.
append
(
file_path
)
return
sorted
(
file_list
)
# @ xianfzeng
def
_get_files_with_context
(
self
,
all_files
,
context
):
file_list
=
[]
for
file_path
in
all_files
:
_
,
fname
=
os
.
path
.
split
(
file_path
)
if
context
in
fname
:
file_list
.
append
(
file_path
)
return
sorted
(
file_list
)
def
validate_files
(
self
):
for
file
in
self
.
file_list
:
if
not
os
.
path
.
isfile
(
file
):
print
(
f
'Error:
{
file
}
is not existent'
)
def
_get_files
(
self
,
dir
):
file_list
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
dir
):
for
file
in
files
:
file_list
.
append
(
os
.
path
.
join
(
root
,
file
))
return
file_list
def
_get_layer_keys
(
self
):
key_set
=
set
()
key_len
=
len
(
LAYER_FILE_PREFIX
)
+
2
for
file_path
in
self
.
layer_files
:
_
,
fname
=
os
.
path
.
split
(
file_path
)
key_set
.
add
(
fname
[:
key_len
])
return
sorted
(
list
(
key_set
))
def
_partition_data
(
self
,
data_list
,
num_partitions
):
num_elems
=
len
(
data_list
)
assert
num_elems
%
num_partitions
==
0
partition_size
=
num_elems
//
num_partitions
partitions_list
=
[
data_list
[
i
:
i
+
partition_size
]
for
i
in
range
(
0
,
num_elems
,
partition_size
)]
return
partitions_list
def
_merge_state_dicts
(
self
,
sd_list
):
print
(
f
">> sd_List:
{
sd_list
[
0
].
keys
()
}
"
)
#print(f">> sd_list: {sd_list[0]['module']['language_model'].keys()}")
merged_sd
=
{}
for
key
in
sd_list
[
0
].
keys
():
if
not
key
in
SEQUENTIAL_LAYERS
:
cat_dim
=
LAYER_CONCAT_DIM
.
get
(
key
,
0
)
merged_sd
[
key
]
=
torch
.
cat
([
sd
[
key
]
for
sd
in
sd_list
],
dim
=
cat_dim
)
else
:
merged_sd
[
key
]
=
sd_list
[
0
][
key
]
return
merged_sd
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/deepspeed_to_megatron.py
0 → 100644
View file @
6709039a
#!/usr/bin/env python
import
argparse
import
os
import
torch
from
collections
import
OrderedDict
from
deepspeed_checkpoint
import
ARGS_KEY
,
DeepSpeedCheckpoint
MODEL_KEY
=
'model'
ARGS_KEY
=
'args'
LANGUAGE_MODEL_KEY
=
'language_model'
EMBEDDING_KEY
=
'embedding'
ENCODER_KEY
=
'encoder'
WORD_EMBEDDINGS_FOR_HEAD_KEY
=
'word_embeddings_for_head'
WORD_EMBEDDINGS_FOR_HEAD_KEY
=
'output_layer'
WORD_EMBEDDINGS_FOR_HEAD_LLAMA_KEY
=
'lm_head'
WORD_EMBEDDINGS_KEY
=
'word_embeddings'
FINAL_LAYER_NORM_KEY
=
'final_layernorm'
CHECKPOINT_VERSION_KEY
=
'checkpoint_version'
CHECKPOINT_VERSION_VALUE
=
3.0
ITERATION_KEY
=
'iteration'
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--input_folder'
,
default
=
None
,
type
=
str
,
help
=
'Input DeepSpeed Checkpoint folder'
)
parser
.
add_argument
(
'--output_folder'
,
default
=
None
,
type
=
str
,
help
=
'Output Megatron checkpoint folder'
)
parser
.
add_argument
(
'--target_tp'
,
default
=
1
,
type
=
int
,
help
=
'Target TP degree'
)
parser
.
add_argument
(
'--target_pp'
,
default
=
1
,
type
=
int
,
help
=
'Target PP degree'
)
parser
.
add_argument
(
'--no_pp'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Target PP degree'
)
parser
.
add_argument
(
'--share_emb'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Share embedding and lm_head'
)
parser
.
add_argument
(
'--for_release'
,
action
=
'store_true'
,
help
=
'Convert for release purpose, reset some (progress) counters.'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
_convert_ds_transformer_state
(
sd_list
):
new_sd
=
OrderedDict
()
for
i
,
sd
in
enumerate
(
sd_list
):
for
key
,
value
in
sd
.
items
():
if
key
==
"weight"
:
continue
new_key
=
f
'layers.
{
i
}
.
{
key
}
'
new_sd
[
new_key
]
=
value
return
new_sd
def
_create_checkpoint_paths
(
base_folder
,
iteration
,
tp_degree
,
pp_degree
):
path_list
=
[]
iter_folder
=
f
'iter_
{
iteration
:
07
d
}
'
for
i
in
range
(
0
,
tp_degree
):
path_list
.
append
([])
for
j
in
range
(
0
,
pp_degree
):
rank_folder
=
f
'mp_rank_
{
i
:
02
d
}
'
if
pp_degree
==
1
else
f
'mp_rank_
{
i
:
02
d
}
_
{
j
:
03
d
}
'
ckpt_path
=
os
.
path
.
join
(
rank_folder
,
'model_optim_rng.pt'
)
path_list
[
i
].
append
(
os
.
path
.
join
(
base_folder
,
iter_folder
,
ckpt_path
))
return
path_list
def
_create_megatron_dict
():
language_model_dict
=
{
EMBEDDING_KEY
:
{},
ENCODER_KEY
:
{}
}
megatron_dict
=
{
MODEL_KEY
:
{
LANGUAGE_MODEL_KEY
:
language_model_dict
},
CHECKPOINT_VERSION_KEY
:
CHECKPOINT_VERSION_VALUE
}
return
megatron_dict
def
_save_checkpoint
(
file_path
,
chkpt_sd
):
dir
,
_
=
os
.
path
.
split
(
file_path
)
os
.
makedirs
(
dir
,
exist_ok
=
True
)
torch
.
save
(
chkpt_sd
,
file_path
)
def
_renest_sd
(
sd
):
new_sd
=
OrderedDict
()
for
key
,
value
in
sd
.
items
():
a
,
b
=
key
.
split
(
'.'
)
new_sd
[
a
]
=
{
b
:
value
}
return
new_sd
def
_create_rank_checkpoint
(
ds_checkpoint
,
tp_index
,
pp_index
,
for_release
=
False
,
share_emb
=
False
):
meg_encoder_sd
=
OrderedDict
()
meg_embedding_sd
=
OrderedDict
()
meg_embedding_for_head_sd
=
OrderedDict
()
transformer_sd
=
ds_checkpoint
.
get_transformer_state
(
tp_index
,
pp_index
)
meg_encoder_sd
.
update
(
_convert_ds_transformer_state
(
transformer_sd
))
if
pp_index
in
[
0
,
ds_checkpoint
.
pp_degree
-
1
]:
embedding_sd
=
ds_checkpoint
.
get_embedding_state
(
tp_index
)
nested_embedding_sd
=
_renest_sd
(
embedding_sd
)
if
pp_index
==
0
:
meg_embedding_sd
.
update
(
nested_embedding_sd
)
if
pp_index
==
ds_checkpoint
.
pp_degree
-
1
:
for
key
,
value
in
embedding_sd
.
items
():
print
(
f
">> deepspeed to megatron.py, key:
{
key
}
, value:
{
value
}
"
)
if
key
.
startswith
(
WORD_EMBEDDINGS_KEY
):
fields
=
key
.
split
(
'.'
)
new_fields
=
fields
[
1
:]
new_key
=
'.'
.
join
(
new_fields
)
#new_key = "weight"
meg_embedding_sd
[
new_key
]
=
value
print
(
f
">> embedding sd:
{
new_key
}
{
key
}
"
)
if
not
share_emb
:
for
key
,
value
in
ds_checkpoint
.
get_lm_head_state
(
tp_index
).
items
():
if
WORD_EMBEDDINGS_FOR_HEAD_LLAMA_KEY
in
key
:
fields
=
key
.
split
(
'.'
)
new_fields
=
fields
[
1
:]
new_key
=
'.'
.
join
(
new_fields
)
#new_key = "weight"
meg_embedding_for_head_sd
[
new_key
]
=
value
print
(
f
">> embedding for head sd:
{
new_key
}
{
key
}
"
)
final_norm_sd
=
ds_checkpoint
.
get_final_norm_state
(
tp_index
)
new_final_norm_sd
=
{}
for
key
,
value
in
final_norm_sd
.
items
():
print
(
f
">> final_norm_sd, key:
{
key
}
, value:
{
value
.
size
()
}
"
)
new_final_norm_sd
[
f
'
{
FINAL_LAYER_NORM_KEY
}
.
{
key
}
'
]
=
value
#new_final_norm_sd = {f'{FINAL_LAYER_NORM_KEY}.{key}': value for key, value in final_norm_sd.items()}
meg_encoder_sd
.
update
(
new_final_norm_sd
)
checkpoint_sd
=
_create_megatron_dict
()
iteration
=
ds_checkpoint
.
get_iteration
()
checkpoint_sd
[
ITERATION_KEY
]
=
iteration
if
pp_index
==
0
:
checkpoint_sd
[
MODEL_KEY
][
LANGUAGE_MODEL_KEY
][
EMBEDDING_KEY
]
=
meg_embedding_sd
print
(
f
">> deepspeed to megatron.py, meg_embedding_sd:
{
meg_embedding_sd
}
"
)
checkpoint_sd
[
MODEL_KEY
][
LANGUAGE_MODEL_KEY
][
ENCODER_KEY
]
=
meg_encoder_sd
print
(
f
">> pp_index:
{
pp_index
}
pp_degree:
{
ds_checkpoint
.
pp_degree
}
"
)
if
pp_index
==
ds_checkpoint
.
pp_degree
-
1
and
not
share_emb
:
#checkpoint_sd[MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
checkpoint_sd
[
MODEL_KEY
][
LANGUAGE_MODEL_KEY
][
WORD_EMBEDDINGS_FOR_HEAD_KEY
]
=
meg_embedding_for_head_sd
print
(
f
">> deepspeed to megatron.py, meg_embedding_for_head_sd:
{
meg_embedding_for_head_sd
}
"
)
#print(f">> checkpoint_sd[model][output_layer]: {checkpoint_sd[MODEL_KEY][LANGUAGE_MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY]}")
checkpoint_sd
[
ARGS_KEY
]
=
ds_checkpoint
.
get_args
()
# Adjust specific fields
checkpoint_sd
[
ARGS_KEY
].
tensor_model_parallel_size
=
ds_checkpoint
.
tp_degree
checkpoint_sd
[
ARGS_KEY
].
pipeline_model_parallel_size
=
ds_checkpoint
.
pp_degree
if
for_release
:
checkpoint_sd
[
ARGS_KEY
].
consumed_train_samples
=
0
checkpoint_sd
[
ARGS_KEY
].
consumed_valid_samples
=
0
return
checkpoint_sd
def
_create_latest_file
(
base_folder
,
iteration
):
file_path
=
os
.
path
.
join
(
base_folder
,
'latest_checkpointed_iteration.txt'
)
os
.
makedirs
(
base_folder
,
exist_ok
=
True
)
with
open
(
file_path
,
'w'
)
as
f
:
f
.
write
(
str
(
iteration
))
def
main
():
print
(
f
'Convert DeepSpeed Checkpoint to Megatron Checkpoint'
)
args
=
parse_arguments
()
print
(
f
'Converting DeepSpeed checkpoint in
{
args
.
input_folder
}
to Megatron checkpoint in
{
args
.
output_folder
}
'
)
ds_checkpoint
=
DeepSpeedCheckpoint
(
args
.
input_folder
,
args
.
target_tp
,
args
.
target_pp
)
iteration
=
ds_checkpoint
.
get_iteration
()
_create_latest_file
(
args
.
output_folder
,
iteration
)
checkpoint_paths
=
_create_checkpoint_paths
(
args
.
output_folder
,
iteration
,
ds_checkpoint
.
tp_degree
,
ds_checkpoint
.
pp_degree
)
for
i
in
range
(
0
,
ds_checkpoint
.
tp_degree
):
for
j
in
range
(
0
,
ds_checkpoint
.
pp_degree
):
sd
=
_create_rank_checkpoint
(
ds_checkpoint
,
i
,
j
,
args
.
for_release
)
_save_checkpoint
(
checkpoint_paths
[
i
][
j
],
sd
)
if
__name__
==
"__main__"
:
main
()
tools/convert_scripts/convert_scripts/Megatron-Deepspeed/tools/covert_checkpoint/deepspeed_to_transformers.py
0 → 100644
View file @
6709039a
#!/usr/bin/env python
import
os
import
torch
import
json
from
deepspeed_checkpoint
import
DeepSpeedCheckpoint
from
deepspeed_to_megatron
import
_create_rank_checkpoint
,
parse_arguments
# the import was tested to work with this version
# https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
# copying that version here instead
#from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
from
convert_megatron_gpt2_checkpoint
import
convert_megatron_checkpoint
from
transformers
import
GPT2Config
,
AutoTokenizer
,
LlamaTokenizer
def
main
():
# this first part comes mainly from deepspeed_to_megatron.main
args
=
parse_arguments
()
print
(
f
'Converting DeepSpeed checkpoint in
{
args
.
input_folder
}
to HF Transformers checkpoint in
{
args
.
output_folder
}
'
)
ds_checkpoint
=
DeepSpeedCheckpoint
(
args
.
input_folder
,
args
.
target_tp
,
args
.
target_pp
,
args
.
no_pp
)
iteration
=
ds_checkpoint
.
get_iteration
()
input_state_dict
=
_create_rank_checkpoint
(
ds_checkpoint
,
0
,
0
,
args
.
for_release
)
# the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
# Spell out all parameters in case the defaults change.
config
=
None
config
=
GPT2Config
(
vocab_size
=
50257
,
n_positions
=
1024
,
n_ctx
=
1024
,
n_embd
=
1024
,
n_layer
=
24
,
n_head
=
16
,
n_inner
=
4096
,
activation_function
=
"gelu"
,
# used to be "gelu_new" in earlier versions
resid_pdrop
=
0.1
,
embd_pdrop
=
0.1
,
attn_pdrop
=
0.1
,
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
summary_type
=
"cls_index"
,
summary_use_proj
=
True
,
summary_activation
=
None
,
summary_proj_to_labels
=
True
,
summary_first_dropout
=
0.1
,
scale_attn_weights
=
True
,
gradient_checkpointing
=
False
,
use_cache
=
True
,
bos_token_id
=
50256
,
eos_token_id
=
50256
,
)
# Convert.
print
(
"Converting to HF Checkpoint"
)
print
(
f
"input_state_dict:
{
input_state_dict
[
'model'
].
keys
()
}
"
)
print
(
f
"language model:
{
input_state_dict
[
'model'
][
'language_model'
].
keys
()
}
"
)
print
(
f
"language model:
{
input_state_dict
[
'model'
][
'language_model'
][
'encoder'
][
'layers.23.mlp.dense_h_to_4h.weight'
].
size
()
}
"
)
output_state_dict
=
convert_megatron_checkpoint
(
args
,
input_state_dict
,
config
,
ds_checkpoint
.
original_tp_degree
)
basename
=
args
.
output_folder
os
.
makedirs
(
basename
,
exist_ok
=
True
)
# Print the structure of converted state dict.
#if args.print_checkpoint_structure:
# recursive_print(None, output_state_dict)
# Store the config to file.
#please prepare your config.json according to your model
# Store the state_dict to file.
output_checkpoint_file
=
os
.
path
.
join
(
basename
,
"pytorch_model.bin"
)
print
(
f
'Saving checkpoint to "
{
output_checkpoint_file
}
"'
)
torch
.
save
(
output_state_dict
,
output_checkpoint_file
)
if
__name__
==
"__main__"
:
main
()
tools/convert_scripts/convert_scripts/convert.sh
0 → 100644
View file @
6709039a
# please change the relating path according to your env
WORK_DIR
=
/path_to_convert_scripts/Megatron-Deepspeed
CODE_DIR
=
$WORK_DIR
export
PYTHONPATH
=
${
WORK_DIR
}
:
$PYTHONPATH
export
PYTHONPATH
=
/path_to_Megatron-DeepSpeed-main:
$PYTHONPATH
checkpoint_dir
=
/path_to_checkpoint/global_step1xxxx
python3
$WORK_DIR
/tools/covert_checkpoint/deepspeed_to_transformers.py
\
--input_folder
${
checkpoint_dir
}
\
--output_folder
${
checkpoint_dir
}
_hf
\
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment