Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
475293db
Commit
475293db
authored
May 21, 2025
by
wxj
Browse files
A100适配
parent
cd1a6dfe
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
89 additions
and
92 deletions
+89
-92
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+77
-77
examples/llama/run_llama2_7B.sh
examples/llama/run_llama2_7B.sh
+8
-12
examples/llama/train_llama2_7b_1nodes.sh
examples/llama/train_llama2_7b_1nodes.sh
+4
-3
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
475293db
...
@@ -100,39 +100,39 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -100,39 +100,39 @@ class CoreAdaptation(MegatronAdaptationABC):
from
..core.models.gpt.gpt_model
import
gpt_model_init_wrapper
,
gpt_model_forward
from
..core.models.gpt.gpt_model
import
gpt_model_init_wrapper
,
gpt_model_forward
# GPT Model
# GPT Model
MegatronAdaptation
.
register
(
'megatron.core.models.gpt.gpt_model.GPTModel.__init__'
,
#
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
gpt_model_init_wrapper
,
#
gpt_model_init_wrapper,
apply_wrapper
=
True
)
#
apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.core.models.gpt.gpt_model.GPTModel.forward'
,
#
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward',
gpt_model_forward
)
#
gpt_model_forward)
def
patch_core_transformers
(
self
):
def
patch_core_transformers
(
self
):
from
..core
import
transformer_block_init_wrapper
from
..core
import
transformer_block_init_wrapper
from
..core.transformer.transformer_config
import
TransformerConfigPatch
,
MLATransformerConfigPatch
from
..core.transformer.transformer_config
import
TransformerConfigPatch
,
MLATransformerConfigPatch
# Transformer block. If mtp_num_layers > 0, move final_layernorm outside
# Transformer block. If mtp_num_layers > 0, move final_layernorm outside
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_block.TransformerBlock.__init__'
,
# MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
transformer_block_init_wrapper
)
# transformer_block_init_wrapper)
# Transformer config
# # Transformer config
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.TransformerConfig'
,
# MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
TransformerConfigPatch
)
# TransformerConfigPatch)
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.MLATransformerConfig'
,
# MegatronAdaptation.register('megatron.core.transformer.transformer_config.MLATransformerConfig',
MLATransformerConfigPatch
)
# MLATransformerConfigPatch)
# Moe
# # Moe
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
# # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
# torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
# # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
# # apply_wrapper=True)
# # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
# # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
# # apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
# torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True)
# apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.
switch_load_balancing_loss_func
',
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.
unpermute
',
# torch.compile(
options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}
),
# torch.compile(
mode='max-autotune-no-cudagraphs'
),
# apply_wrapper=True)
# apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.permute'
,
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.unpermute'
,
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
apply_wrapper
=
True
)
def
patch_core_extentions
(
self
):
def
patch_core_extentions
(
self
):
import
transformer_engine
as
te
import
transformer_engine
as
te
...
@@ -152,31 +152,31 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -152,31 +152,31 @@ class CoreAdaptation(MegatronAdaptationABC):
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
# VocabParallelEmbedding
# VocabParallelEmbedding
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
#
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper
=
True
)
#
apply_wrapper=True)
# VocabParallelCrossEntropy
#
#
VocabParallelCrossEntropy
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits',
VocabParallelCrossEntropy
.
calculate_predicted_logits
)
#
VocabParallelCrossEntropy.calculate_predicted_logits)
# _VocabParallelCrossEntropy
#
#
_VocabParallelCrossEntropy
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
remove_origin_wrappers
=
True
)
#
remove_origin_wrappers=True)
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
#
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper
=
True
)
#
apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
staticmethod
,
#
staticmethod,
apply_wrapper
=
True
)
#
apply_wrapper=True)
# reduce_scatter_to_sequence_parallel_region
#
#
reduce_scatter_to_sequence_parallel_region
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
torch
.
_dynamo
.
disable
,
#
torch._dynamo.disable,
apply_wrapper
=
True
)
#
apply_wrapper=True)
# reduce_from_tensor_model_parallel_region
#
#
reduce_from_tensor_model_parallel_region
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.mappings.reduce_from_tensor_model_parallel_region'
,
#
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_from_tensor_model_parallel_region',
torch
.
_dynamo
.
disable
,
#
torch._dynamo.disable,
apply_wrapper
=
True
)
#
apply_wrapper=True)
# flux
# flux
if
int
(
os
.
getenv
(
"USE_FLUX_OVERLAP"
,
"0"
)):
if
int
(
os
.
getenv
(
"USE_FLUX_OVERLAP"
,
"0"
)):
...
@@ -186,12 +186,12 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -186,12 +186,12 @@ class CoreAdaptation(MegatronAdaptationABC):
)
)
from
..core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_flux_spec
from
..core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_flux_spec
MegatronAdaptation
.
register
(
"megatron.core.extensions.transformer_engine.TEColumnParallelLinear"
,
#
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear",
FluxColumnParallelLinear
)
#
FluxColumnParallelLinear)
MegatronAdaptation
.
register
(
"megatron.core.extensions.transformer_engine.TERowParallelLinear"
,
#
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear",
FluxRowParallelLinear
)
#
FluxRowParallelLinear)
MegatronAdaptation
.
register
(
"megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec"
,
#
MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec",
get_gpt_layer_with_flux_spec
)
#
get_gpt_layer_with_flux_spec)
def
patch_training
(
self
):
def
patch_training
(
self
):
from
..training.tokenizer
import
build_tokenizer
from
..training.tokenizer
import
build_tokenizer
...
@@ -209,9 +209,9 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -209,9 +209,9 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation
.
register
(
'megatron.training.initialize._compile_dependencies'
,
MegatronAdaptation
.
register
(
'megatron.training.initialize._compile_dependencies'
,
_compile_dependencies
)
_compile_dependencies
)
# 添加固定seed
#
#
添加固定seed
MegatronAdaptation
.
register
(
'megatron.training.initialize._set_random_seed'
,
#
MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed
)
#
_set_random_seed)
# add trace_handler
# add trace_handler
MegatronAdaptation
.
register
(
'megatron.training.training.train'
,
MegatronAdaptation
.
register
(
'megatron.training.training.train'
,
...
@@ -239,24 +239,24 @@ class LegacyAdaptation(MegatronAdaptationABC):
...
@@ -239,24 +239,24 @@ class LegacyAdaptation(MegatronAdaptationABC):
)
)
from
..legacy.model.utils
import
get_norm
from
..legacy.model.utils
import
get_norm
# ParallecMLP
#
#
ParallecMLP
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelMLP.__init__'
,
#
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__',
parallel_mlp_init_wrapper
,
#
parallel_mlp_init_wrapper,
apply_wrapper
=
True
)
#
apply_wrapper=True)
# ParallelAttention
#
#
ParallelAttention
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelAttention.__init__'
,
#
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.__init__',
parallel_attention_init_wrapper
,
#
parallel_attention_init_wrapper,
apply_wrapper
=
True
)
#
apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelAttention.forward'
,
#
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward',
ParallelAttentionPatch
.
forward
)
#
ParallelAttentionPatch.forward)
# rms_norm.RMSNorm
#
#
rms_norm.RMSNorm
MegatronAdaptation
.
register
(
'megatron.legacy.model.rms_norm.RMSNorm.forward'
,
#
MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward',
torch
.
compile
(
mode
=
"max-autotune-no-cudagraphs"
),
#
torch.compile(mode="max-autotune-no-cudagraphs"),
apply_wrapper
=
True
)
#
apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.legacy.model.utils.get_norm'
,
#
MegatronAdaptation.register('megatron.legacy.model.utils.get_norm',
get_norm
)
#
get_norm)
MegatronAdaptation
.
execute
()
MegatronAdaptation
.
execute
()
examples/llama/run_llama2_7B.sh
View file @
475293db
...
@@ -6,24 +6,20 @@ do
...
@@ -6,24 +6,20 @@ do
done
done
# Those variables need to modify
# Those variables need to modify
GPUS
=
""
# how many gpus to use
GPUS
=
"4"
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
HOST
=
"localhost"
# hostname
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
PORT
=
"11451"
# port id
HOST
=
""
# hostname
DATA_PATH
=
"/data/datasets/oscar-1GB_head-llama2_text_document"
# path to oscar-1GB_head-llama2_text_document
PORT
=
""
# port id
TOKENIZER_MODEL_PATH
=
"/data/models/llama2/tokenizer.model"
# path to tokenizer.model
DATA_PATH
=
""
# path to oscar-1GB_head-llama2_text_document
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
TOKENIZER_MODEL_PATH
=
""
# path to tokenizer.model
CHECKPOINT_PATH
=
""
# path to ckpt
# Runs Llama2 7B model
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
_llama2_7B
\
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--allow-run-as-root
\
--bind-to
none
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
bash
-c
"
source
${
DTK_ENV
}
&&
\
./train_llama2_7b_1nodes.sh
\
source
${
NCCL_ENV
}
&&
\
./train_llama2_7b_
$((${
GPUS
}
/
8
))
nodes.sh
\
${
HOST
}
\
${
HOST
}
\
${
PORT
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--data_path=
$DATA_PATH
\
...
...
examples/llama/train_llama2_7b_1nodes.sh
View file @
475293db
...
@@ -66,7 +66,7 @@ GPT_MODEL_ARGS=(
...
@@ -66,7 +66,7 @@ GPT_MODEL_ARGS=(
--ffn-hidden-size
11008
--ffn-hidden-size
11008
--num-attention-heads
32
--num-attention-heads
32
--max-position-embeddings
4096
--max-position-embeddings
4096
--normalization
Lightop
RMSNorm
--normalization
RMSNorm
--position-embedding-type
rope
--position-embedding-type
rope
--untie-embeddings-and-output-weights
--untie-embeddings-and-output-weights
)
)
...
@@ -75,7 +75,7 @@ TRAINING_ARGS=(
...
@@ -75,7 +75,7 @@ TRAINING_ARGS=(
--transformer-impl
local
--transformer-impl
local
--use-legacy-models
--use-legacy-models
--micro-batch-size
1
--micro-batch-size
1
--global-batch-size
25
6
--global-batch-size
6
4
--train-iters
50
--train-iters
50
--weight-decay
0.1
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta1
0.9
...
@@ -159,4 +159,5 @@ elif [[ $profiling == "hip" ]]; then
...
@@ -159,4 +159,5 @@ elif [[ $profiling == "hip" ]]; then
fi
fi
#for hygon cpu
#for hygon cpu
${
MEGATRON_PATH
}
/requirements/launch_with_binding.sh
${
LOCAL_RANK
}
${
APP
}
export
CUDA_VISIBLE_DEVICES
=
4,5,6,7
\ No newline at end of file
${
APP
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment