Commit 475293db authored by wxj's avatar wxj
Browse files

A100适配

parent cd1a6dfe
...@@ -100,39 +100,39 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -100,39 +100,39 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..core.models.gpt.gpt_model import gpt_model_init_wrapper, gpt_model_forward from ..core.models.gpt.gpt_model import gpt_model_init_wrapper, gpt_model_forward
# GPT Model # GPT Model
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__', # MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
gpt_model_init_wrapper, # gpt_model_init_wrapper,
apply_wrapper=True) # apply_wrapper=True)
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward', # MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward',
gpt_model_forward) # gpt_model_forward)
def patch_core_transformers(self): def patch_core_transformers(self):
from ..core import transformer_block_init_wrapper from ..core import transformer_block_init_wrapper
from ..core.transformer.transformer_config import TransformerConfigPatch, MLATransformerConfigPatch from ..core.transformer.transformer_config import TransformerConfigPatch, MLATransformerConfigPatch
# Transformer block. If mtp_num_layers > 0, move final_layernorm outside # Transformer block. If mtp_num_layers > 0, move final_layernorm outside
MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__', # MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
transformer_block_init_wrapper) # transformer_block_init_wrapper)
# Transformer config # # Transformer config
MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig', # MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
TransformerConfigPatch) # TransformerConfigPatch)
MegatronAdaptation.register('megatron.core.transformer.transformer_config.MLATransformerConfig', # MegatronAdaptation.register('megatron.core.transformer.transformer_config.MLATransformerConfig',
MLATransformerConfigPatch) # MLATransformerConfigPatch)
# Moe # # Moe
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity', # # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
# torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}), # # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
# # apply_wrapper=True)
# # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
# # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
# # apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
# torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True) # apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func', # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.unpermute',
# torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}), # torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True) # apply_wrapper=True)
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.unpermute',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
def patch_core_extentions(self): def patch_core_extentions(self):
import transformer_engine as te import transformer_engine as te
...@@ -152,31 +152,31 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -152,31 +152,31 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy from ..core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
# VocabParallelEmbedding # VocabParallelEmbedding
MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward', # MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
torch.compile(mode='max-autotune-no-cudagraphs'), # torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True) # apply_wrapper=True)
# VocabParallelCrossEntropy # # VocabParallelCrossEntropy
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits', # MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits',
VocabParallelCrossEntropy.calculate_predicted_logits) # VocabParallelCrossEntropy.calculate_predicted_logits)
# _VocabParallelCrossEntropy # # _VocabParallelCrossEntropy
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward', # MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
remove_origin_wrappers=True) # remove_origin_wrappers=True)
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward', # MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
torch.compile(mode='max-autotune-no-cudagraphs'), # torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True) # apply_wrapper=True)
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward', # MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
staticmethod, # staticmethod,
apply_wrapper=True) # apply_wrapper=True)
# reduce_scatter_to_sequence_parallel_region # # reduce_scatter_to_sequence_parallel_region
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region', # MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
torch._dynamo.disable, # torch._dynamo.disable,
apply_wrapper=True) # apply_wrapper=True)
# reduce_from_tensor_model_parallel_region # # reduce_from_tensor_model_parallel_region
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_from_tensor_model_parallel_region', # MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_from_tensor_model_parallel_region',
torch._dynamo.disable, # torch._dynamo.disable,
apply_wrapper=True) # apply_wrapper=True)
# flux # flux
if int(os.getenv("USE_FLUX_OVERLAP", "0")): if int(os.getenv("USE_FLUX_OVERLAP", "0")):
...@@ -186,12 +186,12 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -186,12 +186,12 @@ class CoreAdaptation(MegatronAdaptationABC):
) )
from ..core.models.gpt.gpt_layer_specs import get_gpt_layer_with_flux_spec from ..core.models.gpt.gpt_layer_specs import get_gpt_layer_with_flux_spec
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear", # MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear",
FluxColumnParallelLinear) # FluxColumnParallelLinear)
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear", # MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear",
FluxRowParallelLinear) # FluxRowParallelLinear)
MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec", # MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec",
get_gpt_layer_with_flux_spec) # get_gpt_layer_with_flux_spec)
def patch_training(self): def patch_training(self):
from ..training.tokenizer import build_tokenizer from ..training.tokenizer import build_tokenizer
...@@ -209,9 +209,9 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -209,9 +209,9 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation.register('megatron.training.initialize._compile_dependencies', MegatronAdaptation.register('megatron.training.initialize._compile_dependencies',
_compile_dependencies) _compile_dependencies)
# 添加固定seed # # 添加固定seed
MegatronAdaptation.register('megatron.training.initialize._set_random_seed', # MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed) # _set_random_seed)
# add trace_handler # add trace_handler
MegatronAdaptation.register('megatron.training.training.train', MegatronAdaptation.register('megatron.training.training.train',
...@@ -239,24 +239,24 @@ class LegacyAdaptation(MegatronAdaptationABC): ...@@ -239,24 +239,24 @@ class LegacyAdaptation(MegatronAdaptationABC):
) )
from ..legacy.model.utils import get_norm from ..legacy.model.utils import get_norm
# ParallecMLP # # ParallecMLP
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__', # MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__',
parallel_mlp_init_wrapper, # parallel_mlp_init_wrapper,
apply_wrapper=True) # apply_wrapper=True)
# ParallelAttention # # ParallelAttention
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.__init__', # MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.__init__',
parallel_attention_init_wrapper, # parallel_attention_init_wrapper,
apply_wrapper=True) # apply_wrapper=True)
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward', # MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward',
ParallelAttentionPatch.forward) # ParallelAttentionPatch.forward)
# rms_norm.RMSNorm # # rms_norm.RMSNorm
MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward', # MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward',
torch.compile(mode="max-autotune-no-cudagraphs"), # torch.compile(mode="max-autotune-no-cudagraphs"),
apply_wrapper=True) # apply_wrapper=True)
MegatronAdaptation.register('megatron.legacy.model.utils.get_norm', # MegatronAdaptation.register('megatron.legacy.model.utils.get_norm',
get_norm) # get_norm)
MegatronAdaptation.execute() MegatronAdaptation.execute()
...@@ -6,24 +6,20 @@ do ...@@ -6,24 +6,20 @@ do
done done
# Those variables need to modify # Those variables need to modify
GPUS="" # how many gpus to use GPUS="4" # how many gpus to use
DTK_ENV="" # where env.sh of dtk HOST="localhost" # hostname
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh) PORT="11451" # port id
HOST="" # hostname DATA_PATH="/data/datasets/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
PORT="" # port id TOKENIZER_MODEL_PATH="/data/models/llama2/tokenizer.model" # path to tokenizer.model
DATA_PATH="" # path to oscar-1GB_head-llama2_text_document CHECKPOINT_PATH="./ckpt" # path to ckpt
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH="" # path to ckpt
# Runs Llama2 7B model # Runs Llama2 7B model
mpirun -np ${GPUS} --hostfile hostfile_llama2_7B \ mpirun -np ${GPUS} --hostfile hostfile \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
bash -c " bash -c "
source ${DTK_ENV} && \ ./train_llama2_7b_1nodes.sh \
source ${NCCL_ENV} && \
./train_llama2_7b_$((${GPUS} / 8))nodes.sh \
${HOST} \ ${HOST} \
${PORT} \ ${PORT} \
--data_path=$DATA_PATH \ --data_path=$DATA_PATH \
......
...@@ -66,7 +66,7 @@ GPT_MODEL_ARGS=( ...@@ -66,7 +66,7 @@ GPT_MODEL_ARGS=(
--ffn-hidden-size 11008 --ffn-hidden-size 11008
--num-attention-heads 32 --num-attention-heads 32
--max-position-embeddings 4096 --max-position-embeddings 4096
--normalization LightopRMSNorm --normalization RMSNorm
--position-embedding-type rope --position-embedding-type rope
--untie-embeddings-and-output-weights --untie-embeddings-and-output-weights
) )
...@@ -75,7 +75,7 @@ TRAINING_ARGS=( ...@@ -75,7 +75,7 @@ TRAINING_ARGS=(
--transformer-impl local --transformer-impl local
--use-legacy-models --use-legacy-models
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 256 --global-batch-size 64
--train-iters 50 --train-iters 50
--weight-decay 0.1 --weight-decay 0.1
--adam-beta1 0.9 --adam-beta1 0.9
...@@ -159,4 +159,5 @@ elif [[ $profiling == "hip" ]]; then ...@@ -159,4 +159,5 @@ elif [[ $profiling == "hip" ]]; then
fi fi
#for hygon cpu #for hygon cpu
${MEGATRON_PATH}/requirements/launch_with_binding.sh ${LOCAL_RANK} ${APP} export CUDA_VISIBLE_DEVICES=4,5,6,7
\ No newline at end of file ${APP}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment