Commit 475293db authored by wxj's avatar wxj
Browse files

A100适配

parent cd1a6dfe
......@@ -100,39 +100,39 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..core.models.gpt.gpt_model import gpt_model_init_wrapper, gpt_model_forward
# GPT Model
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
gpt_model_init_wrapper,
apply_wrapper=True)
MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward',
gpt_model_forward)
# MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
# gpt_model_init_wrapper,
# apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward',
# gpt_model_forward)
def patch_core_transformers(self):
from ..core import transformer_block_init_wrapper
from ..core.transformer.transformer_config import TransformerConfigPatch, MLATransformerConfigPatch
# Transformer block. If mtp_num_layers > 0, move final_layernorm outside
MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
transformer_block_init_wrapper)
# Transformer config
MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
TransformerConfigPatch)
MegatronAdaptation.register('megatron.core.transformer.transformer_config.MLATransformerConfig',
MLATransformerConfigPatch)
# Moe
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
# torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
# MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
# transformer_block_init_wrapper)
# # Transformer config
# MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
# TransformerConfigPatch)
# MegatronAdaptation.register('megatron.core.transformer.transformer_config.MLATransformerConfig',
# MLATransformerConfigPatch)
# # Moe
# # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
# # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
# # apply_wrapper=True)
# # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
# # torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
# # apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
# torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
# torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
# MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.unpermute',
# torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True)
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.unpermute',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
def patch_core_extentions(self):
import transformer_engine as te
......@@ -152,31 +152,31 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
# VocabParallelEmbedding
MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
# VocabParallelCrossEntropy
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits',
VocabParallelCrossEntropy.calculate_predicted_logits)
# _VocabParallelCrossEntropy
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
remove_origin_wrappers=True)
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
staticmethod,
apply_wrapper=True)
# reduce_scatter_to_sequence_parallel_region
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
torch._dynamo.disable,
apply_wrapper=True)
# reduce_from_tensor_model_parallel_region
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_from_tensor_model_parallel_region',
torch._dynamo.disable,
apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
# torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True)
# # VocabParallelCrossEntropy
# MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits',
# VocabParallelCrossEntropy.calculate_predicted_logits)
# # _VocabParallelCrossEntropy
# MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
# remove_origin_wrappers=True)
# MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
# torch.compile(mode='max-autotune-no-cudagraphs'),
# apply_wrapper=True)
# MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
# staticmethod,
# apply_wrapper=True)
# # reduce_scatter_to_sequence_parallel_region
# MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
# torch._dynamo.disable,
# apply_wrapper=True)
# # reduce_from_tensor_model_parallel_region
# MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_from_tensor_model_parallel_region',
# torch._dynamo.disable,
# apply_wrapper=True)
# flux
if int(os.getenv("USE_FLUX_OVERLAP", "0")):
......@@ -186,12 +186,12 @@ class CoreAdaptation(MegatronAdaptationABC):
)
from ..core.models.gpt.gpt_layer_specs import get_gpt_layer_with_flux_spec
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear",
FluxColumnParallelLinear)
MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear",
FluxRowParallelLinear)
MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec",
get_gpt_layer_with_flux_spec)
# MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear",
# FluxColumnParallelLinear)
# MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear",
# FluxRowParallelLinear)
# MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec",
# get_gpt_layer_with_flux_spec)
def patch_training(self):
from ..training.tokenizer import build_tokenizer
......@@ -209,9 +209,9 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation.register('megatron.training.initialize._compile_dependencies',
_compile_dependencies)
# 添加固定seed
MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed)
# # 添加固定seed
# MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
# _set_random_seed)
# add trace_handler
MegatronAdaptation.register('megatron.training.training.train',
......@@ -239,24 +239,24 @@ class LegacyAdaptation(MegatronAdaptationABC):
)
from ..legacy.model.utils import get_norm
# ParallecMLP
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__',
parallel_mlp_init_wrapper,
apply_wrapper=True)
# ParallelAttention
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.__init__',
parallel_attention_init_wrapper,
apply_wrapper=True)
MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward',
ParallelAttentionPatch.forward)
# rms_norm.RMSNorm
MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward',
torch.compile(mode="max-autotune-no-cudagraphs"),
apply_wrapper=True)
MegatronAdaptation.register('megatron.legacy.model.utils.get_norm',
get_norm)
# # ParallecMLP
# MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__',
# parallel_mlp_init_wrapper,
# apply_wrapper=True)
# # ParallelAttention
# MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.__init__',
# parallel_attention_init_wrapper,
# apply_wrapper=True)
# MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward',
# ParallelAttentionPatch.forward)
# # rms_norm.RMSNorm
# MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward',
# torch.compile(mode="max-autotune-no-cudagraphs"),
# apply_wrapper=True)
# MegatronAdaptation.register('megatron.legacy.model.utils.get_norm',
# get_norm)
MegatronAdaptation.execute()
......@@ -6,24 +6,20 @@ do
done
# Those variables need to modify
GPUS="" # how many gpus to use
DTK_ENV="" # where env.sh of dtk
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST="" # hostname
PORT="" # port id
DATA_PATH="" # path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH="" # path to ckpt
GPUS="4" # how many gpus to use
HOST="localhost" # hostname
PORT="11451" # port id
DATA_PATH="/data/datasets/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH="/data/models/llama2/tokenizer.model" # path to tokenizer.model
CHECKPOINT_PATH="./ckpt" # path to ckpt
# Runs Llama2 7B model
mpirun -np ${GPUS} --hostfile hostfile_llama2_7B \
mpirun -np ${GPUS} --hostfile hostfile \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
bash -c "
source ${DTK_ENV} && \
source ${NCCL_ENV} && \
./train_llama2_7b_$((${GPUS} / 8))nodes.sh \
./train_llama2_7b_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
......
......@@ -66,7 +66,7 @@ GPT_MODEL_ARGS=(
--ffn-hidden-size 11008
--num-attention-heads 32
--max-position-embeddings 4096
--normalization LightopRMSNorm
--normalization RMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights
)
......@@ -75,7 +75,7 @@ TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256
--global-batch-size 64
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
......@@ -159,4 +159,5 @@ elif [[ $profiling == "hip" ]]; then
fi
#for hygon cpu
${MEGATRON_PATH}/requirements/launch_with_binding.sh ${LOCAL_RANK} ${APP}
\ No newline at end of file
export CUDA_VISIBLE_DEVICES=4,5,6,7
${APP}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment