Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
# WARNING: Yaml configs is currently an experimental feature # WARNING: Yaml configs is currently an experimental feature
language_model: language_model:
# model architecture # model architecture
num_layers: 24 num_layers: 24
hidden_size: 1024 hidden_size: 1024
num_attention_heads: 16 num_attention_heads: 16
num_query_groups: null num_query_groups: null
ffn_hidden_size: null ffn_hidden_size: null
kv_channels: null kv_channels: null
hidden_dropout: 0.0 hidden_dropout: 0.0
attention_dropout: 0.0 attention_dropout: 0.0
fp32_residual_connection: False fp32_residual_connection: False
apply_residual_connection_post_layernorm: False apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5 layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True layernorm_zero_centered_gamma: True
add_bias_linear: False add_bias_linear: False
bias_activation_fusion: False bias_activation_fusion: False
add_qkv_bias: False add_qkv_bias: False
gated_linear_unit: False gated_linear_unit: False
activation_func: swiglu activation_func: swiglu
num_moe_experts: null num_moe_experts: null
rotary_interleaved: False rotary_interleaved: False
window_size: null window_size: null
# initialization # initialization
init_method: null init_method: null
init_method_std: 0.02 init_method_std: 0.02
output_layer_init_method: null output_layer_init_method: null
# mixed-precision # mixed-precision
apply_query_key_layer_scaling: False apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False attention_softmax_in_fp32: False
# fusion # fusion
bias_swiglu_fusion: True bias_swiglu_fusion: True
masked_softmax_fusion: True masked_softmax_fusion: True
persist_layer_norm: False persist_layer_norm: False
memory_efficient_layer_norm: False memory_efficient_layer_norm: False
bias_dropout_fusion: True bias_dropout_fusion: True
apply_rope_fusion: True apply_rope_fusion: True
# activation recomputation # activation recomputation
recompute_granularity: null recompute_granularity: null
recompute_method: null recompute_method: null
recompute_num_layers: null recompute_num_layers: null
distribute_saved_activations: null distribute_saved_activations: null
# fp8 related # fp8 related
fp8: null fp8: null
fp8_margin: 0 fp8_margin: 0
fp8_interval: 1 fp8_interval: 1
fp8_amax_history_len: 1 fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent" fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True fp8_wgrad: True
# miscellaneous # miscellaneous
clone_scatter_output_in_embedding: True clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm" normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related # MoE related
moe_router_load_balancing_type: "aux_loss" moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2 moe_router_topk: 2
moe_router_topk_limited_devices: null moe_router_group_topk: null
moe_grouped_gemm: False moe_router_num_groups: null
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss. moe_grouped_gemm: False
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_input_jitter_eps: null moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_token_dropping: False moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism model_parallel:
tensor_model_parallel_size: 1 # Model parallelism
context_parallel_size: 1 tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1 context_parallel_size: 1
virtual_pipeline_model_parallel_size: null pipeline_model_parallel_size: 1
sequence_parallel: True virtual_pipeline_model_parallel_size: null
expert_model_parallel_size: 1 sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True # Initialization
use_cpu_initialization: null perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False # Training
bf16: True fp16: False
params_dtype: null # Set from above arguments for core bf16: True
timers: null params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True # Optimizations
async_tensor_model_parallel_allreduce: True gradient_accumulation_fusion: True
tp_comm_overlap: False async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True # Debug Options
tp_comm_atomic_ag: True tp_comm_split_ag: True
tp_comm_split_rs: True tp_comm_atomic_ag: True
tp_comm_atomic_rs: True tp_comm_split_rs: True
tp_comm_bulk_wgrad: True tp_comm_atomic_rs: True
tp_comm_bulk_dgrad: True tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null # Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null # Pipeline Parallel
grad_scale_func: null pipeline_dtype: null
enable_autocast: False grad_scale_func: null
autocast_dtype: null enable_autocast: False
variable_seq_lengths: False autocast_dtype: null
num_microbatches_with_partial_activation_checkpoints: null variable_seq_lengths: False
overlap_p2p_comm: False num_microbatches_with_partial_activation_checkpoints: null
batch_p2p_comm: True overlap_p2p_comm: False
batch_p2p_sync: True batch_p2p_comm: True
use_ring_exchange_p2p: False batch_p2p_sync: True
deallocate_pipeline_outputs: False use_ring_exchange_p2p: False
no_sync_func: null deallocate_pipeline_outputs: False
grad_sync_func: null no_sync_func: null
param_sync_func: null grad_sync_func: null
pipeline_model_parallel_split_rank: null param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False # CPU Offloading
cpu_offloading_num_layers: 0 cpu_offloading: False
_cpu_offloading_context: null cpu_offloading_num_layers: 0
cpu_offloading_weights: False _cpu_offloading_context: null
cpu_offloading_activations: True cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True # Timing
barrier_with_L1_time: True
# training:
use_legacy_models: False # training:
spec: null use_legacy_models: False
micro_batch_size: 2 spec: null
global_batch_size: 128 micro_batch_size: 2
rampup_batch_size: [32, 32, 65324160] global_batch_size: 128
check_for_nan_in_loss_and_grad: True rampup_batch_size: [32, 32, 65324160]
num_layers_per_virtual_pipeline_stage: null check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null encoder_num_layers: null
rotary_seq_len_interpolation_factor: null decoder_num_layers: null
add_position_embedding: False rotary_seq_len_interpolation_factor: null
make_vocab_size_divisible_by: 128 add_position_embedding: False
group_query_attention: False make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null exit_signal_handler: False
exit_interval: null exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope untie_embeddings_and_output_weights: True
rotary_percent: 0.5 position_embedding_type: rope
openai_gelu: False rotary_percent: 0.5
squared_relu: False openai_gelu: False
swiglu: True squared_relu: False
onnx_safe: null swiglu: True
bert_binary_head: True onnx_safe: null
max_position_embeddings: 4096 bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False transformer_impl: local
seed: 1234 use_flash_attn: False
data_parallel_random_init: False seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam # Optimizer
lr: 2.5e-4 optimizer: adam
lr_decay_style: cosine lr: 2.5e-4
lr_decay_iters: null lr_decay_style: cosine
lr_decay_samples: 255126953 lr_decay_iters: null
lr_warmup_fraction: null lr_decay_samples: 255126953
lr_warmup_iters: 0 lr_warmup_fraction: null
lr_warmup_samples: 81381 lr_warmup_iters: 0
lr_warmup_init: 0.0 lr_warmup_samples: 81381
min_lr: 2.5e-5 lr_warmup_init: 0.0
weight_decay: 0.1 min_lr: 2.5e-5
start_weight_decay: null weight_decay: 0.1
end_weight_decay: null start_weight_decay: null
weight_decay_incr_style: constant end_weight_decay: null
clip_grad: 1.0 weight_decay_incr_style: constant
adam_beta1: 0.9 clip_grad: 1.0
adam_beta2: 0.95 adam_beta1: 0.9
adam_eps: 1.e-08 adam_beta2: 0.95
sgd_momentum: 0.9 adam_eps: 1.e-08
override_opt_param_scheduler: False sgd_momentum: 0.9
use_checkpoint_opt_param_scheduler: False override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null # checkpointing arguments
save_interval: 20000 save: null
no_save_optim: null save_interval: 20000
no_save_rng: null no_save_optim: null
load: null no_save_rng: null
no_load_optim: null load: null
no_load_rng: null no_load_optim: null
finetune: False no_load_rng: null
use_checkpoint_args: False finetune: False
exit_on_missing_checkpoint: False use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null # loss arguments
initial_loss_scale: 4294967296 loss_scale: null
min_loss_scale: 1.0 initial_loss_scale: 4294967296
loss_scale_window: 1000 min_loss_scale: 1.0
hysteresis: 2 loss_scale_window: 1000
accumulate_allreduce_grads_in_fp32: False hysteresis: 2
fp16_lm_cross_entropy: False accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl # distributed arguments
distributed_timeout_minutes: 10 distributed_backend: nccl
overlap_grad_reduce: False distributed_timeout_minutes: 10
align_grad_reduce: True overlap_grad_reduce: False
overlap_param_gather: False align_grad_reduce: True
align_param_gather: False overlap_param_gather: False
scatter_gather_tensors_in_pipeline: True align_param_gather: False
local_rank: null scatter_gather_tensors_in_pipeline: True
lazy_mpu_init: null local_rank: null
empty_unused_memory_level: 0 lazy_mpu_init: null
standalone_embedding_stage: False empty_unused_memory_level: 0
use_distributed_optimizer: False standalone_embedding_stage: False
nccl_communicator_config_path: null use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32 train_iters: null
eval_interval: 2000 eval_iters: 32
skip_train: False eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000 adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False # garbage collection
manual_gc_interval: 0 manual_gc: False
manual_gc_eval: True manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
tp_comm_overlap_cfg: null
#data
data_path: null #data
split: '99,1,0' data_path: null
train_data_path: null split: '99,1,0'
valid_data_path: null train_data_path: null
test_data_path: null valid_data_path: null
data_cache_path: null test_data_path: null
mock_data: False data_cache_path: null
vocab_size: null mock_data: False
vocab_file: null vocab_size: null
merge_file: null vocab_file: null
vocab_extra_ids: 0 merge_file: null
seq_length: 4096 vocab_extra_ids: 0
encoder_seq_length: null seq_length: 4096
decoder_seq_length: null encoder_seq_length: null
retriever_seq_length: 256 decoder_seq_length: null
sample_rate: 1.0 retriever_seq_length: 256
mask_prob: 0.15 sample_rate: 1.0
short_seq_prob: 0.1 mask_prob: 0.15
num_workers: 2 short_seq_prob: 0.1
tokenizer_type: GPTSentencePieceTokenizer num_workers: 2
tokenizer_model: null tokenizer_type: GPTSentencePieceTokenizer
reset_position_ids: False tokenizer_model: null
reset_attention_mask: False reset_position_ids: False
eod_mask_loss: False reset_attention_mask: False
train_samples: 268554688 eod_mask_loss: False
dataloader_type: null train_samples: 268554688
dataloader_type: null
#profile:
profile: False #profile:
profile_ranks: [0] profile: False
profile_step_end: 12 profile_ranks: [0]
profile_step_start: 10 profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True #logging:
log_num_zeros_in_grad: True log_params_norm: True
log_throughput: False log_num_zeros_in_grad: True
log_progress: False log_throughput: False
timing_log_level: 0 log_progress: False
timing_log_option: minmax timing_log_level: 0
tensorboard_log_interval: 1 timing_log_option: minmax
tensorboard_queue_size: 1000 tensorboard_log_interval: 1
log_timers_to_tensorboard: False tensorboard_queue_size: 1000
log_validation_ppl_to_tensorboard: False log_timers_to_tensorboard: False
log_memory_to_tensorboard: False log_validation_ppl_to_tensorboard: False
log_world_size_to_tensorboard: False log_memory_to_tensorboard: False
log_loss_scale_to_tensorboard: True log_world_size_to_tensorboard: False
wandb_project: '' log_loss_scale_to_tensorboard: True
wandb_exp_name: '' wandb_project: ''
wandb_save_dir: '' wandb_exp_name: ''
enable_one_logger: True wandb_save_dir: ''
one_logger_project: megatron-lm enable_one_logger: True
one_logger_run_name: null one_logger_project: megatron-lm
log_interval: 100 one_logger_run_name: null
tensorboard_dir: null log_interval: 100
tensorboard_dir: null
...@@ -7,10 +7,10 @@ do ...@@ -7,10 +7,10 @@ do
fi fi
done done
mpirun -np 8 --allow-run-as-root \ mpirun -np 8 --allow-run-as-root \
train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1 train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait wait
rm -rf CKPT rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
...@@ -7,13 +7,13 @@ do ...@@ -7,13 +7,13 @@ do
fi fi
done done
mpirun -np 16 --hostfile mixtralnodes \ mpirun -np 512 --hostfile hostfile_gpt_567B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1 train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1
wait wait
rm -rf CKPT rm -rf CKPT
#rm -rf mixtral_dataset/my-mixtral_text_document #rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
#!/bin/bash #!/bin/bash
# Runs the "175B" parameter model # Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8 GPUS_PER_NODE=8
# Change for multinode config # Change for multinode config
MASTER_ADDR=localhost MASTER_ADDR=localhost
MASTER_PORT=6000 MASTER_PORT=6000
NUM_NODES=1 NUM_NODES=1
NODE_RANK=0 NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path> CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path> TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=( DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE --nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES --nnodes $NUM_NODES
--master_addr $MASTER_ADDR --master_addr $MASTER_ADDR
--master_port $MASTER_PORT --master_port $MASTER_PORT
) )
GPT_MODEL_ARGS=( GPT_MODEL_ARGS=(
--num-layers 96 --num-layers 96
--hidden-size 12288 --hidden-size 12288
--num-attention-heads 96 --num-attention-heads 96
--seq-length 2048 --seq-length 2048
--max-position-embeddings 2048 --max-position-embeddings 2048
--attention-backend auto # Can use (flash/fused/unfused/local) --attention-backend auto # Can use (flash/fused/unfused/local)
) )
TRAINING_ARGS=( TRAINING_ARGS=(
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 1536 --global-batch-size 1536
--rampup-batch-size 16 16 5859375 --rampup-batch-size 16 16 5859375
--train-iters 500000 --train-iters 500000
--weight-decay 0.1 --weight-decay 0.1
--adam-beta1 0.9 --adam-beta1 0.9
--adam-beta2 0.95 --adam-beta2 0.95
--init-method-std 0.006 --init-method-std 0.006
--clip-grad 1.0 --clip-grad 1.0
--fp16 --fp16
--lr 6.0e-5 --lr 6.0e-5
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 6.0e-6 --min-lr 6.0e-6
--lr-warmup-fraction .001 --lr-warmup-fraction .001
--lr-decay-iters 430000 --lr-decay-iters 430000
) )
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8 --tensor-model-parallel-size 8
--pipeline-model-parallel-size 16 --pipeline-model-parallel-size 16
) )
DATA_ARGS=( DATA_ARGS=(
--data-path $DATA_PATH --data-path $DATA_PATH
--vocab-file $VOCAB_FILE --vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE --merge-file $MERGE_FILE
--split 949,50,1 --split 949,50,1
) )
EVAL_AND_LOGGING_ARGS=( EVAL_AND_LOGGING_ARGS=(
--log-interval 100 --log-interval 100
--save-interval 10000 --save-interval 10000
--eval-interval 1000 --eval-interval 1000
--save $CHECKPOINT_PATH --save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH --load $CHECKPOINT_PATH
--eval-iters 10 --eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH --tensorboard-dir $TENSORBOARD_LOGS_PATH
) )
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \ torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \ ${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \ ${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \ ${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \ ${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} ${EVAL_AND_LOGGING_ARGS[@]}
...@@ -4,18 +4,23 @@ for para in $* ...@@ -4,18 +4,23 @@ for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi fi
done done
# Runs GPT 567B model
source /opt/dtk/env.sh source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10 export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32
...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
...@@ -96,7 +102,6 @@ TRAINING_ARGS=( ...@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
--overlap-grad-reduce --overlap-grad-reduce
#--tp-comm-overlap
) )
TORCH_PROFIE_ARGS=( TORCH_PROFIE_ARGS=(
...@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=( ...@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_gpt_1nodes --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
--use-pytorch-profiler --use-pytorch-profiler
) )
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 1 --pipeline-model-parallel-size 1
...@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \ ...@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}" APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi fi
#for hygon cpu #for hygon cpu
...@@ -205,4 +198,4 @@ case ${LOCAL_RANK} in ...@@ -205,4 +198,4 @@ case ${LOCAL_RANK} in
${APP} ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP} #numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
\ No newline at end of file
...@@ -4,18 +4,23 @@ for para in $* ...@@ -4,18 +4,23 @@ for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi fi
done done
# Runs GPT 567B model
source /opt/dtk/env.sh source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10 export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32
...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
...@@ -49,7 +55,7 @@ MODEL_ARGS=( ...@@ -49,7 +55,7 @@ MODEL_ARGS=(
--disable-bias-linear --disable-bias-linear
--seq-length 8192 --seq-length 8192
--max-position-embeddings 32768 --max-position-embeddings 32768
--num-layers 64 --num-layers 32 #64
--hidden-size 8192 --hidden-size 8192
--ffn-hidden-size 32768 --ffn-hidden-size 32768
--num-attention-heads 64 --num-attention-heads 64
...@@ -72,7 +78,7 @@ MOE_ARGS=( ...@@ -72,7 +78,7 @@ MOE_ARGS=(
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
--moe-grouped-gemm #--moe-grouped-gemm
) )
DATA_ARGS=( DATA_ARGS=(
...@@ -84,7 +90,7 @@ DATA_ARGS=( ...@@ -84,7 +90,7 @@ DATA_ARGS=(
TRAINING_ARGS=( TRAINING_ARGS=(
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 4096 --global-batch-size 1024
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 320000
...@@ -96,7 +102,6 @@ TRAINING_ARGS=( ...@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
--overlap-grad-reduce --overlap-grad-reduce
#--tp-comm-overlap
) )
TORCH_PROFIE_ARGS=( TORCH_PROFIE_ARGS=(
...@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=( ...@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_gpt --profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
--use-pytorch-profiler --use-pytorch-profiler
) )
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 16 --pipeline-model-parallel-size 16
--expert-model-parallel-size 16 --expert-model-parallel-size 16
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--context-parallel-size 2
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \ ...@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}" APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi fi
#for hygon cpu #for hygon cpu
......
import os import os
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
from pretrain_gpt import model_provider InferenceWrapperConfig,
import torch )
import sys from pretrain_gpt import model_provider
from argparse import Namespace import torch
from megatron.core.inference.engines.abstract_engine import AbstractEngine import sys
from megatron.core.inference.engines.mcore_engine import MCoreEngine import time
from megatron.core.inference.sampling_params import SamplingParams import tqdm
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper import warnings
from megatron.core.inference.inference_request import InferenceRequest from argparse import Namespace
from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.transformer.module import MegatronModule from megatron.core.inference.engines.mcore_engine import MCoreEngine
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), from megatron.core.inference.sampling_params import SamplingParams
os.path.pardir, os.path.pardir))) from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
from megatron.training import get_args )
from megatron.training import get_tokenizer from megatron.core.inference.inference_request import InferenceRequest
from megatron.training.checkpointing import load_checkpoint from megatron.core.inference.text_generation_controllers.text_generation_controller import (
from megatron.core import mpu TextGenerationController,
from megatron.training.initialize import initialize_megatron )
from megatron.training import get_model from megatron.core.transformer.module import MegatronModule
from typing import List
sys.path.append(
def add_text_generate_args(parser): os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
"""Text generation arguments.""" )
group = parser.add_argument_group(title='text generation')
from megatron.training import get_args
group.add_argument("--temperature", type=float, default=1.0, from megatron.training import get_tokenizer
help='Sampling temperature.') from megatron.training.checkpointing import load_checkpoint
group.add_argument("--top_k", type=int, default=1, from megatron.core import mpu
help='Top k sampling.') from megatron.training.initialize import initialize_megatron
group.add_argument("--top_p", type=float, default=0.0, from megatron.training import get_model
help='Top p sampling.') import asyncio
group.add_argument("--return-log-probs", action='store_true', default=False, from typing import AsyncIterator, List
help='Return the log probabilities of the final output tokens')
group.add_argument("--num-tokens-to-generate", type=int, default=30,
help='Number of tokens to generate for each prompt')
group.add_argument("--prompts", metavar='N', type=str, nargs='+', def add_text_generate_args(parser):
help='Input prompts with each prompt within quotes and seperated by space') """Text generation arguments."""
group.add_argument("--max-batch-size", type=int, default=1, group = parser.add_argument_group(title='text generation')
help='Max number of prompts to process at once')
return parser group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine: group.add_argument(
"""Utility to get the relevant backend for running inference "--return-log-probs",
action='store_true',
This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. default=False,
help='Return the log probabilities of the final output tokens',
Args: )
args (Namespace): The user arguments parsed from command line group.add_argument(
model (MegatronModule): The megatron model . "--num-tokens-to-generate",
type=int,
Returns: default=30,
AbstractBackend: The chosen backend help='Number of tokens to generate for each prompt',
""" )
tokenizer = get_tokenizer() group.add_argument(
"--prompts",
inference_wrapper_config = InferenceWrapperConfig( metavar='N',
hidden_size=args.hidden_size, type=str,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, nargs='+',
fp32_residual_connection=args.fp32_residual_connection, help='Input prompts with each prompt within quotes and seperated by space',
params_dtype=args.params_dtype, )
padded_vocab_size=args.padded_vocab_size group.add_argument(
) "--max-batch-size", type=int, default=8, dest="inference_max_requests",
help='Max number of prompts to process at once'
inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) )
text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size) return parser
def main():
"""Main program.""" def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
"""Utility to get the relevant backend for running inference
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True, Args:
'no_load_optim': True, args (Namespace): The user arguments parsed from command line
'micro_batch_size': 1, model (MegatronModule): The megatron model .
'exit_on_missing_checkpoint': True})
Returns:
# Set up model and load checkpoint AbstractBackend: The chosen backend
model = get_model(model_provider, wrap_with_ddp=False) """
load_checkpoint(model, None, None) tokenizer = get_tokenizer()
model = model[0]
inference_wrapper_config = InferenceWrapperConfig(
args = get_args() hidden_size=args.hidden_size,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
inference_engine = get_inference_engine(args, model) fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
sampling_params = SamplingParams( padded_vocab_size=args.padded_vocab_size,
temperature=args.temperature, inference_max_requests=args.inference_max_requests,
top_k=args.top_k, inference_max_seq_length=args.inference_max_seq_length,
top_p=args.top_p, )
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate) inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
results: List[InferenceRequest] = inference_engine.generate( return MCoreEngine(text_generation_controller=text_generation_controller)
prompts=args.prompts, sampling_params=sampling_params
)
async def generate(
if torch.distributed.get_rank() == 0: inference_engine: MCoreEngine,
for idx, result in enumerate(results): sampling_params: SamplingParams,
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ') prompts: List[str],
result = { ) -> List[InferenceRequest]:
'id': result.request_id, async def collect_stream(prompt, request_id, stream_generator):
'input_prompt': result.prompt, print(f"Request {request_id}: {prompt}", end="", flush=True)
'generated_text': result.generated_text, prev_idx = 0
'generated_tokens' : result.generated_tokens async for output in stream_generator:
} print(output.generated_text[prev_idx:], end="", flush=True)
print(result) prev_idx = len(output.generated_text)
print()
if __name__ == "__main__":
main() request_ids: List[str] = [
inference_engine.add_request(
prompt=prompt, inference_parameters=sampling_params, streaming=True
)
for prompt in prompts
]
stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
tasks = [
asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
]
await inference_engine.run_engine_async()
await asyncio.gather(*tasks)
results: List[InferenceRequest] = [
inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
]
return results
def main():
"""Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(
extra_args_provider=add_text_generate_args,
args_defaults={
'no_load_rng': True,
'no_load_optim': True,
'micro_batch_size': 1,
'exit_on_missing_checkpoint': True,
},
)
# Set up model and load checkpoint
model = get_model(model_provider, wrap_with_ddp=False)
load_checkpoint(model, None, None)
model = model[0]
args = get_args()
inference_engine = get_inference_engine(args, model)
sampling_params = SamplingParams(
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate,
)
if args.enable_cuda_graph:
print(f"Running warmup for CUDA graphs...")
inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params
)
start_time = time.perf_counter()
if args.stream:
results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
else:
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params,
)
end_time = time.perf_counter()
latency = end_time - start_time
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
result = {
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens': result.generated_tokens,
'latency': latency,
}
print(result)
torch.distributed.destroy_process_group()
if __name__ == "__main__":
main()
#!/bin/bash #!/bin/bash
# This example will start serving the Llama3.1-8B model # This example will start serving the Llama3.1-8B model
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \ DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \ --nnodes 1 \
--node_rank 0 \ --node_rank 0 \
--master_addr 0.0.0.0 \ --master_addr 0.0.0.0 \
--master_port 6000" --master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided # Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1 exit 1
fi fi
# Assign command-line arguments to variables # Assign command-line arguments to variables
CHECKPOINT=$1 CHECKPOINT=$1
TOKENIZER_MODEL=$2 TOKENIZER_MODEL=$2
pip install flask-restful pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \ --use-checkpoint-args \
--disable-bias-linear \ --disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \ --tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \ --tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--normalization RMSNorm \ --normalization RMSNorm \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 500000 \ --rotary-base 500000 \
--use-rope-scaling \ --use-rope-scaling \
--use-rotary-position-embeddings \ --use-rotary-position-embeddings \
--swiglu \ --swiglu \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 32 \ --num-layers 32 \
--hidden-size 4096 \ --hidden-size 4096 \
--ffn-hidden-size 14336 \ --ffn-hidden-size 14336 \
--load ${CHECKPOINT} \ --load ${CHECKPOINT} \
--num-attention-heads 32 \ --num-attention-heads 32 \
--max-position-embeddings 131072 \ --max-position-embeddings 131072 \
--bf16 \ --bf16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length 8192 --seq-length 8192
#!/bin/bash #!/bin/bash
# This example will start serving the Llama3-8B model # This example will start serving the Llama3-8B model
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \ DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \ --nnodes 1 \
--node_rank 0 \ --node_rank 0 \
--master_addr 0.0.0.0 \ --master_addr 0.0.0.0 \
--master_port 6000" --master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided # Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1 exit 1
fi fi
# Assign command-line arguments to variables # Assign command-line arguments to variables
CHECKPOINT=$1 CHECKPOINT=$1
TOKENIZER_MODEL=$2 TOKENIZER_MODEL=$2
pip install flask-restful pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \ --use-checkpoint-args \
--disable-bias-linear \ --disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \ --tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \ --tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--normalization RMSNorm \ --normalization RMSNorm \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 500000 \ --rotary-base 500000 \
--use-rotary-position-embeddings \ --use-rotary-position-embeddings \
--swiglu \ --swiglu \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 32 \ --num-layers 32 \
--hidden-size 4096 \ --hidden-size 4096 \
--ffn-hidden-size 14336 \ --ffn-hidden-size 14336 \
--load ${CHECKPOINT} \ --load ${CHECKPOINT} \
--num-attention-heads 32 \ --num-attention-heads 32 \
--max-position-embeddings 8192 \ --max-position-embeddings 8192 \
--bf16 \ --bf16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length 8192 --seq-length 8192
#!/bin/bash #!/bin/bash
# This example will start serving the Mistral-7B-v0.3 model # This example will start serving the Mistral-7B-v0.3 model
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
DISTRIBUTED_ARGS="--nproc_per_node 1 \ DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \ --nnodes 1 \
--node_rank 0 \ --node_rank 0 \
--master_addr 0.0.0.0 \ --master_addr 0.0.0.0 \
--master_port 6000" --master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided # Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1 exit 1
fi fi
# Assign command-line arguments to variables # Assign command-line arguments to variables
CHECKPOINT=$1 CHECKPOINT=$1
TOKENIZER_MODEL=$2 TOKENIZER_MODEL=$2
pip install flask-restful pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tokenizer-type HuggingFaceTokenizer \ --tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \ --tokenizer-model ${TOKENIZER_MODEL} \
--use-checkpoint-args \ --use-checkpoint-args \
--apply-layernorm-1p \ --apply-layernorm-1p \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--normalization RMSNorm \ --normalization RMSNorm \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--use-flash-attn \ --use-flash-attn \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--disable-bias-linear \ --disable-bias-linear \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 1000000 \ --rotary-base 1000000 \
--swiglu \ --swiglu \
--ffn-hidden-size 14336 \ --ffn-hidden-size 14336 \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 32 \ --num-layers 32 \
--hidden-size 4096 \ --hidden-size 4096 \
--load ${CHECKPOINT} \ --load ${CHECKPOINT} \
--num-attention-heads 32 \ --num-attention-heads 32 \
--max-position-embeddings 4096 \ --max-position-embeddings 4096 \
--bf16 \ --bf16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length 4096 \ --seq-length 4096 \
--seed 101 --seed 101
#!/bin/bash #!/bin/bash
# This example will start serving the 345M model. # This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \ DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \ --nnodes 1 \
--node_rank 0 \ --node_rank 0 \
--master_addr localhost \ --master_addr localhost \
--master_port 6000" --master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)> CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)> VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)> MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 24 \ --num-layers 24 \
--hidden-size 1024 \ --hidden-size 1024 \
--load ${CHECKPOINT} \ --load ${CHECKPOINT} \
--num-attention-heads 16 \ --num-attention-heads 16 \
--max-position-embeddings 1024 \ --max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \ --tokenizer-type GPT2BPETokenizer \
--fp16 \ --fp16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length 1024 \ --seq-length 1024 \
--vocab-file $VOCAB_FILE \ --vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \ --merge-file $MERGE_FILE \
--seed 42 --seed 42
#!/bin/bash #!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel # This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \ DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \ --nnodes 1 \
--node_rank 0 \ --node_rank 0 \
--master_addr localhost \ --master_addr localhost \
--master_port 6000" --master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)> CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)> VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)> MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \ --tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 24 \ --num-layers 24 \
--hidden-size 1024 \ --hidden-size 1024 \
--load ${CHECKPOINT} \ --load ${CHECKPOINT} \
--num-attention-heads 16 \ --num-attention-heads 16 \
--max-position-embeddings 1024 \ --max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \ --tokenizer-type GPT2BPETokenizer \
--fp16 \ --fp16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length 1024 \ --seq-length 1024 \
--vocab-file $VOCAB_FILE \ --vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \ --merge-file $MERGE_FILE \
--seed 42 --seed 42
#!/bin/bash #!/bin/bash
# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path> # Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server> # To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1 CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2 TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \ DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \ --nnodes 1 \
--node_rank 0 \ --node_rank 0 \
--master_addr localhost \ --master_addr localhost \
--master_port 6000" --master_port 6000"
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19 export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4 export NCCL_IB_QPS_PER_CONNECTION=4
export TRITON_CACHE_DIR="./triton-cache/" export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager" export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \ torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--num-layers 56 \ --num-layers 56 \
--hidden-size 4096 \ --hidden-size 4096 \
--load ${CHECKPOINT_PATH} \ --load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \ --num-attention-heads 32 \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--hybrid-attention-ratio 0.08 \ --hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \ --hybrid-mlp-ratio 0.5 \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--disable-bias-linear \ --disable-bias-linear \
--normalization RMSNorm \ --normalization RMSNorm \
--seq-length 4096 \ --seq-length 4096 \
--max-position-embeddings 4096 \ --max-position-embeddings 4096 \
--position-embedding-type none \ --position-embedding-type none \
--tokenizer-type GPTSentencePieceTokenizer \ --tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \ --tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \ --distributed-backend nccl \
--distributed-timeout-minutes 1440 \ --distributed-timeout-minutes 1440 \
--bf16 \ --bf16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--use-mcore-models \ --use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--seed 42 --seed 42
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment