Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
# WARNING: Yaml configs is currently an experimental feature
language_model:
# model architecture
num_layers: 24
hidden_size: 1024
num_attention_heads: 16
num_query_groups: null
ffn_hidden_size: null
kv_channels: null
hidden_dropout: 0.0
attention_dropout: 0.0
fp32_residual_connection: False
apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True
add_bias_linear: False
bias_activation_fusion: False
add_qkv_bias: False
gated_linear_unit: False
activation_func: swiglu
num_moe_experts: null
rotary_interleaved: False
window_size: null
# initialization
init_method: null
init_method_std: 0.02
output_layer_init_method: null
# mixed-precision
apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False
# fusion
bias_swiglu_fusion: True
masked_softmax_fusion: True
persist_layer_norm: False
memory_efficient_layer_norm: False
bias_dropout_fusion: True
apply_rope_fusion: True
# activation recomputation
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
# fp8 related
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True
# miscellaneous
clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_router_topk_limited_devices: null
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism
tensor_model_parallel_size: 1
context_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False
bf16: True
params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True
async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True
tp_comm_atomic_ag: True
tp_comm_split_rs: True
tp_comm_atomic_rs: True
tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null
grad_scale_func: null
enable_autocast: False
autocast_dtype: null
variable_seq_lengths: False
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: False
batch_p2p_comm: True
batch_p2p_sync: True
use_ring_exchange_p2p: False
deallocate_pipeline_outputs: False
no_sync_func: null
grad_sync_func: null
param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True
# training:
use_legacy_models: False
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160]
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
align_grad_reduce: True
overlap_param_gather: False
align_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null
#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: True
one_logger_project: megatron-lm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null
# WARNING: Yaml configs is currently an experimental feature
language_model:
# model architecture
num_layers: 24
hidden_size: 1024
num_attention_heads: 16
num_query_groups: null
ffn_hidden_size: null
kv_channels: null
hidden_dropout: 0.0
attention_dropout: 0.0
fp32_residual_connection: False
apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True
add_bias_linear: False
bias_activation_fusion: False
add_qkv_bias: False
gated_linear_unit: False
activation_func: swiglu
num_moe_experts: null
rotary_interleaved: False
window_size: null
# initialization
init_method: null
init_method_std: 0.02
output_layer_init_method: null
# mixed-precision
apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False
# fusion
bias_swiglu_fusion: True
masked_softmax_fusion: True
persist_layer_norm: False
memory_efficient_layer_norm: False
bias_dropout_fusion: True
apply_rope_fusion: True
# activation recomputation
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
# fp8 related
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True
# miscellaneous
clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_router_group_topk: null
moe_router_num_groups: null
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism
tensor_model_parallel_size: 1
context_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False
bf16: True
params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True
async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True
tp_comm_atomic_ag: True
tp_comm_split_rs: True
tp_comm_atomic_rs: True
tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null
grad_scale_func: null
enable_autocast: False
autocast_dtype: null
variable_seq_lengths: False
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: False
batch_p2p_comm: True
batch_p2p_sync: True
use_ring_exchange_p2p: False
deallocate_pipeline_outputs: False
no_sync_func: null
grad_sync_func: null
param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True
# training:
use_legacy_models: False
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160]
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
align_grad_reduce: True
overlap_param_gather: False
align_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null
#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: True
one_logger_project: megatron-lm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null
......@@ -7,10 +7,10 @@ do
fi
done
mpirun -np 8 --allow-run-as-root \
train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
rm -rf mixtral_dataset/my-mixtral_text_document
......@@ -7,13 +7,13 @@ do
fi
done
mpirun -np 16 --hostfile mixtralnodes \
mpirun -np 512 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
#rm -rf mixtral_dataset/my-mixtral_text_document
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
--attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
--attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
......@@ -4,18 +4,23 @@ for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
......@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
......@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16
--overlap-param-gather
--overlap-grad-reduce
#--tp-comm-overlap
)
TORCH_PROFIE_ARGS=(
......@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
......@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
......@@ -205,4 +198,4 @@ case ${LOCAL_RANK} in
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
\ No newline at end of file
esac
......@@ -4,18 +4,23 @@ for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
......@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
......@@ -49,7 +55,7 @@ MODEL_ARGS=(
--disable-bias-linear
--seq-length 8192
--max-position-embeddings 32768
--num-layers 64
--num-layers 32 #64
--hidden-size 8192
--ffn-hidden-size 32768
--num-attention-heads 64
......@@ -72,7 +78,7 @@ MOE_ARGS=(
--moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
#--moe-grouped-gemm
)
DATA_ARGS=(
......@@ -84,7 +90,7 @@ DATA_ARGS=(
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 4096
--global-batch-size 1024
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
......@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16
--overlap-param-gather
--overlap-grad-reduce
#--tp-comm-overlap
)
TORCH_PROFIE_ARGS=(
......@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt
--profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 16
--expert-model-parallel-size 16
--expert-tensor-parallel-size 1
--context-parallel-size 2
--use-distributed-optimizer
--sequence-parallel
)
......@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
......
import os
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
from pretrain_gpt import model_provider
import torch
import sys
from argparse import Namespace
from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
from megatron.core.transformer.module import MegatronModule
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir, os.path.pardir)))
from megatron.training import get_args
from megatron.training import get_tokenizer
from megatron.training.checkpointing import load_checkpoint
from megatron.core import mpu
from megatron.training.initialize import initialize_megatron
from megatron.training import get_model
from typing import List
def add_text_generate_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='text generation')
group.add_argument("--temperature", type=float, default=1.0,
help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1,
help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0,
help='Top p sampling.')
group.add_argument("--return-log-probs", action='store_true', default=False,
help='Return the log probabilities of the final output tokens')
group.add_argument("--num-tokens-to-generate", type=int, default=30,
help='Number of tokens to generate for each prompt')
group.add_argument("--prompts", metavar='N', type=str, nargs='+',
help='Input prompts with each prompt within quotes and seperated by space')
group.add_argument("--max-batch-size", type=int, default=1,
help='Max number of prompts to process at once')
return parser
def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
"""Utility to get the relevant backend for running inference
This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
Args:
args (Namespace): The user arguments parsed from command line
model (MegatronModule): The megatron model .
Returns:
AbstractBackend: The chosen backend
"""
tokenizer = get_tokenizer()
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=args.hidden_size,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size
)
inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
def main():
"""Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True,
'micro_batch_size': 1,
'exit_on_missing_checkpoint': True})
# Set up model and load checkpoint
model = get_model(model_provider, wrap_with_ddp=False)
load_checkpoint(model, None, None)
model = model[0]
args = get_args()
inference_engine = get_inference_engine(args, model)
sampling_params = SamplingParams(
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate)
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params
)
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
result = {
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens' : result.generated_tokens
}
print(result)
if __name__ == "__main__":
main()
import os
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from pretrain_gpt import model_provider
import torch
import sys
import time
import tqdm
import warnings
from argparse import Namespace
from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
from megatron.core.transformer.module import MegatronModule
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from megatron.training import get_args
from megatron.training import get_tokenizer
from megatron.training.checkpointing import load_checkpoint
from megatron.core import mpu
from megatron.training.initialize import initialize_megatron
from megatron.training import get_model
import asyncio
from typing import AsyncIterator, List
def add_text_generate_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='text generation')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument(
"--return-log-probs",
action='store_true',
default=False,
help='Return the log probabilities of the final output tokens',
)
group.add_argument(
"--num-tokens-to-generate",
type=int,
default=30,
help='Number of tokens to generate for each prompt',
)
group.add_argument(
"--prompts",
metavar='N',
type=str,
nargs='+',
help='Input prompts with each prompt within quotes and seperated by space',
)
group.add_argument(
"--max-batch-size", type=int, default=8, dest="inference_max_requests",
help='Max number of prompts to process at once'
)
group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
return parser
def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
"""Utility to get the relevant backend for running inference
This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
Args:
args (Namespace): The user arguments parsed from command line
model (MegatronModule): The megatron model .
Returns:
AbstractBackend: The chosen backend
"""
tokenizer = get_tokenizer()
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=args.hidden_size,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size,
inference_max_requests=args.inference_max_requests,
inference_max_seq_length=args.inference_max_seq_length,
)
inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
return MCoreEngine(text_generation_controller=text_generation_controller)
async def generate(
inference_engine: MCoreEngine,
sampling_params: SamplingParams,
prompts: List[str],
) -> List[InferenceRequest]:
async def collect_stream(prompt, request_id, stream_generator):
print(f"Request {request_id}: {prompt}", end="", flush=True)
prev_idx = 0
async for output in stream_generator:
print(output.generated_text[prev_idx:], end="", flush=True)
prev_idx = len(output.generated_text)
print()
request_ids: List[str] = [
inference_engine.add_request(
prompt=prompt, inference_parameters=sampling_params, streaming=True
)
for prompt in prompts
]
stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
tasks = [
asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
]
await inference_engine.run_engine_async()
await asyncio.gather(*tasks)
results: List[InferenceRequest] = [
inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
]
return results
def main():
"""Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(
extra_args_provider=add_text_generate_args,
args_defaults={
'no_load_rng': True,
'no_load_optim': True,
'micro_batch_size': 1,
'exit_on_missing_checkpoint': True,
},
)
# Set up model and load checkpoint
model = get_model(model_provider, wrap_with_ddp=False)
load_checkpoint(model, None, None)
model = model[0]
args = get_args()
inference_engine = get_inference_engine(args, model)
sampling_params = SamplingParams(
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate,
)
if args.enable_cuda_graph:
print(f"Running warmup for CUDA graphs...")
inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params
)
start_time = time.perf_counter()
if args.stream:
results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
else:
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params,
)
end_time = time.perf_counter()
latency = end_time - start_time
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
result = {
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens': result.generated_tokens,
'latency': latency,
}
print(result)
torch.distributed.destroy_process_group()
if __name__ == "__main__":
main()
#!/bin/bash
# This example will start serving the Llama3.1-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rope-scaling \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 131072 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Llama3.1-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rope-scaling \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 131072 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Llama3-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 8192 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Llama3-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 8192 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Mistral-7B-v0.3 model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--use-checkpoint-args \
--apply-layernorm-1p \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--ffn-hidden-size 14336 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--bf16 \
--micro-batch-size 1 \
--seq-length 4096 \
--seed 101
#!/bin/bash
# This example will start serving the Mistral-7B-v0.3 model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--use-checkpoint-args \
--apply-layernorm-1p \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--ffn-hidden-size 14336 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--bf16 \
--micro-batch-size 1 \
--seq-length 4096 \
--seed 101
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--untie-embeddings-and-output-weights \
--num-layers 56 \
--hidden-size 4096 \
--load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--normalization RMSNorm \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type none \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--distributed-timeout-minutes 1440 \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--seed 42
#!/bin/bash
# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--untie-embeddings-and-output-weights \
--num-layers 56 \
--hidden-size 4096 \
--load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--normalization RMSNorm \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type none \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--distributed-timeout-minutes 1440 \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--seed 42
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment