更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
 # WARNING: Yaml configs is currently an experimental feature
 language_model:
  # model architecture
  num_layers: 24
  hidden_size: 1024
  num_attention_heads: 16
  num_query_groups: null
  ffn_hidden_size: null
  kv_channels: null
  hidden_dropout: 0.0
  attention_dropout: 0.0
  fp32_residual_connection: False
  apply_residual_connection_post_layernorm: False
  layernorm_epsilon: 1.e-5
  layernorm_zero_centered_gamma: True
  add_bias_linear: False
  bias_activation_fusion: False
  add_qkv_bias: False
  gated_linear_unit: False
  activation_func: swiglu
  num_moe_experts: null
  rotary_interleaved: False
  window_size: null
  # initialization
  init_method: null
  init_method_std: 0.02
  output_layer_init_method: null
  # mixed-precision
  apply_query_key_layer_scaling: False
  attention_softmax_in_fp32: False
  # fusion
  bias_swiglu_fusion: True
  masked_softmax_fusion: True
  persist_layer_norm: False
  memory_efficient_layer_norm: False
  bias_dropout_fusion: True
  apply_rope_fusion: True
  # activation recomputation
  recompute_granularity: null
  recompute_method: null
  recompute_num_layers: null
  distribute_saved_activations: null
  # fp8 related
  fp8: null
  fp8_margin: 0
  fp8_interval: 1
  fp8_amax_history_len: 1
  fp8_amax_compute_algo: "most_recent"
  fp8_wgrad: True
  # miscellaneous
  clone_scatter_output_in_embedding: True
  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
  # MoE related
  moe_router_load_balancing_type: "aux_loss"
  moe_router_topk: 2
-  moe_router_topk_limited_devices: null
+  moe_router_group_topk: null
-  moe_grouped_gemm: False
+  moe_router_num_groups: null
-  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_grouped_gemm: False
-  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
-  moe_input_jitter_eps: null
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
-  moe_token_dropping: False
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
-model_parallel:
-  # Model parallelism
+model_parallel:
-  tensor_model_parallel_size: 1
+  # Model parallelism
-  context_parallel_size: 1
+  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
+  pipeline_model_parallel_size: 1
-  sequence_parallel: True
+  virtual_pipeline_model_parallel_size: null
-  expert_model_parallel_size: 1
+  sequence_parallel: True
+  expert_model_parallel_size: 1
-  # Initialization
-  perform_initialization: True
+  # Initialization
-  use_cpu_initialization: null
+  perform_initialization: True
+  use_cpu_initialization: null
-  # Training
-  fp16: False
+  # Training
-  bf16: True
+  fp16: False
-  params_dtype: null # Set from above arguments for core
+  bf16: True
-  timers: null
+  params_dtype: null # Set from above arguments for core
+  timers: null
-  # Optimizations
-  gradient_accumulation_fusion: True
+  # Optimizations
-  async_tensor_model_parallel_allreduce: True
+  gradient_accumulation_fusion: True
-  tp_comm_overlap: False
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
-  # Debug Options
-  tp_comm_split_ag: True
+  # Debug Options
-  tp_comm_atomic_ag: True
+  tp_comm_split_ag: True
-  tp_comm_split_rs: True
+  tp_comm_atomic_ag: True
-  tp_comm_atomic_rs: True
+  tp_comm_split_rs: True
-  tp_comm_bulk_wgrad: True
+  tp_comm_atomic_rs: True
-  tp_comm_bulk_dgrad: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
-  # Parallelism
-  finalize_model_grads_func: null
+  # Parallelism
+  finalize_model_grads_func: null
-  # Pipeline Parallel
-  pipeline_dtype: null
+  # Pipeline Parallel
-  grad_scale_func: null
+  pipeline_dtype: null
-  enable_autocast: False
+  grad_scale_func: null
-  autocast_dtype: null
+  enable_autocast: False
-  variable_seq_lengths: False
+  autocast_dtype: null
-  num_microbatches_with_partial_activation_checkpoints: null
+  variable_seq_lengths: False
-  overlap_p2p_comm: False
+  num_microbatches_with_partial_activation_checkpoints: null
-  batch_p2p_comm: True
+  overlap_p2p_comm: False
-  batch_p2p_sync: True
+  batch_p2p_comm: True
-  use_ring_exchange_p2p: False
+  batch_p2p_sync: True
-  deallocate_pipeline_outputs: False
+  use_ring_exchange_p2p: False
-  no_sync_func: null
+  deallocate_pipeline_outputs: False
-  grad_sync_func: null
+  no_sync_func: null
-  param_sync_func: null
+  grad_sync_func: null
-  pipeline_model_parallel_split_rank: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
-  # CPU Offloading
-  cpu_offloading: False
+  # CPU Offloading
-  cpu_offloading_num_layers: 0
+  cpu_offloading: False
-  _cpu_offloading_context: null
+  cpu_offloading_num_layers: 0
-  cpu_offloading_weights: False
+  _cpu_offloading_context: null
-  cpu_offloading_activations: True
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
-  # Timing
-  barrier_with_L1_time: True
+  # Timing
+  barrier_with_L1_time: True
-# training:
-use_legacy_models: False
+# training:
-spec: null
+use_legacy_models: False
-micro_batch_size: 2
+spec: null
-global_batch_size: 128
+micro_batch_size: 2
-rampup_batch_size: [32, 32, 65324160] 
+global_batch_size: 128
-check_for_nan_in_loss_and_grad: True
+rampup_batch_size: [32, 32, 65324160] 
-num_layers_per_virtual_pipeline_stage: null
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
-encoder_num_layers: null
-decoder_num_layers: null
+encoder_num_layers: null
-rotary_seq_len_interpolation_factor: null
+decoder_num_layers: null
-add_position_embedding: False
+rotary_seq_len_interpolation_factor: null
-make_vocab_size_divisible_by: 128
+add_position_embedding: False
-group_query_attention: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
-exit_signal_handler: False
-exit_duration_in_mins: null
+exit_signal_handler: False
-exit_interval: null
+exit_duration_in_mins: null
+exit_interval: null
-untie_embeddings_and_output_weights: True
-position_embedding_type: rope
+untie_embeddings_and_output_weights: True
-rotary_percent: 0.5
+position_embedding_type: rope
-openai_gelu: False
+rotary_percent: 0.5
-squared_relu: False
+openai_gelu: False
-swiglu: True
+squared_relu: False
-onnx_safe: null
+swiglu: True
-bert_binary_head: True
+onnx_safe: null
-max_position_embeddings: 4096
+bert_binary_head: True
+max_position_embeddings: 4096
-transformer_impl: local
-use_flash_attn: False
+transformer_impl: local
-seed: 1234
+use_flash_attn: False
-data_parallel_random_init: False
+seed: 1234
+data_parallel_random_init: False
-# Optimizer
-optimizer: adam
+# Optimizer
-lr: 2.5e-4
+optimizer: adam
-lr_decay_style: cosine
+lr: 2.5e-4
-lr_decay_iters: null
+lr_decay_style: cosine
-lr_decay_samples: 255126953
+lr_decay_iters: null
-lr_warmup_fraction: null
+lr_decay_samples: 255126953
-lr_warmup_iters: 0
+lr_warmup_fraction: null
-lr_warmup_samples: 81381
+lr_warmup_iters: 0
-lr_warmup_init: 0.0
+lr_warmup_samples: 81381
-min_lr: 2.5e-5
+lr_warmup_init: 0.0
-weight_decay: 0.1
+min_lr: 2.5e-5
-start_weight_decay: null
+weight_decay: 0.1
-end_weight_decay: null
+start_weight_decay: null
-weight_decay_incr_style: constant
+end_weight_decay: null
-clip_grad: 1.0
+weight_decay_incr_style: constant
-adam_beta1: 0.9
+clip_grad: 1.0
-adam_beta2: 0.95
+adam_beta1: 0.9
-adam_eps: 1.e-08
+adam_beta2: 0.95
-sgd_momentum: 0.9
+adam_eps: 1.e-08
-override_opt_param_scheduler: False
+sgd_momentum: 0.9
-use_checkpoint_opt_param_scheduler: False
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
-# checkpointing arguments
-save: null
+# checkpointing arguments
-save_interval: 20000
+save: null
-no_save_optim: null
+save_interval: 20000
-no_save_rng: null
+no_save_optim: null
-load: null
+no_save_rng: null
-no_load_optim: null
+load: null
-no_load_rng: null
+no_load_optim: null
-finetune: False
+no_load_rng: null
-use_checkpoint_args: False
+finetune: False
-exit_on_missing_checkpoint: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
-# loss arguments
-loss_scale: null
+# loss arguments
-initial_loss_scale: 4294967296
+loss_scale: null
-min_loss_scale: 1.0
+initial_loss_scale: 4294967296
-loss_scale_window: 1000 
+min_loss_scale: 1.0
-hysteresis: 2
+loss_scale_window: 1000 
-accumulate_allreduce_grads_in_fp32: False
+hysteresis: 2
-fp16_lm_cross_entropy: False
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
-# distributed arguments
-distributed_backend: nccl
+# distributed arguments
-distributed_timeout_minutes: 10
+distributed_backend: nccl
-overlap_grad_reduce: False
+distributed_timeout_minutes: 10
-align_grad_reduce: True
+overlap_grad_reduce: False
-overlap_param_gather: False
+align_grad_reduce: True
-align_param_gather: False
+overlap_param_gather: False
-scatter_gather_tensors_in_pipeline: True
+align_param_gather: False
-local_rank: null
+scatter_gather_tensors_in_pipeline: True
-lazy_mpu_init: null
+local_rank: null
-empty_unused_memory_level: 0
+lazy_mpu_init: null
-standalone_embedding_stage: False
+empty_unused_memory_level: 0
-use_distributed_optimizer: False
+standalone_embedding_stage: False
-nccl_communicator_config_path: null
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
-train_iters: null
-eval_iters: 32
+train_iters: null
-eval_interval: 2000
+eval_iters: 32
-skip_train: False
+eval_interval: 2000
+skip_train: False
-adlr_autoresume: False
-adlr_autoresume_interval: 1000
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
-# garbage collection
-manual_gc: False
+# garbage collection
-manual_gc_interval: 0
+manual_gc: False
-manual_gc_eval: True
+manual_gc_interval: 0
+manual_gc_eval: True
-tp_comm_overlap_cfg: null
+tp_comm_overlap_cfg: null
-#data
-data_path: null
+#data
-split: '99,1,0'
+data_path: null
-train_data_path: null
+split: '99,1,0'
-valid_data_path: null
+train_data_path: null
-test_data_path: null
+valid_data_path: null
-data_cache_path: null
+test_data_path: null
-mock_data: False
+data_cache_path: null
-vocab_size: null
+mock_data: False
-vocab_file: null
+vocab_size: null
-merge_file: null
+vocab_file: null
-vocab_extra_ids: 0
+merge_file: null
-seq_length: 4096
+vocab_extra_ids: 0
-encoder_seq_length: null
+seq_length: 4096
-decoder_seq_length: null
+encoder_seq_length: null
-retriever_seq_length: 256
+decoder_seq_length: null
-sample_rate: 1.0
+retriever_seq_length: 256
-mask_prob: 0.15
+sample_rate: 1.0
-short_seq_prob: 0.1
+mask_prob: 0.15
-num_workers: 2
+short_seq_prob: 0.1
-tokenizer_type: GPTSentencePieceTokenizer
+num_workers: 2
-tokenizer_model: null
+tokenizer_type: GPTSentencePieceTokenizer
-reset_position_ids: False
+tokenizer_model: null
-reset_attention_mask: False
+reset_position_ids: False
-eod_mask_loss: False
+reset_attention_mask: False
-train_samples: 268554688
+eod_mask_loss: False
-dataloader_type: null
+train_samples: 268554688
+dataloader_type: null
-#profile:
-profile: False
+#profile:
-profile_ranks: [0]
+profile: False
-profile_step_end: 12
+profile_ranks: [0]
-profile_step_start: 10
+profile_step_end: 12
+profile_step_start: 10
-#logging:
-log_params_norm: True
+#logging:
-log_num_zeros_in_grad: True
+log_params_norm: True
-log_throughput: False
+log_num_zeros_in_grad: True
-log_progress: False
+log_throughput: False
-timing_log_level: 0
+log_progress: False
-timing_log_option: minmax
+timing_log_level: 0
-tensorboard_log_interval: 1
+timing_log_option: minmax
-tensorboard_queue_size: 1000
+tensorboard_log_interval: 1
-log_timers_to_tensorboard: False
+tensorboard_queue_size: 1000
-log_validation_ppl_to_tensorboard: False
+log_timers_to_tensorboard: False
-log_memory_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
-log_world_size_to_tensorboard: False
+log_memory_to_tensorboard: False
-log_loss_scale_to_tensorboard: True
+log_world_size_to_tensorboard: False
-wandb_project: ''
+log_loss_scale_to_tensorboard: True
-wandb_exp_name: ''
+wandb_project: ''
-wandb_save_dir: ''
+wandb_exp_name: ''
-enable_one_logger: True
+wandb_save_dir: ''
-one_logger_project: megatron-lm
+enable_one_logger: True
-one_logger_run_name: null
+one_logger_project: megatron-lm
-log_interval: 100
+one_logger_run_name: null
-tensorboard_dir: null
+log_interval: 100
+tensorboard_dir: null
--- a/examples/gpt3/hostfile_gpt_567B
+++ b/examples/gpt3/hostfile_gpt_567B
--- a/run_GPT-MOE_1nodes.sh
+++ b/run_GPT-MOE_1nodes.sh
@@ -7,10 +7,10 @@ do
    fi
 done
-mpirun -np 8 --allow-run-as-root \
+mpirun -np 8  --allow-run-as-root \
-             train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
 wait
 rm -rf CKPT
 rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/run_mixtral8x7B_2nodes.sh
+++ b/run_mixtral8x7B_2nodes.sh
@@ -7,13 +7,13 @@ do
    fi
 done
-mpirun -np 16 --hostfile mixtralnodes \
+mpirun -np 512 --hostfile hostfile_gpt_567B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1
 wait
 rm -rf CKPT
 #rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
 #!/bin/bash
 # Runs the "175B" parameter model
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 GPUS_PER_NODE=8
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NUM_NODES=1
 NODE_RANK=0
 WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
 CHECKPOINT_PATH=$1 #<Specify path>
 TENSORBOARD_LOGS_PATH=$2 #<Specify path>
 VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
 MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
 DATA_PATH=$5 #<Specify path and file prefix>_text_document
 DISTRIBUTED_ARGS=(
    --nproc_per_node $GPUS_PER_NODE 
    --nnodes $NUM_NODES 
    --master_addr $MASTER_ADDR 
    --master_port $MASTER_PORT
 )
 GPT_MODEL_ARGS=(
    --num-layers 96 
    --hidden-size 12288 
    --num-attention-heads 96 
    --seq-length 2048 
    --max-position-embeddings 2048 
    --attention-backend auto # Can use (flash/fused/unfused/local)
 )
 TRAINING_ARGS=(
    --micro-batch-size 1 
    --global-batch-size 1536 
    --rampup-batch-size 16 16 5859375 
    --train-iters 500000 
    --weight-decay 0.1 
    --adam-beta1 0.9 
    --adam-beta2 0.95 
    --init-method-std 0.006 
    --clip-grad 1.0 
    --fp16
    --lr 6.0e-5 
    --lr-decay-style cosine 
    --min-lr 6.0e-6
    --lr-warmup-fraction .001 
    --lr-decay-iters 430000 
 )
 MODEL_PARALLEL_ARGS=(
 	--tensor-model-parallel-size 8 
 	--pipeline-model-parallel-size 16 
 )
 DATA_ARGS=(
    --data-path $DATA_PATH 
    --vocab-file $VOCAB_FILE 
    --merge-file $MERGE_FILE 
    --split 949,50,1
 )
 EVAL_AND_LOGGING_ARGS=(
    --log-interval 100
    --save-interval 10000 
    --eval-interval 1000 
    --save $CHECKPOINT_PATH 
    --load $CHECKPOINT_PATH 
    --eval-iters 10
    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
 )
 torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
    ${GPT_MODEL_ARGS[@]} \
    ${TRAINING_ARGS[@]} \
    ${MODEL_PARALLEL_ARGS[@]} \
    ${DATA_ARGS[@]} \
    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/train_GPT-MOE_567B_1nodes.sh
+++ b/train_GPT-MOE_567B_1nodes.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done
+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )
 TORCH_PROFIE_ARGS=(
@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt_1nodes
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
    --use-pytorch-profiler
 )
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \
 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
 #for hygon cpu
@@ -205,4 +198,4 @@ case ${LOCAL_RANK} in
  ${APP}
  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac
\ No newline at end of file
--- a/train_GPT-MOE_567B.sh
+++ b/train_GPT-MOE_567B.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done
+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -49,7 +55,7 @@ MODEL_ARGS=(
    --disable-bias-linear
    --seq-length 8192
    --max-position-embeddings 32768
-    --num-layers 64
+    --num-layers 32 #64
    --hidden-size 8192
    --ffn-hidden-size 32768
    --num-attention-heads 64
@@ -72,7 +78,7 @@ MOE_ARGS=(
    --moe-token-dispatcher-type alltoall
    --moe-expert-capacity-factor 0.5
    --moe-pad-expert-input-to-capacity
-    --moe-grouped-gemm
+    #--moe-grouped-gemm
 )
 DATA_ARGS=(
@@ -84,7 +90,7 @@ DATA_ARGS=(
 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 4096
+    --global-batch-size 1024
    --lr 1e-4
    --train-iters 10
    --lr-decay-iters 320000
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )
 TORCH_PROFIE_ARGS=(
@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt
+    --profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
    --use-pytorch-profiler
 )
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 16
    --expert-model-parallel-size 16
    --expert-tensor-parallel-size 1
+    --context-parallel-size 2
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \
 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
 #for hygon cpu

--- a/examples/inference/gpt/gpt_batch_inference.py
+++ b/examples/inference/gpt/gpt_batch_inference.py
 import os
-from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
-from pretrain_gpt import model_provider
+    InferenceWrapperConfig,
-import torch
+)
-import sys
+from pretrain_gpt import model_provider
-from argparse import Namespace
+import torch
-from megatron.core.inference.engines.abstract_engine import AbstractEngine
+import sys
-from megatron.core.inference.engines.mcore_engine import MCoreEngine
+import time
-from megatron.core.inference.sampling_params import SamplingParams
+import tqdm
-from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+import warnings
-from megatron.core.inference.inference_request import InferenceRequest
+from argparse import Namespace
-from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
-from megatron.core.transformer.module import MegatronModule
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+from megatron.core.inference.sampling_params import SamplingParams
-                                             os.path.pardir, os.path.pardir)))
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
-from megatron.training import get_args
+)
-from megatron.training import get_tokenizer
+from megatron.core.inference.inference_request import InferenceRequest
-from megatron.training.checkpointing import load_checkpoint
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
-from megatron.core import mpu
+    TextGenerationController,
-from megatron.training.initialize import initialize_megatron
+)
-from megatron.training import get_model
+from megatron.core.transformer.module import MegatronModule
-from typing import List
+sys.path.append(
-def add_text_generate_args(parser):
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
-    """Text generation arguments."""
+)
-    group = parser.add_argument_group(title='text generation')
+from megatron.training import get_args
-    group.add_argument("--temperature", type=float, default=1.0,
+from megatron.training import get_tokenizer
-                       help='Sampling temperature.')
+from megatron.training.checkpointing import load_checkpoint
-    group.add_argument("--top_k", type=int, default=1,
+from megatron.core import mpu
-                       help='Top k sampling.')
+from megatron.training.initialize import initialize_megatron
-    group.add_argument("--top_p", type=float, default=0.0,
+from megatron.training import get_model
-                       help='Top p sampling.')
+import asyncio
-    group.add_argument("--return-log-probs", action='store_true', default=False,
+from typing import AsyncIterator, List
-                       help='Return the log probabilities of the final output tokens')
-    group.add_argument("--num-tokens-to-generate", type=int, default=30,
-                       help='Number of tokens to generate for each prompt')
-    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
+def add_text_generate_args(parser):
-                       help='Input prompts with each prompt within quotes and seperated by space')
+    """Text generation arguments."""
-    group.add_argument("--max-batch-size", type=int, default=1,
+    group = parser.add_argument_group(title='text generation')
-                       help='Max number of prompts to process at once')
-    return parser
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
-def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    group.add_argument(
-    """Utility to get the relevant backend for running inference
+        "--return-log-probs",
+        action='store_true',
-    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. 
+        default=False,
+        help='Return the log probabilities of the final output tokens',
-    Args:
+    )
-        args (Namespace): The user arguments parsed from command line
+    group.add_argument(
-        model (MegatronModule): The megatron model . 
+        "--num-tokens-to-generate",
+        type=int,
-    Returns:
+        default=30,
-        AbstractBackend: The chosen backend
+        help='Number of tokens to generate for each prompt',
-    """
+    )
-    tokenizer = get_tokenizer()
+    group.add_argument(
+        "--prompts",
-    inference_wrapper_config = InferenceWrapperConfig(
+        metavar='N',
-        hidden_size=args.hidden_size,
+        type=str,
-        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        nargs='+',
-        fp32_residual_connection=args.fp32_residual_connection,
+        help='Input prompts with each prompt within quotes and seperated by space',
-        params_dtype=args.params_dtype,
+    )
-        padded_vocab_size=args.padded_vocab_size
+    group.add_argument(
-    )
+        "--max-batch-size", type=int, default=8, dest="inference_max_requests",
+        help='Max number of prompts to process at once'
-    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    )
-    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
-    return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
+    return parser
-def main():
-    """Main program."""
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
-    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
-    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'no_load_rng': True,
+    Args:
-                                       'no_load_optim': True,
+        args (Namespace): The user arguments parsed from command line
-                                       'micro_batch_size': 1, 
+        model (MegatronModule): The megatron model .
-                                       'exit_on_missing_checkpoint': True})
+    Returns:
-    # Set up model and load checkpoint
+        AbstractBackend: The chosen backend
-    model = get_model(model_provider, wrap_with_ddp=False)
+    """
-    load_checkpoint(model, None, None)
+    tokenizer = get_tokenizer()
-    model = model[0]
+    inference_wrapper_config = InferenceWrapperConfig(
-    args = get_args()
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
-    inference_engine = get_inference_engine(args, model)
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
-    sampling_params = SamplingParams(
+        padded_vocab_size=args.padded_vocab_size,
-        temperature=args.temperature, 
+        inference_max_requests=args.inference_max_requests,
-        top_k=args.top_k, 
+        inference_max_seq_length=args.inference_max_seq_length,
-        top_p=args.top_p, 
+    )
-        return_log_probs=args.return_log_probs, 
-        num_tokens_to_generate=args.num_tokens_to_generate)
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
-    results: List[InferenceRequest] = inference_engine.generate(
+    return MCoreEngine(text_generation_controller=text_generation_controller)
-        prompts=args.prompts, sampling_params=sampling_params
-    )
+async def generate(
-    if torch.distributed.get_rank() == 0:
+    inference_engine: MCoreEngine,
-        for idx, result in enumerate(results):
+    sampling_params: SamplingParams,
-            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+    prompts: List[str],
-            result = {
+) -> List[InferenceRequest]:
-                'id': result.request_id,
+    async def collect_stream(prompt, request_id, stream_generator):
-                'input_prompt': result.prompt, 
+        print(f"Request {request_id}: {prompt}", end="", flush=True)
-                'generated_text': result.generated_text,
+        prev_idx = 0
-                'generated_tokens' : result.generated_tokens
+        async for output in stream_generator:
-                }
+            print(output.generated_text[prev_idx:], end="", flush=True)
-            print(result)
+            prev_idx = len(output.generated_text)
+        print()
-if __name__ == "__main__":
-    main()
+    request_ids: List[str] = [
+        inference_engine.add_request(
+            prompt=prompt, inference_parameters=sampling_params, streaming=True
+        )
+        for prompt in prompts
+    ]
+    stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
+    tasks = [
+        asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
+        for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
+    ]
+    await inference_engine.run_engine_async()
+    await asyncio.gather(*tasks)
+    results: List[InferenceRequest] = [
+        inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
+    ]
+    return results
+def main():
+    """Main program."""
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+    args = get_args()
+    inference_engine = get_inference_engine(args, model)
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+    if args.enable_cuda_graph:
+        print(f"Running warmup for CUDA graphs...")
+        inference_engine.generate(
+                prompts=args.prompts, sampling_params=sampling_params
+            )
+    start_time = time.perf_counter()
+    if args.stream:
+        results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
+    else:
+        results: List[InferenceRequest] = inference_engine.generate(
+            prompts=args.prompts, sampling_params=sampling_params,
+        )
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+                'latency': latency,
+            }
+            print(result)
+    torch.distributed.destroy_process_group()
+if __name__ == "__main__":
+    main()
--- a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
 #!/bin/bash
 # This example will start serving the Llama3.1-8B model
 export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_APPLY_QK_LAYER_SCALING=0
 DISTRIBUTED_ARGS="--nproc_per_node 1 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr 0.0.0.0 \
                  --master_port 6000"
 # Ensure CHECKPOINT and TOKENIZER_MODEL are provided
 if [ -z "$1" ] || [ -z "$2" ]; then
  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
  exit 1
 fi
 # Assign command-line arguments to variables
 CHECKPOINT=$1
 TOKENIZER_MODEL=$2
 pip install flask-restful
 torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
      --use-checkpoint-args \
      --disable-bias-linear \
      --tokenizer-type HuggingFaceTokenizer \
      --tokenizer-model ${TOKENIZER_MODEL} \
      --transformer-impl transformer_engine \
      --normalization RMSNorm \
      --group-query-attention \
      --num-query-groups 8 \
      --no-masked-softmax-fusion \
      --attention-softmax-in-fp32 \
      --attention-dropout 0.0 \
      --hidden-dropout 0.0 \
      --untie-embeddings-and-output-weights \
      --position-embedding-type rope \
      --rotary-percent 1.0 \
      --rotary-base 500000 \
      --use-rope-scaling \
      --use-rotary-position-embeddings \
      --swiglu \
      --tensor-model-parallel-size 1  \
      --pipeline-model-parallel-size 1  \
      --num-layers 32  \
      --hidden-size 4096  \
      --ffn-hidden-size 14336 \
      --load ${CHECKPOINT}  \
      --num-attention-heads 32  \
      --max-position-embeddings 131072  \
      --bf16  \
      --micro-batch-size 1  \
      --seq-length 8192
--- a/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh
 #!/bin/bash
 # This example will start serving the Llama3-8B model
 export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_APPLY_QK_LAYER_SCALING=0
 DISTRIBUTED_ARGS="--nproc_per_node 1 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr 0.0.0.0 \
                  --master_port 6000"
 # Ensure CHECKPOINT and TOKENIZER_MODEL are provided
 if [ -z "$1" ] || [ -z "$2" ]; then
  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
  exit 1
 fi
 # Assign command-line arguments to variables
 CHECKPOINT=$1
 TOKENIZER_MODEL=$2
 pip install flask-restful
 torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
      --use-checkpoint-args \
      --disable-bias-linear \
      --tokenizer-type HuggingFaceTokenizer \
      --tokenizer-model ${TOKENIZER_MODEL} \
      --transformer-impl transformer_engine \
      --normalization RMSNorm \
      --group-query-attention \
      --num-query-groups 8 \
      --no-masked-softmax-fusion \
      --attention-softmax-in-fp32 \
      --attention-dropout 0.0 \
      --hidden-dropout 0.0 \
      --untie-embeddings-and-output-weights \
      --position-embedding-type rope \
      --rotary-percent 1.0 \
      --rotary-base 500000 \
      --use-rotary-position-embeddings \
      --swiglu \
      --tensor-model-parallel-size 1  \
      --pipeline-model-parallel-size 1  \
      --num-layers 32  \
      --hidden-size 4096  \
      --ffn-hidden-size 14336 \
      --load ${CHECKPOINT}  \
      --num-attention-heads 32  \
      --max-position-embeddings 8192  \
      --bf16  \
      --micro-batch-size 1  \
      --seq-length 8192
--- a/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh
 #!/bin/bash
 # This example will start serving the Mistral-7B-v0.3 model
 export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 DISTRIBUTED_ARGS="--nproc_per_node 1 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr 0.0.0.0 \
                  --master_port 6000"
 # Ensure CHECKPOINT and TOKENIZER_MODEL are provided
 if [ -z "$1" ] || [ -z "$2" ]; then
  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
  exit 1
 fi
 # Assign command-line arguments to variables
 CHECKPOINT=$1
 TOKENIZER_MODEL=$2
 pip install flask-restful
 torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
       --tokenizer-type HuggingFaceTokenizer \
       --tokenizer-model ${TOKENIZER_MODEL} \
       --use-checkpoint-args \
       --apply-layernorm-1p \
       --transformer-impl transformer_engine \
       --normalization RMSNorm \
       --group-query-attention \
       --num-query-groups 8 \
       --no-masked-softmax-fusion \
       --use-flash-attn \
       --untie-embeddings-and-output-weights \
       --disable-bias-linear \
       --position-embedding-type rope \
       --rotary-percent 1.0 \
       --rotary-base 1000000 \
       --swiglu \
       --ffn-hidden-size 14336 \
       --tensor-model-parallel-size 1  \
       --pipeline-model-parallel-size 1  \
       --num-layers 32  \
       --hidden-size 4096  \
       --load ${CHECKPOINT}  \
       --num-attention-heads 32  \
       --max-position-embeddings 4096  \
       --bf16  \
       --micro-batch-size 1  \
       --seq-length 4096  \
       --seed 101
--- a/examples/inference/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
 #!/bin/bash
 # This example will start serving the 345M model.
 DISTRIBUTED_ARGS="--nproc_per_node 1 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"
 CHECKPOINT=<Path to checkpoint (e.g /345m)>
 VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
 MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 pip install flask-restful
 torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
       --tensor-model-parallel-size 1  \
       --pipeline-model-parallel-size 1  \
       --num-layers 24  \
       --hidden-size 1024  \
       --load ${CHECKPOINT}  \
       --num-attention-heads 16  \
       --max-position-embeddings 1024  \
       --tokenizer-type GPT2BPETokenizer  \
       --fp16  \
       --micro-batch-size 1  \
       --seq-length 1024  \
       --vocab-file $VOCAB_FILE  \
       --merge-file $MERGE_FILE  \
       --seed 42
--- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
 #!/bin/bash
 # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 DISTRIBUTED_ARGS="--nproc_per_node 8 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"
 CHECKPOINT=<Path to checkpoint (e.g /345m)>
 VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
 MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
 pip install flask-restful
 python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
       --tensor-model-parallel-size 8  \
       --pipeline-model-parallel-size 1  \
       --num-layers 24  \
       --hidden-size 1024  \
       --load ${CHECKPOINT}  \
       --num-attention-heads 16  \
       --max-position-embeddings 1024  \
       --tokenizer-type GPT2BPETokenizer  \
       --fp16  \
       --micro-batch-size 1  \
       --seq-length 1024  \
       --vocab-file $VOCAB_FILE  \
       --merge-file $MERGE_FILE  \
       --seed 42
--- a/examples/mamba/run_text_gen_server_8b.sh
+++ b/examples/mamba/run_text_gen_server_8b.sh
 #!/bin/bash
 # Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
 # To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
 CHECKPOINT_PATH=$1
 TOKENIZER_PATH=$2
 DISTRIBUTED_ARGS="--nproc_per_node 1 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"
 export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NCCL_IB_TIMEOUT=19
 export NCCL_IB_QPS_PER_CONNECTION=4
 export TRITON_CACHE_DIR="./triton-cache/"
 export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
 torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
       --tensor-model-parallel-size 1  \
       --pipeline-model-parallel-size 1  \
       --untie-embeddings-and-output-weights \
       --num-layers 56  \
       --hidden-size 4096  \
       --load ${CHECKPOINT_PATH}  \
       --num-attention-heads 32  \
       --group-query-attention \
       --num-query-groups 8 \
       --hybrid-attention-ratio 0.08 \
       --hybrid-mlp-ratio 0.5 \
       --attention-dropout 0.0 \
       --hidden-dropout 0.0 \
       --disable-bias-linear \
       --normalization RMSNorm \
       --seq-length 4096  \
       --max-position-embeddings 4096  \
       --position-embedding-type none \
       --tokenizer-type GPTSentencePieceTokenizer  \
       --tokenizer-model ${TOKENIZER_PATH} \
       --distributed-backend nccl \
       --distributed-timeout-minutes 1440 \
       --bf16  \
       --micro-batch-size 1  \
       --use-mcore-models \
       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
       --seed 42