更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
-# WARNING: Yaml configs is currently an experimental feature
-language_model:
-  # model architecture
-  num_layers: 24
-  hidden_size: 1024
-  num_attention_heads: 16
-  num_query_groups: null
-
-  ffn_hidden_size: null
-  kv_channels: null
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  fp32_residual_connection: False
-
-  apply_residual_connection_post_layernorm: False
-  layernorm_epsilon: 1.e-5
-  layernorm_zero_centered_gamma: True
-  add_bias_linear: False
-  bias_activation_fusion: False
-  add_qkv_bias: False
-  gated_linear_unit: False
-  activation_func: swiglu
-  num_moe_experts: null
-  rotary_interleaved: False
-  window_size: null
-
-  # initialization
-  init_method: null
-  init_method_std: 0.02
-  output_layer_init_method: null
-
-  # mixed-precision
-  apply_query_key_layer_scaling: False
-  attention_softmax_in_fp32: False
-
-  # fusion
-  bias_swiglu_fusion: True
-  masked_softmax_fusion: True
-  persist_layer_norm: False
-  memory_efficient_layer_norm: False
-  bias_dropout_fusion: True
-  apply_rope_fusion: True
-
-  # activation recomputation
-  recompute_granularity: null
-  recompute_method: null
-  recompute_num_layers: null
-  distribute_saved_activations: null
-
-  # fp8 related
-  fp8: null
-  fp8_margin: 0
-  fp8_interval: 1
-  fp8_amax_history_len: 1
-  fp8_amax_compute_algo: "most_recent"
-  fp8_wgrad: True
-
-  # miscellaneous
-  clone_scatter_output_in_embedding: True
-
-  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
-
-  # MoE related
-  moe_router_load_balancing_type: "aux_loss"
-  moe_router_topk: 2
-  moe_router_topk_limited_devices: null
-  moe_grouped_gemm: False
-  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
-  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
-  moe_input_jitter_eps: null
-  moe_token_dropping: False
-
-model_parallel:
-  # Model parallelism
-  tensor_model_parallel_size: 1
-  context_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  sequence_parallel: True
-  expert_model_parallel_size: 1
-
-  # Initialization
-  perform_initialization: True
-  use_cpu_initialization: null
-
-  # Training
-  fp16: False
-  bf16: True
-  params_dtype: null # Set from above arguments for core
-  timers: null
-
-  # Optimizations
-  gradient_accumulation_fusion: True
-  async_tensor_model_parallel_allreduce: True
-  tp_comm_overlap: False
-
-  # Debug Options
-  tp_comm_split_ag: True
-  tp_comm_atomic_ag: True
-  tp_comm_split_rs: True
-  tp_comm_atomic_rs: True
-  tp_comm_bulk_wgrad: True
-  tp_comm_bulk_dgrad: True
-
-  # Parallelism
-  finalize_model_grads_func: null
-
-  # Pipeline Parallel
-  pipeline_dtype: null
-  grad_scale_func: null
-  enable_autocast: False
-  autocast_dtype: null
-  variable_seq_lengths: False
-  num_microbatches_with_partial_activation_checkpoints: null
-  overlap_p2p_comm: False
-  batch_p2p_comm: True
-  batch_p2p_sync: True
-  use_ring_exchange_p2p: False
-  deallocate_pipeline_outputs: False
-  no_sync_func: null
-  grad_sync_func: null
-  param_sync_func: null
-  pipeline_model_parallel_split_rank: null
-
-  # CPU Offloading
-  cpu_offloading: False
-  cpu_offloading_num_layers: 0
-  _cpu_offloading_context: null
-  cpu_offloading_weights: False
-  cpu_offloading_activations: True
-
-  # Timing
-  barrier_with_L1_time: True
-
-# training:
-use_legacy_models: False
-spec: null
-micro_batch_size: 2
-global_batch_size: 128
-rampup_batch_size: [32, 32, 65324160] 
-check_for_nan_in_loss_and_grad: True
-num_layers_per_virtual_pipeline_stage: null
-
-encoder_num_layers: null
-decoder_num_layers: null
-rotary_seq_len_interpolation_factor: null
-add_position_embedding: False
-make_vocab_size_divisible_by: 128
-group_query_attention: False
-
-
-exit_signal_handler: False
-exit_duration_in_mins: null
-exit_interval: null
-
-untie_embeddings_and_output_weights: True
-position_embedding_type: rope
-rotary_percent: 0.5
-openai_gelu: False
-squared_relu: False
-swiglu: True
-onnx_safe: null
-bert_binary_head: True
-max_position_embeddings: 4096
-
-transformer_impl: local
-use_flash_attn: False
-seed: 1234
-data_parallel_random_init: False
-
-# Optimizer
-optimizer: adam
-lr: 2.5e-4
-lr_decay_style: cosine
-lr_decay_iters: null
-lr_decay_samples: 255126953
-lr_warmup_fraction: null
-lr_warmup_iters: 0
-lr_warmup_samples: 81381
-lr_warmup_init: 0.0
-min_lr: 2.5e-5
-weight_decay: 0.1
-start_weight_decay: null
-end_weight_decay: null
-weight_decay_incr_style: constant
-clip_grad: 1.0
-adam_beta1: 0.9
-adam_beta2: 0.95
-adam_eps: 1.e-08
-sgd_momentum: 0.9
-override_opt_param_scheduler: False
-use_checkpoint_opt_param_scheduler: False
-
-# checkpointing arguments
-save: null
-save_interval: 20000
-no_save_optim: null
-no_save_rng: null
-load: null
-no_load_optim: null
-no_load_rng: null
-finetune: False
-use_checkpoint_args: False
-exit_on_missing_checkpoint: False
-
-# loss arguments
-loss_scale: null
-initial_loss_scale: 4294967296
-min_loss_scale: 1.0
-loss_scale_window: 1000 
-hysteresis: 2
-accumulate_allreduce_grads_in_fp32: False
-fp16_lm_cross_entropy: False
-
-# distributed arguments
-distributed_backend: nccl
-distributed_timeout_minutes: 10
-overlap_grad_reduce: False
-align_grad_reduce: True
-overlap_param_gather: False
-align_param_gather: False
-scatter_gather_tensors_in_pipeline: True
-local_rank: null
-lazy_mpu_init: null
-empty_unused_memory_level: 0
-standalone_embedding_stage: False
-use_distributed_optimizer: False
-nccl_communicator_config_path: null
-
-train_iters: null
-eval_iters: 32
-eval_interval: 2000
-skip_train: False
-
-adlr_autoresume: False
-adlr_autoresume_interval: 1000
-
-# garbage collection
-manual_gc: False
-manual_gc_interval: 0
-manual_gc_eval: True
-
-tp_comm_overlap_cfg: null
-
-#data
-data_path: null
-split: '99,1,0'
-train_data_path: null
-valid_data_path: null
-test_data_path: null
-data_cache_path: null
-mock_data: False
-vocab_size: null
-vocab_file: null
-merge_file: null
-vocab_extra_ids: 0
-seq_length: 4096
-encoder_seq_length: null
-decoder_seq_length: null
-retriever_seq_length: 256
-sample_rate: 1.0
-mask_prob: 0.15
-short_seq_prob: 0.1
-num_workers: 2
-tokenizer_type: GPTSentencePieceTokenizer
-tokenizer_model: null
-reset_position_ids: False
-reset_attention_mask: False
-eod_mask_loss: False
-train_samples: 268554688
-dataloader_type: null
-
-#profile:
-profile: False
-profile_ranks: [0]
-profile_step_end: 12
-profile_step_start: 10
-
-#logging:
-log_params_norm: True
-log_num_zeros_in_grad: True
-log_throughput: False
-log_progress: False
-timing_log_level: 0
-timing_log_option: minmax
-tensorboard_log_interval: 1
-tensorboard_queue_size: 1000
-log_timers_to_tensorboard: False
-log_validation_ppl_to_tensorboard: False
-log_memory_to_tensorboard: False
-log_world_size_to_tensorboard: False
-log_loss_scale_to_tensorboard: True
-wandb_project: ''
-wandb_exp_name: ''
-wandb_save_dir: ''
-enable_one_logger: True
-one_logger_project: megatron-lm
-one_logger_run_name: null
-log_interval: 100
-tensorboard_dir: null
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_router_group_topk: null
+  moe_router_num_groups: null
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/examples/gpt3/hostfile_gpt_567B
+++ b/examples/gpt3/hostfile_gpt_567B
--- a/run_GPT-MOE_1nodes.sh
+++ b/run_GPT-MOE_1nodes.sh
@@ -7,10 +7,10 @@ do
    fi
 done

-mpirun -np 8 --allow-run-as-root \
-             train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+mpirun -np 8  --allow-run-as-root \
+              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1

 wait

 rm -rf CKPT
-rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+rm -rf mixtral_dataset/my-mixtral_text_document
--- a/run_mixtral8x7B_2nodes.sh
+++ b/run_mixtral8x7B_2nodes.sh
@@ -7,13 +7,13 @@ do
    fi
 done

-mpirun -np 16 --hostfile mixtralnodes \
+mpirun -np 512 --hostfile hostfile_gpt_567B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1

 wait

 rm -rf CKPT
-#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+#rm -rf mixtral_dataset/my-mixtral_text_document
--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
--- a/train_GPT-MOE_567B_1nodes.sh
+++ b/train_GPT-MOE_567B_1nodes.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt_1nodes
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu
@@ -205,4 +198,4 @@ case ${LOCAL_RANK} in
  ${APP}
  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
-esac
\ No newline at end of file
+esac
--- a/train_GPT-MOE_567B.sh
+++ b/train_GPT-MOE_567B.sh
--- a/examples/inference/gpt/gpt_batch_inference.py
+++ b/examples/inference/gpt/gpt_batch_inference.py
--- a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
--- a/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh
--- a/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh
--- a/examples/inference/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
--- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
--- a/examples/mamba/run_text_gen_server_8b.sh
+++ b/examples/mamba/run_text_gen_server_8b.sh