更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
@@ -63,7 +63,8 @@ language_model:
  # MoE related
  moe_router_load_balancing_type: "aux_loss"
  moe_router_topk: 2
-  moe_router_topk_limited_devices: null
+  moe_router_group_topk: null
+  moe_router_num_groups: null
  moe_grouped_gemm: False
  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss

--- a/examples/gpt3/hostfile_gpt_567B
+++ b/examples/gpt3/hostfile_gpt_567B
--- a/run_GPT-MOE_1nodes.sh
+++ b/run_GPT-MOE_1nodes.sh
@@ -8,7 +8,7 @@ do
 done

 mpirun -np 8  --allow-run-as-root \
-             train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1

 wait


--- a/run_mixtral8x7B_2nodes.sh
+++ b/run_mixtral8x7B_2nodes.sh
@@ -7,11 +7,11 @@ do
    fi
 done

-mpirun -np 16 --hostfile mixtralnodes \
+mpirun -np 512 --hostfile hostfile_gpt_567B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1

 wait


--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
--- a/train_GPT-MOE_567B_1nodes.sh
+++ b/train_GPT-MOE_567B_1nodes.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt_1nodes
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu

--- a/train_GPT-MOE_567B.sh
+++ b/train_GPT-MOE_567B.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -49,7 +55,7 @@ MODEL_ARGS=(
    --disable-bias-linear
    --seq-length 8192
    --max-position-embeddings 32768
-    --num-layers 64
+    --num-layers 32 #64
    --hidden-size 8192
    --ffn-hidden-size 32768
    --num-attention-heads 64
@@ -72,7 +78,7 @@ MOE_ARGS=(
    --moe-token-dispatcher-type alltoall
    --moe-expert-capacity-factor 0.5
    --moe-pad-expert-input-to-capacity
-    --moe-grouped-gemm
+    #--moe-grouped-gemm
 )

 DATA_ARGS=(
@@ -84,7 +90,7 @@ DATA_ARGS=(

 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 4096
+    --global-batch-size 1024
    --lr 1e-4
    --train-iters 10
    --lr-decay-iters 320000
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt
+    --profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 16
    --expert-model-parallel-size 16
    --expert-tensor-parallel-size 1
+    --context-parallel-size 2
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu

--- a/examples/inference/gpt/gpt_batch_inference.py
+++ b/examples/inference/gpt/gpt_batch_inference.py
 import os
-from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
 from pretrain_gpt import model_provider
 import torch
 import sys
+import time
+import tqdm
+import warnings
 from argparse import Namespace
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
 from megatron.core.inference.sampling_params import SamplingParams
-from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
 from megatron.core.inference.inference_request import InferenceRequest
-from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
+)
 from megatron.core.transformer.module import MegatronModule
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir, os.path.pardir)))
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)

 from megatron.training import get_args
 from megatron.training import get_tokenizer
@@ -20,26 +31,42 @@ from megatron.training.checkpointing import load_checkpoint
 from megatron.core import mpu
 from megatron.training.initialize import initialize_megatron
 from megatron.training import get_model
-from typing import List
+import asyncio
+from typing import AsyncIterator, List
+
+

 def add_text_generate_args(parser):
    """Text generation arguments."""
    group = parser.add_argument_group(title='text generation')

-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--top_k", type=int, default=1,
-                       help='Top k sampling.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--return-log-probs", action='store_true', default=False,
-                       help='Return the log probabilities of the final output tokens')
-    group.add_argument("--num-tokens-to-generate", type=int, default=30,
-                       help='Number of tokens to generate for each prompt')
-    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
-                       help='Input prompts with each prompt within quotes and seperated by space')
-    group.add_argument("--max-batch-size", type=int, default=1,
-                       help='Max number of prompts to process at once')
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=8, dest="inference_max_requests",
+        help='Max number of prompts to process at once'
+    )
+    group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
    return parser


@@ -62,23 +89,65 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi
        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
        fp32_residual_connection=args.fp32_residual_connection,
        params_dtype=args.params_dtype,
-        padded_vocab_size=args.padded_vocab_size
+        padded_vocab_size=args.padded_vocab_size,
+        inference_max_requests=args.inference_max_requests,
+        inference_max_seq_length=args.inference_max_seq_length,
    )

    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
-    return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
+    return MCoreEngine(text_generation_controller=text_generation_controller)
+
+
+async def generate(
+    inference_engine: MCoreEngine,
+    sampling_params: SamplingParams,
+    prompts: List[str],
+) -> List[InferenceRequest]:
+    async def collect_stream(prompt, request_id, stream_generator):
+        print(f"Request {request_id}: {prompt}", end="", flush=True)
+        prev_idx = 0
+        async for output in stream_generator:
+            print(output.generated_text[prev_idx:], end="", flush=True)
+            prev_idx = len(output.generated_text)
+        print()
+
+    request_ids: List[str] = [
+        inference_engine.add_request(
+            prompt=prompt, inference_parameters=sampling_params, streaming=True
+        )
+        for prompt in prompts
+    ]
+    stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
+
+    tasks = [
+        asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
+        for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
+    ]
+
+    await inference_engine.run_engine_async()
+    await asyncio.gather(*tasks)
+
+    results: List[InferenceRequest] = [
+        inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
+    ]
+
+    return results

 def main():
    """Main program."""

    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'no_load_rng': True,
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
            'no_load_optim': True,
            'micro_batch_size': 1,
-                                       'exit_on_missing_checkpoint': True})
+            'exit_on_missing_checkpoint': True,
+        },
+    )

    # Set up model and load checkpoint
    model = get_model(model_provider, wrap_with_ddp=False)
@@ -94,12 +163,25 @@ def main():
        top_k=args.top_k,
        top_p=args.top_p,
        return_log_probs=args.return_log_probs,
-        num_tokens_to_generate=args.num_tokens_to_generate)
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )

-    results: List[InferenceRequest] = inference_engine.generate(
+    if args.enable_cuda_graph:
+        print(f"Running warmup for CUDA graphs...")
+        inference_engine.generate(
                prompts=args.prompts, sampling_params=sampling_params
            )

+    start_time = time.perf_counter()
+    if args.stream:
+        results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
+    else:
+        results: List[InferenceRequest] = inference_engine.generate(
+            prompts=args.prompts, sampling_params=sampling_params,
+        )
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+
    if torch.distributed.get_rank() == 0:
        for idx, result in enumerate(results):
            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
@@ -107,9 +189,12 @@ def main():
                'id': result.request_id,
                'input_prompt': result.prompt,
                'generated_text': result.generated_text,
-                'generated_tokens' : result.generated_tokens
+                'generated_tokens': result.generated_tokens,
+                'latency': latency,
            }
            print(result)

+    torch.distributed.destroy_process_group()
+
 if __name__ == "__main__":
    main()
--- a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
--- a/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh
--- a/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh
--- a/examples/inference/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
--- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
--- a/examples/mamba/run_text_gen_server_8b.sh
+++ b/examples/mamba/run_text_gen_server_8b.sh