更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · a02a5490
Commit 688448db authored Mar 14, 2025 by silencealiang
3 changed files
--- a/tools/retro/text_generation/retro_text_generation.py
+++ b/tools/retro/text_generation/retro_text_generation.py
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -5,18 +5,11 @@ import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             os.path.pardir)))
-from megatron.training import get_args
 from megatron.training import print_rank_0
-from megatron.core import mpu
-from megatron.training.checkpointing import load_checkpoint
-from megatron.training.initialize import initialize_megatron
 from megatron.core.models.gpt import GPTModel
-from megatron.training import get_model
 from megatron.training.arguments import core_transformer_config_from_args
 from megatron.training.yaml_arguments import core_transformer_config_from_yaml
 from megatron.inference.text_generation_server import MegatronServer
-from megatron.inference.text_generation import generate_and_post_process
-from megatron.inference.text_generation import beam_search_and_post_process
 from megatron.core.transformer.spec_utils import import_module
 from megatron.core.models.gpt.gpt_layer_specs import (
    get_gpt_layer_local_spec,
@@ -24,10 +17,28 @@ from megatron.core.models.gpt.gpt_layer_specs import (
 )

 from contextlib import nullcontext
-import torch
 from typing import Union
 import megatron

+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+import sys
+from argparse import Namespace
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController
+from megatron.core.transformer.module import MegatronModule
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir, os.path.pardir)))
+
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+

 def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
    """Builds the model.
@@ -84,23 +95,69 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
            position_embedding_type=args.position_embedding_type,
            rotary_percent=args.rotary_percent,
            rotary_base=args.rotary_base,
-            rope_scaling=args.use_rope_scaling
+            rope_scaling=args.use_rope_scaling,
+            rope_scaling_factor=args.rope_scaling_factor,
        )

    return model

+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Get the relevant backend for running inference
+
+    This function will automatically choose the TRTLLMBackend when possible, and default to Mcore backend if the user does not specify any backends. TRTLLMBackend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model.
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+        inference_max_seq_length=args.inference_max_seq_length,
+        inference_max_requests=args.inference_max_batch_size
+    )
+
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = SimpleTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    return MCoreEngine(text_generation_controller=text_generation_controller)
+
+
 def add_text_generate_args(parser):
    group = parser.add_argument_group(title='text generation')
    group.add_argument("--port", type=int, default=5000,
                       help='port for text generation server to run on')
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1,
+                       help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--return-log-probs", action='store_true', default=True,
+                       help='Return the log probabilities of the final output tokens')
+    group.add_argument("--num-tokens-to-generate", type=int, default=30,
+                       help='Number of tokens to generate for each prompt')
+    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
+                       help='Input prompts with each prompt within quotes and seperated by space')
+    group.add_argument("--max-batch-size", type=int, default=8,
+                       help='Max number of prompts to process at once')
    return parser


 if __name__ == "__main__":
    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
-                                       'no_load_rng': True,
-                                       'no_load_optim': True})
+                        args_defaults={'no_load_rng': True,
+                                       'no_load_optim': True,
+                                       'exit_on_missing_checkpoint': True})

    args = get_args()
    if args.num_layers_per_virtual_pipeline_stage is not None:
@@ -125,20 +182,8 @@ if __name__ == "__main__":
    model = model[0]
    model.eval()

+    inference_engine = get_inference_engine(args, model)
+
    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        server = MegatronServer(model)
+        server = MegatronServer(inference_engine, args)
        server.run("0.0.0.0",port=args.port)
-
-    while True:
-        choice = torch.tensor(1, dtype=torch.long, device='cuda')
-        torch.distributed.broadcast(choice, 0)
-        if choice.item() == 0:
-            try:
-                generate_and_post_process(model)
-            except ValueError as ve:
-                pass
-        elif choice.item() == 1:
-            try:
-                beam_search_and_post_process(model)
-            except ValueError as ve:
-                pass
--- a/unit-test-job-lts.yaml
+++ b/unit-test-job-lts.yaml
-default:
-  interruptible: true
-other:
-  artifacts:
-    paths:
-      - results/
-    when: always
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570
-  needs:
-    - job: functional:configure
-      pipeline: $PARENT_PIPELINE_ID
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
-    - if: $CI_MERGE_REQUEST_ID
-  script:
-    - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py
-      --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case
-      other --container-tag 20283570 --cluster dgxh100_coreweave
-  stage: unit-tests
-  tags: &id001
-    - arch/amd64
-    - env/prod
-    - origin/jet-fleet
-    - owner/jet-core
-    - purpose/jet-client
-    - team/megatron
-  timeout: 7 days
-stages:
-  - unit-tests
-tests/unit_tests/data/:
-  artifacts:
-    paths:
-      - results/
-    when: always
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570
-  needs:
-    - job: functional:configure
-      pipeline: $PARENT_PIPELINE_ID
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
-    - if: $CI_MERGE_REQUEST_ID
-  script:
-    - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py
-      --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case
-      tests/unit_tests/data/ --container-tag 20283570 --cluster dgxh100_coreweave
-  stage: unit-tests
-  tags: *id001
-  timeout: 7 days
-tests/unit_tests/dist_checkpointing/:
-  artifacts:
-    paths:
-      - results/
-    when: always
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570
-  needs:
-    - job: functional:configure
-      pipeline: $PARENT_PIPELINE_ID
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
-    - if: $CI_MERGE_REQUEST_ID
-  script:
-    - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py
-      --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case
-      tests/unit_tests/dist_checkpointing/ --container-tag 20283570 --cluster dgxh100_coreweave
-  stage: unit-tests
-  tags: *id001
-  timeout: 7 days
-tests/unit_tests/distributed/:
-  artifacts:
-    paths:
-      - results/
-    when: always
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570
-  needs:
-    - job: functional:configure
-      pipeline: $PARENT_PIPELINE_ID
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
-    - if: $CI_MERGE_REQUEST_ID
-  script:
-    - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py
-      --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case
-      tests/unit_tests/distributed/ --container-tag 20283570 --cluster dgxh100_coreweave
-  stage: unit-tests
-  tags: *id001
-  timeout: 7 days
-? tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py
-  tests/unit_tests/test_training.py
-: artifacts:
-    paths:
-      - results/
-    when: always
-  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570
-  needs:
-    - job: functional:configure
-      pipeline: $PARENT_PIPELINE_ID
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
-    - if: $CI_MERGE_REQUEST_ID
-  script:
-    - export PYTHONPATH=$(pwd); python tests/test_utils/python_scripts/launch_jet_workload.py
-      --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case
-      tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py
-      tests/unit_tests/test_training.py --container-tag 20283570 --cluster dgxh100_coreweave
-  stage: unit-tests
-  tags: *id001
-  timeout: 7 days