From 99a0c39ea23936072cfd7fb1ce1fd2b8298e9b20 Mon Sep 17 00:00:00 2001 From: xingjinliang Date: Wed, 25 Dec 2024 14:38:33 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E6=9C=80=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .coveragerc | 0 .gitignore | 0 .gitlab-ci.yml | 0 CHANGELOG.md | 0 CODEOWNERS | 0 CONTRIBUTING.md | 0 Dockerfile.ci.dev | 0 Dockerfile.ci.lts | 0 Dockerfile.linting | 0 GPT_pretraining.sh | 0 LICENSE | 0 Llama_pretraining.sh | 0 MANIFEST.in | 0 README.md.origin | 0 docs/llama_mistral.md | 0 docs/source/api-guide/context_parallel.rst | 0 docs/source/api-guide/datasets.rst | 0 docs/source/api-guide/dist_checkpointing.rst | 0 .../dist_checkpointing.strategies.rst | 0 docs/source/api-guide/dist_optimizer.md | 0 docs/source/api-guide/distributed.rst | 0 .../api-guide/encoder_decoder_parallelism.rst | 0 docs/source/api-guide/fusions.rst | 0 docs/source/api-guide/index.rst | 0 docs/source/api-guide/models.bert.rst | 0 docs/source/api-guide/models.gpt.rst | 0 docs/source/api-guide/models.rst | 0 docs/source/api-guide/models.t5.rst | 0 docs/source/api-guide/moe.rst | 0 .../api-guide/num_microbatches_calculator.rst | 0 .../api-guide/optimizer_param_scheduler.rst | 0 docs/source/api-guide/pipeline_parallel.rst | 0 docs/source/api-guide/tensor_parallel.rst | 0 docs/source/api-guide/transformer.rst | 0 .../images/context_parallel/CP_overview.png | Bin .../images/context_parallel/CP_results.png | Bin .../images/distrib_optimizer/data_flow.png | Bin .../distrib_optimizer/sharding_scheme.png | Bin docs/source/images/moe/token_drop.png | Bin docs/source/index.rst | 0 docs/source/user-guide/index.rst | 0 .../detxoify_lm/README.md | 0 .../annotations/filter-selfgeneration.py | 0 .../annotations/perspective_api_annotate.py | 0 .../detxoify_lm/annotations/preprocess.sh | 0 .../detxoify_lm/finetune_gpt.py | 0 .../finetune_gpt_distributed-1.3b.sh | 0 .../detxoify_lm/generate-1.3b.sh | 0 .../detxoify_lm/generate_samples_gpt.py | 0 .../detxoify_lm/perspective_api.py | 0 .../selfgenerate-1.3b-unconditional.sh | 0 .../academic_paper_scripts/msdp/README.md | 0 .../msdp/data_processing.sh | 0 .../msdp/eval_knwl_generation.sh | 0 .../msdp/eval_resp_generation.sh | 0 .../msdp/prep_resp_gen.sh | 0 .../msdp/prompt_knwl_gen.sh | 0 .../msdp/prompt_resp_gen.sh | 0 .../academic_paper_scripts/sc21/CONFIG.sh | 0 .../academic_paper_scripts/sc21/README.md | 0 .../academic_paper_scripts/sc21/SBATCH.sh | 0 examples/academic_paper_scripts/sc21/SRUN.sh | 0 .../sc21/run_figure_11.sh | 0 .../sc21/run_figure_12.sh | 0 .../sc21/run_figure_13.sh | 0 .../sc21/run_figure_14.sh | 0 .../sc21/run_figure_15.sh | 0 .../sc21/run_figure_16.sh | 0 .../sc21/run_figure_17.sh | 0 .../sc21/run_figure_18.sh | 0 .../sc21/run_table_1.sh | 0 examples/bert/README.md | 0 examples/bert/train_bert_340m_distributed.sh | 0 examples/export/README.md | 0 .../pretrain_gpt_modelopt.py | 0 .../export/ptq_and_trtllm_export/README.md | 0 .../ptq_trtllm_llama2_7b.sh | 0 .../ptq_trtllm_llama3_1_8b.sh | 0 .../ptq_trtllm_llama3_8b.sh | 0 .../ptq_trtllm_minitron_8b.sh | 0 .../ptq_trtllm_mistral_12b.sh | 0 .../ptq_trtllm_mixtral_8x7b.sh | 0 .../text_generation_ptq.py | 0 .../trtllm_text_generation.py | 0 examples/export/trtllm_export/README.md | 0 .../gpt_distributed_gpu_export.py | 0 .../gpt_single_device_cpu_export.py | 0 examples/gpt3/README.md | 0 examples/gpt3/gpt_config.yaml | 1 + examples/gpt3/train_gpt3_175b_distributed.sh | 0 examples/inference/README.md | 105 ++- ...ch_inference.py => gpt_batch_inference.py} | 10 +- .../llama_mistral/huggingface_reference.py | 0 .../run_text_generation_llama3.1.sh | 0 .../run_text_generation_llama3.sh | 0 .../run_text_generation_mistral.sh | 0 .../run_text_generation_server_345M.sh | 0 ...eneration_server_345M_8_tensor_parallel.sh | 0 .../inference/t5/simple_t5_batch_inference.py | 6 +- examples/mamba/.gitignore | 0 examples/mamba/Dockerfile | 0 examples/mamba/README.md | 0 examples/mamba/run_text_gen_server_8b.sh | 0 examples/mamba/run_text_gen_server_8b_gpt3.sh | 0 examples/mamba/train.sh | 0 examples/mixtral/README.md | 0 .../mixtral/train_mixtral_8x7b_distributed.sh | 0 examples/multimodal/Dockerfile | 0 examples/multimodal/README.md | 4 +- .../multimodal/assets/pretrain_curves.png | Bin .../combine_lm_vision_checkpoints.sh | 0 examples/multimodal/combine_state_dicts.py | 0 examples/multimodal/config.py | 50 +- .../convert_llava_pretrain_to_wds.py | 0 examples/multimodal/dataloader_provider.py | 0 examples/multimodal/dataset_helpers.py | 203 ++-- .../{ => evaluation}/evaluate_ai2d.py | 22 +- .../{ => evaluation}/evaluate_chartqa.py | 13 +- .../{ => evaluation}/evaluate_coco.py | 18 +- .../{ => evaluation}/evaluate_mathvista.py | 12 +- .../{ => evaluation}/evaluate_mmmu.py | 10 + .../{ => evaluation}/evaluate_ocrbench.py | 12 +- .../{ => evaluation}/evaluate_textvqa.py | 25 +- .../{ => evaluation}/evaluate_vqav2.py | 16 +- .../{ => evaluation}/evaluation_datasets.py | 84 +- examples/multimodal/image_processing.py | 0 examples/multimodal/layer_specs.py | 0 examples/multimodal/manual_prompts.json | 0 examples/multimodal/model.py | 14 + .../model_converter/clip_converter.py | 0 .../model_converter/internvit_converter.py | 0 .../model_converter/siglip_converter.py | 0 .../model_converter/vision_model_tester.py | 0 examples/multimodal/multimodal_args.py | 6 +- examples/multimodal/nvlm/README.md | 11 +- examples/multimodal/nvlm/internvit.py | 0 examples/multimodal/nvlm/nvlm_prompts.json | 0 .../nvlm/pp_checkpoint_converter.py | 0 examples/multimodal/nvlm/pretrain_blend.yaml | 0 .../nvlm/pretrain_qwen20_72b_internvit_6b.sh | 2 +- .../nvlm/pretrain_yi_34b_internvit_6b.sh | 2 +- ...text_generation_qwen20_72b_internvit_6b.sh | 2 +- .../run_text_generation_qwen25_7b_siglip.sh | 111 +++ ...run_text_generation_yi_34b_internvit_6b.sh | 4 +- examples/multimodal/nvlm/sft_34b_internvit.sh | 2 +- examples/multimodal/nvlm/sft_blend.yaml | 0 .../nvlm/sft_qwen20_72b_internvit_6b.sh | 2 +- examples/multimodal/pretrain_dataset.yaml | 0 examples/multimodal/pretrain_mistral_clip.sh | 7 +- examples/multimodal/run_text_generation.py | 28 +- examples/multimodal/sft_dataset.yaml | 0 examples/multimodal/sft_mistral_clip.sh | 7 +- .../text_generation_mistral_clip.sh | 17 +- examples/multimodal/train.py | 4 +- examples/retro/README.md | 0 examples/retro/preprocess_data.sh | 0 examples/retro/train_retro_2b_distributed.sh | 0 examples/run_simple_mcore_train_loop.py | 0 examples/t5/README.md | 0 examples/t5/t5_mcore_train_curve.png | Bin examples/t5/train_t5_220m_distributed.sh | 0 images/model_table.png | Bin images/strong_scaling.png | Bin images/weak_scaling.png | Bin megatron/core/QuickStart.md | 0 megatron/core/README.md | 0 megatron/core/README_STRAGGLER.md | 0 megatron/core/__init__.py | 0 megatron/core/config_logger.py | 0 megatron/core/datasets/Makefile | 0 megatron/core/datasets/__init__.py | 0 megatron/core/datasets/bert_dataset.py | 0 megatron/core/datasets/blended_dataset.py | 0 .../blended_megatron_dataset_builder.py | 0 .../blended_megatron_dataset_config.py | 0 megatron/core/datasets/gpt_dataset.py | 0 megatron/core/datasets/helpers.cpp | 0 megatron/core/datasets/helpers.py | 0 megatron/core/datasets/indexed_dataset.py | 0 megatron/core/datasets/masked_dataset.py | 0 megatron/core/datasets/megatron_dataset.py | 0 megatron/core/datasets/megatron_tokenizer.py | 0 megatron/core/datasets/multimodal_dataset.py | 0 megatron/core/datasets/readme.md | 0 megatron/core/datasets/retro/__init__.py | 0 .../core/datasets/retro/config/__init__.py | 0 .../datasets/retro/config/bert_embedders.py | 0 megatron/core/datasets/retro/config/config.py | 0 .../retro/config/gpt_chunk_datasets.py | 0 .../core/datasets/retro/config/tokenizers.py | 0 megatron/core/datasets/retro/db/__init__.py | 0 megatron/core/datasets/retro/db/build.py | 0 megatron/core/datasets/retro/db/dataset.py | 0 megatron/core/datasets/retro/db/utils.py | 0 megatron/core/datasets/retro/external_libs.py | 0 .../core/datasets/retro/index/__init__.py | 0 megatron/core/datasets/retro/index/build.py | 0 megatron/core/datasets/retro/index/factory.py | 0 megatron/core/datasets/retro/index/index.py | 0 .../datasets/retro/index/indexes/__init__.py | 0 .../retro/index/indexes/faiss_base.py | 0 .../retro/index/indexes/faiss_par_add.py | 0 megatron/core/datasets/retro/index/utils.py | 0 .../core/datasets/retro/index/validate.py | 0 .../core/datasets/retro/query/__init__.py | 0 .../datasets/retro/query/gpt_chunk_dataset.py | 0 .../retro/query/multi_split_gpt_dataset.py | 0 megatron/core/datasets/retro/query/query.py | 0 .../datasets/retro/query/retro_dataset.py | 0 megatron/core/datasets/retro/query/utils.py | 0 megatron/core/datasets/retro/utils.py | 0 megatron/core/datasets/t5_dataset.py | 0 megatron/core/datasets/utils.py | 0 megatron/core/datasets/utils_s3.py | 0 megatron/core/dist_checkpointing/__init__.py | 0 megatron/core/dist_checkpointing/core.py | 0 .../core/dist_checkpointing/dict_utils.py | 0 .../core/dist_checkpointing/exchange_utils.py | 0 megatron/core/dist_checkpointing/mapping.py | 3 +- megatron/core/dist_checkpointing/optimizer.py | 0 .../core/dist_checkpointing/serialization.py | 2 - .../state_dict_transformation.py | 0 .../dist_checkpointing/strategies/__init__.py | 0 .../strategies/async_utils.py | 0 .../dist_checkpointing/strategies/base.py | 0 .../dist_checkpointing/strategies/common.py | 0 .../strategies/filesystem_async.py | 0 .../strategies/fully_parallel.py | 0 .../strategies/resharding.py | 0 .../strategies/state_dict_saver.py | 0 .../strategies/tensorstore.py | 0 .../dist_checkpointing/strategies/torch.py | 0 .../strategies/two_stage.py | 0 .../dist_checkpointing/strategies/zarr.py | 0 megatron/core/dist_checkpointing/utils.py | 0 .../core/dist_checkpointing/validation.py | 27 +- megatron/core/distributed/README.md | 0 megatron/core/distributed/__init__.py | 0 .../core/distributed/data_parallel_base.py | 0 .../distributed/distributed_data_parallel.py | 33 +- .../distributed_data_parallel_config.py | 0 .../core/distributed/finalize_model_grads.py | 0 .../core/distributed/param_and_grad_buffer.py | 34 +- .../torch_fully_sharded_data_parallel.py | 0 megatron/core/enums.py | 0 megatron/core/export/__init__.py | 0 megatron/core/export/data_type.py | 0 megatron/core/export/export_config.py | 0 megatron/core/export/model_type.py | 0 megatron/core/export/trtllm/__init__.py | 0 .../export/trtllm/engine_builder/__init__.py | 0 .../engine_builder/trtllm_engine_builder.py | 0 .../trtllm/model_to_trllm_mapping/__init__.py | 0 .../default_conversion_dict.py | 0 .../core/export/trtllm/trt_model_config.py | 0 megatron/core/export/trtllm/trt_model_type.py | 0 megatron/core/export/trtllm/trtllm_helper.py | 0 megatron/core/export/trtllm/trtllm_layers.py | 0 .../trtllm_weights_converter/__init__.py | 0 ...tributed_trtllm_model_weights_converter.py | 0 ...e_device_trtllm_model_weights_converter.py | 0 megatron/core/extensions/__init__.py | 0 .../core/extensions/transformer_engine.py | 47 +- megatron/core/fusions/__init__.py | 0 megatron/core/fusions/fused_bias_dropout.py | 0 megatron/core/fusions/fused_bias_geglu.py | 0 megatron/core/fusions/fused_bias_gelu.py | 0 megatron/core/fusions/fused_bias_swiglu.py | 0 megatron/core/fusions/fused_cross_entropy.py | 0 megatron/core/fusions/fused_layer_norm.py | 0 megatron/core/fusions/fused_softmax.py | 0 megatron/core/inference/__init__.py | 0 .../core/inference/ammo_support/__init__.py | 0 .../inference/ammo_support/gpt/model_specs.py | 0 .../ammo_support/gpt/state_dict_hooks.py | 0 .../core/inference/common_inference_params.py | 33 +- .../core/inference/communication_utils.py | 0 megatron/core/inference/engines/__init__.py | 0 .../core/inference/engines/abstract_engine.py | 0 .../core/inference/engines/mcore_engine.py | 23 +- megatron/core/inference/inference_request.py | 4 +- .../model_inference_wrappers/__init__.py | 0 .../abstract_model_inference_wrapper.py | 0 .../model_inference_wrappers/gpt/__init__.py | 0 .../gpt/gpt_inference_wrapper.py | 0 .../inference_wrapper_config.py | 0 .../model_inference_wrappers/t5/__init__.py | 0 .../t5/t5_inference_wrapper.py | 0 .../inference/modelopt_support/__init__.py | 0 .../modelopt_support/gpt/__init__.py | 0 .../modelopt_support/gpt/model_specs.py | 0 .../modelopt_support/gpt/state_dict_hooks.py | 0 megatron/core/inference/sampling_params.py | 35 + megatron/core/inference/scheduler.py | 6 +- .../text_generation_controllers/__init__.py | 0 ...oder_decoder_text_generation_controller.py | 8 +- .../simple_text_generation_controller.py | 401 +------- .../text_generation_controller.py | 400 ++++++++ megatron/core/inference/utils.py | 0 megatron/core/inference_params.py | 0 megatron/core/jit.py | 0 megatron/core/model_parallel_config.py | 0 megatron/core/models/T5/__init__.py | 0 megatron/core/models/T5/t5_model.py | 0 megatron/core/models/T5/t5_spec.py | 0 megatron/core/models/__init__.py | 0 megatron/core/models/bert/__init__.py | 0 megatron/core/models/bert/bert_layer_specs.py | 72 +- megatron/core/models/bert/bert_lm_head.py | 0 megatron/core/models/bert/bert_model.py | 0 megatron/core/models/bert/pooler.py | 0 megatron/core/models/common/__init__.py | 0 .../core/models/common/embeddings/__init__.py | 0 .../embeddings/language_model_embedding.py | 0 .../models/common/embeddings/rope_utils.py | 29 +- .../common/embeddings/rotary_pos_embedding.py | 0 .../embeddings/yarn_rotary_pos_embedding.py | 0 .../models/common/language_module/__init__.py | 0 .../common/language_module/language_module.py | 0 .../models/common/vision_module/__init__.py | 0 .../common/vision_module/vision_module.py | 0 megatron/core/models/gpt/__init__.py | 0 megatron/core/models/gpt/gpt_layer_specs.py | 121 ++- megatron/core/models/gpt/gpt_model.py | 0 megatron/core/models/gpt/moe_module_specs.py | 81 ++ megatron/core/models/mamba/__init__.py | 0 .../core/models/mamba/mamba_layer_specs.py | 0 megatron/core/models/mamba/mamba_model.py | 0 megatron/core/models/multimodal/__init__.py | 0 .../core/models/multimodal/llava_model.py | 3 +- megatron/core/models/multimodal/llava_spec.py | 0 megatron/core/models/retro/__init__.py | 0 megatron/core/models/retro/base_attention.py | 0 megatron/core/models/retro/config.py | 0 .../core/models/retro/decoder_attention.py | 0 megatron/core/models/retro/decoder_spec.py | 0 .../core/models/retro/encoder_attention.py | 0 megatron/core/models/retro/encoder_spec.py | 0 megatron/core/models/retro/model.py | 0 megatron/core/models/retro/utils.py | 0 megatron/core/models/vision/__init__.py | 0 megatron/core/models/vision/clip_vit_model.py | 0 .../models/vision/multimodal_projector.py | 0 .../core/models/vision/vit_layer_specs.py | 0 megatron/core/num_microbatches_calculator.py | 0 megatron/core/optimizer/__init__.py | 73 +- megatron/core/optimizer/clip_grads.py | 30 +- megatron/core/optimizer/distrib_optimizer.py | 295 ++++-- megatron/core/optimizer/grad_scaler.py | 0 megatron/core/optimizer/optimizer.py | 241 +++-- megatron/core/optimizer/optimizer_config.py | 65 ++ megatron/core/optimizer_param_scheduler.py | 0 megatron/core/package_info.py | 0 megatron/core/packed_seq_params.py | 0 megatron/core/parallel_state.py | 0 megatron/core/pipeline_parallel/__init__.py | 0 .../pipeline_parallel/p2p_communication.py | 0 megatron/core/pipeline_parallel/schedules.py | 10 + megatron/core/requirements.txt | 0 megatron/core/rerun_state_machine.py | 165 ++-- megatron/core/ssm/__init__.py | 0 megatron/core/ssm/mamba_block.py | 0 .../core/ssm/mamba_hybrid_layer_allocation.py | 0 megatron/core/ssm/mamba_layer.py | 0 megatron/core/ssm/mamba_mixer.py | 0 megatron/core/ssm/triton_cache_manager.py | 0 megatron/core/tensor_parallel/__init__.py | 0 .../core/tensor_parallel/cross_entropy.py | 0 megatron/core/tensor_parallel/data.py | 0 megatron/core/tensor_parallel/layers.py | 0 megatron/core/tensor_parallel/mappings.py | 0 megatron/core/tensor_parallel/random.py | 0 megatron/core/tensor_parallel/utils.py | 0 megatron/core/timers.py | 0 megatron/core/transformer/__init__.py | 0 megatron/core/transformer/attention.py | 0 megatron/core/transformer/cuda_graphs.py | 882 +++++++++++++----- .../transformer/custom_layers/__init__.py | 0 .../custom_layers/transformer_engine.py | 0 .../core/transformer/dot_product_attention.py | 0 megatron/core/transformer/enums.py | 0 megatron/core/transformer/identity_op.py | 0 megatron/core/transformer/mlp.py | 0 megatron/core/transformer/module.py | 0 megatron/core/transformer/moe/README.md | 5 +- megatron/core/transformer/moe/__init__.py | 0 megatron/core/transformer/moe/experts.py | 0 .../core/transformer/moe/grouped_gemm_util.py | 0 .../moe/legacy_a2a_token_dispatcher.py | 0 megatron/core/transformer/moe/moe_layer.py | 23 +- megatron/core/transformer/moe/moe_utils.py | 119 ++- megatron/core/transformer/moe/router.py | 81 +- .../core/transformer/moe/shared_experts.py | 9 +- .../core/transformer/moe/token_dispatcher.py | 0 .../core/transformer/moe/upcycling_utils.py | 0 .../transformer/multi_latent_attention.py | 0 megatron/core/transformer/spec_utils.py | 0 megatron/core/transformer/torch_layer_norm.py | 0 megatron/core/transformer/torch_norm.py | 0 .../core/transformer/transformer_block.py | 5 +- .../core/transformer/transformer_config.py | 42 +- .../core/transformer/transformer_layer.py | 0 megatron/core/transformer/utils.py | 0 megatron/core/utils.py | 38 + megatron/inference/__init__.py | 0 megatron/inference/algos/__init__.py | 0 megatron/inference/algos/distillation.py | 0 megatron/inference/arguments.py | 0 megatron/inference/checkpointing.py | 0 megatron/inference/docs/distillation.md | 0 megatron/inference/endpoints/common.py | 0 megatron/inference/endpoints/completions.py | 0 megatron/inference/gpt/__init__.py | 0 megatron/inference/gpt/loss_func.py | 0 megatron/inference/gpt/model_provider.py | 0 megatron/inference/static/index.html | 0 .../inference/text_generation/__init__.py | 0 megatron/inference/text_generation/api.py | 0 .../inference/text_generation/beam_utils.py | 0 .../text_generation/communication.py | 0 .../inference/text_generation/forward_step.py | 9 +- .../inference/text_generation/generation.py | 0 .../inference/text_generation/sampling.py | 0 .../inference/text_generation/tokenization.py | 0 megatron/inference/text_generation_server.py | 0 megatron/legacy/data/__init__.py | 0 megatron/legacy/data/autoaugment.py | 0 .../legacy/data/biencoder_dataset_utils.py | 0 megatron/legacy/data/data_samplers.py | 0 megatron/legacy/data/dataset_utils.py | 0 megatron/legacy/data/ict_dataset.py | 0 megatron/legacy/data/image_folder.py | 0 megatron/legacy/data/multimodal_dataset.py | 0 megatron/legacy/data/orqa_wiki_dataset.py | 0 megatron/legacy/data/realm_dataset_utils.py | 0 megatron/legacy/data/realm_index.py | 0 megatron/legacy/data/vit_dataset.py | 0 .../legacy/fp16_deprecated/loss_scaler.py | 0 megatron/legacy/fused_kernels/__init__.py | 0 megatron/legacy/fused_kernels/compat.h | 0 .../legacy/fused_kernels/tests/__init__.py | 0 .../fused_kernels/tests/test_fused_kernels.py | 0 megatron/legacy/fused_kernels/type_shim.h | 0 megatron/legacy/indexer.py | 0 megatron/legacy/model/__init__.py | 0 megatron/legacy/model/bert_model.py | 0 megatron/legacy/model/biencoder_model.py | 0 megatron/legacy/model/classification.py | 0 megatron/legacy/model/enums.py | 0 megatron/legacy/model/fused_bias_gelu.py | 0 megatron/legacy/model/fused_layer_norm.py | 0 megatron/legacy/model/fused_softmax.py | 0 megatron/legacy/model/gpt_model.py | 0 megatron/legacy/model/language_model.py | 0 megatron/legacy/model/module.py | 0 megatron/legacy/model/multiple_choice.py | 0 megatron/legacy/model/realm_model.py | 0 megatron/legacy/model/rms_norm.py | 0 megatron/legacy/model/t5_model.py | 0 megatron/legacy/model/transformer.py | 0 megatron/legacy/model/utils.py | 0 .../legacy/model/vision/classification.py | 0 megatron/legacy/model/vision/dino.py | 0 .../model/vision/esvit_swin_backbone.py | 0 megatron/legacy/model/vision/inpainting.py | 0 megatron/legacy/model/vision/knn_monitor.py | 0 megatron/legacy/model/vision/mit_backbone.py | 0 megatron/legacy/model/vision/swin_backbone.py | 0 megatron/legacy/model/vision/utils.py | 0 megatron/legacy/model/vision/vit_backbone.py | 0 megatron/legacy/mpu/tests/__init__.py | 0 megatron/legacy/mpu/tests/commons.py | 0 .../legacy/mpu/tests/test_cross_entropy.py | 0 megatron/legacy/mpu/tests/test_data.py | 0 megatron/legacy/mpu/tests/test_initialize.py | 0 megatron/legacy/mpu/tests/test_layers.py | 0 megatron/legacy/mpu/tests/test_random.py | 0 megatron/training/__init__.py | 0 megatron/training/activations.py | 0 megatron/training/arguments.py | 45 +- megatron/training/async_utils.py | 0 megatron/training/checkpointing.py | 39 +- megatron/training/dist_signal_handler.py | 0 megatron/training/ft_integration.py | 0 megatron/training/global_vars.py | 0 megatron/training/initialize.py | 0 megatron/training/log_handler.py | 0 megatron/training/one_logger_utils.py | 0 megatron/training/theoretical_memory_usage.py | 0 megatron/training/tokenizer/__init__.py | 0 .../training/tokenizer/bert_tokenization.py | 0 .../training/tokenizer/gpt2_tokenization.py | 0 .../tokenizer/multimodal_tokenizer.py | 0 megatron/training/tokenizer/tokenizer.py | 0 megatron/training/training.py | 159 ++-- megatron/training/utils.py | 99 +- megatron/training/yaml_arguments.py | 0 mypy.ini | 0 pretrain_bert.py | 0 pretrain_gpt.py | 6 +- pretrain_ict.py | 0 pretrain_mamba.py | 0 pretrain_retro.py | 0 pretrain_t5.py | 0 pretrain_vision_classify.py | 0 pretrain_vision_dino.py | 0 pretrain_vision_inpaint.py | 0 pretrain_vlm.py | 14 +- pyproject.toml | 0 pytest.ini | 0 .../requirements.txt | 0 .../requirements.txt | 0 run_1nodes.sh | 17 + setup.py | 0 tasks/data_utils.py | 0 tasks/ensemble_classifier.py | 0 tasks/eval_utils.py | 0 tasks/finetune_utils.py | 0 tasks/glue/data.py | 0 tasks/glue/finetune.py | 0 tasks/glue/mnli.py | 0 tasks/glue/qqp.py | 0 tasks/main.py | 0 tasks/msdp/README.md | 0 tasks/msdp/evaluate.py | 0 tasks/msdp/main.py | 0 tasks/msdp/metrics.py | 0 tasks/msdp/preprocessing.py | 0 tasks/msdp/prompt.py | 0 tasks/orqa/README.md | 0 tasks/orqa/evaluate_orqa.py | 0 tasks/orqa/evaluate_utils.py | 0 tasks/orqa/supervised/data.py | 0 tasks/orqa/supervised/eval_utils.py | 0 tasks/orqa/supervised/finetune.py | 0 tasks/orqa/unsupervised/nq.py | 0 tasks/orqa/unsupervised/qa_utils.py | 0 tasks/orqa/unsupervised/tokenizers.py | 0 tasks/quantize/calibrate_gpt.py | 0 tasks/race/data.py | 0 tasks/race/finetune.py | 0 tasks/vision/classification/classification.py | 0 tasks/vision/classification/eval_utils.py | 0 tasks/vision/finetune_utils.py | 0 tasks/vision/main.py | 0 tasks/vision/segmentation/cityscapes.py | 0 tasks/vision/segmentation/data.py | 0 .../vision/segmentation/finetune_segformer.py | 0 tasks/vision/segmentation/finetune_setr.py | 0 tasks/vision/segmentation/metrics.py | 0 tasks/vision/segmentation/seg_heads.py | 0 tasks/vision/segmentation/seg_models.py | 0 tasks/vision/segmentation/transforms.py | 0 tasks/vision/segmentation/utils.py | 0 tasks/zeroshot_gpt/datasets.py | 0 tasks/zeroshot_gpt/detokenizer.py | 0 tasks/zeroshot_gpt/evaluate.py | 0 tests/__init__.py | 0 tests/functional_tests/__init__.py | 0 .../python_test_utils/__init__.py | 0 .../python_test_utils/common.py | 0 .../get_test_results_from_tensorboard_logs.py | 0 .../python_test_utils/test_ci_pipeline.py | 0 .../python_test_utils/test_fp8_ci_pipeline.py | 0 .../test_resume_checkpoint_pipeline.py | 0 .../shell_test_utils/_run_training.sh | 0 .../shell_test_utils/run_ci_test.sh | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../bert_release/golden_values_0.9.0.json | 0 .../bert/bert_release/model_config.yaml | 0 .../common/ckpt_converter/__main__.py | 0 .../common/ckpt_converter/model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_0.8.0.json | 0 .../golden_values_0.9.0.json | 0 .../gpt/gpt3_15b_8t_release/model_config.yaml | 0 .../gpt3_15b_8t_release_sm/model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 1 + .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 1 - .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 1 - .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 58 +- .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 1 + .../golden_values_0.9.0.json | 0 .../model_config.yaml | 0 .../golden_values_0.8.0.json | 0 .../golden_values_0.9.0.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_0.9.0.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 1 + .../golden_values_lts.json | 1 + .../model_config.yaml | 57 ++ .../golden_values_dev.json | 1 + .../golden_values_lts.json | 1 + .../model_config.yaml | 58 ++ .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../model_config.yaml | 1 + .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 1 + .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 1 + .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../model_config.yaml | 0 .../golden_values_dev.json | 0 .../golden_values_lts.json | 0 .../model_config.yaml | 0 .../golden_values_lts.json | 0 .../golden_values_lts.json | 0 .../t5/t5_release/golden_values_0.9.0.json | 0 .../t5/t5_release/model_config.yaml | 0 tests/test_utils/python_scripts/common.py | 0 .../generate_jet_trigger_job.py | 0 .../python_scripts/generate_local_jobs.py | 0 .../python_scripts/launch_jet_workload.py | 0 .../test_utils/recipes/_build-mcore-dev.yaml | 0 .../test_utils/recipes/_build-mcore-lts.yaml | 0 tests/test_utils/recipes/_build-nemo.yaml | 0 tests/test_utils/recipes/bert.yaml | 0 tests/test_utils/recipes/gpt-modelopt.yaml | 0 tests/test_utils/recipes/gpt-nemo.yaml | 0 tests/test_utils/recipes/gpt.yaml | 0 .../test_utils/recipes/multimodal-llava.yaml | 2 + tests/test_utils/recipes/t5.yaml | 0 tests/test_utils/recipes/unit-tests.yaml | 0 tests/test_utils/shell_scripts/notify.sh | 0 tests/unit_tests/__init__.py | 0 tests/unit_tests/conftest.py | 0 tests/unit_tests/data/__init__.py | 0 tests/unit_tests/data/test_bin_reader.py | 0 tests/unit_tests/data/test_builder.py | 0 tests/unit_tests/data/test_gpt_dataset.py | 0 .../data/test_multimodal_dataset.py | 0 tests/unit_tests/data/test_preprocess_data.py | 0 .../unit_tests/data/test_preprocess_mmdata.py | 0 .../unit_tests/dist_checkpointing/__init__.py | 0 .../unit_tests/dist_checkpointing/conftest.py | 0 .../dist_checkpointing/models/__init__.py | 0 .../dist_checkpointing/models/common.py | 0 .../models/test_bert_model.py | 0 .../models/test_gpt_model.py | 0 .../dist_checkpointing/models/test_mamba.py | 0 .../dist_checkpointing/models/test_mlp_glu.py | 0 .../models/test_moe_experts.py | 20 +- .../models/test_retro_model.py | 0 .../models/test_t5_model.py | 0 .../dist_checkpointing/test_async_save.py | 0 .../test_cached_metadata.py | 0 .../test_flattened_resharding.py | 68 ++ .../unit_tests/dist_checkpointing/test_fp8.py | 0 .../dist_checkpointing/test_fully_parallel.py | 0 .../dist_checkpointing/test_local.py | 0 .../dist_checkpointing/test_mapping.py | 0 .../dist_checkpointing/test_nonpersistent.py | 0 .../dist_checkpointing/test_optimizer.py | 0 .../dist_checkpointing/test_serialization.py | 33 + tests/unit_tests/dist_checkpointing/utils.py | 0 ...est_grad_reduce_for_replicated_embedder.py | 0 .../distributed/test_param_and_grad_buffer.py | 0 tests/unit_tests/export/trtllm/__init__.py | 0 .../export/trtllm/test_distributed_fp8.py | 0 .../export/trtllm/test_single_device_fp8.py | 0 .../test_trtllm_distributed_gpu_converter.py | 0 .../export/trtllm/test_trtllm_helper.py | 0 .../export/trtllm/test_trtllm_layers.py | 0 .../test_trtllm_single_device_converter.py | 0 .../unit_tests/fusions/test_torch_softmax.py | 0 tests/unit_tests/inference/__init__.py | 0 .../unit_tests/inference/engines/__init__.py | 0 .../inference/engines/test_mcore_engine.py | 14 +- .../model_inference_wrappers/__init__.py | 0 .../gpt/test_gpt_inference_wrapper.py | 0 .../t5/test_t5_inference_wrapper.py | 0 .../test_model_inference_wrapper_config.py | 0 .../inference/test_common_inference_params.py | 6 +- .../unit_tests/inference/test_flash_decode.py | 0 .../inference/test_inference_utils.py | 0 .../inference/test_modelopt_gpt_model.py | 0 tests/unit_tests/inference/test_scheduler.py | 4 +- .../text_generation_controllers/__init__.py | 0 ...oder_decoder_text_generation_controller.py | 4 +- .../test_simple_text_generation_controller.py | 26 +- tests/unit_tests/models/__init__.py | 0 .../unit_tests/models/test_base_embedding.py | 0 tests/unit_tests/models/test_bert_model.py | 0 .../unit_tests/models/test_clip_vit_model.py | 0 tests/unit_tests/models/test_gpt_model.py | 0 tests/unit_tests/models/test_llava_model.py | 0 tests/unit_tests/models/test_mamba_model.py | 0 .../models/test_multimodal_projector.py | 0 tests/unit_tests/models/test_t5_model.py | 0 .../unit_tests/pipeline_parallel/__init__.py | 0 .../pipeline_parallel/test_helpers.py | 0 .../pipeline_parallel/test_schedules.py | 0 tests/unit_tests/ssm/test_mamba_block.py | 0 .../ssm/test_mamba_hybrid_layer_allocation.py | 0 tests/unit_tests/ssm/test_mamba_layer.py | 0 tests/unit_tests/ssm/test_mamba_mixer.py | 0 tests/unit_tests/tensor_parallel/__init__.py | 0 .../tensor_parallel/test_cross_entropy.py | 0 tests/unit_tests/tensor_parallel/test_data.py | 0 .../tensor_parallel/test_initialization.py | 0 .../unit_tests/tensor_parallel/test_layers.py | 0 .../tensor_parallel/test_mappings.py | 0 .../unit_tests/tensor_parallel/test_random.py | 0 .../test_tensor_parallel_utils.py | 0 tests/unit_tests/test_basic.py | 0 tests/unit_tests/test_imports.py | 0 tests/unit_tests/test_inference.py | 0 .../unit_tests/test_local_multi_tensor_fns.py | 0 .../test_num_microbatches_calculator.py | 0 tests/unit_tests/test_optimizer.py | 47 + .../test_optimizer_param_scheduler.py | 0 tests/unit_tests/test_parallel_state.py | 0 tests/unit_tests/test_tokenizer.py | 0 tests/unit_tests/test_training.py | 0 tests/unit_tests/test_utilities.py | 0 tests/unit_tests/test_utils.py | 0 tests/unit_tests/transformer/__init__.py | 0 tests/unit_tests/transformer/moe/__init__.py | 0 tests/unit_tests/transformer/moe/conftest.py | 0 .../moe/test_a2a_token_dispatcher.py | 0 .../transformer/moe/test_aux_loss.py | 44 + .../transformer/moe/test_grouped_mlp.py | 13 +- .../transformer/moe/test_moe_layer.py | 12 +- .../transformer/moe/test_routers.py | 63 +- .../transformer/moe/test_sequential_mlp.py | 4 +- .../transformer/moe/test_shared_experts.py | 6 +- .../transformer/moe/test_token_dispatcher.py | 4 +- .../transformer/moe/test_upcycling.py | 10 +- .../unit_tests/transformer/test_attention.py | 0 .../transformer/test_attention_packed_seq.py | 0 .../transformer/test_core_attention.py | 0 tests/unit_tests/transformer/test_mlp.py | 0 tests/unit_tests/transformer/test_module.py | 0 .../test_multi_latent_attention.py | 0 .../transformer/test_retro_attention.py | 0 tests/unit_tests/transformer/test_rope.py | 0 .../transformer/test_spec_customization.py | 0 .../transformer/test_transformer_block.py | 0 .../transformer/test_transformer_layer.py | 0 tools/autoformat.sh | 0 tools/bert_embedding/__init__.py | 0 tools/bert_embedding/dataset.py | 0 tools/bert_embedding/embed.py | 0 tools/bert_embedding/external_libs.py | 0 tools/bert_embedding/huggingface.py | 0 tools/checkpoint/convert.py | 0 tools/checkpoint/hybrid_conversion.py | 0 tools/checkpoint/loader_llama_mistral.py | 0 tools/checkpoint/loader_mcore.py | 0 tools/checkpoint/loader_megatron.py | 0 tools/checkpoint/loader_mixtral_hf.py | 0 tools/checkpoint/saver_mcore.py | 0 tools/checkpoint/saver_megatron.py | 0 tools/checkpoint/schema_base.py | 0 tools/checkpoint/schema_mcore.py | 0 tools/checkpoint/utils.py | 0 tools/copyright.sh | 0 tools/linter.py | 0 tools/merge_datasets.py | 0 tools/openwebtext/README.md | 0 tools/openwebtext/add_id.py | 0 tools/openwebtext/blacklist_urls.py | 0 tools/openwebtext/cleanup_dataset.py | 0 tools/openwebtext/cleanup_fix_dataset.py | 0 tools/openwebtext/filter_ngrams.py | 0 tools/openwebtext/find_duplicates.py | 0 tools/openwebtext/group_duplicate_url.py | 0 tools/openwebtext/merge_jsons.py | 0 tools/openwebtext/remove_group_duplicates.py | 0 tools/preprocess_data.py | 0 tools/preprocess_data_nmt.py | 0 tools/preprocess_mmdata.py | 0 tools/report_theoretical_memory.py | 0 tools/retro/README.md | 0 tools/retro/build_db.md | 0 tools/retro/cli/__init__.py | 0 tools/retro/cli/__main__.py | 0 tools/retro/cli/cli.py | 0 tools/retro/config_utils.py | 0 tools/retro/docker/Dockerfile | 0 tools/retro/preprocess_data.py | 0 tools/retro/sft/README.md | 0 tools/retro/sft/dataset_conv.py | 0 tools/retro/sft/open_inst.sh | 0 tools/retro/sft/sft_retro.py | 0 tools/retro/sft/sft_retro_lm.sh | 0 tools/retro/text_generation/evaluate.py | 0 tools/retro/text_generation/metrics.py | 0 tools/retro/text_generation/retro_api.py | 0 tools/retro/text_generation/retro_generate.sh | 0 .../retro/text_generation/retro_generation.py | 0 .../text_generation/retro_text_generation.py | 0 tools/run_mamba_text_generation_server.py | 0 tools/run_text_generation_server.py | 0 tools/run_vlm_text_generation.py | 0 tools/text_generation_cli.py | 0 train_mixtral_8x7B_1nodes.sh | 5 +- unit-test-job-lts.yaml | 0 1102 files changed, 3538 insertions(+), 1674 deletions(-) mode change 100755 => 100644 .coveragerc mode change 100755 => 100644 .gitignore mode change 100755 => 100644 .gitlab-ci.yml mode change 100755 => 100644 CHANGELOG.md mode change 100755 => 100644 CODEOWNERS mode change 100755 => 100644 CONTRIBUTING.md mode change 100755 => 100644 Dockerfile.ci.dev mode change 100755 => 100644 Dockerfile.ci.lts mode change 100755 => 100644 Dockerfile.linting mode change 100755 => 100644 GPT_pretraining.sh mode change 100755 => 100644 LICENSE mode change 100755 => 100644 Llama_pretraining.sh mode change 100755 => 100644 MANIFEST.in mode change 100755 => 100644 README.md.origin mode change 100755 => 100644 docs/llama_mistral.md mode change 100755 => 100644 docs/source/api-guide/context_parallel.rst mode change 100755 => 100644 docs/source/api-guide/datasets.rst mode change 100755 => 100644 docs/source/api-guide/dist_checkpointing.rst mode change 100755 => 100644 docs/source/api-guide/dist_checkpointing.strategies.rst mode change 100755 => 100644 docs/source/api-guide/dist_optimizer.md mode change 100755 => 100644 docs/source/api-guide/distributed.rst mode change 100755 => 100644 docs/source/api-guide/encoder_decoder_parallelism.rst mode change 100755 => 100644 docs/source/api-guide/fusions.rst mode change 100755 => 100644 docs/source/api-guide/index.rst mode change 100755 => 100644 docs/source/api-guide/models.bert.rst mode change 100755 => 100644 docs/source/api-guide/models.gpt.rst mode change 100755 => 100644 docs/source/api-guide/models.rst mode change 100755 => 100644 docs/source/api-guide/models.t5.rst mode change 100755 => 100644 docs/source/api-guide/moe.rst mode change 100755 => 100644 docs/source/api-guide/num_microbatches_calculator.rst mode change 100755 => 100644 docs/source/api-guide/optimizer_param_scheduler.rst mode change 100755 => 100644 docs/source/api-guide/pipeline_parallel.rst mode change 100755 => 100644 docs/source/api-guide/tensor_parallel.rst mode change 100755 => 100644 docs/source/api-guide/transformer.rst mode change 100755 => 100644 docs/source/images/context_parallel/CP_overview.png mode change 100755 => 100644 docs/source/images/context_parallel/CP_results.png mode change 100755 => 100644 docs/source/images/distrib_optimizer/data_flow.png mode change 100755 => 100644 docs/source/images/distrib_optimizer/sharding_scheme.png mode change 100755 => 100644 docs/source/images/moe/token_drop.png mode change 100755 => 100644 docs/source/index.rst mode change 100755 => 100644 docs/source/user-guide/index.rst mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/README.md mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/perspective_api.py mode change 100755 => 100644 examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh mode change 100755 => 100644 examples/academic_paper_scripts/msdp/README.md mode change 100755 => 100644 examples/academic_paper_scripts/msdp/data_processing.sh mode change 100755 => 100644 examples/academic_paper_scripts/msdp/eval_knwl_generation.sh mode change 100755 => 100644 examples/academic_paper_scripts/msdp/eval_resp_generation.sh mode change 100755 => 100644 examples/academic_paper_scripts/msdp/prep_resp_gen.sh mode change 100755 => 100644 examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh mode change 100755 => 100644 examples/academic_paper_scripts/msdp/prompt_resp_gen.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/CONFIG.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/README.md mode change 100755 => 100644 examples/academic_paper_scripts/sc21/SBATCH.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/SRUN.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_11.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_12.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_13.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_14.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_15.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_16.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_17.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_figure_18.sh mode change 100755 => 100644 examples/academic_paper_scripts/sc21/run_table_1.sh mode change 100755 => 100644 examples/bert/README.md mode change 100755 => 100644 examples/bert/train_bert_340m_distributed.sh mode change 100755 => 100644 examples/export/README.md mode change 100755 => 100644 examples/export/knowledge_distillation/pretrain_gpt_modelopt.py mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/README.md mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/text_generation_ptq.py mode change 100755 => 100644 examples/export/ptq_and_trtllm_export/trtllm_text_generation.py mode change 100755 => 100644 examples/export/trtllm_export/README.md mode change 100755 => 100644 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py mode change 100755 => 100644 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py mode change 100755 => 100644 examples/gpt3/README.md mode change 100755 => 100644 examples/gpt3/gpt_config.yaml mode change 100755 => 100644 examples/gpt3/train_gpt3_175b_distributed.sh mode change 100755 => 100644 examples/inference/README.md rename examples/inference/gpt/{simple_gpt_batch_inference.py => gpt_batch_inference.py} (91%) mode change 100755 => 100644 mode change 100755 => 100644 examples/inference/llama_mistral/huggingface_reference.py mode change 100755 => 100644 examples/inference/llama_mistral/run_text_generation_llama3.1.sh mode change 100755 => 100644 examples/inference/llama_mistral/run_text_generation_llama3.sh mode change 100755 => 100644 examples/inference/llama_mistral/run_text_generation_mistral.sh mode change 100755 => 100644 examples/inference/run_text_generation_server_345M.sh mode change 100755 => 100644 examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh mode change 100755 => 100644 examples/inference/t5/simple_t5_batch_inference.py mode change 100755 => 100644 examples/mamba/.gitignore mode change 100755 => 100644 examples/mamba/Dockerfile mode change 100755 => 100644 examples/mamba/README.md mode change 100755 => 100644 examples/mamba/run_text_gen_server_8b.sh mode change 100755 => 100644 examples/mamba/run_text_gen_server_8b_gpt3.sh mode change 100755 => 100644 examples/mamba/train.sh mode change 100755 => 100644 examples/mixtral/README.md mode change 100755 => 100644 examples/mixtral/train_mixtral_8x7b_distributed.sh mode change 100755 => 100644 examples/multimodal/Dockerfile mode change 100755 => 100644 examples/multimodal/README.md mode change 100755 => 100644 examples/multimodal/assets/pretrain_curves.png mode change 100755 => 100644 examples/multimodal/combine_lm_vision_checkpoints.sh mode change 100755 => 100644 examples/multimodal/combine_state_dicts.py mode change 100755 => 100644 examples/multimodal/config.py mode change 100755 => 100644 examples/multimodal/convert_llava_pretrain_to_wds.py mode change 100755 => 100644 examples/multimodal/dataloader_provider.py mode change 100755 => 100644 examples/multimodal/dataset_helpers.py rename examples/multimodal/{ => evaluation}/evaluate_ai2d.py (72%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_chartqa.py (77%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_coco.py (77%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_mathvista.py (92%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_mmmu.py (91%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_ocrbench.py (95%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_textvqa.py (72%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluate_vqav2.py (88%) mode change 100755 => 100644 rename examples/multimodal/{ => evaluation}/evaluation_datasets.py (88%) mode change 100755 => 100644 mode change 100755 => 100644 examples/multimodal/image_processing.py mode change 100755 => 100644 examples/multimodal/layer_specs.py mode change 100755 => 100644 examples/multimodal/manual_prompts.json mode change 100755 => 100644 examples/multimodal/model.py mode change 100755 => 100644 examples/multimodal/model_converter/clip_converter.py mode change 100755 => 100644 examples/multimodal/model_converter/internvit_converter.py mode change 100755 => 100644 examples/multimodal/model_converter/siglip_converter.py mode change 100755 => 100644 examples/multimodal/model_converter/vision_model_tester.py mode change 100755 => 100644 examples/multimodal/multimodal_args.py mode change 100755 => 100644 examples/multimodal/nvlm/README.md mode change 100755 => 100644 examples/multimodal/nvlm/internvit.py mode change 100755 => 100644 examples/multimodal/nvlm/nvlm_prompts.json mode change 100755 => 100644 examples/multimodal/nvlm/pp_checkpoint_converter.py mode change 100755 => 100644 examples/multimodal/nvlm/pretrain_blend.yaml mode change 100755 => 100644 examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh mode change 100755 => 100644 examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh mode change 100755 => 100644 examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh create mode 100644 examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh mode change 100755 => 100644 examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh mode change 100755 => 100644 examples/multimodal/nvlm/sft_34b_internvit.sh mode change 100755 => 100644 examples/multimodal/nvlm/sft_blend.yaml mode change 100755 => 100644 examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh mode change 100755 => 100644 examples/multimodal/pretrain_dataset.yaml mode change 100755 => 100644 examples/multimodal/pretrain_mistral_clip.sh mode change 100755 => 100644 examples/multimodal/run_text_generation.py mode change 100755 => 100644 examples/multimodal/sft_dataset.yaml mode change 100755 => 100644 examples/multimodal/sft_mistral_clip.sh mode change 100755 => 100644 examples/multimodal/text_generation_mistral_clip.sh mode change 100755 => 100644 examples/multimodal/train.py mode change 100755 => 100644 examples/retro/README.md mode change 100755 => 100644 examples/retro/preprocess_data.sh mode change 100755 => 100644 examples/retro/train_retro_2b_distributed.sh mode change 100755 => 100644 examples/run_simple_mcore_train_loop.py mode change 100755 => 100644 examples/t5/README.md mode change 100755 => 100644 examples/t5/t5_mcore_train_curve.png mode change 100755 => 100644 examples/t5/train_t5_220m_distributed.sh mode change 100755 => 100644 images/model_table.png mode change 100755 => 100644 images/strong_scaling.png mode change 100755 => 100644 images/weak_scaling.png mode change 100755 => 100644 megatron/core/QuickStart.md mode change 100755 => 100644 megatron/core/README.md mode change 100755 => 100644 megatron/core/README_STRAGGLER.md mode change 100755 => 100644 megatron/core/__init__.py mode change 100755 => 100644 megatron/core/config_logger.py mode change 100755 => 100644 megatron/core/datasets/Makefile mode change 100755 => 100644 megatron/core/datasets/__init__.py mode change 100755 => 100644 megatron/core/datasets/bert_dataset.py mode change 100755 => 100644 megatron/core/datasets/blended_dataset.py mode change 100755 => 100644 megatron/core/datasets/blended_megatron_dataset_builder.py mode change 100755 => 100644 megatron/core/datasets/blended_megatron_dataset_config.py mode change 100755 => 100644 megatron/core/datasets/gpt_dataset.py mode change 100755 => 100644 megatron/core/datasets/helpers.cpp mode change 100755 => 100644 megatron/core/datasets/helpers.py mode change 100755 => 100644 megatron/core/datasets/indexed_dataset.py mode change 100755 => 100644 megatron/core/datasets/masked_dataset.py mode change 100755 => 100644 megatron/core/datasets/megatron_dataset.py mode change 100755 => 100644 megatron/core/datasets/megatron_tokenizer.py mode change 100755 => 100644 megatron/core/datasets/multimodal_dataset.py mode change 100755 => 100644 megatron/core/datasets/readme.md mode change 100755 => 100644 megatron/core/datasets/retro/__init__.py mode change 100755 => 100644 megatron/core/datasets/retro/config/__init__.py mode change 100755 => 100644 megatron/core/datasets/retro/config/bert_embedders.py mode change 100755 => 100644 megatron/core/datasets/retro/config/config.py mode change 100755 => 100644 megatron/core/datasets/retro/config/gpt_chunk_datasets.py mode change 100755 => 100644 megatron/core/datasets/retro/config/tokenizers.py mode change 100755 => 100644 megatron/core/datasets/retro/db/__init__.py mode change 100755 => 100644 megatron/core/datasets/retro/db/build.py mode change 100755 => 100644 megatron/core/datasets/retro/db/dataset.py mode change 100755 => 100644 megatron/core/datasets/retro/db/utils.py mode change 100755 => 100644 megatron/core/datasets/retro/external_libs.py mode change 100755 => 100644 megatron/core/datasets/retro/index/__init__.py mode change 100755 => 100644 megatron/core/datasets/retro/index/build.py mode change 100755 => 100644 megatron/core/datasets/retro/index/factory.py mode change 100755 => 100644 megatron/core/datasets/retro/index/index.py mode change 100755 => 100644 megatron/core/datasets/retro/index/indexes/__init__.py mode change 100755 => 100644 megatron/core/datasets/retro/index/indexes/faiss_base.py mode change 100755 => 100644 megatron/core/datasets/retro/index/indexes/faiss_par_add.py mode change 100755 => 100644 megatron/core/datasets/retro/index/utils.py mode change 100755 => 100644 megatron/core/datasets/retro/index/validate.py mode change 100755 => 100644 megatron/core/datasets/retro/query/__init__.py mode change 100755 => 100644 megatron/core/datasets/retro/query/gpt_chunk_dataset.py mode change 100755 => 100644 megatron/core/datasets/retro/query/multi_split_gpt_dataset.py mode change 100755 => 100644 megatron/core/datasets/retro/query/query.py mode change 100755 => 100644 megatron/core/datasets/retro/query/retro_dataset.py mode change 100755 => 100644 megatron/core/datasets/retro/query/utils.py mode change 100755 => 100644 megatron/core/datasets/retro/utils.py mode change 100755 => 100644 megatron/core/datasets/t5_dataset.py mode change 100755 => 100644 megatron/core/datasets/utils.py mode change 100755 => 100644 megatron/core/datasets/utils_s3.py mode change 100755 => 100644 megatron/core/dist_checkpointing/__init__.py mode change 100755 => 100644 megatron/core/dist_checkpointing/core.py mode change 100755 => 100644 megatron/core/dist_checkpointing/dict_utils.py mode change 100755 => 100644 megatron/core/dist_checkpointing/exchange_utils.py mode change 100755 => 100644 megatron/core/dist_checkpointing/mapping.py mode change 100755 => 100644 megatron/core/dist_checkpointing/optimizer.py mode change 100755 => 100644 megatron/core/dist_checkpointing/serialization.py mode change 100755 => 100644 megatron/core/dist_checkpointing/state_dict_transformation.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/__init__.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/async_utils.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/base.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/common.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/filesystem_async.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/fully_parallel.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/resharding.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/state_dict_saver.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/tensorstore.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/torch.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/two_stage.py mode change 100755 => 100644 megatron/core/dist_checkpointing/strategies/zarr.py mode change 100755 => 100644 megatron/core/dist_checkpointing/utils.py mode change 100755 => 100644 megatron/core/dist_checkpointing/validation.py mode change 100755 => 100644 megatron/core/distributed/README.md mode change 100755 => 100644 megatron/core/distributed/__init__.py mode change 100755 => 100644 megatron/core/distributed/data_parallel_base.py mode change 100755 => 100644 megatron/core/distributed/distributed_data_parallel.py mode change 100755 => 100644 megatron/core/distributed/distributed_data_parallel_config.py mode change 100755 => 100644 megatron/core/distributed/finalize_model_grads.py mode change 100755 => 100644 megatron/core/distributed/param_and_grad_buffer.py mode change 100755 => 100644 megatron/core/distributed/torch_fully_sharded_data_parallel.py mode change 100755 => 100644 megatron/core/enums.py mode change 100755 => 100644 megatron/core/export/__init__.py mode change 100755 => 100644 megatron/core/export/data_type.py mode change 100755 => 100644 megatron/core/export/export_config.py mode change 100755 => 100644 megatron/core/export/model_type.py mode change 100755 => 100644 megatron/core/export/trtllm/__init__.py mode change 100755 => 100644 megatron/core/export/trtllm/engine_builder/__init__.py mode change 100755 => 100644 megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py mode change 100755 => 100644 megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py mode change 100755 => 100644 megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py mode change 100755 => 100644 megatron/core/export/trtllm/trt_model_config.py mode change 100755 => 100644 megatron/core/export/trtllm/trt_model_type.py mode change 100755 => 100644 megatron/core/export/trtllm/trtllm_helper.py mode change 100755 => 100644 megatron/core/export/trtllm/trtllm_layers.py mode change 100755 => 100644 megatron/core/export/trtllm/trtllm_weights_converter/__init__.py mode change 100755 => 100644 megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py mode change 100755 => 100644 megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py mode change 100755 => 100644 megatron/core/extensions/__init__.py mode change 100755 => 100644 megatron/core/extensions/transformer_engine.py mode change 100755 => 100644 megatron/core/fusions/__init__.py mode change 100755 => 100644 megatron/core/fusions/fused_bias_dropout.py mode change 100755 => 100644 megatron/core/fusions/fused_bias_geglu.py mode change 100755 => 100644 megatron/core/fusions/fused_bias_gelu.py mode change 100755 => 100644 megatron/core/fusions/fused_bias_swiglu.py mode change 100755 => 100644 megatron/core/fusions/fused_cross_entropy.py mode change 100755 => 100644 megatron/core/fusions/fused_layer_norm.py mode change 100755 => 100644 megatron/core/fusions/fused_softmax.py mode change 100755 => 100644 megatron/core/inference/__init__.py mode change 100755 => 100644 megatron/core/inference/ammo_support/__init__.py mode change 100755 => 100644 megatron/core/inference/ammo_support/gpt/model_specs.py mode change 100755 => 100644 megatron/core/inference/ammo_support/gpt/state_dict_hooks.py mode change 100755 => 100644 megatron/core/inference/common_inference_params.py mode change 100755 => 100644 megatron/core/inference/communication_utils.py mode change 100755 => 100644 megatron/core/inference/engines/__init__.py mode change 100755 => 100644 megatron/core/inference/engines/abstract_engine.py mode change 100755 => 100644 megatron/core/inference/engines/mcore_engine.py mode change 100755 => 100644 megatron/core/inference/inference_request.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/__init__.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/gpt/__init__.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/t5/__init__.py mode change 100755 => 100644 megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py mode change 100755 => 100644 megatron/core/inference/modelopt_support/__init__.py mode change 100755 => 100644 megatron/core/inference/modelopt_support/gpt/__init__.py mode change 100755 => 100644 megatron/core/inference/modelopt_support/gpt/model_specs.py mode change 100755 => 100644 megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py create mode 100644 megatron/core/inference/sampling_params.py mode change 100755 => 100644 megatron/core/inference/scheduler.py mode change 100755 => 100644 megatron/core/inference/text_generation_controllers/__init__.py mode change 100755 => 100644 megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py mode change 100755 => 100644 megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py create mode 100644 megatron/core/inference/text_generation_controllers/text_generation_controller.py mode change 100755 => 100644 megatron/core/inference/utils.py mode change 100755 => 100644 megatron/core/inference_params.py mode change 100755 => 100644 megatron/core/jit.py mode change 100755 => 100644 megatron/core/model_parallel_config.py mode change 100755 => 100644 megatron/core/models/T5/__init__.py mode change 100755 => 100644 megatron/core/models/T5/t5_model.py mode change 100755 => 100644 megatron/core/models/T5/t5_spec.py mode change 100755 => 100644 megatron/core/models/__init__.py mode change 100755 => 100644 megatron/core/models/bert/__init__.py mode change 100755 => 100644 megatron/core/models/bert/bert_layer_specs.py mode change 100755 => 100644 megatron/core/models/bert/bert_lm_head.py mode change 100755 => 100644 megatron/core/models/bert/bert_model.py mode change 100755 => 100644 megatron/core/models/bert/pooler.py mode change 100755 => 100644 megatron/core/models/common/__init__.py mode change 100755 => 100644 megatron/core/models/common/embeddings/__init__.py mode change 100755 => 100644 megatron/core/models/common/embeddings/language_model_embedding.py mode change 100755 => 100644 megatron/core/models/common/embeddings/rope_utils.py mode change 100755 => 100644 megatron/core/models/common/embeddings/rotary_pos_embedding.py mode change 100755 => 100644 megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py mode change 100755 => 100644 megatron/core/models/common/language_module/__init__.py mode change 100755 => 100644 megatron/core/models/common/language_module/language_module.py mode change 100755 => 100644 megatron/core/models/common/vision_module/__init__.py mode change 100755 => 100644 megatron/core/models/common/vision_module/vision_module.py mode change 100755 => 100644 megatron/core/models/gpt/__init__.py mode change 100755 => 100644 megatron/core/models/gpt/gpt_layer_specs.py mode change 100755 => 100644 megatron/core/models/gpt/gpt_model.py create mode 100644 megatron/core/models/gpt/moe_module_specs.py mode change 100755 => 100644 megatron/core/models/mamba/__init__.py mode change 100755 => 100644 megatron/core/models/mamba/mamba_layer_specs.py mode change 100755 => 100644 megatron/core/models/mamba/mamba_model.py mode change 100755 => 100644 megatron/core/models/multimodal/__init__.py mode change 100755 => 100644 megatron/core/models/multimodal/llava_model.py mode change 100755 => 100644 megatron/core/models/multimodal/llava_spec.py mode change 100755 => 100644 megatron/core/models/retro/__init__.py mode change 100755 => 100644 megatron/core/models/retro/base_attention.py mode change 100755 => 100644 megatron/core/models/retro/config.py mode change 100755 => 100644 megatron/core/models/retro/decoder_attention.py mode change 100755 => 100644 megatron/core/models/retro/decoder_spec.py mode change 100755 => 100644 megatron/core/models/retro/encoder_attention.py mode change 100755 => 100644 megatron/core/models/retro/encoder_spec.py mode change 100755 => 100644 megatron/core/models/retro/model.py mode change 100755 => 100644 megatron/core/models/retro/utils.py mode change 100755 => 100644 megatron/core/models/vision/__init__.py mode change 100755 => 100644 megatron/core/models/vision/clip_vit_model.py mode change 100755 => 100644 megatron/core/models/vision/multimodal_projector.py mode change 100755 => 100644 megatron/core/models/vision/vit_layer_specs.py mode change 100755 => 100644 megatron/core/num_microbatches_calculator.py mode change 100755 => 100644 megatron/core/optimizer/__init__.py mode change 100755 => 100644 megatron/core/optimizer/clip_grads.py mode change 100755 => 100644 megatron/core/optimizer/distrib_optimizer.py mode change 100755 => 100644 megatron/core/optimizer/grad_scaler.py mode change 100755 => 100644 megatron/core/optimizer/optimizer.py mode change 100755 => 100644 megatron/core/optimizer/optimizer_config.py mode change 100755 => 100644 megatron/core/optimizer_param_scheduler.py mode change 100755 => 100644 megatron/core/package_info.py mode change 100755 => 100644 megatron/core/packed_seq_params.py mode change 100755 => 100644 megatron/core/parallel_state.py mode change 100755 => 100644 megatron/core/pipeline_parallel/__init__.py mode change 100755 => 100644 megatron/core/pipeline_parallel/p2p_communication.py mode change 100755 => 100644 megatron/core/pipeline_parallel/schedules.py mode change 100755 => 100644 megatron/core/requirements.txt mode change 100755 => 100644 megatron/core/rerun_state_machine.py mode change 100755 => 100644 megatron/core/ssm/__init__.py mode change 100755 => 100644 megatron/core/ssm/mamba_block.py mode change 100755 => 100644 megatron/core/ssm/mamba_hybrid_layer_allocation.py mode change 100755 => 100644 megatron/core/ssm/mamba_layer.py mode change 100755 => 100644 megatron/core/ssm/mamba_mixer.py mode change 100755 => 100644 megatron/core/ssm/triton_cache_manager.py mode change 100755 => 100644 megatron/core/tensor_parallel/__init__.py mode change 100755 => 100644 megatron/core/tensor_parallel/cross_entropy.py mode change 100755 => 100644 megatron/core/tensor_parallel/data.py mode change 100755 => 100644 megatron/core/tensor_parallel/layers.py mode change 100755 => 100644 megatron/core/tensor_parallel/mappings.py mode change 100755 => 100644 megatron/core/tensor_parallel/random.py mode change 100755 => 100644 megatron/core/tensor_parallel/utils.py mode change 100755 => 100644 megatron/core/timers.py mode change 100755 => 100644 megatron/core/transformer/__init__.py mode change 100755 => 100644 megatron/core/transformer/attention.py mode change 100755 => 100644 megatron/core/transformer/cuda_graphs.py mode change 100755 => 100644 megatron/core/transformer/custom_layers/__init__.py mode change 100755 => 100644 megatron/core/transformer/custom_layers/transformer_engine.py mode change 100755 => 100644 megatron/core/transformer/dot_product_attention.py mode change 100755 => 100644 megatron/core/transformer/enums.py mode change 100755 => 100644 megatron/core/transformer/identity_op.py mode change 100755 => 100644 megatron/core/transformer/mlp.py mode change 100755 => 100644 megatron/core/transformer/module.py mode change 100755 => 100644 megatron/core/transformer/moe/README.md mode change 100755 => 100644 megatron/core/transformer/moe/__init__.py mode change 100755 => 100644 megatron/core/transformer/moe/experts.py mode change 100755 => 100644 megatron/core/transformer/moe/grouped_gemm_util.py mode change 100755 => 100644 megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py mode change 100755 => 100644 megatron/core/transformer/moe/moe_layer.py mode change 100755 => 100644 megatron/core/transformer/moe/moe_utils.py mode change 100755 => 100644 megatron/core/transformer/moe/router.py mode change 100755 => 100644 megatron/core/transformer/moe/shared_experts.py mode change 100755 => 100644 megatron/core/transformer/moe/token_dispatcher.py mode change 100755 => 100644 megatron/core/transformer/moe/upcycling_utils.py mode change 100755 => 100644 megatron/core/transformer/multi_latent_attention.py mode change 100755 => 100644 megatron/core/transformer/spec_utils.py mode change 100755 => 100644 megatron/core/transformer/torch_layer_norm.py mode change 100755 => 100644 megatron/core/transformer/torch_norm.py mode change 100755 => 100644 megatron/core/transformer/transformer_block.py mode change 100755 => 100644 megatron/core/transformer/transformer_config.py mode change 100755 => 100644 megatron/core/transformer/transformer_layer.py mode change 100755 => 100644 megatron/core/transformer/utils.py mode change 100755 => 100644 megatron/core/utils.py mode change 100755 => 100644 megatron/inference/__init__.py mode change 100755 => 100644 megatron/inference/algos/__init__.py mode change 100755 => 100644 megatron/inference/algos/distillation.py mode change 100755 => 100644 megatron/inference/arguments.py mode change 100755 => 100644 megatron/inference/checkpointing.py mode change 100755 => 100644 megatron/inference/docs/distillation.md mode change 100755 => 100644 megatron/inference/endpoints/common.py mode change 100755 => 100644 megatron/inference/endpoints/completions.py mode change 100755 => 100644 megatron/inference/gpt/__init__.py mode change 100755 => 100644 megatron/inference/gpt/loss_func.py mode change 100755 => 100644 megatron/inference/gpt/model_provider.py mode change 100755 => 100644 megatron/inference/static/index.html mode change 100755 => 100644 megatron/inference/text_generation/__init__.py mode change 100755 => 100644 megatron/inference/text_generation/api.py mode change 100755 => 100644 megatron/inference/text_generation/beam_utils.py mode change 100755 => 100644 megatron/inference/text_generation/communication.py mode change 100755 => 100644 megatron/inference/text_generation/forward_step.py mode change 100755 => 100644 megatron/inference/text_generation/generation.py mode change 100755 => 100644 megatron/inference/text_generation/sampling.py mode change 100755 => 100644 megatron/inference/text_generation/tokenization.py mode change 100755 => 100644 megatron/inference/text_generation_server.py mode change 100755 => 100644 megatron/legacy/data/__init__.py mode change 100755 => 100644 megatron/legacy/data/autoaugment.py mode change 100755 => 100644 megatron/legacy/data/biencoder_dataset_utils.py mode change 100755 => 100644 megatron/legacy/data/data_samplers.py mode change 100755 => 100644 megatron/legacy/data/dataset_utils.py mode change 100755 => 100644 megatron/legacy/data/ict_dataset.py mode change 100755 => 100644 megatron/legacy/data/image_folder.py mode change 100755 => 100644 megatron/legacy/data/multimodal_dataset.py mode change 100755 => 100644 megatron/legacy/data/orqa_wiki_dataset.py mode change 100755 => 100644 megatron/legacy/data/realm_dataset_utils.py mode change 100755 => 100644 megatron/legacy/data/realm_index.py mode change 100755 => 100644 megatron/legacy/data/vit_dataset.py mode change 100755 => 100644 megatron/legacy/fp16_deprecated/loss_scaler.py mode change 100755 => 100644 megatron/legacy/fused_kernels/__init__.py mode change 100755 => 100644 megatron/legacy/fused_kernels/compat.h mode change 100755 => 100644 megatron/legacy/fused_kernels/tests/__init__.py mode change 100755 => 100644 megatron/legacy/fused_kernels/tests/test_fused_kernels.py mode change 100755 => 100644 megatron/legacy/fused_kernels/type_shim.h mode change 100755 => 100644 megatron/legacy/indexer.py mode change 100755 => 100644 megatron/legacy/model/__init__.py mode change 100755 => 100644 megatron/legacy/model/bert_model.py mode change 100755 => 100644 megatron/legacy/model/biencoder_model.py mode change 100755 => 100644 megatron/legacy/model/classification.py mode change 100755 => 100644 megatron/legacy/model/enums.py mode change 100755 => 100644 megatron/legacy/model/fused_bias_gelu.py mode change 100755 => 100644 megatron/legacy/model/fused_layer_norm.py mode change 100755 => 100644 megatron/legacy/model/fused_softmax.py mode change 100755 => 100644 megatron/legacy/model/gpt_model.py mode change 100755 => 100644 megatron/legacy/model/language_model.py mode change 100755 => 100644 megatron/legacy/model/module.py mode change 100755 => 100644 megatron/legacy/model/multiple_choice.py mode change 100755 => 100644 megatron/legacy/model/realm_model.py mode change 100755 => 100644 megatron/legacy/model/rms_norm.py mode change 100755 => 100644 megatron/legacy/model/t5_model.py mode change 100755 => 100644 megatron/legacy/model/transformer.py mode change 100755 => 100644 megatron/legacy/model/utils.py mode change 100755 => 100644 megatron/legacy/model/vision/classification.py mode change 100755 => 100644 megatron/legacy/model/vision/dino.py mode change 100755 => 100644 megatron/legacy/model/vision/esvit_swin_backbone.py mode change 100755 => 100644 megatron/legacy/model/vision/inpainting.py mode change 100755 => 100644 megatron/legacy/model/vision/knn_monitor.py mode change 100755 => 100644 megatron/legacy/model/vision/mit_backbone.py mode change 100755 => 100644 megatron/legacy/model/vision/swin_backbone.py mode change 100755 => 100644 megatron/legacy/model/vision/utils.py mode change 100755 => 100644 megatron/legacy/model/vision/vit_backbone.py mode change 100755 => 100644 megatron/legacy/mpu/tests/__init__.py mode change 100755 => 100644 megatron/legacy/mpu/tests/commons.py mode change 100755 => 100644 megatron/legacy/mpu/tests/test_cross_entropy.py mode change 100755 => 100644 megatron/legacy/mpu/tests/test_data.py mode change 100755 => 100644 megatron/legacy/mpu/tests/test_initialize.py mode change 100755 => 100644 megatron/legacy/mpu/tests/test_layers.py mode change 100755 => 100644 megatron/legacy/mpu/tests/test_random.py mode change 100755 => 100644 megatron/training/__init__.py mode change 100755 => 100644 megatron/training/activations.py mode change 100755 => 100644 megatron/training/arguments.py mode change 100755 => 100644 megatron/training/async_utils.py mode change 100755 => 100644 megatron/training/checkpointing.py mode change 100755 => 100644 megatron/training/dist_signal_handler.py mode change 100755 => 100644 megatron/training/ft_integration.py mode change 100755 => 100644 megatron/training/global_vars.py mode change 100755 => 100644 megatron/training/initialize.py mode change 100755 => 100644 megatron/training/log_handler.py mode change 100755 => 100644 megatron/training/one_logger_utils.py mode change 100755 => 100644 megatron/training/theoretical_memory_usage.py mode change 100755 => 100644 megatron/training/tokenizer/__init__.py mode change 100755 => 100644 megatron/training/tokenizer/bert_tokenization.py mode change 100755 => 100644 megatron/training/tokenizer/gpt2_tokenization.py mode change 100755 => 100644 megatron/training/tokenizer/multimodal_tokenizer.py mode change 100755 => 100644 megatron/training/tokenizer/tokenizer.py mode change 100755 => 100644 megatron/training/training.py mode change 100755 => 100644 megatron/training/utils.py mode change 100755 => 100644 megatron/training/yaml_arguments.py mode change 100755 => 100644 mypy.ini mode change 100755 => 100644 pretrain_bert.py mode change 100755 => 100644 pretrain_gpt.py mode change 100755 => 100644 pretrain_ict.py mode change 100755 => 100644 pretrain_mamba.py mode change 100755 => 100644 pretrain_retro.py mode change 100755 => 100644 pretrain_t5.py mode change 100755 => 100644 pretrain_vision_classify.py mode change 100755 => 100644 pretrain_vision_dino.py mode change 100755 => 100644 pretrain_vision_inpaint.py mode change 100755 => 100644 pretrain_vlm.py mode change 100755 => 100644 pyproject.toml mode change 100755 => 100644 pytest.ini rename requirements/{pytorch:24.01 => pytorch_24.01}/requirements.txt (100%) mode change 100755 => 100644 rename requirements/{pytorch:24.07 => pytorch_24.07}/requirements.txt (100%) mode change 100755 => 100644 create mode 100644 run_1nodes.sh mode change 100755 => 100644 setup.py mode change 100755 => 100644 tasks/data_utils.py mode change 100755 => 100644 tasks/ensemble_classifier.py mode change 100755 => 100644 tasks/eval_utils.py mode change 100755 => 100644 tasks/finetune_utils.py mode change 100755 => 100644 tasks/glue/data.py mode change 100755 => 100644 tasks/glue/finetune.py mode change 100755 => 100644 tasks/glue/mnli.py mode change 100755 => 100644 tasks/glue/qqp.py mode change 100755 => 100644 tasks/main.py mode change 100755 => 100644 tasks/msdp/README.md mode change 100755 => 100644 tasks/msdp/evaluate.py mode change 100755 => 100644 tasks/msdp/main.py mode change 100755 => 100644 tasks/msdp/metrics.py mode change 100755 => 100644 tasks/msdp/preprocessing.py mode change 100755 => 100644 tasks/msdp/prompt.py mode change 100755 => 100644 tasks/orqa/README.md mode change 100755 => 100644 tasks/orqa/evaluate_orqa.py mode change 100755 => 100644 tasks/orqa/evaluate_utils.py mode change 100755 => 100644 tasks/orqa/supervised/data.py mode change 100755 => 100644 tasks/orqa/supervised/eval_utils.py mode change 100755 => 100644 tasks/orqa/supervised/finetune.py mode change 100755 => 100644 tasks/orqa/unsupervised/nq.py mode change 100755 => 100644 tasks/orqa/unsupervised/qa_utils.py mode change 100755 => 100644 tasks/orqa/unsupervised/tokenizers.py mode change 100755 => 100644 tasks/quantize/calibrate_gpt.py mode change 100755 => 100644 tasks/race/data.py mode change 100755 => 100644 tasks/race/finetune.py mode change 100755 => 100644 tasks/vision/classification/classification.py mode change 100755 => 100644 tasks/vision/classification/eval_utils.py mode change 100755 => 100644 tasks/vision/finetune_utils.py mode change 100755 => 100644 tasks/vision/main.py mode change 100755 => 100644 tasks/vision/segmentation/cityscapes.py mode change 100755 => 100644 tasks/vision/segmentation/data.py mode change 100755 => 100644 tasks/vision/segmentation/finetune_segformer.py mode change 100755 => 100644 tasks/vision/segmentation/finetune_setr.py mode change 100755 => 100644 tasks/vision/segmentation/metrics.py mode change 100755 => 100644 tasks/vision/segmentation/seg_heads.py mode change 100755 => 100644 tasks/vision/segmentation/seg_models.py mode change 100755 => 100644 tasks/vision/segmentation/transforms.py mode change 100755 => 100644 tasks/vision/segmentation/utils.py mode change 100755 => 100644 tasks/zeroshot_gpt/datasets.py mode change 100755 => 100644 tasks/zeroshot_gpt/detokenizer.py mode change 100755 => 100644 tasks/zeroshot_gpt/evaluate.py mode change 100755 => 100644 tests/__init__.py mode change 100755 => 100644 tests/functional_tests/__init__.py mode change 100755 => 100644 tests/functional_tests/python_test_utils/__init__.py mode change 100755 => 100644 tests/functional_tests/python_test_utils/common.py mode change 100755 => 100644 tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py mode change 100755 => 100644 tests/functional_tests/python_test_utils/test_ci_pipeline.py mode change 100755 => 100644 tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py mode change 100755 => 100644 tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py mode change 100755 => 100644 tests/functional_tests/shell_test_utils/_run_training.sh mode change 100755 => 100644 tests/functional_tests/shell_test_utils/run_ci_test.sh mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_release/golden_values_0.9.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/bert/bert_release/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/common/ckpt_converter/__main__.py mode change 100755 => 100644 tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.8.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.9.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.9.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.9.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.9.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_dev.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_lts.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/model_config.yaml create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_dev.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_lts.json create mode 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_release/golden_values_0.9.0.json mode change 100755 => 100644 tests/functional_tests/test_cases/t5/t5_release/model_config.yaml mode change 100755 => 100644 tests/test_utils/python_scripts/common.py mode change 100755 => 100644 tests/test_utils/python_scripts/generate_jet_trigger_job.py mode change 100755 => 100644 tests/test_utils/python_scripts/generate_local_jobs.py mode change 100755 => 100644 tests/test_utils/python_scripts/launch_jet_workload.py mode change 100755 => 100644 tests/test_utils/recipes/_build-mcore-dev.yaml mode change 100755 => 100644 tests/test_utils/recipes/_build-mcore-lts.yaml mode change 100755 => 100644 tests/test_utils/recipes/_build-nemo.yaml mode change 100755 => 100644 tests/test_utils/recipes/bert.yaml mode change 100755 => 100644 tests/test_utils/recipes/gpt-modelopt.yaml mode change 100755 => 100644 tests/test_utils/recipes/gpt-nemo.yaml mode change 100755 => 100644 tests/test_utils/recipes/gpt.yaml mode change 100755 => 100644 tests/test_utils/recipes/multimodal-llava.yaml mode change 100755 => 100644 tests/test_utils/recipes/t5.yaml mode change 100755 => 100644 tests/test_utils/recipes/unit-tests.yaml mode change 100755 => 100644 tests/test_utils/shell_scripts/notify.sh mode change 100755 => 100644 tests/unit_tests/__init__.py mode change 100755 => 100644 tests/unit_tests/conftest.py mode change 100755 => 100644 tests/unit_tests/data/__init__.py mode change 100755 => 100644 tests/unit_tests/data/test_bin_reader.py mode change 100755 => 100644 tests/unit_tests/data/test_builder.py mode change 100755 => 100644 tests/unit_tests/data/test_gpt_dataset.py mode change 100755 => 100644 tests/unit_tests/data/test_multimodal_dataset.py mode change 100755 => 100644 tests/unit_tests/data/test_preprocess_data.py mode change 100755 => 100644 tests/unit_tests/data/test_preprocess_mmdata.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/__init__.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/conftest.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/__init__.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/common.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_bert_model.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_gpt_model.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_mamba.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_moe_experts.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_retro_model.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/models/test_t5_model.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_async_save.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_cached_metadata.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_flattened_resharding.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_fp8.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_fully_parallel.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_local.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_mapping.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_nonpersistent.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_optimizer.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/test_serialization.py mode change 100755 => 100644 tests/unit_tests/dist_checkpointing/utils.py mode change 100755 => 100644 tests/unit_tests/distributed/test_grad_reduce_for_replicated_embedder.py mode change 100755 => 100644 tests/unit_tests/distributed/test_param_and_grad_buffer.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/__init__.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/test_distributed_fp8.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/test_single_device_fp8.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/test_trtllm_helper.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/test_trtllm_layers.py mode change 100755 => 100644 tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py mode change 100755 => 100644 tests/unit_tests/fusions/test_torch_softmax.py mode change 100755 => 100644 tests/unit_tests/inference/__init__.py mode change 100755 => 100644 tests/unit_tests/inference/engines/__init__.py mode change 100755 => 100644 tests/unit_tests/inference/engines/test_mcore_engine.py mode change 100755 => 100644 tests/unit_tests/inference/model_inference_wrappers/__init__.py mode change 100755 => 100644 tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py mode change 100755 => 100644 tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py mode change 100755 => 100644 tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py mode change 100755 => 100644 tests/unit_tests/inference/test_common_inference_params.py mode change 100755 => 100644 tests/unit_tests/inference/test_flash_decode.py mode change 100755 => 100644 tests/unit_tests/inference/test_inference_utils.py mode change 100755 => 100644 tests/unit_tests/inference/test_modelopt_gpt_model.py mode change 100755 => 100644 tests/unit_tests/inference/test_scheduler.py mode change 100755 => 100644 tests/unit_tests/inference/text_generation_controllers/__init__.py mode change 100755 => 100644 tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py mode change 100755 => 100644 tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py mode change 100755 => 100644 tests/unit_tests/models/__init__.py mode change 100755 => 100644 tests/unit_tests/models/test_base_embedding.py mode change 100755 => 100644 tests/unit_tests/models/test_bert_model.py mode change 100755 => 100644 tests/unit_tests/models/test_clip_vit_model.py mode change 100755 => 100644 tests/unit_tests/models/test_gpt_model.py mode change 100755 => 100644 tests/unit_tests/models/test_llava_model.py mode change 100755 => 100644 tests/unit_tests/models/test_mamba_model.py mode change 100755 => 100644 tests/unit_tests/models/test_multimodal_projector.py mode change 100755 => 100644 tests/unit_tests/models/test_t5_model.py mode change 100755 => 100644 tests/unit_tests/pipeline_parallel/__init__.py mode change 100755 => 100644 tests/unit_tests/pipeline_parallel/test_helpers.py mode change 100755 => 100644 tests/unit_tests/pipeline_parallel/test_schedules.py mode change 100755 => 100644 tests/unit_tests/ssm/test_mamba_block.py mode change 100755 => 100644 tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py mode change 100755 => 100644 tests/unit_tests/ssm/test_mamba_layer.py mode change 100755 => 100644 tests/unit_tests/ssm/test_mamba_mixer.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/__init__.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_cross_entropy.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_data.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_initialization.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_layers.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_mappings.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_random.py mode change 100755 => 100644 tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py mode change 100755 => 100644 tests/unit_tests/test_basic.py mode change 100755 => 100644 tests/unit_tests/test_imports.py mode change 100755 => 100644 tests/unit_tests/test_inference.py mode change 100755 => 100644 tests/unit_tests/test_local_multi_tensor_fns.py mode change 100755 => 100644 tests/unit_tests/test_num_microbatches_calculator.py mode change 100755 => 100644 tests/unit_tests/test_optimizer.py mode change 100755 => 100644 tests/unit_tests/test_optimizer_param_scheduler.py mode change 100755 => 100644 tests/unit_tests/test_parallel_state.py mode change 100755 => 100644 tests/unit_tests/test_tokenizer.py mode change 100755 => 100644 tests/unit_tests/test_training.py mode change 100755 => 100644 tests/unit_tests/test_utilities.py mode change 100755 => 100644 tests/unit_tests/test_utils.py mode change 100755 => 100644 tests/unit_tests/transformer/__init__.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/__init__.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/conftest.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_aux_loss.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_grouped_mlp.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_moe_layer.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_routers.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_sequential_mlp.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_shared_experts.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_token_dispatcher.py mode change 100755 => 100644 tests/unit_tests/transformer/moe/test_upcycling.py mode change 100755 => 100644 tests/unit_tests/transformer/test_attention.py mode change 100755 => 100644 tests/unit_tests/transformer/test_attention_packed_seq.py mode change 100755 => 100644 tests/unit_tests/transformer/test_core_attention.py mode change 100755 => 100644 tests/unit_tests/transformer/test_mlp.py mode change 100755 => 100644 tests/unit_tests/transformer/test_module.py mode change 100755 => 100644 tests/unit_tests/transformer/test_multi_latent_attention.py mode change 100755 => 100644 tests/unit_tests/transformer/test_retro_attention.py mode change 100755 => 100644 tests/unit_tests/transformer/test_rope.py mode change 100755 => 100644 tests/unit_tests/transformer/test_spec_customization.py mode change 100755 => 100644 tests/unit_tests/transformer/test_transformer_block.py mode change 100755 => 100644 tests/unit_tests/transformer/test_transformer_layer.py mode change 100755 => 100644 tools/autoformat.sh mode change 100755 => 100644 tools/bert_embedding/__init__.py mode change 100755 => 100644 tools/bert_embedding/dataset.py mode change 100755 => 100644 tools/bert_embedding/embed.py mode change 100755 => 100644 tools/bert_embedding/external_libs.py mode change 100755 => 100644 tools/bert_embedding/huggingface.py mode change 100755 => 100644 tools/checkpoint/convert.py mode change 100755 => 100644 tools/checkpoint/hybrid_conversion.py mode change 100755 => 100644 tools/checkpoint/loader_llama_mistral.py mode change 100755 => 100644 tools/checkpoint/loader_mcore.py mode change 100755 => 100644 tools/checkpoint/loader_megatron.py mode change 100755 => 100644 tools/checkpoint/loader_mixtral_hf.py mode change 100755 => 100644 tools/checkpoint/saver_mcore.py mode change 100755 => 100644 tools/checkpoint/saver_megatron.py mode change 100755 => 100644 tools/checkpoint/schema_base.py mode change 100755 => 100644 tools/checkpoint/schema_mcore.py mode change 100755 => 100644 tools/checkpoint/utils.py mode change 100755 => 100644 tools/copyright.sh mode change 100755 => 100644 tools/linter.py mode change 100755 => 100644 tools/merge_datasets.py mode change 100755 => 100644 tools/openwebtext/README.md mode change 100755 => 100644 tools/openwebtext/add_id.py mode change 100755 => 100644 tools/openwebtext/blacklist_urls.py mode change 100755 => 100644 tools/openwebtext/cleanup_dataset.py mode change 100755 => 100644 tools/openwebtext/cleanup_fix_dataset.py mode change 100755 => 100644 tools/openwebtext/filter_ngrams.py mode change 100755 => 100644 tools/openwebtext/find_duplicates.py mode change 100755 => 100644 tools/openwebtext/group_duplicate_url.py mode change 100755 => 100644 tools/openwebtext/merge_jsons.py mode change 100755 => 100644 tools/openwebtext/remove_group_duplicates.py mode change 100755 => 100644 tools/preprocess_data.py mode change 100755 => 100644 tools/preprocess_data_nmt.py mode change 100755 => 100644 tools/preprocess_mmdata.py mode change 100755 => 100644 tools/report_theoretical_memory.py mode change 100755 => 100644 tools/retro/README.md mode change 100755 => 100644 tools/retro/build_db.md mode change 100755 => 100644 tools/retro/cli/__init__.py mode change 100755 => 100644 tools/retro/cli/__main__.py mode change 100755 => 100644 tools/retro/cli/cli.py mode change 100755 => 100644 tools/retro/config_utils.py mode change 100755 => 100644 tools/retro/docker/Dockerfile mode change 100755 => 100644 tools/retro/preprocess_data.py mode change 100755 => 100644 tools/retro/sft/README.md mode change 100755 => 100644 tools/retro/sft/dataset_conv.py mode change 100755 => 100644 tools/retro/sft/open_inst.sh mode change 100755 => 100644 tools/retro/sft/sft_retro.py mode change 100755 => 100644 tools/retro/sft/sft_retro_lm.sh mode change 100755 => 100644 tools/retro/text_generation/evaluate.py mode change 100755 => 100644 tools/retro/text_generation/metrics.py mode change 100755 => 100644 tools/retro/text_generation/retro_api.py mode change 100755 => 100644 tools/retro/text_generation/retro_generate.sh mode change 100755 => 100644 tools/retro/text_generation/retro_generation.py mode change 100755 => 100644 tools/retro/text_generation/retro_text_generation.py mode change 100755 => 100644 tools/run_mamba_text_generation_server.py mode change 100755 => 100644 tools/run_text_generation_server.py mode change 100755 => 100644 tools/run_vlm_text_generation.py mode change 100755 => 100644 tools/text_generation_cli.py mode change 100755 => 100644 unit-test-job-lts.yaml diff --git a/.coveragerc b/.coveragerc old mode 100755 new mode 100644 diff --git a/.gitignore b/.gitignore old mode 100755 new mode 100644 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml old mode 100755 new mode 100644 diff --git a/CHANGELOG.md b/CHANGELOG.md old mode 100755 new mode 100644 diff --git a/CODEOWNERS b/CODEOWNERS old mode 100755 new mode 100644 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md old mode 100755 new mode 100644 diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev old mode 100755 new mode 100644 diff --git a/Dockerfile.ci.lts b/Dockerfile.ci.lts old mode 100755 new mode 100644 diff --git a/Dockerfile.linting b/Dockerfile.linting old mode 100755 new mode 100644 diff --git a/GPT_pretraining.sh b/GPT_pretraining.sh old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/Llama_pretraining.sh b/Llama_pretraining.sh old mode 100755 new mode 100644 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100755 new mode 100644 diff --git a/README.md.origin b/README.md.origin old mode 100755 new mode 100644 diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/context_parallel.rst b/docs/source/api-guide/context_parallel.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/datasets.rst b/docs/source/api-guide/datasets.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/dist_checkpointing.rst b/docs/source/api-guide/dist_checkpointing.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/dist_checkpointing.strategies.rst b/docs/source/api-guide/dist_checkpointing.strategies.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/dist_optimizer.md b/docs/source/api-guide/dist_optimizer.md old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/distributed.rst b/docs/source/api-guide/distributed.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/encoder_decoder_parallelism.rst b/docs/source/api-guide/encoder_decoder_parallelism.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.bert.rst b/docs/source/api-guide/models.bert.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.gpt.rst b/docs/source/api-guide/models.gpt.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.rst b/docs/source/api-guide/models.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/models.t5.rst b/docs/source/api-guide/models.t5.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/moe.rst b/docs/source/api-guide/moe.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/num_microbatches_calculator.rst b/docs/source/api-guide/num_microbatches_calculator.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/optimizer_param_scheduler.rst b/docs/source/api-guide/optimizer_param_scheduler.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/pipeline_parallel.rst b/docs/source/api-guide/pipeline_parallel.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/tensor_parallel.rst b/docs/source/api-guide/tensor_parallel.rst old mode 100755 new mode 100644 diff --git a/docs/source/api-guide/transformer.rst b/docs/source/api-guide/transformer.rst old mode 100755 new mode 100644 diff --git a/docs/source/images/context_parallel/CP_overview.png b/docs/source/images/context_parallel/CP_overview.png old mode 100755 new mode 100644 diff --git a/docs/source/images/context_parallel/CP_results.png b/docs/source/images/context_parallel/CP_results.png old mode 100755 new mode 100644 diff --git a/docs/source/images/distrib_optimizer/data_flow.png b/docs/source/images/distrib_optimizer/data_flow.png old mode 100755 new mode 100644 diff --git a/docs/source/images/distrib_optimizer/sharding_scheme.png b/docs/source/images/distrib_optimizer/sharding_scheme.png old mode 100755 new mode 100644 diff --git a/docs/source/images/moe/token_drop.png b/docs/source/images/moe/token_drop.png old mode 100755 new mode 100644 diff --git a/docs/source/index.rst b/docs/source/index.rst old mode 100755 new mode 100644 diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh old mode 100755 new mode 100644 diff --git a/examples/academic_paper_scripts/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh old mode 100755 new mode 100644 diff --git a/examples/bert/README.md b/examples/bert/README.md old mode 100755 new mode 100644 diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/export/README.md b/examples/export/README.md old mode 100755 new mode 100644 diff --git a/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py b/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py old mode 100755 new mode 100644 diff --git a/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py old mode 100755 new mode 100644 diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md old mode 100755 new mode 100644 diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py old mode 100755 new mode 100644 diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py old mode 100755 new mode 100644 diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md old mode 100755 new mode 100644 diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml old mode 100755 new mode 100644 index 443e4b7..0625782 --- a/examples/gpt3/gpt_config.yaml +++ b/examples/gpt3/gpt_config.yaml @@ -63,6 +63,7 @@ language_model: # MoE related moe_router_load_balancing_type: "aux_loss" moe_router_topk: 2 + moe_router_topk_limited_devices: null moe_grouped_gemm: False moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss. moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/inference/README.md b/examples/inference/README.md old mode 100755 new mode 100644 index bd8e738..b4b07cb --- a/examples/inference/README.md +++ b/examples/inference/README.md @@ -1,5 +1,5 @@ ### Megatron Core Inference Documentation -This guide will walk you through how you can use megatron core for inference on your models. +This guide provides an example for Megatron Core for running model inference. ### Contents - [Megatron Core Inference Documentation](#megatron-core-inference-documentation) @@ -18,21 +18,21 @@ This guide will walk you through how you can use megatron core for inference on
#### 1. Quick Start -This will walk you through the flow of running batch inference on a GPT model trained using megatron core. The file can be found at [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py) +This example runs batch inference on a GPT model trained using Megatron Core. The entrypoint is [simple_gpt_batch_inference.py](./gpt/gpt_batch_inference.py)
-##### 1.1 Understanding The Code -***STEP 1 - We initialize model parallel and other default arguments*** -We can default micro batch size to be 1, since for TP models it is not used, and for PP models it is calculated during runtime. +##### 1.1 Code Walkthrough +***STEP 1 - Initialize model parallel and other default arguments*** +The micro batch size is set as 1 as it is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. ```python initialize_megatron( args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1} ) ``` -***STEP 2 - We load the model using the model_provider_function*** -NOTE: The model provider function in the script supports MCore and Legacy models. +***STEP 2 - Load the model using the model_provider_function*** +NOTE: The model provider function supports both MCore and Legacy models. ```python model = get_model(model_provider, wrap_with_ddp=False) @@ -41,10 +41,10 @@ NOTE: The model provider function in the script supports MCore and Legacy models ``` ***STEP 3 - Choose an engine*** -One of the important elements of the generate function is an inference engine. In this example we will be choosing the [megatron core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [simple text generation controller](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py), the default engine. Other engines that will be supported in the future are TRTLLMEngine. +Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a simple [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future. ```python inference_wrapped_model = GPTInferenceWrapper(model, args) - text_generation_controller = SimpleTextGenerationController( + text_generation_controller = TextGenerationController( inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer ) @@ -53,12 +53,12 @@ One of the important elements of the generate function is an inference engine. I ) ``` -***STEP 4 - Run the generate function and display results*** -We use default values for the [common inference params](../../megatron/core/inference/common_inference_params.py). Customize this if you want to change top_p, top_k, number of tokens to generate etc. -*Note that the result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)* +***STEP 4 - Run text generation*** +The [SamplingParams](../../megatron/core/inference/sampling_params.py) contains suggested defaults. Customize this to change top_p, top_k, number of tokens to generate etc. +*Note: The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)* ```python results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, common_inference_params=common_inference_params + prompts=args.prompts, sampling_params=sampling_params ) if torch.distributed.get_rank() == 0: @@ -76,12 +76,12 @@ We use default values for the [common inference params](../../megatron/core/infe
##### 1.2 Running The Code -An example run script is shown below. Change the tokenizer paths, inference params, and other settings for your model. +An example run script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. -For a quick recap on inference params refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910) +For a quick recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910). ``` -#In a slurm cluster (You could also use docker) +# In a slurm cluster (You could also use docker) ACCOUNT= MLM_PATH=/path/to/megatron-lm GPT_CKPT=/path/to/gpt/ckpt @@ -133,8 +133,8 @@ NOTE: Other parameters which can be customized for inference are :- --top_p (top_p sampling) --num-tokens-to-generate (Number of tokens to generate for each prompt) --inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.') ---use-dist-ckpt (If you are using dist checkpoint format for the model) ---use-legacy-models (If you are using legacy gpt model instead of mcore gpt model) +--use-dist-ckpt (If using dist checkpoint format for the model) +--use-legacy-models (If using legacy gpt model instead of mcore gpt model) ``` @@ -142,16 +142,17 @@ NOTE: Other parameters which can be customized for inference are :-
-#### 2. Flow of Control In MCore Backend -The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py). -* We call [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function with all our input prompts. -* The scheduler in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until we hit the max batch size, and then it will put the rest in the waiting requests pool. -* The engine will then run until all requests (waiting + active) are completed +#### 2. Control Flow in the MCore Backend +An example of inference with static batching is provided in [gpt_batch_inference.py](./gpt/gpt_batch_inference.py). +* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts. +* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. +* The engine will run until all requests (waiting + active) are completed. * The active requests are passed into **generate_all_output_tokens_static_batch()** of the text generation controller . - * This function uses the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) **prep_model_for_inference()** , and then runs an auto regressive loop - * In the auto regressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to get the required input, passes it into the **run_one_forward_step()** method, which calls the appropriate (PP, TP) model `.forward()` methods to get the output logits - * The output logits are synchronized across all pipeline parallel ranks - * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the common inference parameters. + * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop + * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks + * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits + * Output logits are synchronized across all pipeline parallel ranks + * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters. * The sampled tokens are then appended to the input prompt tokens for the next iteration * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. @@ -160,16 +161,18 @@ The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simpl
#### 3. Customizing The Inference Pipeline -The following guide will walk you through how you can customize different parts of the inference pipeline. There are three levels at which you can customize the pipeline. -* **Inference engine** - Highest level of customization. Currently we support the MCore Engine. Change this to add a new engine. -* **Text generation controller** - Extend this to customize tokenization, detokenization, or implement a new sampling strategy. + +The inference pipeline supports three levels of customization: + +* **Inference engine** - The MCore Engine is currently supported. Change this to add a new backend. +* **Text generation controller** - The main sampling loop. This can be customized to support alternative tokenization, detokenization, or to implement a new sampling strategy. * **Inference Wrapped Model** - Change this to support a new model. * **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
##### 3.1. Create Your Own Inference Backend -This is the highest level of customization. The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file has a generate method that can be extended to support a new backend. +The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. ```python class AbstractEngine(ABC): @@ -177,15 +180,17 @@ class AbstractEngine(ABC): def generate(self) -> dict: """The abstract backend's generate function. - To define your own backend, make sure you implement this and return the outputs as a dictionary . - + To define a new backend, implement this method and return the outputs as a dictionary. +```
-##### 3.2. Create Your Own Text Generation Controller -In case you want to use the megatron core backend, but would like to overwrite the tokenization, text generation or detokenization extend the [simple_text_generation_controller.py](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py). The class has the following methods +##### 3.2. Implement a new Sampling Loop + +The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies. + ``` python -class SimpleTextGenerationController: +class TextGenerationController: def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: """Utility to tokenize the input prompts""" @@ -193,12 +198,12 @@ class SimpleTextGenerationController: def sample_from_logits( self, last_token_logits: torch.Tensor, - common_inference_params: CommonInferenceParams, + sampling_params: SamplingParams, vocab_size: int, ) -> torch.Tensor: """Samples the logits to generate outputs - Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples + Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. """ def update_generation_status( @@ -229,12 +234,12 @@ class SimpleTextGenerationController:
##### 3.3. Support Other Models -In order to support other models please extend the [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) file. The abstract wrapper already supports the following : -* Forward method which automatically calls the appropriate forward method (PP or TP etc) depending on model parallel settings -* Initalizes the model and puts it in eval mode -* Obtains the input parameters (batch size, max seq length) and has an instance of the input +Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: +* Forward method which calls the model `forward` method depending on model parallel settings +* Initializes the model and puts it in `.eval()` mode +* Setup for the input parameters (max batch size, max seq length) -The main methods to change for your model might be the following: +The following methods should be implemented: ```python class AbstractModelInferenceWrapper: def prep_model_for_inference(self, prompts_tokens: torch.Tensor): @@ -247,28 +252,28 @@ class AbstractModelInferenceWrapper: def get_batch_for_context_window(self) -> List: """Returns the input data for inference - This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. + This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. ``` -Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of extending this for GPTModel. +Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
##### 3.3. Modify Inference Parameters -We use [common inference params](../../megatron/core/inference/common_inference_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below +We use [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below ``` -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams -c = CommonInferenceParams(temperature=0.5) +c = SamplingParams(temperature=0.5) c.add_attributes({'min_length':4, 'eod_id':153}) ```
#### 4. Future work -The following are planned for the future releases . +The following features are planned for the future releases. * Dynamic batching * Paged Attention * TRTLLM Engine support -* Support for Multimodal model inference \ No newline at end of file +* Support for multimodal inference \ No newline at end of file diff --git a/examples/inference/gpt/simple_gpt_batch_inference.py b/examples/inference/gpt/gpt_batch_inference.py old mode 100755 new mode 100644 similarity index 91% rename from examples/inference/gpt/simple_gpt_batch_inference.py rename to examples/inference/gpt/gpt_batch_inference.py index 5c7ae5b..050b230 --- a/examples/inference/gpt/simple_gpt_batch_inference.py +++ b/examples/inference/gpt/gpt_batch_inference.py @@ -6,10 +6,10 @@ import sys from argparse import Namespace from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper from megatron.core.inference.inference_request import InferenceRequest -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController +from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController from megatron.core.transformer.module import MegatronModule sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))) @@ -66,7 +66,7 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi ) inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) - text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) + text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size) def main(): @@ -89,7 +89,7 @@ def main(): inference_engine = get_inference_engine(args, model) - common_inference_params = CommonInferenceParams( + sampling_params = SamplingParams( temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, @@ -97,7 +97,7 @@ def main(): num_tokens_to_generate=args.num_tokens_to_generate) results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, common_inference_params=common_inference_params + prompts=args.prompts, sampling_params=sampling_params ) if torch.distributed.get_rank() == 0: diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py old mode 100755 new mode 100644 diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh old mode 100755 new mode 100644 diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.sh b/examples/inference/llama_mistral/run_text_generation_llama3.sh old mode 100755 new mode 100644 diff --git a/examples/inference/llama_mistral/run_text_generation_mistral.sh b/examples/inference/llama_mistral/run_text_generation_mistral.sh old mode 100755 new mode 100644 diff --git a/examples/inference/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh old mode 100755 new mode 100644 diff --git a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh old mode 100755 new mode 100644 diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py old mode 100755 new mode 100644 index 3f4557d..b4226d7 --- a/examples/inference/t5/simple_t5_batch_inference.py +++ b/examples/inference/t5/simple_t5_batch_inference.py @@ -5,7 +5,7 @@ from argparse import Namespace import torch import pretrain_t5 -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine from megatron.core.inference.inference_request import InferenceRequest @@ -120,7 +120,7 @@ def main(): inference_engine = get_inference_engine(args, model) - common_inference_params = CommonInferenceParams( + sampling_params = SamplingParams( temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, @@ -138,7 +138,7 @@ def main(): prompts=args.prompts, add_BOS=True, encoder_prompts=args.encoder_prompts, - common_inference_params=common_inference_params, + sampling_params=sampling_params, ) if torch.distributed.get_rank() == 0: diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore old mode 100755 new mode 100644 diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile old mode 100755 new mode 100644 diff --git a/examples/mamba/README.md b/examples/mamba/README.md old mode 100755 new mode 100644 diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh old mode 100755 new mode 100644 diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh old mode 100755 new mode 100644 diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh old mode 100755 new mode 100644 diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md old mode 100755 new mode 100644 diff --git a/examples/mixtral/train_mixtral_8x7b_distributed.sh b/examples/mixtral/train_mixtral_8x7b_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile old mode 100755 new mode 100644 diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md old mode 100755 new mode 100644 index 62e4756..a65839f --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -16,7 +16,7 @@ You can build a docker container using `examples/multimodal/Dockerfile` to run t ### Language model -Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 (Base or Instruct) from HuggingFace and convert to mcore format with tensor parallel size 4. +Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 from HuggingFace and convert to mcore format with tensor parallel size 4. Please use the tokenizer from HuggingFace. ### Vision model @@ -113,7 +113,7 @@ Run the following script: ``` examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \ - --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer/ --gt-path /path/to/groundtruth/file --task generation-task-name + --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name ``` where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`. diff --git a/examples/multimodal/assets/pretrain_curves.png b/examples/multimodal/assets/pretrain_curves.png old mode 100755 new mode 100644 diff --git a/examples/multimodal/combine_lm_vision_checkpoints.sh b/examples/multimodal/combine_lm_vision_checkpoints.sh old mode 100755 new mode 100644 diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py old mode 100755 new mode 100644 index 343fcd5..ee40460 --- a/examples/multimodal/config.py +++ b/examples/multimodal/config.py @@ -7,34 +7,20 @@ from megatron.training.activations import fast_gelu, quick_gelu, squared_relu def get_language_model_config(config): - if config.language_model_type == "2b": + if config.language_model_type == "llama3_8b": + config.activation_func = torch.nn.functional.silu config.add_bias_linear = False config.bias_activation_fusion = False config.gated_linear_unit = True - config.apply_query_key_layer_scaling = True - config.layernorm_zero_centered_gamma = True - config.bias_dropout_fusion = False - config.rotary_percent = 0.5 - config.apply_rope_fusion = False - config.attention_softmax_in_fp32 = True - elif config.language_model_type == "8b": - config.add_bias_linear = False - config.bias_activation_fusion = False - config.gated_linear_unit = False - config.apply_query_key_layer_scaling = True - config.layernorm_zero_centered_gamma = True + config.apply_query_key_layer_scaling = False + config.layernorm_zero_centered_gamma = ( + False # Zero centered gamma not supported for RMSNorm + ) config.bias_dropout_fusion = False - config.rotary_percent = 0.5 - config.attention_dropout = 0.0 config.apply_rope_fusion = False - config.activation_func = squared_relu - config.ffn_hidden_size = 16384 - config.masked_softmax_fusion = True config.attention_softmax_in_fp32 = True - config.num_query_groups = 32 - config.kv_channels = 128 - config.rotary_interleaved = False - elif config.language_model_type == "llama3_8b": + config.ffn_hidden_size = 14336 + elif config.language_model_type == "mistral_7b": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False config.bias_activation_fusion = False @@ -47,7 +33,7 @@ def get_language_model_config(config): config.apply_rope_fusion = False config.attention_softmax_in_fp32 = True config.ffn_hidden_size = 14336 - elif config.language_model_type == "mistral_7b": + elif config.language_model_type == "yi-34b": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False config.bias_activation_fusion = False @@ -59,10 +45,11 @@ def get_language_model_config(config): config.bias_dropout_fusion = False config.apply_rope_fusion = False config.attention_softmax_in_fp32 = True - config.ffn_hidden_size = 14336 - elif config.language_model_type == "yi-34b": + config.ffn_hidden_size = 20480 + elif config.language_model_type == "qwen2.5_7B": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False + config.add_qkv_bias = True config.bias_activation_fusion = False config.gated_linear_unit = True config.apply_query_key_layer_scaling = False @@ -72,7 +59,7 @@ def get_language_model_config(config): config.bias_dropout_fusion = False config.apply_rope_fusion = False config.attention_softmax_in_fp32 = True - config.ffn_hidden_size = 20480 + config.ffn_hidden_size = 18944 elif config.language_model_type == "qwen2.0_72B": config.activation_func = torch.nn.functional.silu config.add_bias_linear = False @@ -168,13 +155,7 @@ def get_vision_projection_config(config, hidden_size): config.bias_activation_fusion = False config.add_bias_linear = False config.hidden_size = hidden_size # Used as the vision projection output size, i.e., the input to the language model. - if config.language_model_type == "2b": - config.ffn_hidden_size = 5440 - config.activation_func = torch.nn.functional.gelu - if config.language_model_type == "8b": - config.ffn_hidden_size = 16384 - config.activation_func = squared_relu - elif config.language_model_type == "llama3_8b": + if config.language_model_type == "llama3_8b": config.ffn_hidden_size = 14336 config.activation_func = torch.nn.functional.gelu elif config.language_model_type == "mistral_7b": @@ -185,6 +166,9 @@ def get_vision_projection_config(config, hidden_size): config.ffn_hidden_size = 20480 config.normalization = "LayerNorm" config.activation_func = torch.nn.functional.gelu + elif config.language_model_type == "qwen2.5_7B": + config.ffn_hidden_size = 3584 + config.activation_func = torch.nn.functional.gelu elif config.language_model_type == "qwen2.0_72B": config.ffn_hidden_size = 29568 config.normalization = "LayerNorm" diff --git a/examples/multimodal/convert_llava_pretrain_to_wds.py b/examples/multimodal/convert_llava_pretrain_to_wds.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py old mode 100755 new mode 100644 index de76f8e..ecbbc50 --- a/examples/multimodal/dataset_helpers.py +++ b/examples/multimodal/dataset_helpers.py @@ -2,16 +2,19 @@ import bisect import dataclasses import json +import re import sys import traceback from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Union from image_processing import get_visual_transform +from PIL import Image +from torchvision.transforms import ToPILImage import numpy as np import torch -from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN +from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN, VIDEO_TOKEN from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.energon import ( Batch, @@ -175,6 +178,10 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, self.img_h, self.img_w = self.args.img_h, self.args.img_w + # This map is used to reduce the number of tiles used per image if the number of tokens is + # larger than the decoder_seq_length. + self.num_tiles_degradation_map = {12:8, 8:6, 6:4, 4:2, 2:1, 1:1} + def _get_total_seq_length(self, input_ids, num_tiles): """Calculate expected sequence length given text tokens length and number of tiles.""" total_num_images = len(num_tiles) @@ -237,7 +244,7 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] - cur_prompt = "\n" + cur_prompt + "\n" + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt + "\n" caption = sample.caption.strip() @@ -282,7 +289,7 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, # LLAVA training: override text-prompt with just the image. conv = [ # Note: no system message. - {"role": "user", "content": "\n"}, + {"role": "user", "content": IMAGE_TOKEN + "\n"}, {"role": "assistant", "content": sample.answers}, ] @@ -307,66 +314,130 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, """Encode SFT sample.""" augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False - has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False - has_image = has_image or (hasattr(sample, "images") and len(sample.images) > 0) - if has_video: - # Grab the selected frames of the video as a tensor with shape - # fhwc: (num_frames, height, width, num_channels). - video_fhwc = sample.images[0].permute(0, 2, 3, 1) - selected_frames = torch.linspace( - 0, video_fhwc.shape[0] - 1, self.args.num_frames).long() - video_frame_fhwc = video_fhwc[selected_frames] - imgs = [] - for video_frame_hwc in video_frame_fhwc: - imgs += get_visual_transform( - video_frame_hwc, self.img_h, self.img_w, - self.args.use_tiling, self.args.max_num_tiles, - self.args.use_thumbnail, augment, self.args.vision_model_type) - num_tiles = [len(imgs)] - elif has_image: - imgs = get_visual_transform( - sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment, - self.args.vision_model_type, - ) - num_tiles = [len(imgs)] - else: - imgs = num_tiles = [] - sample.__key__ = "{}-{}".format("no-image", sample.__key__) + has_image = False + if hasattr(sample, "images"): + # If this is a text-only sample and we are freezing the LM, + # then use a dummy input image. + if len(sample.images) == 0 and self.args.freeze_LM: + empty_img = Image.new('RGB', (self.args.img_w, self.args.img_h), (255, 255, 255)) + sample.images.append(empty_img) + if len(sample.images) > 0 and not has_video: + has_image = True - conversation = [] # Note: Some tokenizers may ignore the system prompt. - conversation.append({"role": "system", "content": "Answer the questions."}) - - has_image_token = False - + conversation = [{"role": "system", "content": "Answer the questions."}] + # Format the conversation as a list of "user" / "assistant" turns. for text in sample.texts: - if IMAGE_TOKEN in text["value"]: - has_image_token = True - - if text["from"] == "human": - role = "user" - elif text["from"] == "gpt": - role = "assistant" - else: - raise RuntimeError(f"unexpected role {text['from']} in {sample.texts}") - - turn = {"role": role, "content": text["value"]} - conversation.append(turn) - - # If the sample contains an image but none of the user messages has an image token, - # then add it to the first user message. - if len(imgs) > 0 and not has_image_token: + error_msg = f"unexpected role {text['from']} in {sample.texts}" + assert text["from"] in ["human", "gpt"], error_msg + conversation.append({ + "role": "user" if text["from"] == "human" else "assistant", + "content": text["value"]}) + + # Replace the image tags with IMAGE_TOKEN and count the number of image tags + number_image_tags = 0 + image_tag_ids_list = [] + for turn in conversation: + if turn["role"] == "user": + image_tag_ids = [int(x) - 1 for x in re.findall(r"", turn["content"])] + image_tag_ids_list.extend(image_tag_ids) + turn["content"] = re.sub(r"", IMAGE_TOKEN, turn["content"]) + number_image_tags += turn["content"].count(IMAGE_TOKEN) + # For videos, we replace the image tag with the video tag + if has_video: + turn["content"] = turn["content"].replace(IMAGE_TOKEN, VIDEO_TOKEN) + + # We re-order the images in sample.images according to how they appear in the conversation. + if len(image_tag_ids_list) > 0: + sample.images = [sample.images[idx] for idx in image_tag_ids_list] + + # If there is only one image, but several image tags, we assume all the tags refer to the + # same image and duplicate the image: + if len(sample.images) == 1 and number_image_tags > 1: + sample.images = sample.images * number_image_tags + + number_of_images = len(sample.images) + # Fail if there are more image or video tags than image or videos: + error_msg = ( + f"Found {number_image_tags} image tags for {number_of_images} images. {sample.texts}") + assert number_image_tags <= number_of_images, error_msg + + # If there are less image of video tags than image or videos, prepend the tags to the first + # user message: + if number_image_tags < number_of_images: for turn in conversation: if turn["role"] == "user": - turn["content"] = f"{IMAGE_TOKEN}\n" + turn["content"] + tag_to_add = VIDEO_TOKEN if has_video else IMAGE_TOKEN + turn["content"] = tag_to_add*(number_of_images-number_image_tags) + "\n" + turn["content"] break input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False) + if has_image: + imgs = [] + num_tiles = [] + max_num_tiles = self.args.max_num_tiles + # We keep a buffer of 4 tokens for the question, + # the rest can be used for image tokens. + max_image_token_allowed = self.args.decoder_seq_length - len(input_ids) - 4 + # We start by extracting as many tiles per image as possible, and decrease the max + # number of tiles if there are too many image tokens. + while True: + imgs = [] + num_tiles = [] + for img in sample.images: + img_tiles = get_visual_transform( + img, self.img_h, self.img_w, self.args.use_tiling, max_num_tiles, + self.args.use_thumbnail, augment, self.args.vision_model_type) + imgs += img_tiles + num_tiles += [len(img_tiles)] + if max_num_tiles == 1: + break + if sum(num_tiles) * self.token_per_img_tile > max_image_token_allowed: + if max_num_tiles in self.num_tiles_degradation_map: + max_num_tiles = self.num_tiles_degradation_map[max_num_tiles] + else: + raise RuntimeError(( + f"Tried to decrease the number of tiles {max_num_tiles} but it's not ", + f"defined in the degradation map {self.num_tiles_degradation_map}")) + else: + break + elif has_video: + # We don't use tiling for videos to limit the number of tokens. + use_tiling=False + # Grab the selected frames of the video as a tensor with shape + # fhwc: (num_frames, num_channels, height, width). + video_fchw = sample.images[0].permute(0, 1, 2, 3) + selected_frames = torch.linspace( + 0, video_fchw.shape[0] - 1, self.args.num_frames).long() + video_fchw = video_fchw[selected_frames] + imgs = [] + for video_chw in video_fchw: + to_pil = ToPILImage() + video_chw = to_pil(video_chw) + imgs += get_visual_transform( + video_chw, self.img_h, self.img_w, use_tiling, self.args.max_num_tiles, + self.args.use_thumbnail, augment, self.args.vision_model_type) + num_tiles = [len(imgs)] + else: + imgs = num_tiles = [] + if self.is_packing_enabled: input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles) + # Some final checks with respect to the number of image tokens and images on the tokenized + # conversation. There can still be errors, for instance if a non-video sample happens to + # have our pre-defined video token, or if the packing truncation removed a necessary image + # tag. + number_image_token = np.sum(input_ids == self.img_token_id) + error_msg = ( + f"Found {number_image_token} image tokens for len({num_tiles}) = {len(num_tiles)} image tiles in {conversation}.") + assert number_image_token == len(num_tiles), error_msg + error_msg = ( + f"Found sum({num_tiles}) = {np.sum(num_tiles)} tiles for {len(imgs)} images in {conversation}.") + assert np.sum(num_tiles) == len(imgs), error_msg + return ImageTaskSample( __key__=sample.__key__, __restore_key__=sample.__restore_key__, @@ -407,8 +478,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, if isinstance(sample, MultiChoiceVQASample): cur_prompt = format_multichoice_question(sample.context, sample.choices) - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt cur_answer = format_multichoice_answer(sample.correct_choice_idx) elif isinstance(sample, VQASample): if 'docvqa' in sample.__key__: @@ -423,8 +494,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, cur_prompt = cur_prompt.format(sample.context) - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt if isinstance(sample.answers, list): answer_list = sample.answers @@ -505,11 +576,11 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_list = self.manual_prompts["DocPretraining"]["raw"] prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt - # Make sure there is no extra tag. - sample.text = sample.text.replace("", "") + # Make sure there is no extra IMAGE_TOKEN tag. + sample.text = sample.text.replace(IMAGE_TOKEN, "") caption = sample.text.strip() @@ -526,8 +597,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, ref = sample.text region = sample.words_boxes - # Make sure there is no extra tag - ref = ref.replace("", "") + # Make sure there is no extra IMAGE_TOKEN tag + ref = ref.replace(IMAGE_TOKEN, "") if len(region) == 4: region = f"({region[0]},{region[1]}),({region[2]},{region[3]})" @@ -550,8 +621,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] cur_prompt = cur_prompt.format(prompt_content) - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt return sample, cur_prompt, answer @@ -559,8 +630,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, """Format bbox coordinates as text.""" assert len(bbox) == 4 or len(bbox) == 8 - # Make sure there is no extra tag - text = text.replace("", "") + # Make sure there is no extra IMAGE_TOKEN tag + text = text.replace(IMAGE_TOKEN, "") if len(bbox) == 4: label_str = f"{text}({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})" @@ -582,8 +653,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, prompt_idx = np.random.randint(len(prompt_list)) cur_prompt = prompt_list[prompt_idx] - if "" not in cur_prompt: - cur_prompt = "\n" + cur_prompt + if IMAGE_TOKEN not in cur_prompt: + cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt cur_answer = answer return sample, cur_prompt, cur_answer diff --git a/examples/multimodal/evaluate_ai2d.py b/examples/multimodal/evaluation/evaluate_ai2d.py old mode 100755 new mode 100644 similarity index 72% rename from examples/multimodal/evaluate_ai2d.py rename to examples/multimodal/evaluation/evaluate_ai2d.py index 2d5db67..39b866a --- a/examples/multimodal/evaluate_ai2d.py +++ b/examples/multimodal/evaluation/evaluate_ai2d.py @@ -9,19 +9,25 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append( - { - "question_id": res["sample_id"], - "answer": res["answer"], - "gt_answer": res["gt_answer"], - } - ) + sample_id = res["sample_id"] + + # Ignore possible duplicates. + if sample_id in results: + continue + + results[sample_id] = { + "question_id": sample_id, + "answer": res["answer"], + "gt_answer": res["gt_answer"], + } + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_chartqa.py b/examples/multimodal/evaluation/evaluate_chartqa.py old mode 100755 new mode 100644 similarity index 77% rename from examples/multimodal/evaluate_chartqa.py rename to examples/multimodal/evaluation/evaluate_chartqa.py index e923806..53d4944 --- a/examples/multimodal/evaluate_chartqa.py +++ b/examples/multimodal/evaluation/evaluate_chartqa.py @@ -9,15 +9,22 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - res["question_id"] = res["sample_id"] + sample_id = res["sample_id"] - results.append(res) + # Ignore possible duplicates. + if sample_id in results: + continue + + res["question_id"] = sample_id + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluation/evaluate_coco.py old mode 100755 new mode 100644 similarity index 77% rename from examples/multimodal/evaluate_coco.py rename to examples/multimodal/evaluation/evaluate_coco.py index a717090..8eeb367 --- a/examples/multimodal/evaluate_coco.py +++ b/examples/multimodal/evaluation/evaluate_coco.py @@ -11,20 +11,28 @@ def convert_to_coco_format(input_path): """Convert input files to COCO compatible format.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning") - captions = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) + sample_id = res["sample_id"] - question_id = res['sample_id'] - caption = res['caption'].rstrip('.').lower() + # Ignore possible duplicates. + if sample_id in results: + continue - captions.append({"image_id": question_id, "caption": caption}) + caption = res["caption"].rstrip(".").lower() + results[sample_id] = { + "image_id": sample_id, + "caption": caption, + } + + results = list(results.values()) with open(output_file_path, "w") as output_file: - json.dump(captions, output_file, indent=4) + json.dump(results, output_file, indent=4) return output_file_path diff --git a/examples/multimodal/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py old mode 100755 new mode 100644 similarity index 92% rename from examples/multimodal/evaluate_mathvista.py rename to examples/multimodal/evaluation/evaluate_mathvista.py index 3474c5f..a55f312 --- a/examples/multimodal/evaluate_mathvista.py +++ b/examples/multimodal/evaluation/evaluate_mathvista.py @@ -11,13 +11,21 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append(res) + sample_id = res["sample_id"] + + # Remove possible duplicates. + if sample_id in results: + continue + + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluation/evaluate_mmmu.py old mode 100755 new mode 100644 similarity index 91% rename from examples/multimodal/evaluate_mmmu.py rename to examples/multimodal/evaluation/evaluate_mmmu.py index 66118fa..798c42b --- a/examples/multimodal/evaluate_mmmu.py +++ b/examples/multimodal/evaluation/evaluate_mmmu.py @@ -2,9 +2,15 @@ import argparse import glob import json import os +import sys import re import subprocess +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +# Add the parent directory to sys.path +sys.path.insert(0, parent_dir) + from run_text_generation import get_output_path from config import EvaluationConfig @@ -48,6 +54,10 @@ def convert_to_mmmu_format(input_path): ) # MMMU eval script expects just a sample_id to prediction mapping. + # Skip possible duplicates. + if sample_id in output: + continue + output[sample_id] = prediction with open(output_file_path, "w") as output_file: diff --git a/examples/multimodal/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py old mode 100755 new mode 100644 similarity index 95% rename from examples/multimodal/evaluate_ocrbench.py rename to examples/multimodal/evaluation/evaluate_ocrbench.py index bc2b901..b37473a --- a/examples/multimodal/evaluate_ocrbench.py +++ b/examples/multimodal/evaluation/evaluate_ocrbench.py @@ -8,13 +8,21 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append(res) + sample_id = res["sample_id"] + + # Remove possible duplicates. + if sample_id in results: + continue + + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluation/evaluate_textvqa.py old mode 100755 new mode 100644 similarity index 72% rename from examples/multimodal/evaluate_textvqa.py rename to examples/multimodal/evaluation/evaluate_textvqa.py index c9bba71..af782bd --- a/examples/multimodal/evaluate_textvqa.py +++ b/examples/multimodal/evaluation/evaluate_textvqa.py @@ -9,22 +9,25 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - results.append( - { - "question_id": res["sample_id"], - "answer": res["answer"], - "gt_answer": res["gt_answer"], - } - ) - - # Make order deterministic. - # results = sorted(results, key=lambda d: d["question_id"]) + sample_id = res["sample_id"] + + # Remove possible duplicates. + if sample_id in results: + continue + + results[sample_id] = { + "question_id": sample_id, + "answer": res["answer"], + "gt_answer": res["gt_answer"], + } + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py old mode 100755 new mode 100644 similarity index 88% rename from examples/multimodal/evaluate_vqav2.py rename to examples/multimodal/evaluation/evaluate_vqav2.py index 0b1b920..7807d80 --- a/examples/multimodal/evaluate_vqav2.py +++ b/examples/multimodal/evaluation/evaluate_vqav2.py @@ -9,15 +9,22 @@ def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2") - results = [] + results = dict() for input_file_path in input_file_paths: with open(input_file_path, "r") as input_file: for line in input_file: res = json.loads(line) - res["question_id"] = res["sample_id"] + sample_id = res["sample_id"] - results.append(res) + # Skip possible duplicates. + if sample_id in results: + continue + + res["question_id"] = sample_id + results[sample_id] = res + + results = list(results.values()) with open(output_file_path, "w") as output_file: json.dump(results, output_file) @@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task): assert len(gt) == 1, "expected exactly one groundtruth answer." gt = gt[0] + pred = pred.rstrip("%") + gt = gt.rstrip("%") + if is_number(pred) and is_number(gt): pred = float(pred) gt = float(gt) diff --git a/examples/multimodal/evaluation_datasets.py b/examples/multimodal/evaluation/evaluation_datasets.py old mode 100755 new mode 100644 similarity index 88% rename from examples/multimodal/evaluation_datasets.py rename to examples/multimodal/evaluation/evaluation_datasets.py index 97f9ba9..50a50d5 --- a/examples/multimodal/evaluation_datasets.py +++ b/examples/multimodal/evaluation/evaluation_datasets.py @@ -188,7 +188,7 @@ class MMMUDataset(torch.utils.data.Dataset): use_tiling, max_num_tiles, use_thumbnail, - single_image, + prompt_style, vision_model_type, ): import datasets @@ -246,7 +246,7 @@ class MMMUDataset(torch.utils.data.Dataset): self._use_tiling = use_tiling self._max_num_tiles = max_num_tiles self._use_thumbnail = use_thumbnail - self._single_image = single_image + self._prompt_style = prompt_style self._vision_model_type = vision_model_type def __len__(self): @@ -258,7 +258,7 @@ class MMMUDataset(torch.utils.data.Dataset): sample = self._dataset[idx] # Use the single image approach from the MMMU repo. - if self._single_image: + if self._prompt_style == "single_image": sample = process_single_sample(sample) sample = construct_prompt(sample, self._config) @@ -274,7 +274,69 @@ class MMMUDataset(torch.utils.data.Dataset): vision_model_type=self._vision_model_type, ) sample_num_tiles = [len(sample_imgs)] - else: + + prompt = sample["final_input_prompt"] + for i in range(8): + prompt = prompt.replace(f"", "") + sample["final_input_prompt"] = f"\n{prompt}" + elif self._prompt_style == "vlmevalkit": + sample = construct_prompt(sample, self._config) + + if sample["question_type"] == "multiple-choice": + question = sample["question"] + + options = "" + for k, v in sample["index2ans"].items(): + options += f"{k}. {v}\n" + + final_prompt = f"{question}\n" + if "hint" in sample: + final_prompt += f"Hint: {sample['hint']}\n" + + if "task_instructions" in sample: + final_prompt += f"Task instructions: {sample['task_instructions']}\n" + + final_prompt += options + final_prompt += "Answer with the option's letter from the given choices directly." + + sample["final_input_prompt"] = final_prompt.rstrip() + else: + question = sample["question"] + final_prompt = f"{question}\n" + final_prompt += "Answer the question directly." + sample["final_input_prompt"] = final_prompt.rstrip() + + sample_imgs = [] + sample_num_tiles = [] + + img_indices = sorted(list(set(re.findall(r"" + + img = sample[img_key] + assert img is not None, f"{img_str} is in prompt but not in sample images" + + imgs = get_visual_transform( + img, + self._img_h, + self._img_w, + self._use_tiling, + adjusted_max_num_tiles, + self._use_thumbnail, + augment=False, + vision_model_type=self._vision_model_type, + ) # List of tiles. + + sample_imgs.extend(imgs) + sample_num_tiles.append(len(imgs)) + + sample["final_input_prompt"] = " ".join([f'' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"] + elif self._prompt_style == "multi_image": sample = construct_prompt(sample, self._config) sample_imgs = [] @@ -315,6 +377,8 @@ class MMMUDataset(torch.utils.data.Dataset): assert ( f"" not in sample["final_input_prompt"] ), "prompt contains unhandled image tags" + else: + raise ValueError(f"unknown prompt style {self._prompt_style}") # MMMU specific metadata. metadata = {"question_type": sample["question_type"]} @@ -323,10 +387,6 @@ class MMMUDataset(torch.utils.data.Dataset): metadata["all_choices"] = sample["all_choices"] prompt = sample['final_input_prompt'] - if self._single_image: - for i in range(8): - prompt = prompt.replace(f"", "") - prompt = f"\n{prompt}" tile_count = torch.tensor(sample_num_tiles, dtype=torch.int) @@ -780,8 +840,10 @@ def get_evaluation_dataset( vision_model_type, ) elif task == 'MMMU': - # Note: single_image=True uses only one image like in the MMMU repo example. - # single_image=False uses all images in the sample. + # Note: + # - prompt_style="single_image" uses only one image like in the MMMU repo example. + # - prompt_style="multi_image" uses multiple input images. + # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499 dataset = MMMUDataset( input_image_path, num_samples_per_partition, @@ -792,7 +854,7 @@ def get_evaluation_dataset( use_tiling, max_num_tiles, use_thumbnail, - single_image=True, + prompt_style="single_image", vision_model_type=vision_model_type, ) elif task == "VideoMME": diff --git a/examples/multimodal/image_processing.py b/examples/multimodal/image_processing.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/manual_prompts.json b/examples/multimodal/manual_prompts.json old mode 100755 new mode 100644 diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py old mode 100755 new mode 100644 index 6db834e..a28a428 --- a/examples/multimodal/model.py +++ b/examples/multimodal/model.py @@ -136,6 +136,20 @@ def model_provider( else: vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules + # Toggle --recompute* for the vision and language model separately. + if args.recompute_vision: + if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None: + vision_config.recompute_num_layers = vision_config.num_layers + else: + vision_config.recompute_granularity = None + vision_config.recompute_method = None + vision_config.recompute_num_layers = None + + vision_projection_config.recompute_granularity = None + vision_projection_config.recompute_method = None + vision_projection_config.recompute_num_layers = None + + tokenizer = get_tokenizer() image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) diff --git a/examples/multimodal/model_converter/clip_converter.py b/examples/multimodal/model_converter/clip_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/model_converter/internvit_converter.py b/examples/multimodal/model_converter/internvit_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/model_converter/siglip_converter.py b/examples/multimodal/model_converter/siglip_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/model_converter/vision_model_tester.py b/examples/multimodal/model_converter/vision_model_tester.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py old mode 100755 new mode 100644 index 4b2be45..eb56118 --- a/examples/multimodal/multimodal_args.py +++ b/examples/multimodal/multimodal_args.py @@ -49,7 +49,7 @@ def add_multimodal_extra_args(parser): group.add_argument( "--tokenizer-prompt-format", type=str, - choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"], + choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"], required=True, help="Prompt format to use with the tokenizer.", ) @@ -71,5 +71,9 @@ def add_multimodal_extra_args(parser): group.add_argument( "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing." ) + group.add_argument( + "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model" + ) + return parser diff --git a/examples/multimodal/nvlm/README.md b/examples/multimodal/nvlm/README.md old mode 100755 new mode 100644 index 7eddbb7..bb576bb --- a/examples/multimodal/nvlm/README.md +++ b/examples/multimodal/nvlm/README.md @@ -5,6 +5,13 @@ Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details. *NOTE: VLMs in Megatron are under active development and are expected to change.* +# Checkpoints + +NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format. + +- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B) +- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore) + # Setup ## Docker image @@ -32,7 +39,7 @@ NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface Please download it and run the following command to convert it to Megatron format. ``` python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \ - --load-dir --save-dir --tokenizer-model \ + --load-dir --save-dir --tokenizer-model \ --saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1 ``` @@ -42,7 +49,7 @@ NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Q Please download it and run the following command to convert it to Megatron format. ``` python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \ - --load-dir --save-dir --tokenizer-model \ + --load-dir --save-dir --tokenizer-model \ --saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf ``` diff --git a/examples/multimodal/nvlm/internvit.py b/examples/multimodal/nvlm/internvit.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/nvlm_prompts.json b/examples/multimodal/nvlm/nvlm_prompts.json old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/pp_checkpoint_converter.py b/examples/multimodal/nvlm/pp_checkpoint_converter.py old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/pretrain_blend.yaml b/examples/multimodal/nvlm/pretrain_blend.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh old mode 100755 new mode 100644 index 320c7ad..008a17a --- a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh +++ b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh @@ -62,7 +62,7 @@ OPTIONS=" \ --exit-duration-in-mins 230 \ --disable-bias-linear \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-prompt-format qwen2p0 \ --transformer-impl transformer_engine \ --normalization RMSNorm \ diff --git a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh old mode 100755 new mode 100644 index c36cb05..00f9435 --- a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh +++ b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh @@ -75,7 +75,7 @@ OPTIONS=" \ --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-prompt-format nvlm-yi-34b \ --vocab-size 64000 \ --make-vocab-size-divisible-by 1 \ diff --git a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh old mode 100755 new mode 100644 index 35cd904..e3b001c --- a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh +++ b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh @@ -97,7 +97,7 @@ do --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model \ + --tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-prompt-format qwen2p0 \ --position-embedding-type rope \ --rotary-percent 1.0 \ diff --git a/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh new file mode 100644 index 0000000..3b62219 --- /dev/null +++ b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 +export TOKENIZERS_PARALLELISM="false" + +INPUT_IMAGE_PATH="placeholder" +GROUNDTRUTH_PATH="placeholder" + +while [[ $# -gt 0 ]]; do + case $1 in + -i|--input-image-path) + INPUT_IMAGE_PATH="$2" + shift + shift + ;; + -o|--output-path) + OUTPUT_PATH="$2" + shift + shift + ;; + -m|--model-path) + MODEL_PATH="$2" + shift + shift + ;; + -t|--task) + TASK="$2" + shift + shift + ;; + -g|--gt-path) + GROUNDTRUTH_PATH="$2" + shift + shift + ;; + -*|--*) + echo "Invalid option $1" + exit 1 + ;; + esac +done + +# Please modify these as needed. +NUM_PARTITIONS=0 +START=0 +END=0 + + +SEQ_LEN=256 +DECODER_SEQ_LEN=8192 +EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail" + +for PARTITION_ID in $( eval echo {$START..$END} ) +do + torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \ + --attention-softmax-in-fp32 \ + --transformer-impl transformer_engine \ + --use-te \ + --use-checkpoint-args \ + --normalization RMSNorm \ + --norm-epsilon 1e-06 \ + --language-model-type=qwen2.5_7B \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --swiglu \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --group-query-attention \ + --num-query-groups 4 \ + --num-layers 28 \ + --hidden-size 3584 \ + --ffn-hidden-size 18944 \ + --add-qkv-bias \ + --num-attention-heads 28 \ + --max-position-embeddings 32768 \ + --no-masked-softmax-fusion \ + --load ${MODEL_PATH} \ + --tokenizer-type MultimodalTokenizer \ + --tokenizer-model Qwen/Qwen2.5-7B-Instruct \ + --tokenizer-prompt-format qwen2p5 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length ${SEQ_LEN} \ + --decoder-seq-length ${DECODER_SEQ_LEN} \ + --out-seq-length 128 \ + --temperature 1.0 \ + --img-h 448 \ + --img-w 448 \ + --patch-dim 14 \ + --seed 153 \ + --top_k 1 \ + --no-load-rng \ + --no-load-optim \ + --input-image-path ${INPUT_IMAGE_PATH} \ + --num-partitions ${NUM_PARTITIONS} \ + --partition-id ${PARTITION_ID} \ + --output-path ${OUTPUT_PATH} \ + --gt-path ${GROUNDTRUTH_PATH} \ + --task ${TASK} \ + ${EXTRA_ARGS} \ + --special-tokens "" "" "" \ + --vision-model-type siglip \ + --ckpt-format torch +done diff --git a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh old mode 100755 new mode 100644 index 0437e4c..341f4e4 --- a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh +++ b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh @@ -95,7 +95,7 @@ do --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model \ + --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-prompt-format nvlm-yi-34b \ --vocab-size 64000 \ --make-vocab-size-divisible-by 1 \ @@ -135,6 +135,6 @@ do --gt-path ${GROUNDTRUTH_PATH} \ ${EXTRA_ARGS} \ --task ${TASK} \ - --image-tag-type nlvm \ + --image-tag-type nvlm \ --ckpt-format torch done diff --git a/examples/multimodal/nvlm/sft_34b_internvit.sh b/examples/multimodal/nvlm/sft_34b_internvit.sh old mode 100755 new mode 100644 index 3d585d8..0dff946 --- a/examples/multimodal/nvlm/sft_34b_internvit.sh +++ b/examples/multimodal/nvlm/sft_34b_internvit.sh @@ -80,7 +80,7 @@ OPTIONS=" \ --decoder-seq-length ${DECODER_SEQ_LEN} \ --max-position-embeddings ${MAX_POS_EMBED} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-prompt-format nvlm-yi-34b \ --vocab-size 64000 \ --make-vocab-size-divisible-by 1 \ diff --git a/examples/multimodal/nvlm/sft_blend.yaml b/examples/multimodal/nvlm/sft_blend.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh old mode 100755 new mode 100644 index adb1d1b..3b47225 --- a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh +++ b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh @@ -67,7 +67,7 @@ OPTIONS=" \ --exit-duration-in-mins 230 \ --disable-bias-linear \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/ \ + --tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-prompt-format qwen2p0 \ --transformer-impl transformer_engine \ --normalization RMSNorm \ diff --git a/examples/multimodal/pretrain_dataset.yaml b/examples/multimodal/pretrain_dataset.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh old mode 100755 new mode 100644 index ea1f741..90b0053 --- a/examples/multimodal/pretrain_mistral_clip.sh +++ b/examples/multimodal/pretrain_mistral_clip.sh @@ -24,11 +24,6 @@ if [[ -z $LOAD_NAME ]]; then exit 1 fi -if [[ -z $TOKENIZER_MODEL ]]; then - echo "Please set TOKENIZER_MODEL for tokenizer model name." - exit 1 -fi - CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" @@ -93,7 +88,7 @@ OPTIONS=" \ --eval-iters 10 \ --eval-interval 1000 \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ + --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-prompt-format mistral \ --data-path ${DATA_TRAIN} \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py old mode 100755 new mode 100644 index f4bb502..cbde668 --- a/examples/multimodal/run_text_generation.py +++ b/examples/multimodal/run_text_generation.py @@ -14,11 +14,13 @@ sys.path.append( import torch import yaml from config import EvaluationConfig -from evaluation_datasets import get_evaluation_dataset +from evaluation.evaluation_datasets import get_evaluation_dataset from model import model_provider from multimodal_args import add_multimodal_extra_args from megatron.core import parallel_state +from megatron.core.enums import ModelType +from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.inference.text_generation.api import generate_and_post_process from megatron.inference.text_generation.forward_step import ForwardStep @@ -36,7 +38,7 @@ def add_text_generation_args(parser): group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') group.add_argument( - "--out-seq-length", type=int, default=1024, help='Length of the output generated text.' + "--out-seq-length", type=int, default=128, help='Length of the output generated text.' ) group.add_argument("--output-path", type=str, help='Output file path') group.add_argument('--input-image-path', type=str, help="Input image directory") @@ -206,8 +208,8 @@ def generate_samples(model, config: EvaluationConfig, print_output): if config.task == "VideoMME": output["questions"][0][output_name] = generated else: - output[output_name] = generated output["prompt"] = prompt + output[output_name] = generated if config.task == "captioning": output["ground_truth"] = answers @@ -354,7 +356,7 @@ class VLMForwardStep(ForwardStep): ) def __call__(self, tokens, position_ids, attention_mask): - num_image_tokens = (tokens == self.model.image_token_index).sum().item() + num_image_tokens = (tokens == self.model.module.image_token_index).sum().item() num_tokens = tokens.size(1) recv_buffer_seq_length = None if num_image_tokens > 0: @@ -406,7 +408,7 @@ def get_conversation(task, question): {"role": "system", "content": "Answer the questions."}, { "role": "user", - "content": "\nProvide a one-sentence caption for provided image.", + "content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.", }, ] elif task in ("TextVQA", "VQAv2", "ChartQA"): @@ -414,13 +416,13 @@ def get_conversation(task, question): {"role": "system", "content": "Answer the questions."}, { "role": "user", - "content": f"\n{question}\nAnswer the question using a single word or phrase.", + "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.", }, ] elif task in ("OCRBench", "MathVista", "AI2D"): conversation = [ {"role": "system", "content": "Answer the questions."}, - {"role": "user", "content": f"\n{question}"}, + {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"}, ] elif task == "MMMU": conversation = [ @@ -441,7 +443,7 @@ def get_conversation(task, question): conversation = [ {"role": "system", "content": "Answer the questions."}, - {"role": "user", "content": f"\n{question}"}, + {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"}, ] return conversation @@ -464,11 +466,13 @@ def get_prompt_and_generated(prompt_and_generation, prompt_format): prompt = splitted[0] generated = splitted[1] generated = generated.split("<|im_end|>")[0] - elif prompt_format in ("nvlm-yi-34b", "qwen2p0"): + elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"): splitted = prompt_and_generation.split("<|im_start|>assistant\n") prompt = splitted[0] generated = splitted[1] generated = generated.split("<|im_end|>")[0] + else: + raise ValueError(f"Prompt format {prompt_format} is not supported.") # Remove possible garbage. generated = generated.strip() @@ -489,11 +493,11 @@ def main(): args = get_args() - def wrapped_model_provider(pre_process, post_process): - return model_provider(pre_process, post_process, parallel_output=False) + def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder): + return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False) # Set up model and load checkpoint. - model = get_model(wrapped_model_provider, wrap_with_ddp=False) + model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False) if args.load is not None: _ = load_checkpoint(model, None, None) diff --git a/examples/multimodal/sft_dataset.yaml b/examples/multimodal/sft_dataset.yaml old mode 100755 new mode 100644 diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh old mode 100755 new mode 100644 index 8a083cc..94ff208 --- a/examples/multimodal/sft_mistral_clip.sh +++ b/examples/multimodal/sft_mistral_clip.sh @@ -29,11 +29,6 @@ if [[ -z $LOAD_ITER ]]; then exit 1 fi -if [[ -z $TOKENIZER_MODEL ]]; then - echo "Please set TOKENIZER_MODEL for tokenizer model name." - exit 1 -fi - CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml" @@ -98,7 +93,7 @@ OPTIONS=" \ --eval-iters 10 \ --eval-interval 500 \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ + --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-prompt-format mistral \ --data-path ${DATA_TRAIN} \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh old mode 100755 new mode 100644 index ca98ff2..c1ef7bc --- a/examples/multimodal/text_generation_mistral_clip.sh +++ b/examples/multimodal/text_generation_mistral_clip.sh @@ -4,12 +4,13 @@ export NCCL_IB_SL=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_APPLY_QK_LAYER_SCALING=0 +INPUT_IMAGE_PATH="placeholder" GROUNDTRUTH_PATH="placeholder" NUM_FRAMES=1 while [[ $# -gt 0 ]]; do case $1 in - --input-image-path) + -i|--input-image-path) INPUT_IMAGE_PATH="$2" shift shift @@ -19,11 +20,6 @@ while [[ $# -gt 0 ]]; do shift shift ;; - -g|--groundtruth-path) - GROUNDTRUTH_PATH="$2" - shift - shift - ;; -o|--output-path) OUTPUT_PATH="$2" shift @@ -34,12 +30,7 @@ while [[ $# -gt 0 ]]; do shift shift ;; - -t|--tokenizer-path) - TOKENIZER_PATH="$2" - shift - shift - ;; - --task) + -t|--task) TASK="$2" shift shift @@ -92,7 +83,7 @@ do --no-masked-softmax-fusion \ --load ${MODEL_PATH} \ --tokenizer-type MultimodalTokenizer \ - --tokenizer-model ${TOKENIZER_PATH} \ + --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-prompt-format mistral \ --bf16 \ --micro-batch-size 1 \ diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py old mode 100755 new mode 100644 index 5ff2121..1dc68d1 --- a/examples/multimodal/train.py +++ b/examples/multimodal/train.py @@ -48,7 +48,7 @@ def get_batch(data_iterator): pp_size = get_pipeline_model_parallel_world_size() if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size): # Note these are all set to None above. - return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles + return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params # Broadcast data. torch.cuda.nvtx.range_push("get_data") @@ -66,7 +66,7 @@ def get_batch(data_iterator): cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"] max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"] - # Dummy image, no image. + # No image input (text-only sample) if the dataloader produced a dummy image. if imgs.shape == torch.Size([1, 1]): # FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled. imgs = torch.tensor([], dtype=torch.float32, device=data_text.device) diff --git a/examples/retro/README.md b/examples/retro/README.md old mode 100755 new mode 100644 diff --git a/examples/retro/preprocess_data.sh b/examples/retro/preprocess_data.sh old mode 100755 new mode 100644 diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh old mode 100755 new mode 100644 diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py old mode 100755 new mode 100644 diff --git a/examples/t5/README.md b/examples/t5/README.md old mode 100755 new mode 100644 diff --git a/examples/t5/t5_mcore_train_curve.png b/examples/t5/t5_mcore_train_curve.png old mode 100755 new mode 100644 diff --git a/examples/t5/train_t5_220m_distributed.sh b/examples/t5/train_t5_220m_distributed.sh old mode 100755 new mode 100644 diff --git a/images/model_table.png b/images/model_table.png old mode 100755 new mode 100644 diff --git a/images/strong_scaling.png b/images/strong_scaling.png old mode 100755 new mode 100644 diff --git a/images/weak_scaling.png b/images/weak_scaling.png old mode 100755 new mode 100644 diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md old mode 100755 new mode 100644 diff --git a/megatron/core/README.md b/megatron/core/README.md old mode 100755 new mode 100644 diff --git a/megatron/core/README_STRAGGLER.md b/megatron/core/README_STRAGGLER.md old mode 100755 new mode 100644 diff --git a/megatron/core/__init__.py b/megatron/core/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/config_logger.py b/megatron/core/config_logger.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/Makefile b/megatron/core/datasets/Makefile old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/__init__.py b/megatron/core/datasets/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/helpers.cpp b/megatron/core/datasets/helpers.cpp old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/helpers.py b/megatron/core/datasets/helpers.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/megatron_dataset.py b/megatron/core/datasets/megatron_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/megatron_tokenizer.py b/megatron/core/datasets/megatron_tokenizer.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/multimodal_dataset.py b/megatron/core/datasets/multimodal_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/__init__.py b/megatron/core/datasets/retro/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/__init__.py b/megatron/core/datasets/retro/config/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/bert_embedders.py b/megatron/core/datasets/retro/config/bert_embedders.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/config/tokenizers.py b/megatron/core/datasets/retro/config/tokenizers.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/__init__.py b/megatron/core/datasets/retro/db/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/__init__.py b/megatron/core/datasets/retro/index/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/index.py b/megatron/core/datasets/retro/index/index.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/indexes/__init__.py b/megatron/core/datasets/retro/index/indexes/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/indexes/faiss_base.py b/megatron/core/datasets/retro/index/indexes/faiss_base.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/utils.py b/megatron/core/datasets/retro/index/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/index/validate.py b/megatron/core/datasets/retro/index/validate.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/__init__.py b/megatron/core/datasets/retro/query/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/query.py b/megatron/core/datasets/retro/query/query.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/retro_dataset.py b/megatron/core/datasets/retro/query/retro_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/query/utils.py b/megatron/core/datasets/retro/query/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/utils.py b/megatron/core/datasets/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/datasets/utils_s3.py b/megatron/core/datasets/utils_s3.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/__init__.py b/megatron/core/dist_checkpointing/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/core.py b/megatron/core/dist_checkpointing/core.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/dict_utils.py b/megatron/core/dist_checkpointing/dict_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py old mode 100755 new mode 100644 index 2ddfcf3..d376c63 --- a/megatron/core/dist_checkpointing/mapping.py +++ b/megatron/core/dist_checkpointing/mapping.py @@ -119,7 +119,8 @@ class ShardedTensor(ShardedBase): self.init_data(device='meta') if self.data.shape != real_data.shape: raise CheckpointingException( - f'Data shape doesnt match expected {self.data.shape} for {self}' + f'Data shape {real_data.shape} doesnt match' + f' expected {self.data.shape} for {self}' ) finally: self.data = real_data diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py old mode 100755 new mode 100644 index 3be5777..600dd87 --- a/megatron/core/dist_checkpointing/serialization.py +++ b/megatron/core/dist_checkpointing/serialization.py @@ -104,8 +104,6 @@ def load( checkpoint_dir = Path(checkpoint_dir) common_state_dict = common_strategy.load_common(checkpoint_dir) - if not sharded_state_dict: - return common_state_dict sharded_state_dict, nonpersistent_state_dict, sh_ten_factories = load_preprocess( sharded_state_dict diff --git a/megatron/core/dist_checkpointing/state_dict_transformation.py b/megatron/core/dist_checkpointing/state_dict_transformation.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/__init__.py b/megatron/core/dist_checkpointing/strategies/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/common.py b/megatron/core/dist_checkpointing/strategies/common.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/resharding.py b/megatron/core/dist_checkpointing/strategies/resharding.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/state_dict_saver.py b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/tensorstore.py b/megatron/core/dist_checkpointing/strategies/tensorstore.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/two_stage.py b/megatron/core/dist_checkpointing/strategies/two_stage.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/strategies/zarr.py b/megatron/core/dist_checkpointing/strategies/zarr.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/utils.py b/megatron/core/dist_checkpointing/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py old mode 100755 new mode 100644 index 48e023d..546ec35 --- a/megatron/core/dist_checkpointing/validation.py +++ b/megatron/core/dist_checkpointing/validation.py @@ -412,7 +412,7 @@ def validate_sharding_integrity( CheckpointingException for invalid access pattern """ - if common_state_dict: + if common_state_dict is not None: _validate_common_state_dict(common_state_dict) if torch.distributed.get_rank() != 0: @@ -461,10 +461,15 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]): lambda x: x[1], _validate_sharding_for_key_flattened, ) - else: - if not torch.all(shard_access_cnt == 1): - logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}') - raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}') + # For each shard with at least 1 flattened tensor in it, the above + # `_validate_sharding_for_key_flattened` ensure a correct consistent pattern + # The only thing that can go wrong at this point is that some shard don't have + # *any* representatives which will be checked later by comparing `shard_access_cnt == 1` + shard_access_cnt = torch.minimum(shard_access_cnt, torch.tensor([1])) + if not torch.all(shard_access_cnt == 1): + raise CheckpointingException( + f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}' + ) def _compute_shards_access(rank_sharding): @@ -489,16 +494,10 @@ def _validate_sharding_for_key_flattened(tensors_by_shard): all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop)) starts, stops = map(np.asarray, zip(*sorted(all_slices))) - if ( - starts[0] != 0 - or stops[-1] != np.product(local_shape) - or not np.all(starts[1:] == stops[:-1]) - ): - logger.error( - f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}' - ) + expected_size = np.product(local_shape) + if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]): raise CheckpointingException( - f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}' + f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}' ) diff --git a/megatron/core/distributed/README.md b/megatron/core/distributed/README.md old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/__init__.py b/megatron/core/distributed/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/data_parallel_base.py b/megatron/core/distributed/data_parallel_base.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py old mode 100755 new mode 100644 index 3a23426..ea08db6 --- a/megatron/core/distributed/distributed_data_parallel.py +++ b/megatron/core/distributed/distributed_data_parallel.py @@ -7,6 +7,7 @@ import torch from .. import parallel_state from ..config_logger import has_config_logger_enabled, log_config_to_disk +from ..transformer.cuda_graphs import is_graph_capturing from ..transformer.transformer_config import TransformerConfig from ..utils import is_float8tensor, log_single_rank from .data_parallel_base import _BaseDataParallel @@ -151,12 +152,20 @@ class DistributedDataParallel(_BaseDataParallel): with_context_parallel=True ) if self.ddp_config.average_in_collective: - # Collective is averaging gradients in collective with data_parallel_group. - assert ( - gradient_scaling_factor - / parallel_state.get_data_parallel_world_size(with_context_parallel=True) - == target_gradient_scaling_factor - ) + if self.ddp_config.num_distributed_optimizer_instances == 1: + # Collective is averaging gradients in collective with data_parallel_group. + assert ( + gradient_scaling_factor + / torch.distributed.get_world_size(group=data_parallel_group) + == target_gradient_scaling_factor + ) + else: + # For non-expert parameters, gradient_scaling_factor is 1. + # For expert parameters, gradient_scaling_factor is 1/ep_size. + assert (gradient_scaling_factor == 1) or ( + gradient_scaling_factor + == (1.0 / parallel_state.get_expert_model_parallel_world_size()) + ) else: assert gradient_scaling_factor == target_gradient_scaling_factor @@ -297,9 +306,10 @@ class DistributedDataParallel(_BaseDataParallel): self._make_forward_pre_hook() ) - def disable_forward_pre_hook(self): + def disable_forward_pre_hook(self, param_sync: bool = True): """ Disable forward pre-hooks needed for param all-gather overlap with forward compute. + Skip synchronous param all-gather if `param_sync` is False. """ assert self.use_forward_hook # De-register forward pre-hook for all sub-modules. @@ -310,7 +320,8 @@ class DistributedDataParallel(_BaseDataParallel): assert len(self.remove_forward_pre_hook_handles) == 0 # Force synchronize parameters. - self.start_param_sync(force_sync=True) + if param_sync: + self.start_param_sync(force_sync=True) def _make_forward_pre_hook(self): """ @@ -323,6 +334,9 @@ class DistributedDataParallel(_BaseDataParallel): self.use_forward_hook ), "Should use pre-hook only when overlap_param_gather is True" + if is_graph_capturing(): + return + # Make sure all parameters in this module have been all-gathered as necessary. for param in module.parameters(recurse=False): # Skip parameters without an associated buffer (such parameters have a @@ -353,6 +367,9 @@ class DistributedDataParallel(_BaseDataParallel): """ def hook(*unused): + if is_graph_capturing(): + return + if param in self.param_to_bucket_group: assert param.requires_grad if self.ddp_config.overlap_grad_reduce: diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py old mode 100755 new mode 100644 diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py old mode 100755 new mode 100644 index 00c8fdd..5095a7c --- a/megatron/core/distributed/param_and_grad_buffer.py +++ b/megatron/core/distributed/param_and_grad_buffer.py @@ -270,13 +270,12 @@ class _ParamAndGradBucketGroup: if self.ddp_config.average_in_collective: reduce_op = torch.distributed.ReduceOp.AVG - # Stream synchronization logic of the CUDA streams that is - # implemented below for the gradient reduction within and across - # distributed optimizer instances. + # We use the following stream synchronization for the gradient reduction + # within and across DistOpt instances. - # Compute Stream - -------------Gradient Compute------------------- - # Comm. Stream - ------(wait for nccl)-----(wait for nccl)------- - # NCCL Stream - -------RS------ -------AR------ + # Compute Stream: -------------Gradient compute------------------- + # Comm. Stream: ------(wait for NCCL)-----(wait for NCCL)------- + # NCCL Stream: -------RS------ -------AR------ # Use async communications only when overlap_grad_reduce is True. async_op = ( @@ -287,13 +286,13 @@ class _ParamAndGradBucketGroup: self.ddp_config.num_distributed_optimizer_instances > 1 and self.ddp_config.overlap_grad_reduce ): - # Assign a communication stream if we use partial DP DistOpt and we - # need to overlap communication + # Assign a communication stream if we have multiple DistOpt instances and we + # need to overlap communication. stream_context = torch.cuda.stream(self.communication_stream) # The RS/AR communication stream needs to wait for the default stream # to complete its gradient computation before launching the next - # gradient reduction collective + # gradient reduction collective. self.communication_stream.wait_stream(torch.cuda.default_stream()) else: stream_context = nullcontext() @@ -314,24 +313,21 @@ class _ParamAndGradBucketGroup: local_data_view, bucket.grad_data, op=reduce_op, - group=self.intra_distributed_optimizer_instance_group, + group=communication_group, async_op=async_op, ) else: torch.distributed.all_reduce( - bucket.grad_data, - op=reduce_op, - group=self.data_parallel_group, - async_op=async_op, + bucket.grad_data, op=reduce_op, group=communication_group, async_op=async_op ) - # When enabling partial DP domain DistOpt, we need to All-Reduce across all partial domains + # With multiple DistOpt instances, we need to all-reduce across instances. if ( self.ddp_config.use_distributed_optimizer and self.ddp_config.num_distributed_optimizer_instances > 1 ): - # Create a new coalescing facility for the inter partial DP-AllReduce here + # Create a new coalescing manager for the inter-instance all-reduce. with stream_context, _coalescing_manager( self.inter_distributed_optimizer_instance_group, async_ops=async_op ) as cm: @@ -366,13 +362,13 @@ class _ParamAndGradBucketGroup: communication call to complete. When ddp_config.overlap_grad_reduce is set to False, makes synchronous call. """ - # If overlap_grad_reduce is False, start (and finish) synchronous communication call here. self.param_gather_dispatched = False + # If overlap_grad_reduce is False, start (and finish) synchronous communication call here. if not self.ddp_config.overlap_grad_reduce: self.start_grad_sync() return - # When using partial DP DistOpt, we don't need to sync as we launch comms on a separate - # communication stream + # When using multiple DistOpt instances, we don't need to sync here as we launch + # communications on a separate communication stream. if self.ddp_config.num_distributed_optimizer_instances > 1: torch.cuda.default_stream().wait_stream(self.communication_stream) return diff --git a/megatron/core/distributed/torch_fully_sharded_data_parallel.py b/megatron/core/distributed/torch_fully_sharded_data_parallel.py old mode 100755 new mode 100644 diff --git a/megatron/core/enums.py b/megatron/core/enums.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py old mode 100755 new mode 100644 diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py old mode 100755 new mode 100644 diff --git a/megatron/core/extensions/__init__.py b/megatron/core/extensions/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py old mode 100755 new mode 100644 index 62336cd..5884109 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -13,8 +13,8 @@ from packaging.version import Version as PkgVersion from torch import Tensor from torch.nn.parameter import Parameter -from megatron.core import ModelParallelConfig from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding +from megatron.core.model_parallel_config import ModelParallelConfig from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.parallel_state import ( get_context_parallel_global_ranks, @@ -654,6 +654,23 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): else: kv_channels = self.config.kv_channels + self.kept_packed_seq_params = set( + field.name for field in dataclasses.fields(PackedSeqParams) + ) + if get_te_version() < PkgVersion("1.3.0"): + # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H + # copies (#555) + # These two arguments did not exist prior to 1.3.0 + self.kept_packed_seq_params.discard("max_seqlen_q") + self.kept_packed_seq_params.discard("max_seqlen_kv") + + if get_te_version() < PkgVersion("1.10.0"): + # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted + # in each individual sequence in THD format dataset + # These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012) + self.kept_packed_seq_params.discard("cu_seqlens_q_padded") + self.kept_packed_seq_params.discard("cu_seqlens_kv_padded") + super().__init__( num_attention_heads=self.config.num_attention_heads, kv_channels=kv_channels, @@ -683,7 +700,9 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): ): """Forward.""" packed_seq_kwargs = ( - dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {} + {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params} + if packed_seq_params is not None + else {} ) # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set # after init @@ -692,24 +711,10 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format) - if get_te_version() < PkgVersion("1.3.0"): - # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H - # copies (#555) - # These two arguments did not exist prior to 1.3.0 - packed_seq_kwargs.pop("max_seqlen_q", None) - packed_seq_kwargs.pop("max_seqlen_kv", None) - - if get_te_version() < PkgVersion("1.10.0"): - # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted - # in each individual sequence in THD format dataset - # These two arguments did not exist prior to 1.8.0.Full support added in 1.10.0 (#1012) - packed_seq_kwargs.pop("cu_seqlens_q_padded", None) - packed_seq_kwargs.pop("cu_seqlens_kv_padded", None) - # WAR for peak memory usage. # See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388 if self.config.apply_rope_fusion and qkv_format == 'bshd': - query, key, value = [x.contiguous().transpose(0, 1) for x in (query, key, value)] + query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)] # In PyTorch, the following two tensors are in fact the same: # Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1) # Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1) @@ -1229,8 +1234,14 @@ try: from transformer_engine.pytorch.attention import FusedRoPEFunc - def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: + def fused_apply_rotary_pos_emb( + t: torch.Tensor, freqs: torch.Tensor, transpose_output_memory: bool = False + ) -> torch.Tensor: """Apply rotary positional embedding to input tensor T in `sbhd` format.""" + if transpose_output_memory: + warnings.warn( + "transpose_output_memory is not supported by TE's fused RoPE and will be ignored." + ) return FusedRoPEFunc.apply(t, freqs, "sbhd") def fused_apply_rotary_pos_emb_thd( diff --git a/megatron/core/fusions/__init__.py b/megatron/core/fusions/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_dropout.py b/megatron/core/fusions/fused_bias_dropout.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_geglu.py b/megatron/core/fusions/fused_bias_geglu.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_gelu.py b/megatron/core/fusions/fused_bias_gelu.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_bias_swiglu.py b/megatron/core/fusions/fused_bias_swiglu.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_cross_entropy.py b/megatron/core/fusions/fused_cross_entropy.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_layer_norm.py b/megatron/core/fusions/fused_layer_norm.py old mode 100755 new mode 100644 diff --git a/megatron/core/fusions/fused_softmax.py b/megatron/core/fusions/fused_softmax.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/__init__.py b/megatron/core/inference/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/ammo_support/__init__.py b/megatron/core/inference/ammo_support/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/ammo_support/gpt/model_specs.py b/megatron/core/inference/ammo_support/gpt/model_specs.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/common_inference_params.py b/megatron/core/inference/common_inference_params.py old mode 100755 new mode 100644 index 2235308..7955bb6 --- a/megatron/core/inference/common_inference_params.py +++ b/megatron/core/inference/common_inference_params.py @@ -1,29 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -from dataclasses import dataclass - - -@dataclass -class CommonInferenceParams: - """Inference parameters sent along with the prompts - - For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910 - """ - - temperature: float = 1.0 - top_k: int = 0 - top_p: float = 0.0 - return_log_probs: bool = False - num_tokens_to_generate: int = 30 - - def add_attributes(self, attribute_value_pair: dict): - """Utility to add more attributes to inference params - - Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows - c = CommonInferenceParams - c.add_attributes({'min_length':4, 'eod_id':153}) - - Args: - attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. - """ - for key, value in attribute_value_pair.items(): - setattr(self, key, value) +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from megatron.core.inference.sampling_params import ( # noqa: F401 # pylint: disable=unused-import + SamplingParams as CommonInferenceParams, +) diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/engines/abstract_engine.py b/megatron/core/inference/engines/abstract_engine.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py old mode 100755 new mode 100644 index fe81602..28ef46b --- a/megatron/core/inference/engines/mcore_engine.py +++ b/megatron/core/inference/engines/mcore_engine.py @@ -3,12 +3,12 @@ from typing import Dict, List import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.scheduler import Scheduler -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) @@ -19,7 +19,7 @@ class MCoreEngine(AbstractEngine): Supports any model that is callable (Accepts the inputs and outputs the tensor) Args: - text_generation_controller (SimpleTextGenerationController): A text generation + text_generation_controller (TextGenerationController): A text generation controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. max_batch_size : The maxinum number of requests to process at once @@ -29,7 +29,7 @@ class MCoreEngine(AbstractEngine): def __init__( self, - text_generation_controller: SimpleTextGenerationController, + text_generation_controller: TextGenerationController, max_batch_size, random_seed: int = None, ): @@ -42,7 +42,8 @@ class MCoreEngine(AbstractEngine): prompts: List[str], add_BOS: bool = False, encoder_prompts: List[str] = None, - common_inference_params: CommonInferenceParams = None, + common_inference_params: SamplingParams = None, + sampling_params: SamplingParams = None, ) -> dict: """The megatron core inference backend generate function @@ -54,13 +55,19 @@ class MCoreEngine(AbstractEngine): prompts (List[str]): All the prompts as a list of strings add_BOS (bool): Whether to add BOS token to beginning of prompts encoder_prompts (List[dict]): All the encoder prompts as a list of strings - common_inference_params (CommonInferenceParams): The inference parameters + common_inference_params: Deprecated. Only used for backward compatibility with + MCore <= 0.9.0. Use `sampling_params` going forward. + sampling_params (SamplingParams): The request-level sampling parameters Returns: List[InferenceRequest]: The output is list of inference requests containing the generated tokens, texts and log probs if required """ # TODO :M core- get rng state tracker + + if common_inference_params: + sampling_params = common_inference_params + if self.random_seed: torch.random.manual_seed(self.random_seed) @@ -73,7 +80,7 @@ class MCoreEngine(AbstractEngine): prompt=prompt, prompt_tokens=prompt_tokens, encoder_prompt=encoder_prompt, - inference_parameters=common_inference_params, + inference_parameters=sampling_params, ) self.run_engine() diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py old mode 100755 new mode 100644 index 4825dfd..ea0d67b --- a/megatron/core/inference/inference_request.py +++ b/megatron/core/inference/inference_request.py @@ -5,7 +5,7 @@ from typing import List import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.sampling_params import SamplingParams # class syntax @@ -28,7 +28,7 @@ class InferenceRequest: request_id: str prompt: str - inference_parameters: CommonInferenceParams + inference_parameters: SamplingParams prompt_tokens: List[int] arrival_time: float status: Status diff --git a/megatron/core/inference/model_inference_wrappers/__init__.py b/megatron/core/inference/model_inference_wrappers/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/__init__.py b/megatron/core/inference/modelopt_support/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/gpt/__init__.py b/megatron/core/inference/modelopt_support/gpt/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/gpt/model_specs.py b/megatron/core/inference/modelopt_support/gpt/model_specs.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/sampling_params.py b/megatron/core/inference/sampling_params.py new file mode 100644 index 0000000..8ffcb63 --- /dev/null +++ b/megatron/core/inference/sampling_params.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass + + +@dataclass +class SamplingParams: + """Inference parameters sent along with the prompts. + This class contains request-level attributes that control the sampling techniques used when + generating text. This is distinct from megatron.core.InferenceParams, which is sets model-level + inference attributes such as the maximum sequence length, and contains the KV cache. + + For an explanation of these parameters refer to this blog + https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and- + temperature-parameters-ed6a31313910 + """ + + temperature: float = 1.0 + top_k: int = 0 + top_p: float = 0.0 + return_log_probs: bool = False + num_tokens_to_generate: int = 30 + + def add_attributes(self, attribute_value_pair: dict): + """Utility to add more attributes to sampling params + + Use this method to pass in a custom dictionary to add more sampling parameter attributes. + c = SamplingParams + c.add_attributes({'min_length':4, 'eod_id':153}) + + Args: + attribute_value_pair (dict): A dictionary containing attributes as the key names and + their values as the values. + """ + for key, value in attribute_value_pair.items(): + setattr(self, key, value) diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py old mode 100755 new mode 100644 index 00ab81b..ef17723 --- a/megatron/core/inference/scheduler.py +++ b/megatron/core/inference/scheduler.py @@ -6,8 +6,8 @@ from typing import Dict import torch -from megatron.core.inference.common_inference_params import CommonInferenceParams from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.utils import Counter @@ -33,7 +33,7 @@ class Scheduler: prompt: str, prompt_tokens: torch.Tensor, encoder_prompt: str = None, - inference_parameters: CommonInferenceParams = None, + inference_parameters: SamplingParams = None, arrival_time: float = None, ): """Add an incoming request @@ -45,7 +45,7 @@ class Scheduler: prompt (str): Input prompt string prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized encoder_prompt (str): Encoder input string - inference_parameters (CommonInferenceParams): The inference parameters + inference_parameters (SamplingParams): The inference parameters arrival_time (float, optional): The incoming request time. Defaults to None. """ request_id = str(next(self.request_counter)) diff --git a/megatron/core/inference/text_generation_controllers/__init__.py b/megatron/core/inference/text_generation_controllers/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py old mode 100755 new mode 100644 index 61beff0..0c2a41b --- a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py @@ -4,15 +4,15 @@ from typing import OrderedDict import torch from megatron.core.inference.inference_request import InferenceRequest -from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( - SimpleTextGenerationController, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, ) -class EncoderDecoderTextGenerationController(SimpleTextGenerationController): +class EncoderDecoderTextGenerationController(TextGenerationController): """The text generation controller for encoder-decoder architecture - This class ingherits from SimpleTextGenerationController, adding features + This class inherits from TextGenerationController, adding features relating to encoder input encoder_prompt """ diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py old mode 100755 new mode 100644 index 1103089..f97df13 --- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py @@ -1,400 +1,5 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -from typing import List, OrderedDict, Tuple +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -import torch -import torch.nn.functional as F - -from megatron.core import parallel_state -from megatron.core.inference.common_inference_params import CommonInferenceParams -from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage -from megatron.core.inference.inference_request import InferenceRequest, Status -from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( - AbstractModelInferenceWrapper, +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( # noqa: F401 # pylint: disable=unused-import + TextGenerationController as SimpleTextGenerationController, ) - - -class SimpleTextGenerationController: - """The basic text generation controller - - This class is responsible for tokenizing the input , running the inference, sampling - and also detokenizing the output - - Args: - inference_wrapped_model (AbstractModelInferenceWrapper): A model that - is wrapped using the specs given in the abstract_model_inference_wrapper.py - tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts - """ - - def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): - self.inference_wrapped_model = inference_wrapped_model - self.tokenizer = tokenizer - - # For models without pipeline parallelism, is_first_stage and is_last_stage returns True - self.model_is_pipeline_parallel = not ( - parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() - ) - - def tokenize_prompt( - self, prompt: str, add_BOS: bool = False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Utility to tokenize the input prompts - - Args: - prompt (str): The input prompt - - Returns: - torch.Tensor: Returns the tokenized prompt - """ - prompt_tokens = self.tokenizer.tokenize(prompt) - - if add_BOS: - prompt_tokens = [self.tokenizer.bos] + prompt_tokens - - return prompt_tokens - - def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: - """Detokenize the output generations - - Args: - prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt - tokens plus the generated tokens - - Returns: - str: The detokenized output - """ - tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist() - return self.tokenizer.detokenize(tokens) - - def sample_from_logits( - self, - last_token_logits: torch.Tensor, - common_inference_params: CommonInferenceParams, - vocab_size: int = None, - ) -> torch.Tensor: - """Samples the logits to generate outputs - - Given the logits of the last token, this function samples it - according to the parameters defined in common_inference_params - and returns the samples - - Args: - last_token_logits (torch.Tensor): The last token logits. A tensor of - size [batch_size, vocab_size] - common_inference_params (CommonInferenceParams): The paramters to use - for inference - vocab_size (int): Obtained from the tokenizer. Defaults to None - - Returns: - torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements - """ - - top_p = common_inference_params.top_p - top_k = common_inference_params.top_k - temperature = common_inference_params.temperature - - assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero' - assert top_p <= 1.0, 'top-p should be in (0,1]' - - def modify_logits_for_top_k_filtering(logits, top_k): - """Set the logits for none top-k values to -inf.""" - filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] - logits.masked_fill_(filter_, float('-Inf')) - - def modify_logits_for_top_p_filtering(logits, top_p): - """Set the logits for none top-p values to -inf.""" - # First sort and calculate cumulative sum of probabilities. - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) - - # Filteration based on the cumulative sum. - filter_ = cumulative_probs > top_p - # This shift by 1 is weird and I cannot justify it. This existed - # in the original implementation: - # https://github.com/ari-holtzman/degen/blob/master/gen.py - # and I guess it is needed so keeping it for now. - filter_[:, 1:] = filter_[:, :-1].clone() - # Make sure we at least have one token to select from. - filter_[..., 0] = 0 - - # Fill in the filtered part - filter_ = filter_.scatter(1, sorted_indices, filter_) - logits.masked_fill_(filter_, float('-Inf')) - - # Greedy sampling - if top_k == 1: - sampled_logits = torch.argmax(last_token_logits, dim=-1) - else: - last_token_logits = last_token_logits.clone() - if temperature != 1.0: - last_token_logits.div_(temperature) - - if top_k > 1: - assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.' - if vocab_size: - assert top_k < vocab_size, 'top-k is larger than vocab size.' - modify_logits_for_top_k_filtering(last_token_logits, top_k) - - elif top_p > 0.0: - modify_logits_for_top_p_filtering(last_token_logits, top_p) - - # After filtering, we need to recalculate the distribution. - probabilities = last_token_logits.softmax(dim=-1) - sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1) - - # If vocab size is provided, make sure the samples are in in the range [0, vocab-size). - if vocab_size: - sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1)) - return sampled_logits - - def update_generation_status( - self, - updated_prompts_tokens: torch.Tensor, - generation_started: torch.Tensor, - current_context_end_position: int, - is_generation_done_tensor: torch.Tensor, - generated_sequence_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Checks which prompts have reached an end condition - - We check which prompts have reached an end condition and set the corresponding - flags of the is_generation_done_tensor to True. The generated sequence lengths - increase as we keep generating, until that prompts hits an end condition. The - generation_started tensor determines which prompts have started generating. - - Args: - updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest - generated tokens. A tensor of shape [batch_size, max_seq_len] - (i.e max_seq_len = max_prompt_len + tokens_to_generate) - generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True - indicates the prompt at that index has started generating tokens. - current_context_end_position (int): An integer indicating which position to - extract from the prompts tokens to get the latest generated tokens. - is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. - True indicates the prompt at that index has reached end condition. - generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. - Each value represents the generated sequence lengths for that prompt. - - Returns: - Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean - is_generation_done_tensor and the generated_sequence_lengths after updating it - """ - latest_samples = updated_prompts_tokens[:, current_context_end_position] - # Make sure we are checking eod criterion only for prompts that have started generating - # (i.e) We only look at the generated tokenns and not the input tokens. - reached_eod = (latest_samples == self.tokenizer.eod) & generation_started - is_generation_done_tensor = is_generation_done_tensor | reached_eod - # We increment generated sequence lengths when that prompt has not hit the - # EOD and generation has started - generated_sequence_lengths += ~is_generation_done_tensor & generation_started - - return is_generation_done_tensor, generated_sequence_lengths - - def pad_input_prompt_tokens( - self, - batch_prompt_tokens_list: List[List[int]], - max_prompt_length_in_batch: int, - num_tokens_to_generate: int, - ) -> torch.Tensor: - """Method to pad input prompts - - Given a list of prompts, pad them all to uniform length - - Args: - batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens - max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens - num_tokens_togenerate (int): The number of tokens to generate for each prompt - - Returns: - torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) - max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, - with extra indices for each tensor padded with mask id. - """ - max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate - - for prompt_tokens in batch_prompt_tokens_list: - padding_size = max_seq_len - len(prompt_tokens) - prompt_tokens.extend([self.tokenizer.eod] * padding_size) - - return torch.tensor(batch_prompt_tokens_list).cuda() - - def generate_output_tokens_dynamic_batch( - self, active_requests: OrderedDict[int, InferenceRequest] - ) -> OrderedDict[int, InferenceRequest]: - """Utility to generate the output tokens and probabilities for the prompts - - This utility generates the output tokens for a dynamic batch. It will run one forward step - at a time, and pass control back to the engine, which will update the request pool and call - this method again. - - Args: - active_requests (OrderedDict[int, InferenceRequest]): The input active requests. - - Returns: - OrderedDict[int, InferenceRequest]: The result for each of the incoming requests - after running one forward step. - """ - raise Exception("Not implemented yet") - - def generate_all_output_tokens_static_batch( - self, active_requests: OrderedDict[int, InferenceRequest] - ) -> OrderedDict[int, InferenceRequest]: - """Utility to generate the all the output tokens and probabilities for the prompts . - - This utility generates the output tokens for a static batch. It runs the forward steps till - all prompts complete generation, updates the status of these requests to completed, adds - the generated result and returns these requests - - Args: - active_requests (OrderedDict[int, InferenceRequest]): The input active requests. - - Returns: - OrderedDict[int, InferenceRequest]: The result for each of the incoming requests - """ - batch_prompt_tokens_list = list( - map(lambda request: request.prompt_tokens, active_requests.values()) - ) - prompt_lengths_in_batch = torch.tensor( - [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list] - ).cuda() - max_prompt_length_in_batch = max(prompt_lengths_in_batch) - min_prompt_length_in_batch = min(prompt_lengths_in_batch) - - # For batch inference the inference params are the same for all request - common_inference_params: CommonInferenceParams = list(active_requests.values())[ - 0 - ].inference_parameters - - # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate - batch_prompt_tokens = self.pad_input_prompt_tokens( - batch_prompt_tokens_list, - max_prompt_length_in_batch=max_prompt_length_in_batch, - num_tokens_to_generate=common_inference_params.num_tokens_to_generate, - ) - batch_size, max_sequence_length = batch_prompt_tokens.shape - - # Pre allocate log probs tensor - output_log_probs = None - if common_inference_params.return_log_probs: - output_log_probs = torch.empty( - (batch_size, max_sequence_length - 1), dtype=torch.float32 - ).cuda() - - # An array to check which of the prompts have reached end of generation condition - is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda() - - # An array to act as a counter to keep track of generated sequence lengths - generated_sequence_lengths = torch.zeros(batch_size).cuda() - - with torch.no_grad(): - - self.prep_model_for_inference( - prompts_tokens=batch_prompt_tokens, active_requests=active_requests - ) - - context_start_position = 0 - # Pick the context window that we need to pass through the network. - for context_end_position in range(min_prompt_length_in_batch, max_sequence_length): - - inference_input = self.inference_wrapped_model.get_batch_for_context_window( - context_start_position, context_end_position - ) - - # Returns the final logits of shape [batch_size, context_length, vocab_size] - # Note: This is returned in all TP ranks or last PP stage in PP models - logits = self.inference_wrapped_model.run_one_forward_step(inference_input) - if self.model_is_pipeline_parallel: - context_length = context_end_position - context_start_position - logits = broadcast_from_last_pipeline_stage( - [batch_size, context_length, self.tokenizer.vocab_size], - dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype, - tensor=logits, - ) - - # Indicates which of the input prompts have started generating tokens. - # A 1D boolean tensor with [batch_size] elements (i.e) The shortest - # prompts will start generating first and so on - generation_started = prompt_lengths_in_batch <= context_end_position - last_token_logits = logits[:, -1, :] - sampled_logits = self.sample_from_logits( - last_token_logits, common_inference_params, self.tokenizer.vocab_size - ) - - # Substitute the sampled logits only for only the prompts that - # have started generating tokens - batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[ - generation_started - ] - - if common_inference_params.return_log_probs: - log_probs = F.log_softmax(logits, dim=2) - indices = torch.unsqueeze( - batch_prompt_tokens[ - :, (context_start_position + 1) : (context_end_position + 1) - ], - 2, - ) - # Get the log probabilities for only the prompt tokens - output_log_probs[:, context_start_position:context_end_position] = torch.gather( - log_probs, 2, indices - ).squeeze(2) - - context_start_position = context_end_position - - # Check end of generation status for each tensor - # and update generated sequence lengths - (is_generation_done_tensor, generated_sequence_lengths) = ( - self.update_generation_status( - updated_prompts_tokens=batch_prompt_tokens, - generation_started=generation_started, - current_context_end_position=context_end_position, - is_generation_done_tensor=is_generation_done_tensor, - generated_sequence_lengths=generated_sequence_lengths, - ) - ) - # Boolean flag indicating if all prompts are finished - all_prompts_done = torch.all(is_generation_done_tensor) - if all_prompts_done: - break - - # Include all the generated tokens - batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)] - if common_inference_params.return_log_probs: - output_log_probs = output_log_probs[:, :context_end_position] - - generated_sequence_lengths[ - generated_sequence_lengths > common_inference_params.num_tokens_to_generate - ] = common_inference_params.num_tokens_to_generate - - for idx, request in enumerate(active_requests.values()): - input_prompt_length = int(prompt_lengths_in_batch[idx]) - # Shorter prompts might have generated more than required tokens. So we trim them down - required_sequence_length = int( - min(generated_sequence_lengths[idx], common_inference_params.num_tokens_to_generate) - ) - # Extract only the generated tokens - required_result_tokens = batch_prompt_tokens_with_generations[ - idx, input_prompt_length : (input_prompt_length + required_sequence_length) - ] - - request.generated_length = required_sequence_length - request.generated_tokens = required_result_tokens - request.generated_log_probs = ( - None - if output_log_probs is None - else output_log_probs[idx, input_prompt_length:required_sequence_length] - ) - request.status = Status.COMPLETED - request.generated_text = self.detokenize_generations(required_result_tokens) - - return active_requests - - def prep_model_for_inference( - self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest] - ): - """Preparing batch for inference, using respective wrapper's prep_model_for_inference method - - Args: - prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] - active_requests (OrderedDict[int, InferenceRequest]): The input active requests - """ - self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens) diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py new file mode 100644 index 0000000..f15c819 --- /dev/null +++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py @@ -0,0 +1,400 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from typing import List, OrderedDict, Tuple + +import torch +import torch.nn.functional as F + +from megatron.core import parallel_state +from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( + AbstractModelInferenceWrapper, +) +from megatron.core.inference.sampling_params import SamplingParams + + +class TextGenerationController: + """The text generation controller (the main sampling loop) + + This class tokenizes the input, runs inference, samples from logits, and detokenizes the output. + + Args: + inference_wrapped_model (AbstractModelInferenceWrapper): A model that + is wrapped using the specs given in the abstract_model_inference_wrapper.py + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts + """ + + def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): + self.inference_wrapped_model = inference_wrapped_model + self.tokenizer = tokenizer + + # For models without pipeline parallelism, is_first_stage and is_last_stage returns True + self.model_is_pipeline_parallel = not ( + parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() + ) + + def tokenize_prompt( + self, prompt: str, add_BOS: bool = False + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility to tokenize the input prompts + + Args: + prompt (str): The input prompt + + Returns: + torch.Tensor: Returns the tokenized prompt + """ + prompt_tokens = self.tokenizer.tokenize(prompt) + + if add_BOS: + prompt_tokens = [self.tokenizer.bos] + prompt_tokens + + return prompt_tokens + + def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: + """Detokenize the output generations + + Args: + prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt + tokens plus the generated tokens + + Returns: + str: The detokenized output + """ + tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist() + return self.tokenizer.detokenize(tokens) + + def sample_from_logits( + self, + last_token_logits: torch.Tensor, + sampling_params: SamplingParams = None, + vocab_size: int = None, + **kwargs + ) -> torch.Tensor: + """Samples the logits to generate outputs + + Given the logits of the last token, this function samples it + according to the parameters defined in sampling_params + and returns the samples + + Args: + last_token_logits (torch.Tensor): The last token logits. A tensor of + size [batch_size, vocab_size] + sampling_params (SamplingParams): The parameters to use for inference. + vocab_size (int): Obtained from the tokenizer. Defaults to None + + Returns: + torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements + """ + + if kwargs.get('common_inference_params'): + sampling_params = kwargs['common_inference_params'] + + top_p = sampling_params.top_p + top_k = sampling_params.top_k + temperature = sampling_params.temperature + + assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero' + assert top_p <= 1.0, 'top-p should be in (0,1]' + + def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf.""" + filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(filter_, float('-Inf')) + + def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf.""" + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Filteration based on the cumulative sum. + filter_ = cumulative_probs > top_p + # This shift by 1 is weird and I cannot justify it. This existed + # in the original implementation: + # https://github.com/ari-holtzman/degen/blob/master/gen.py + # and I guess it is needed so keeping it for now. + filter_[:, 1:] = filter_[:, :-1].clone() + # Make sure we at least have one token to select from. + filter_[..., 0] = 0 + + # Fill in the filtered part + filter_ = filter_.scatter(1, sorted_indices, filter_) + logits.masked_fill_(filter_, float('-Inf')) + + # Greedy sampling + if top_k == 1: + sampled_logits = torch.argmax(last_token_logits, dim=-1) + else: + last_token_logits = last_token_logits.clone() + if temperature != 1.0: + last_token_logits.div_(temperature) + + if top_k > 1: + assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.' + if vocab_size: + assert top_k < vocab_size, 'top-k is larger than vocab size.' + modify_logits_for_top_k_filtering(last_token_logits, top_k) + + elif top_p > 0.0: + modify_logits_for_top_p_filtering(last_token_logits, top_p) + + # After filtering, we need to recalculate the distribution. + probabilities = last_token_logits.softmax(dim=-1) + sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1) + + # If vocab size is provided, make sure the samples are in in the range [0, vocab-size). + if vocab_size: + sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1)) + return sampled_logits + + def update_generation_status( + self, + updated_prompts_tokens: torch.Tensor, + generation_started: torch.Tensor, + current_context_end_position: int, + is_generation_done_tensor: torch.Tensor, + generated_sequence_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Checks which prompts have reached an end condition + + We check which prompts have reached an end condition and set the corresponding + flags of the is_generation_done_tensor to True. The generated sequence lengths + increase as we keep generating, until that prompts hits an end condition. The + generation_started tensor determines which prompts have started generating. + + Args: + updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest + generated tokens. A tensor of shape [batch_size, max_seq_len] + (i.e max_seq_len = max_prompt_len + tokens_to_generate) + generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True + indicates the prompt at that index has started generating tokens. + current_context_end_position (int): An integer indicating which position to + extract from the prompts tokens to get the latest generated tokens. + is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. + True indicates the prompt at that index has reached end condition. + generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. + Each value represents the generated sequence lengths for that prompt. + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean + is_generation_done_tensor and the generated_sequence_lengths after updating it + """ + latest_samples = updated_prompts_tokens[:, current_context_end_position] + # Make sure we are checking eod criterion only for prompts that have started generating + # (i.e) We only look at the generated tokenns and not the input tokens. + reached_eod = (latest_samples == self.tokenizer.eod) & generation_started + is_generation_done_tensor = is_generation_done_tensor | reached_eod + # We increment generated sequence lengths when that prompt has not hit the + # EOD and generation has started + generated_sequence_lengths += ~is_generation_done_tensor & generation_started + + return is_generation_done_tensor, generated_sequence_lengths + + def pad_input_prompt_tokens( + self, + batch_prompt_tokens_list: List[List[int]], + max_prompt_length_in_batch: int, + num_tokens_to_generate: int, + ) -> torch.Tensor: + """Method to pad input prompts + + Given a list of prompts, pad them all to uniform length + + Args: + batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens + max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens + num_tokens_togenerate (int): The number of tokens to generate for each prompt + + Returns: + torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) + max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, + with extra indices for each tensor padded with mask id. + """ + max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate + + for prompt_tokens in batch_prompt_tokens_list: + padding_size = max_seq_len - len(prompt_tokens) + prompt_tokens.extend([self.tokenizer.eod] * padding_size) + + return torch.tensor(batch_prompt_tokens_list).cuda() + + def generate_output_tokens_dynamic_batch( + self, active_requests: OrderedDict[int, InferenceRequest] + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate the output tokens and probabilities for the prompts + + This utility generates the output tokens for a dynamic batch. It will run one forward step + at a time, and pass control back to the engine, which will update the request pool and call + this method again. + + Args: + active_requests (OrderedDict[int, InferenceRequest]): The input active requests. + + Returns: + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests + after running one forward step. + """ + raise Exception("Not implemented yet") + + def generate_all_output_tokens_static_batch( + self, active_requests: OrderedDict[int, InferenceRequest] + ) -> OrderedDict[int, InferenceRequest]: + """Utility to generate the all the output tokens and probabilities for the prompts . + + This utility generates the output tokens for a static batch. It runs the forward steps till + all prompts complete generation, updates the status of these requests to completed, adds + the generated result and returns these requests + + Args: + active_requests (OrderedDict[int, InferenceRequest]): The input active requests. + + Returns: + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests + """ + batch_prompt_tokens_list = list( + map(lambda request: request.prompt_tokens, active_requests.values()) + ) + prompt_lengths_in_batch = torch.tensor( + [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list] + ).cuda() + max_prompt_length_in_batch = max(prompt_lengths_in_batch) + min_prompt_length_in_batch = min(prompt_lengths_in_batch) + + # For batch inference the inference params are the same for all request + sampling_params: SamplingParams = list(active_requests.values())[0].inference_parameters + + # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate + batch_prompt_tokens = self.pad_input_prompt_tokens( + batch_prompt_tokens_list, + max_prompt_length_in_batch=max_prompt_length_in_batch, + num_tokens_to_generate=sampling_params.num_tokens_to_generate, + ) + batch_size, max_sequence_length = batch_prompt_tokens.shape + + # Pre allocate log probs tensor + output_log_probs = None + if sampling_params.return_log_probs: + output_log_probs = torch.empty( + (batch_size, max_sequence_length - 1), dtype=torch.float32 + ).cuda() + + # An array to check which of the prompts have reached end of generation condition + is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda() + + # An array to act as a counter to keep track of generated sequence lengths + generated_sequence_lengths = torch.zeros(batch_size).cuda() + + with torch.no_grad(): + + self.prep_model_for_inference( + prompts_tokens=batch_prompt_tokens, active_requests=active_requests + ) + + context_start_position = 0 + # Pick the context window that we need to pass through the network. + for context_end_position in range(min_prompt_length_in_batch, max_sequence_length): + + inference_input = self.inference_wrapped_model.get_batch_for_context_window( + context_start_position, context_end_position + ) + + # Returns the final logits of shape [batch_size, context_length, vocab_size] + # Note: This is returned in all TP ranks or last PP stage in PP models + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + if self.model_is_pipeline_parallel: + context_length = context_end_position - context_start_position + logits = broadcast_from_last_pipeline_stage( + [batch_size, context_length, self.tokenizer.vocab_size], + dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype, + tensor=logits, + ) + + # Indicates which of the input prompts have started generating tokens. + # A 1D boolean tensor with [batch_size] elements (i.e) The shortest + # prompts will start generating first and so on + generation_started = prompt_lengths_in_batch <= context_end_position + last_token_logits = logits[:, -1, :] + sampled_logits = self.sample_from_logits( + last_token_logits, sampling_params, self.tokenizer.vocab_size + ) + + # Substitute the sampled logits only for only the prompts that + # have started generating tokens + batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[ + generation_started + ] + + if sampling_params.return_log_probs: + log_probs = F.log_softmax(logits, dim=2) + indices = torch.unsqueeze( + batch_prompt_tokens[ + :, (context_start_position + 1) : (context_end_position + 1) + ], + 2, + ) + # Get the log probabilities for only the prompt tokens + output_log_probs[:, context_start_position:context_end_position] = torch.gather( + log_probs, 2, indices + ).squeeze(2) + + context_start_position = context_end_position + + # Check end of generation status for each tensor + # and update generated sequence lengths + (is_generation_done_tensor, generated_sequence_lengths) = ( + self.update_generation_status( + updated_prompts_tokens=batch_prompt_tokens, + generation_started=generation_started, + current_context_end_position=context_end_position, + is_generation_done_tensor=is_generation_done_tensor, + generated_sequence_lengths=generated_sequence_lengths, + ) + ) + # Boolean flag indicating if all prompts are finished + all_prompts_done = torch.all(is_generation_done_tensor) + if all_prompts_done: + break + + # Include all the generated tokens + batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)] + if sampling_params.return_log_probs: + output_log_probs = output_log_probs[:, :context_end_position] + + generated_sequence_lengths[ + generated_sequence_lengths > sampling_params.num_tokens_to_generate + ] = sampling_params.num_tokens_to_generate + + for idx, request in enumerate(active_requests.values()): + input_prompt_length = int(prompt_lengths_in_batch[idx]) + # Shorter prompts might have generated more than required tokens. So we trim them down + required_sequence_length = int( + min(generated_sequence_lengths[idx], sampling_params.num_tokens_to_generate) + ) + # Extract only the generated tokens + required_result_tokens = batch_prompt_tokens_with_generations[ + idx, input_prompt_length : (input_prompt_length + required_sequence_length) + ] + + request.generated_length = required_sequence_length + request.generated_tokens = required_result_tokens + request.generated_log_probs = ( + None + if output_log_probs is None + else output_log_probs[idx, input_prompt_length:required_sequence_length] + ) + request.status = Status.COMPLETED + request.generated_text = self.detokenize_generations(required_result_tokens) + + return active_requests + + def prep_model_for_inference( + self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest] + ): + """Preparing batch for inference, using respective wrapper's prep_model_for_inference method + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] + active_requests (OrderedDict[int, InferenceRequest]): The input active requests + """ + self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens) diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py old mode 100755 new mode 100644 diff --git a/megatron/core/inference_params.py b/megatron/core/inference_params.py old mode 100755 new mode 100644 diff --git a/megatron/core/jit.py b/megatron/core/jit.py old mode 100755 new mode 100644 diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/T5/__init__.py b/megatron/core/models/T5/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/T5/t5_model.py b/megatron/core/models/T5/t5_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/__init__.py b/megatron/core/models/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/__init__.py b/megatron/core/models/bert/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py old mode 100755 new mode 100644 index 80893d5..4edc2ed --- a/megatron/core/models/bert/bert_layer_specs.py +++ b/megatron/core/models/bert/bert_layer_specs.py @@ -1,4 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import warnings + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules @@ -28,38 +30,60 @@ try: HAVE_APEX = True LNImpl = FusedLayerNorm except ImportError: - import warnings from megatron.core.transformer.torch_norm import WrappedTorchNorm - warnings.warn(f'Apex is not installed. Falling back to Torch Norm') + warnings.warn('Apex is not installed. Falling back to Torch Norm') LNImpl = WrappedTorchNorm -# Use this spec to use lower level Transformer Engine modules (required for fp8 training) -bert_layer_with_transformer_engine_spec = ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.padding}, - submodules=SelfAttentionSubmodules( - linear_qkv=TELayerNormColumnParallelLinear, - core_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, - q_layernorm=IdentityOp, - k_layernorm=IdentityOp, + +def get_bert_layer_with_transformer_engine_spec(): + """Use this spec to use lower-level Transformer Engine modules (required for fp8 training). + + Returns: + ModuleSpec: Module specification with TE modules + """ + if not HAVE_TE: + raise ImportError( + "Transformer Engine is not installed. Please use local Bert layer spec instead." + ) + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.padding}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), ), - ), - self_attn_bda=get_bias_dropout_add, - mlp=ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear + self_attn_bda=get_bias_dropout_add, + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear + ), ), + mlp_bda=get_bias_dropout_add, ), - mlp_bda=get_bias_dropout_add, - ), -) + ) + + +def __getattr__(name): + if name == 'bert_layer_with_transformer_engine_spec': + warnings.warn( + """Attribute bert_layer_specs.bert_layer_with_transformer_engine_spec is on a + deprecation track and will be removed in future releases. Please migrate to + bert_layer_specs.get_bert_layer_with_transformer_engine_spec().""" + ) + + return get_bert_layer_with_transformer_engine_spec() + # Use this spec for an implementation using only modules in megatron core bert_layer_local_spec = ModuleSpec( diff --git a/megatron/core/models/bert/bert_lm_head.py b/megatron/core/models/bert/bert_lm_head.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/bert/pooler.py b/megatron/core/models/bert/pooler.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/__init__.py b/megatron/core/models/common/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py old mode 100755 new mode 100644 index f1d7ad4..3dd5193 --- a/megatron/core/models/common/embeddings/rope_utils.py +++ b/megatron/core/models/common/embeddings/rope_utils.py @@ -17,23 +17,24 @@ from megatron.core.utils import is_te_min_version logger = logging.getLogger(__name__) +# Prefer fused RoPE from Apex as we need the `transpose_output_memory` argument for the bshd trick. +# See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2469. try: - from megatron.core.extensions.transformer_engine import ( - fused_apply_rotary_pos_emb, - fused_apply_rotary_pos_emb_thd, - ) - - HAVE_APPLY_ROPE_FUSION = True + from apex.transformer.functional import fused_apply_rotary_pos_emb except ImportError: try: - from apex.transformer.functional import ( - fused_apply_rotary_pos_emb, - fused_apply_rotary_pos_emb_thd, - ) + from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb + except: + fused_apply_rotary_pos_emb = None + - HAVE_APPLY_ROPE_FUSION = True +try: + from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb_thd +except ImportError: + try: + from apex.transformer.functional import fused_apply_rotary_pos_emb_thd except ImportError: - HAVE_APPLY_ROPE_FUSION = False + fused_apply_rotary_pos_emb_thd = None try: @@ -188,8 +189,10 @@ def apply_rotary_pos_emb( if config.apply_rope_fusion: if cu_seqlens is None: - return fused_apply_rotary_pos_emb(t, freqs) + assert fused_apply_rotary_pos_emb is not None, "apply_rope_fusion is not available." + return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) else: + assert fused_apply_rotary_pos_emb_thd is not None, "apply_rope_fusion is not available." cp_size = parallel_state.get_context_parallel_world_size() if cp_size > 1: if not is_te_min_version("1.11.0", check_equality=False): diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/language_module/__init__.py b/megatron/core/models/common/language_module/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/vision_module/__init__.py b/megatron/core/models/common/vision_module/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/common/vision_module/vision_module.py b/megatron/core/models/common/vision_module/vision_module.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/gpt/__init__.py b/megatron/core/models/gpt/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py old mode 100755 new mode 100644 index 749be32..d0e48c1 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -1,16 +1,16 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +import warnings from typing import Optional from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules -from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules -from megatron.core.transformer.moe.shared_experts import SharedExpertMLP from megatron.core.transformer.multi_latent_attention import ( MLASelfAttention, MLASelfAttentionSubmodules, @@ -26,12 +26,10 @@ from megatron.core.utils import is_te_min_version try: from megatron.core.extensions.transformer_engine import ( - TEColumnParallelGroupedLinear, TEColumnParallelLinear, TEDotProductAttention, TELayerNormColumnParallelLinear, TENorm, - TERowParallelGroupedLinear, TERowParallelLinear, ) @@ -47,8 +45,6 @@ try: HAVE_APEX = True LNImpl = FusedLayerNorm except ImportError: - import warnings - from megatron.core.transformer.torch_norm import WrappedTorchNorm warnings.warn('Apex is not installed. Falling back to Torch Norm') @@ -60,7 +56,8 @@ def get_gpt_layer_with_transformer_engine_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, - fp8: Optional[str] = None, + fp8: Optional[str] = None, # pylint: disable=unused-arguments + moe_use_legacy_grouped_gemm: Optional[bool] = False, ) -> ModuleSpec: """Use this spec to use lower-level Transformer Engine modules (required for fp8 training). @@ -69,13 +66,24 @@ def get_gpt_layer_with_transformer_engine_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. - fp8 (str, optional): Flag to decide the linear layer spec for MoE. Defaults to None. + fp8 (str, optional): Deprecated. For temporary Nemo compatibility. + moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. + Defaults to False. Returns: ModuleSpec: Module specification with TE modules """ + if fp8 is not None: + warnings.warn( + 'The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated' + ' and will be removed soon. Please update your code accordingly.' + ) + mlp = _get_mlp_module_spec( - use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8 + use_te=True, + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, ) if multi_latent_attention: @@ -138,6 +146,8 @@ def get_gpt_layer_local_spec( moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, multi_latent_attention: Optional[bool] = False, + fp8: Optional[str] = None, # pylint: disable=unused-arguments + moe_use_legacy_grouped_gemm: Optional[bool] = False, ) -> ModuleSpec: """Use this spec for an implementation using only modules in Megatron-Core. @@ -146,13 +156,24 @@ def get_gpt_layer_local_spec( num_experts (int, optional): Number of experts. Defaults to None. moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False. qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False. + fp8 (str, optional): Deprecated. For temporary Nemo compatibility. + moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP. + Defaults to False. Returns: ModuleSpec: Module specification with Megatron-Core modules """ + if fp8 is not None: + warnings.warn( + 'The fp8 argument in "get_gpt_layer_local_spec" has been deprecated' + ' and will be removed soon. Please update your code accordingly.' + ) mlp = _get_mlp_module_spec( - use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm + use_te=False, + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, ) if multi_latent_attention: @@ -213,63 +234,33 @@ def _get_mlp_module_spec( use_te: Optional[bool] = True, num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, - fp8: Optional[str] = None, + fp8: Optional[str] = None, # pylint: disable=unused-arguments + moe_use_legacy_grouped_gemm: Optional[bool] = False, ) -> ModuleSpec: - """Helper function to get module spec for MLP""" - if num_experts is not None: - moe_spec = _get_moe_module_spec( - use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8 + """Helper function to get module spec for MLP/MoE""" + if fp8 is not None: + warnings.warn( + 'The fp8 argument in "_get_mlp_module_spec" has been deprecated' + ' and will be removed soon. Please update your code accordingly.' ) - return moe_spec - - return ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, - linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, - ), - ) - -def _get_moe_module_spec( - use_te: Optional[bool] = True, - num_experts: Optional[int] = None, - moe_grouped_gemm: Optional[bool] = False, - fp8: Optional[str] = None, -) -> ModuleSpec: - """Helper function to get module spec for MoE""" if num_experts is None: - return None - if use_te and moe_grouped_gemm: - linear_fc1 = TEColumnParallelGroupedLinear - linear_fc2 = TERowParallelGroupedLinear - elif use_te and fp8: - linear_fc1 = TEColumnParallelLinear - linear_fc2 = TERowParallelLinear - else: - linear_fc1 = ColumnParallelLinear - linear_fc2 = RowParallelLinear - - use_te_grouped_gemm = use_te and TEColumnParallelGroupedLinear is not None - - return ModuleSpec( - module=MoELayer, - submodules=MoESubmodules( - experts=( - MLPSubmodules(linear_fc1=linear_fc1, linear_fc2=linear_fc2) - if not moe_grouped_gemm or use_te_grouped_gemm - else None - ), - shared_experts=ModuleSpec( - module=SharedExpertMLP, - params={"gate": False}, - submodules=MLPSubmodules( - linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear, - linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, - ), + # Dense MLP w/ or w/o TE modules. + return ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, ), - ), - ) + ) + else: + # Mixture of experts with modules in megatron core. + return get_moe_module_spec( + use_te=use_te, + num_experts=num_experts, + moe_grouped_gemm=moe_grouped_gemm, + moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm, + ) def get_gpt_decoder_block_spec( @@ -288,7 +279,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=False, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, - fp8=config.fp8, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) if use_transformer_engine else get_gpt_layer_local_spec( @@ -296,6 +287,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=False, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) ) moe_layer_spec = ( @@ -304,7 +296,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, - fp8=config.fp8, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) if use_transformer_engine else get_gpt_layer_local_spec( @@ -312,6 +304,7 @@ def get_gpt_decoder_block_spec( moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm, multi_latent_attention=config.multi_latent_attention, + moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm, ) ) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py new file mode 100644 index 0000000..513eedd --- /dev/null +++ b/megatron/core/models/gpt/moe_module_specs.py @@ -0,0 +1,81 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import warnings +from typing import Optional + +from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer.mlp import MLPSubmodules +from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP +from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules +from megatron.core.transformer.moe.shared_experts import SharedExpertMLP +from megatron.core.transformer.spec_utils import ModuleSpec +from megatron.core.utils import get_te_version, is_te_min_version + +try: + from megatron.core.extensions.transformer_engine import ( + TEColumnParallelGroupedLinear, + TEColumnParallelLinear, + TERowParallelGroupedLinear, + TERowParallelLinear, + ) + + HAVE_TE = True +except ImportError: + HAVE_TE = False + + +def get_moe_module_spec( + use_te: Optional[bool] = True, + num_experts: Optional[int] = None, + moe_grouped_gemm: Optional[bool] = False, + moe_use_legacy_grouped_gemm: Optional[bool] = False, +) -> ModuleSpec: + """Helper function to get module spec for MoE""" + assert num_experts is not None + + mlp = MLPSubmodules( + linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear, + linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, + ) + + # experts spec + if moe_grouped_gemm: + ## use GroupedMLP + if use_te and TEColumnParallelGroupedLinear is not None and not moe_use_legacy_grouped_gemm: + ## use TEGroupedLinear + expert_module = TEGroupedMLP + expert_submodule = MLPSubmodules( + linear_fc1=TEColumnParallelGroupedLinear, linear_fc2=TERowParallelGroupedLinear + ) + else: + ## use legacy GroupedMLP + expert_module = GroupedMLP + expert_submodule = None + warnings.warn( + 'The legacy GroupedMLP will be deprecated in Megatron-Core v0.12.0. ' + 'Please update the TransformerEngine to version>=1.7.0 and use TEGroupedMLP.' + ) + else: + ## use SequentialMLP + expert_module = SequentialMLP + if use_te and not is_te_min_version("1.7.0.dev0"): + warnings.warn( + "Only transformer-engine>=1.7.0 supports MoE experts, " + f"but your version is {get_te_version()}. Use local linear implementation instead." + ) + expert_submodule = MLPSubmodules( + linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear + ) + else: + expert_submodule = mlp + + experts = ModuleSpec(module=expert_module, submodules=expert_submodule) + + # shared experts spec + shared_experts = ModuleSpec(module=SharedExpertMLP, params={"gate": False}, submodules=mlp) + + # MoE module spec + moe_module_spec = ModuleSpec( + module=MoELayer, submodules=MoESubmodules(experts=experts, shared_experts=shared_experts) + ) + return moe_module_spec diff --git a/megatron/core/models/mamba/__init__.py b/megatron/core/models/mamba/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/multimodal/__init__.py b/megatron/core/models/multimodal/__init__.py old mode 100755 new mode 100644 diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py old mode 100755 new mode 100644 index dafe377..3de68b5 --- a/megatron/core/models/multimodal/llava_model.py +++ b/megatron/core/models/multimodal/llava_model.py @@ -36,6 +36,7 @@ IGNORE_INDEX = -100 # ID for labels that should be ignored. # Image token index can be tokenizer dependent so the default value does not work in all cases. DEFAULT_IMAGE_TOKEN_INDEX = -200 IMAGE_TOKEN = "" +VIDEO_TOKEN = "