Merge branch 'main' into 'main'

megatron升级v0.10 See merge request !3

Merge branch 'main' into 'main'
megatron升级v0.10 See merge request !3
d520d24f · silencealiang · 3aca1415 · 481609bb · d520d24f · d520d24f
Commit d520d24f authored Dec 09, 2024 by silencealiang
20 changed files
--- a/docs/source/images/moe/token_drop.png
+++ b/docs/source/images/moe/token_drop.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
+.. Lumache documentation master file, created by
+   sphinx-quickstart on Tue Aug 15 13:44:10 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+Megatron Core User Guide
+===================================
+**Megatron Core** is a Python library that has the core components required to build your language models. 
+A reference implementation of Megatron Core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
+*intuitive* API.
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+   user-guide/index
+.. toctree::
+   :maxdepth: 3
+   :caption: API Guide
+   api-guide/index
--- a/docs/source/user-guide/index.rst
+++ b/docs/source/user-guide/index.rst
+User Guide 
+============
+.. mdinclude:: ../../../megatron/core/QuickStart.md
\ No newline at end of file
--- a/examples/detxoify_lm/README.md
+++ b/examples/detxoify_lm/README.md
--- a/examples/detxoify_lm/annotations/filter-selfgeneration.py
+++ b/examples/detxoify_lm/annotations/filter-selfgeneration.py
--- a/examples/detxoify_lm/annotations/perspective_api_annotate.py
+++ b/examples/detxoify_lm/annotations/perspective_api_annotate.py
@@ -107,7 +107,7 @@ def get_score(line):
            except UnicodeDecodeError:
                try:
                    decoded_text = encoded_text[:20476].decode('utf8')
-                except:
+                except Exception:
                    print("Error occurred")
                    data['score'] = None
                    return json.dumps(data)
@@ -138,7 +138,7 @@ def get_scores(lines):
                except UnicodeDecodeError:
                    try:
                        decoded_text = encoded_text[:20476].decode('utf8')
-                    except:
+                    except Exception:
                        print("Error occurred")
                        data['score'] = None
                        all_data.append(json.dumps(data))

--- a/examples/detxoify_lm/annotations/preprocess.sh
+++ b/examples/detxoify_lm/annotations/preprocess.sh
--- a/examples/detxoify_lm/finetune_gpt.py
+++ b/examples/detxoify_lm/finetune_gpt.py
@@ -10,18 +10,20 @@ import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             os.path.pardir, os.path.pardir)))
-from megatron import get_args
+from megatron.training import get_args
-from megatron import get_timers
+from megatron.training import get_timers
-from megatron import get_tokenizer
+from megatron.training import get_tokenizer
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 from megatron.core import mpu
-from megatron.data.blendable_dataset import BlendableDataset
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
+from megatron.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig
-from megatron.model import GPTModel
+from megatron.core.datasets.gpt_dataset import GPTDataset
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.legacy.model import GPTModel
 from megatron.core.enums import ModelType
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.training.utils import get_ltor_masks_and_position_ids
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
 def model_provider(pre_process=True, post_process=True):
    """Build the model."""
@@ -101,24 +103,34 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
    print_rank_0('> building train, validation, and test datasets '
                 'for GPT ...')
-    train_ds, valid_ds1, test_ds = build_train_valid_test_datasets(
+    train_ds, _, test_ds = BlendedMegatronDatasetBuilder(
-        data_prefix=args.data_path,
+        GPTDataset,
-        data_impl=args.data_impl,
+        train_val_test_num_samples,
-        splits_string=args.split,
+        lambda: True,
-        train_valid_test_num_samples=train_val_test_num_samples,
+        GPTDatasetConfig(
-        seq_length=args.seq_length,
+            blend=get_blend_from_list(args.data_path),
-        seed=args.seed,
+            split=args.split,
-        skip_warmup=(not args.mmap_warmup))
+            random_seed=args.seed,
+            sequence_length=args.seq_length,
+            path_to_cache=args.data_cache_path,
+            return_document_ids=False
+        )
+    ).build()
    print_rank_0("> finished creating finetuning GPT datasets ...")
-    _, valid_ds, _ = build_train_valid_test_datasets(
+    _, valid_ds, _ = BlendedMegatronDatasetBuilder(
-        data_prefix=args.data_path2,
+        GPTDataset,
-        data_impl="mmap",
+        train_val_test_num_samples,
-        splits_string="98,2,0",
+        lambda: True,
-        train_valid_test_num_samples=train_val_test_num_samples,
+        GPTDatasetConfig(
-        seq_length=2048,
+            blend=get_blend_from_list(args.data_path2),
-        seed=1234,
+            split="98,2,0",
-        skip_warmup=(not args.mmap_warmup))
+            random_seed=1234,
+            sequence_length=2048,
+            path_to_cache=args.data_cache_path,
+            return_document_ids=False
+        )
+    ).build()
    print_rank_0("> finished creating pretrained GPT datasets ...")
    return train_ds, valid_ds, test_ds

--- a/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
+++ b/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
@@ -43,7 +43,6 @@ python -m torch.distributed.run $DISTRIBUTED_ARGS \
     --data-path2 ${DATA_BLEND} \
     --vocab-file $VOCAB_FILE \
     --merge-file $MERGE_FILE \
-     --data-impl mmap \
     --split 100,0,0 \
     --distributed-backend nccl \
     --lr-decay-style constant \

--- a/examples/detxoify_lm/generate-1.3b.sh
+++ b/examples/detxoify_lm/generate-1.3b.sh
--- a/examples/detxoify_lm/generate_samples_gpt.py
+++ b/examples/detxoify_lm/generate_samples_gpt.py
@@ -9,23 +9,84 @@ import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             os.path.pardir, os.path.pardir)))
 import torch
-from megatron import get_args
+from megatron.training import get_args
-from megatron import get_tokenizer
+from megatron.training import get_tokenizer
-from megatron import print_rank_0
+from megatron.training import print_rank_0
-from megatron.checkpointing import load_checkpoint
+from megatron.training.checkpointing import load_checkpoint
 from megatron.core import mpu
-from megatron.initialize import initialize_megatron
+from megatron.training.initialize import initialize_megatron
-from megatron.model import GPTModel
+from megatron.legacy.model import GPTModel
 from megatron.training import get_model
-from megatron.text_generation import generate_and_post_process
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.models.gpt import GPTModel
+from typing import Union
+import megatron.legacy.model
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec
+def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+    """Builds the model.
-def model_provider(pre_process=True, post_process=True):
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
-    """Build the model."""
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+    Returns:
+        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+    """
+    args = get_args()
    print_rank_0('building GPT model ...')
-    model = GPTModel(num_tokentypes=0, parallel_output=False,
+    config = core_transformer_config_from_args(args)
-                     pre_process=pre_process, post_process=post_process)
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=False,
+            pre_process=pre_process,
+            post_process=post_process
+        )
+    else:
+        if args.spec is None:
+            if args.transformer_impl == 'local':
+                transformer_layer_spec = get_gpt_layer_local_spec(
+                    num_experts=args.num_experts,
+                    moe_grouped_gemm=args.moe_grouped_gemm
+                )
+            elif args.transformer_impl == 'transformer_engine':
+                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                    num_experts=args.num_experts,
+                    moe_grouped_gemm=args.moe_grouped_gemm
+                )
+            else:
+                raise ValueError(f"Invalid transformer_impl {args.transformer_impl}")
+        elif args.spec[0] == 'local':
+            transformer_layer_spec = get_gpt_layer_local_spec(
+                num_experts=args.num_experts,
+                moe_grouped_gemm=args.moe_grouped_gemm
+            )
+        else:
+            transformer_layer_spec = import_module(args.spec)
+        model = GPTModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=False,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent
+        )
    return model

--- a/examples/detxoify_lm/perspective_api.py
+++ b/examples/detxoify_lm/perspective_api.py
--- a/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
+++ b/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
--- a/examples/msdp/README.md
+++ b/examples/msdp/README.md
--- a/examples/msdp/data_processing.sh
+++ b/examples/msdp/data_processing.sh
--- a/examples/msdp/eval_knwl_generation.sh
+++ b/examples/msdp/eval_knwl_generation.sh
--- a/examples/msdp/eval_resp_generation.sh
+++ b/examples/msdp/eval_resp_generation.sh
--- a/examples/msdp/prep_resp_gen.sh
+++ b/examples/msdp/prep_resp_gen.sh
--- a/examples/msdp/prompt_knwl_gen.sh
+++ b/examples/msdp/prompt_knwl_gen.sh
--- a/examples/msdp/prompt_resp_gen.sh
+++ b/examples/msdp/prompt_resp_gen.sh