add

0d99ae1f · silencealiang · c271aaae · 0d99ae1f · 0d99ae1f · 0d99ae1f
Commit 0d99ae1f authored Mar 14, 2025 by silencealiang
20 changed files
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -91,11 +91,11 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
                if use_te:
                    transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
                        args.num_experts, args.moe_grouped_gemm,
-                        args.qk_layernorm, args.multi_latent_attention, args.fp8)
+                        args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
                else:
                    transformer_layer_spec = get_gpt_layer_local_spec(
                        args.num_experts, args.moe_grouped_gemm,
-                        args.qk_layernorm, args.multi_latent_attention)
+                        args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)

        build_model_context = nullcontext
        build_model_context_args = {}
@@ -129,6 +129,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
                rope_scaling=args.use_rope_scaling
            )

+    print_rank_0(model)
+
    return model



--- a/pretrain_ict.py
+++ b/pretrain_ict.py
--- a/pretrain_mamba.py
+++ b/pretrain_mamba.py
--- a/pretrain_retro.py
+++ b/pretrain_retro.py
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
--- a/pretrain_vision_classify.py
+++ b/pretrain_vision_classify.py
--- a/pretrain_vision_dino.py
+++ b/pretrain_vision_dino.py
--- a/pretrain_vision_inpaint.py
+++ b/pretrain_vision_inpaint.py
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -83,12 +83,6 @@ def model_provider(

    assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."

-    if args.pipeline_model_parallel_size > 1:
-        assert not args.freeze_LM, "Freezing a pipeline parallel language model is not currently supported"
-
-    if args.encoder_pipeline_model_parallel_size == 1:
-        assert not args.freeze_ViT, "Freezing a vision encoder on its own pipeline rank is not currently supported"
-
    num_image_embeddings = get_num_image_embeddings(
        args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token,
        class_token_len=1, pixel_shuffle=False, use_tile_tags=False
@@ -129,7 +123,7 @@ def model_provider(
        language_transformer_layer_spec = decoder_model_with_local_default_spec(
            args.num_experts, args.moe_grouped_gemm
        )
-    
+
    # Prepare mask type for any required padding to support CP/SP sequence sharding.
    if mp_padding_needed > 0:
        if language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.causal:
@@ -351,10 +345,10 @@ def get_batch(data_iterator):
    labels = data_i["labels"].long()
    loss_mask = data_f["loss_mask"].float()
    images = data_f["image"].float()
-    
+
    if cp_size > 1 or args.sequence_parallel:
        vision_model_type = "clip"
-        # Calculate the number of image embedding tokens will be added to text tokens 
+        # Calculate the number of image embedding tokens will be added to text tokens
        num_image_embeddings_per_tile = get_num_image_embeddings(
            args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1
        )
@@ -367,7 +361,7 @@ def get_batch(data_iterator):
        num_images_per_sample = torch.sum(image_token_mask, dim=-1)
        img_seq_len = (num_image_embeddings_per_tile * num_images_per_sample - num_images_per_sample).max()
        packed_seq_params = _get_packed_seq_params(tokens, img_seq_len, mp_padding_needed_for_text)
-    
+
    # slice batch along sequence dimension for context parallelism
    batch = get_batch_on_this_cp_rank({"tokens": tokens, "position_ids": position_ids})
    attention_mask = None  # Use the attention mask type defined in layer spec. Typically no mask for the vision model and causal mask for the vision model.

--- a/pyproject.toml
+++ b/pyproject.toml
--- a/pytest.ini
+++ b/pytest.ini
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ regex
 pyyaml
 sentencepiece
 pybind11
+blobfile

 # ==== test ====
 nltk

--- a/requirements/pytorch:24.01/requirements.txt
+++ b/requirements/pytorch:24.01/requirements.txt
--- a/requirements/pytorch:24.07/requirements.txt
+++ b/requirements/pytorch:24.07/requirements.txt
--- a/run.sh
+++ b/run.sh
+export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+export TORCHINDUCTOR_BENCHMARK_FUSION=1
+export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+
+# export TORCHINDUCTOR_BENCHMARK_KERNEL=1
+export TORCHINDUCTOR_MAX_AUTOTUNE=1
+
+#export FLASH_ATTENTION_PRINT_PARAM=1
+export TORCHINDUCTOR_CACHE_DIR=./cache
+
+# export USE_AOTRITON_FA=1
+# export USE_BSHD=1 # use fa bsdh layout
+#for uniq kernel name
+#export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1
+
+mpirun --allow-run-as-root -np 8 ./Llama_pretraining.sh localhost
--- a/run_GPT-MOE.sh
+++ b/run_GPT-MOE.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 512 --hostfile hostfile_gptmoe \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_GPT-MOE_567B.sh node002 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/run_GPT-MOE_1nodes.sh
+++ b/run_GPT-MOE_1nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 8  --allow-run-as-root \
+              train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/run_mixtral8x7B_1nodes.sh
+++ b/run_mixtral8x7B_1nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 8  --allow-run-as-root \
+              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
--- a/run_mixtral8x7B_4nodes.sh
+++ b/run_mixtral8x7B_4nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 32 --hostfile hostfile_mixtral8x7B \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_mixtral_8x7B_4nodes.sh node066 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/setup.py
+++ b/setup.py