Merge branch 'main' into 'main'

更新torchprof支持 See merge request !6

Merge branch 'main' into 'main'
更新torchprof支持 See merge request !6
e7da80dd · wxj · 340ddce9 · fef9c0d9 · e7da80dd · e7da80dd
Commit e7da80dd authored Dec 16, 2024 by wxj
Showing with 140 additions and 82 deletions

Llama_pretraining.sh Llama_pretraining.sh +85 -45

megatron/training/arguments.py megatron/training/arguments.py +4 -2

megatron/training/training.py megatron/training/training.py +51 -35

No files found.
--- a/Llama_pretraining.sh
+++ b/Llama_pretraining.sh
@@ -19,16 +19,34 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NCCL_IB_HCA=mlx5_1,mlx5_2
 export NCCL_NET_GDR_LEVEL=SYS
 export NCCL_NET_GDR_READ=0
+export GLOG_minloglevel=3 # 打印error级别的nccl日志
 source /opt/dtk/env.sh
 # te调用gemm需要导入hipblaslt库
 # export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
+# 更新rocblas
+export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
+
+# # prof采集添加同步
+# export GPU_FLUSH_ON_EXECUTION=1
+# export HIP_DIRECT_DISPATCH=0

 CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
 TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
-DATA_PATH="/datasets/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
+DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
+
+# GPT_MODEL_ARGS=(
+#     --num-layers 32
+#     --hidden-size 5120
+#     --ffn-hidden-size 13824
+#     --num-attention-heads 40
+#     --seq-length 4096 #4096
+#     --max-position-embeddings 32768 #4096
+#     --num-query-groups 40
+#     --group-query-attention
+# )

 GPT_MODEL_ARGS=(
-    --num-layers 36
+    --num-layers 6
    --hidden-size 4096
    --ffn-hidden-size 11008 
    --num-attention-heads 32
@@ -36,17 +54,18 @@ GPT_MODEL_ARGS=(
    --max-position-embeddings 4096
 )

-# export NVTE_FLASH_ATTN=1 # 走autlass
-# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# export NVTE_FLASH_ATTN=1 # 走cutlass
+export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
 # --transformer-impl transformer_engine
    # --use-mcore-models
-    
+    # --transformer-impl local
+    # --use-legacy-models 
 TRAINING_ARGS=(
-    --transformer-impl local
-    --use-legacy-models 
+    --transformer-impl transformer_engine
+    --use-mcore-models
    --micro-batch-size 1 
-    --global-batch-size 60 #240 #512 #64
-    --train-iters 100
+    --global-batch-size 6 #240 #60 #512 #64
+    --train-iters 10
    --weight-decay 0.1 
    --adam-beta1 0.9 
    --adam-beta2 0.95 
@@ -54,24 +73,32 @@ TRAINING_ARGS=(
    --clip-grad 1.0 
    --bf16
    --use-distributed-optimizer 
-    --ckpt-format torch
    --disable-bias-linear
-    --overlap-grad-reduce
    --attention-dropout 0
    --hidden-dropout 0
-    --ddp-average-in-collective
-    --recompute-granularity full
-    --recompute-num-layers 5
-    --recompute-method block
    --no-gradient-accumulation-fusion
+    --add-qkv-bias
    --swiglu
    --lr 3.0e-5 
    --lr-decay-style cosine 
    --min-lr 3.0e-6
    --lr-warmup-iters 1
+    --ckpt-format torch
+    --ddp-average-in-collective
+    --recompute-granularity full
+    --recompute-num-layers 5 #0 #
+    --recompute-method block
+    --overlap-grad-reduce
    --use-flash-attn-triton
 )
-# --use-flash-attn-ck
+# --add-qkv-bias # qwen
+# --ckpt-format torch
+#     --ddp-average-in-collective
+#     --recompute-granularity full
+#     --recompute-num-layers 5
+#     --recompute-method block
+#     --overlap-grad-reduce
+# --use-flash-attn-cutlass
 # --use-flash-attn-triton

 MODEL_PARALLEL_ARGS=(
@@ -88,7 +115,7 @@ DATA_ARGS=(
    --normalization RMSNorm 
    --no-position-embedding 
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model /path/to/llama2_7b_hf/tokenizer.model
+    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
 )

 EVAL_AND_LOGGING_ARGS=(
@@ -102,6 +129,15 @@ EVAL_AND_LOGGING_ARGS=(
    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
 )

+PROFILE_ARGS=(
+    --profile
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-pytorch-profiler
+    --profile-ranks 0 3
+    --profile-dir prof_data
+)
+
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -122,47 +158,51 @@ APP="python -u pretrain_gpt.py \
        ${DATA_ARGS[@]} \
        ${EVAL_AND_LOGGING_ARGS[@]} \
        ${DISTRIBUTED_ARGS[@]} \
+        ${PROFILE_ARGS[@]} \
 "

+export HIP_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3 # 4,5,6,7 #,
+# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
+# ${APP}
 case ${LOCAL_RANK} in
 [0])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  # numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  # numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [2])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  # numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [3])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  # numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
-[4])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# [4])
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 #   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[5])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [5])
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 #   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[6])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [6])
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 #   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[7])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [7])
+#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 #   ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
 esac
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -643,7 +643,7 @@ def validate_args(args, defaults={}):
            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'

    # FlashAttention
-    args.use_flash_attn = args.use_flash_attn_ck or args.use_flash_attn_triton
+    args.use_flash_attn = args.use_flash_attn_cutlass or args.use_flash_attn_triton

    # Legacy RoPE arguments
    if args.use_rotary_position_embeddings:
@@ -1265,6 +1265,8 @@ def _add_training_args(parser):
                       dest='use_pytorch_profiler')
    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
                       help='Global ranks to profile.')
+    group.add_argument('--profile-dir', type=str, default="./",
+                       help='profile dir to save.')
    group.add_argument('--record-memory-history', action="store_true", default=False,
                       help='Record memory history in last rank.')
    group.add_argument('--memory-snapshot-path', type=str, default="snapshot.pickle",
@@ -1358,7 +1360,7 @@ def _add_training_args(parser):
    group.add_argument('--cross-entropy-loss-fusion', action='store_true',
                       help='Enabled fusion of cross entropy loss calculation.',
                       dest='cross_entropy_loss_fusion')
-    group.add_argument('--use-flash-attn-ck', action='store_true',
+    group.add_argument('--use-flash-attn-cutlass', action='store_true',
                       help='use FlashAttention implementation of attention. '
                       'https://arxiv.org/abs/2205.14135')
    group.add_argument('--use-flash-attn-triton', action='store_true',

--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -135,6 +135,13 @@ def num_floating_point_operations(args, batch_size):
    # - 2x: A GEMM of a m*n tensor with a n*k tensor requires 2mnk floating-point operations.
    expansion_factor = 3 * 2 * 2

+    # print(f"batch_size: {batch_size}, \
+    #       query_projection_to_hidden_size_ratio: {query_projection_to_hidden_size_ratio}, \
+    #       num_experts_routed_to: {num_experts_routed_to}, \
+    #       gated_linear_multiplier: {gated_linear_multiplier}, \
+    #       shared_expert_ffn_hidden_size: {shared_expert_ffn_hidden_size}, \
+    #       gated_linear_multiplier: {gated_linear_multiplier}, \
+    #       ")
    return (
        expansion_factor
        * batch_size
@@ -1214,8 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
        if args.use_pytorch_profiler:
            assert prof is not None
            prof.stop()
-        else:
-            torch.cuda.cudart().cudaProfilerStop()
+            print_rank_0(f"prof stop!")
+        

    # Manual garbage collection.
    if args.manual_gc:
@@ -1401,15 +1408,27 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,

    prof = None
    if args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_pytorch_profiler:
+        
+        def trace_handler(p):
+            from pathlib import Path
+            Path(f"{args.profile_dir}").mkdir(parents=True, exist_ok=True)
+            print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+            p.export_chrome_trace("{path}/trace_rank{rank}_step{step}.json".format(
+                path=args.profile_dir, rank=torch.distributed.get_rank(), step=p.step_num))
+
        prof = torch.profiler.profile(
+        activities=[
+           torch.profiler.ProfilerActivity.CPU,
+           torch.profiler.ProfilerActivity.CUDA,
+        ],
        schedule=torch.profiler.schedule(
            wait=max(args.profile_step_start-1, 0),
            warmup=1 if args.profile_step_start > 0 else 0,
            active=args.profile_step_end-args.profile_step_start,
            repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler(args.tensorboard_dir),
-        record_shapes=True,
-        with_stack=True)
+        # record_shapes=True,
+        # with_stack=True,
+        on_trace_ready=trace_handler,)
        prof.start()
    
    # Run training iterations till done.
@@ -1417,9 +1436,6 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
        if args.profile and torch.distributed.get_rank() in args.profile_ranks:
            if args.use_pytorch_profiler:
                prof.step()
-            elif iteration == args.profile_step_start:
-                torch.cuda.cudart().cudaProfilerStart()
-                torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()

        maybe_finalize_async_save(blocking=False)