update

2b09ea90 · liangjing · af4cf80e · 2b09ea90 · 2b09ea90
Commit 2b09ea90 authored Nov 11, 2024 by liangjing
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 10 deletions

megatron/training/arguments.py megatron/training/arguments.py +1 -5

scripts/qwen2_7b.sh scripts/qwen2_7b.sh +4 -5

No files found.
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -538,8 +538,6 @@ def validate_args(args, defaults={}):
    if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
        assert not args.use_legacy_models, \
            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
-    # FlashAttention
-    args.use_flash_attn = args.use_flash_attn_ck or args.use_flash_attn_triton

    # Legacy RoPE arguments
    if args.use_rotary_position_embeddings:
@@ -1220,11 +1218,9 @@ def _add_training_args(parser):
    group.add_argument('--cross-entropy-loss-fusion', action='store_true',
                       help='Enabled fusion of cross entropy loss calculation.',
                       dest='cross_entropy_loss_fusion')
-    group.add_argument('--use-flash-attn-ck', action='store_true',
+    group.add_argument('--use-flash-attn', action='store_true',
                       help='use FlashAttention implementation of attention. '
                       'https://arxiv.org/abs/2205.14135')
-    group.add_argument('--use-flash-attn-triton', action='store_true',
-                       help='use FlashAttention implementation of attention using Triton.')
    group.add_argument('--disable-bias-linear', action='store_false',
                       help='Disable bias in the linear layers',
                       dest='add_bias_linear')

--- a/scripts/qwen2_7b.sh
+++ b/scripts/qwen2_7b.sh
@@ -38,10 +38,10 @@ GPT_MODEL_ARGS=(

 TRAINING_ARGS=(
    --log-throughput
-    --transformer-impl local
-    --use-legacy-models 
+    --transformer-impl transformer_engine
+    --use-mcore-models
    --micro-batch-size 1 
-    --global-batch-size 12 #512 
+    --global-batch-size 12
    --train-iters 100
    --weight-decay 0.1 
    --adam-beta1 0.9 
@@ -50,7 +50,7 @@ TRAINING_ARGS=(
    --clip-grad 1.0 
    --bf16
    --use-distributed-optimizer 
-    --use-flash-attn-triton
+    --use-flash-attn
    --disable-bias-linear
    --attention-dropout 0
    --hidden-dropout 0
@@ -61,7 +61,6 @@ TRAINING_ARGS=(
    --lr-decay-style cosine 
    --min-lr 3.0e-6
    --lr-warmup-iters 1
-    --use-fast-cross-entropy-loss
 )
 MODEL_PARALLEL_ARGS=(
        --sequence-parallel