Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
a2f7bd8d
Commit
a2f7bd8d
authored
Apr 26, 2025
by
silencealiang
Browse files
add deepseek v3 examples
parent
32afa92c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
449 additions
and
2 deletions
+449
-2
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+0
-2
examples/deepseek_v3/run_deepseek_v3_1node.sh
examples/deepseek_v3/run_deepseek_v3_1node.sh
+15
-0
examples/deepseek_v3/train_deepseek_v3_1node.sh
examples/deepseek_v3/train_deepseek_v3_1node.sh
+434
-0
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
a2f7bd8d
...
@@ -5,8 +5,6 @@ import types
...
@@ -5,8 +5,6 @@ import types
import
argparse
import
argparse
import
torch
import
torch
from
.adaptor_arguments
import
get_adaptor_args
class
MegatronAdaptation
:
class
MegatronAdaptation
:
"""
"""
...
...
examples/deepseek_v3/run_deepseek_v3_1node.sh
0 → 100644
View file @
a2f7bd8d
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
export
GPU_FLUSH_ON_EXECUTION
=
1
export
HIP_DIRECT_DISPATCH
=
0
fi
done
mpirun
-np
8
--allow-run-as-root
\
train_deepseek_v3_1node.sh localhost
--profiling
=
$profiling
>
output.log 2>&1
wait
rm
-rf
CKPT
examples/deepseek_v3/train_deepseek_v3_1node.sh
0 → 100644
View file @
a2f7bd8d
#!/bin/bash
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
fi
done
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
echo
$CURRENT_DIR
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"/public/home/yuguo/check/rccl-tests-0204/topo-input.xml"
#"your topo file"
export
GLOG_minloglevel
=
3
export
GROUPED_GEMM_BatchLinear
=
1
export
LD_LIBRARY_PATH
=
/public/home/yuguo/data/rocblas-install-0224/lib:
$LD_LIBRARY_PATH
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
### BASE CONFIG ###
MODEL_SIZE
=
A37B
BATCH_SIZE
=
1
GLOBAL_BATCH_SIZE
=
256
LR
=
1e-5
MIN_LR
=
1e-6
SEQ_LEN
=
4096
PR
=
bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP
=
1
PP
=
2
CP
=
1
EP
=
4
SP
=
true
DO
=
true
FL
=
true
SFT
=
false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC
=
none
OPTIMIZER_OFFLOAD
=
false
SAVE_INTERVAL
=
500
DATASET_PATH
=
${
MEGATRON_PATH
}
/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
#"your data path"
VALID_DATASET_PATH
=
${
MEGATRON_PATH
}
/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
#"your data path"
PRETRAIN_CHECKPOINT_PATH
=
${
MEGATRON_PATH
}
/deepseekv3_dataset
#"your model path"
# the following two values will not be used when SFT is true
TRAIN_TOKENS
=
100000000
WARMUP_TOKENS
=
10000
###############################
OUTPUT_BASEPATH
=
./output
### OTHERS ###
if
[
$FL
=
true
]
;
then
:
#exit -1
elif
[
$FL
=
false
]
;
then
export
NVTE_FLASH_ATTN
=
0
NVTE_FUSED_ATTN
=
1
attn_backend_option
=
"
\
--attention-backend fused
"
fi
if
[
$MODEL_SIZE
=
A37B
]
;
then
TRAIN_ITERS
=
2
HIDDEN_SIZE
=
7168
NUM_ATTENTION_HEADS
=
128
NUM_LAYERS
=
2
INTERMEDIATE_SIZE
=
18432
MOE_INTERMEDIATE_SIZE
=
2048
MAX_POSITION_EMBEDDINGS
=
${
SEQ_LEN
}
EXTRA_VOCAB_SIZE
=
467
Q_LORA_RANK
=
1536
KV_LORA_RANK
=
512
QK_NOPE_HEAD_DIM
=
128
QK_ROPE_HEAD_DIM
=
64
V_HEAD_DIM
=
128
ROPE_THETA
=
10000
SCALE_FACTOR
=
40
NUM_EXPERTS
=
8
#256
ROUTER_TOPK
=
8
NUM_SHARED_EXPERTS
=
1
RMS_NORM_EPS
=
1e-6
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor 1
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
--num-experts
${
NUM_EXPERTS
}
\
--expert-model-parallel-size
${
EP
}
\
--expert-tensor-parallel-size 1
\
--moe-ffn-hidden-size
${
MOE_INTERMEDIATE_SIZE
}
\
--moe-router-load-balancing-type aux_loss
\
--moe-aux-loss-coeff 0.001
\
--moe-layer-freq ([0]*0+[1]*2)
\
--q-lora-rank
${
Q_LORA_RANK
}
\
--kv-lora-rank
${
KV_LORA_RANK
}
\
--qk-head-dim
${
QK_NOPE_HEAD_DIM
}
\
--qk-pos-emb-head-dim
${
QK_ROPE_HEAD_DIM
}
\
--v-head-dim
${
V_HEAD_DIM
}
\
--moe-shared-expert-intermediate-size
$((${
MOE_INTERMEDIATE_SIZE
}
*
${
NUM_SHARED_EXPERTS
}
))
\
"
fi
# Here are some configs controled by env
if
[
-z
${
MP_DATASET_TYPE
}
]
;
then
MP_DATASET_TYPE
=
"idxmap"
fi
if
[
-z
${
MP_AC_LAYERS
}
]
;
then
MP_AC_LAYERS
=
1
fi
if
[
-z
${
MP_VP
}
]
;
then
vp_option
=
""
else
vp_option
=
"
\
--num-layers-per-virtual-pipeline-stage
${
MP_VP
}
"
fi
if
[
-z
${
MP_SFT_PACKING
}
]
;
then
MP_SFT_PACKING
=
false
fi
TP_COMM_OVERLAP
=
$((
(
$TP
>
1
)
?
1
:
0
))
comm_overlap_option
=
"
\
--overlap-grad-reduce
\
--overlap-param-gather"
if
[
$AC
=
full
]
;
then
_check
=
$((
(
$NUM_LAYERS
/
$PP
)
%
${
MP_AC_LAYERS
}
))
if
[
$_check
!=
0
]
;
then
echo
"the num layers per pp rank must be a multiple of the recompute layers."
exit
-1
fi
activation_checkpoint_options
=
"
\
--recompute-method uniform
\
--recompute-num-layers
${
MP_AC_LAYERS
}
\
--recompute-granularity full"
elif
[
$AC
=
sel
]
;
then
activation_checkpoint_options
=
"
\
--recompute-activations"
elif
[
$AC
=
none
]
;
then
activation_checkpoint_options
=
"
\
"
elif
[
$AC
=
offload
]
;
then
activation_checkpoint_options
=
"
\
--cpu-offloading
\
--cpu-offloading-num-layers
${
MP_AC_LAYERS
}
"
if
[
$TP_COMM_OVERLAP
-eq
1
]
;
then
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
"
\
--tp-comm-overlap"
else
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
""
fi
fi
if
[
$PR
=
fp16
]
;
then
pr_options
=
"
\
--fp16
\
--apply-query-key-layer-scaling"
export
NVTE_APPLY_QK_LAYER_SCALING
=
1
elif
[
$PR
=
bf16
]
;
then
pr_options
=
"
\
--bf16"
elif
[
$PR
=
fp8
]
;
then
pr_options
=
"
\
--bf16
\
--fp8-format hybrid
\
--fp8-amax-compute-algo max
\
--fp8-amax-history-len 1024"
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
&&
[
$DO
=
false
]
;
then
echo
"Offload optimizer is valid only if
\$
DO=true"
DO
=
true
fi
if
[
$DO
=
true
]
;
then
do_option
=
"
\
--use-distributed-optimizer"
elif
[
$DO
=
false
]
;
then
do_option
=
"
\
"
fi
if
[
$SP
=
true
]
&&
[
$TP
-gt
1
]
;
then
sp_option
=
"
\
--sequence-parallel"
elif
[
$SP
=
false
]
;
then
sp_option
=
"
\
"
fi
if
[
-z
${
MP_PP0_LAYERS
}
]
;
then
uneven_split_option
=
""
elif
[
${
PP
}
-gt
1
]
;
then
_check
=
$((
(
$NUM_LAYERS
-
${
MP_PP0_LAYERS
}
)
%
(
${
PP
}
-
1
)
))
if
[
$_check
!=
0
]
;
then
echo
"With uneven pipelineing the left over layers must be divisible by left over stages."
exit
-1
fi
uneven_split_option
=
"
\
--decoder-first-pipeline-num-layers
${
MP_PP0_LAYERS
}
"
else
echo
"uneven pipeline split must be used when PP > 1"
exit
-1
fi
if
[
$PRETRAIN_CHECKPOINT_PATH
!=
none
]
;
then
load_option
=
"
\
--tokenizer-model
$PRETRAIN_CHECKPOINT_PATH
"
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
;
then
offload_option
=
"
\
--optimizer-cpu-offload
\
--use-precision-aware-optimizer
\
--optimizer-offload-fraction
${
OPTIMIZER_OFFLOAD
}
"
fi
if
[
$SFT
=
true
]
;
then
TRAIN_ITERS
=
${
24
}
LR_WARMUP_ITERS
=
${
25
}
LR_DECAY_ITERS
=
$((
${
TRAIN_ITERS
}
-
${
LR_WARMUP_ITERS
}))
PREFIX
=
"finetune-mcore-deepseek-v3"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS
=
$((
${
WARMUP_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
LR_DECAY_ITERS
=
$((
${
TRAIN_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
PREFIX
=
"pretrain-mcore-deepseek-v3"
fi
if
[
${
MP_DATASET_TYPE
}
=
"raw"
]
;
then
dataset_options
=
"
\
--train-data-path
${
DATASET_PATH
}
\
--valid-data-path
${
VALID_DATASET_PATH
}
\
--dataloader-type cyclic
\
--dataset JSON-SFT"
else
dataset_options
=
"
\
--data-path
${
DATASET_PATH
}
\
--split 99,1,0"
fi
if
[
${
MP_SFT_PACKING
}
=
true
]
;
then
echo
"Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options
=
""
else
packing_options
=
""
fi
##### Prepare logdirs #######
NAME
=
"
${
PREFIX
}
"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/tensorboard/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/checkpoint/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/log/"
TENSORBOARD_DIR
=
"
${
OUTPUT_BASEPATH
}
/tensorboard/
${
NAME
}
"
mkdir
-p
${
TENSORBOARD_DIR
}
SAVED_PRETRAIN_CHECKPOINT_PATH
=
"
${
OUTPUT_BASEPATH
}
/checkpoint/
${
NAME
}
"
mkdir
-p
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
find
-L
${
PRETRAIN_CHECKPOINT_PATH
}
-maxdepth
1
-type
f
-name
"*.json"
-print0
| xargs
-0
cp
-t
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
megatron_options
=
"
\
--lr
${
LR
}
\
--min-lr
${
MIN_LR
}
\
--lr-decay-style cosine
\
--weight-decay 0.1
\
--adam-beta1 0.9
\
--adam-beta2 0.95
\
--clip-grad 1.0
\
--init-method-std 0.008
\
--attention-dropout 0.0
\
--hidden-dropout 0.0
\
--lr-decay-iters
${
LR_DECAY_ITERS
}
\
--lr-warmup-iters
${
LR_WARMUP_ITERS
}
\
--train-iters
${
TRAIN_ITERS
}
\
--micro-batch-size
${
BATCH_SIZE
}
\
--global-batch-size
${
GLOBAL_BATCH_SIZE
}
\
--num-layers
${
NUM_LAYERS
}
\
--hidden-size
${
HIDDEN_SIZE
}
\
--num-attention-heads
${
NUM_ATTENTION_HEADS
}
\
--ffn-hidden-size
${
INTERMEDIATE_SIZE
}
\
--seq-length
${
SEQ_LEN
}
\
--max-position-embeddings
${
MAX_POSITION_EMBEDDINGS
}
\
--log-interval 1
\
--log-throughput
\
--eval-interval 10000
\
--eval-iters 5
\
--save-interval
${
SAVE_INTERVAL
}
\
--tensorboard-queue-size 1
\
--tensorboard-dir
${
TENSORBOARD_DIR
}
\
--log-timers-to-tensorboard
\
--log-validation-ppl-to-tensorboard
\
--tensor-model-parallel-size
${
TP
}
\
--pipeline-model-parallel-size
${
PP
}
\
--context-parallel-size
${
CP
}
\
--no-load-optim
\
--no-load-rng
\
--num-workers 8
\
--extra-vocab-size
${
EXTRA_VOCAB_SIZE
}
\
--tokenizer-type DeepSeekV2Tokenizer
\
--swiglu
\
--normalization RMSNorm
\
--norm-epsilon
${
RMS_NORM_EPS
}
\
--use-rotary-position-embeddings
\
--no-bias-swiglu-fusion
\
--no-rope-fusion
\
--position-embedding-type rope
\
--untie-embeddings-and-output-weights
\
--disable-bias-linear
\
--rotary-base
${
ROPE_THETA
}
\
--rotary-scaling-factor
${
SCALE_FACTOR
}
\
--no-save-optim
\
--kv-channels
${
V_HEAD_DIM
}
\
--qk-layernorm
\
--ckpt-format torch
\
--transformer-impl transformer_engine
\
--use-rope-scaling
\
--multi-latent-attention
\
--mtp-num-layers 1
\
--use-mcore-models
\
"
TORCH_PROFIE_ARGS
=
"
\
--profile
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 3
\
--profile-step-end 4
\
--profile-dir torch_prof_data_16nodes_dcu
\
--use-pytorch-profiler
\
"
HIP_PROFIE_ARGS
=
"
\
--profile
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 4
\
--profile-step-end 5
\
--use-hip-profiler
\
"
APP
=
"python3 -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
${
megatron_options
}
\
${
dataset_options
}
\
${
pr_options
}
\
${
load_option
}
\
${
activation_checkpoint_options
}
\
${
do_option
}
\
${
sp_option
}
\
${
moe_options
}
\
${
offload_option
}
\
${
sft_options
}
\
${
vp_option
}
\
${
packing_options
}
\
${
uneven_split_option
}
\
${
attn_backend_option
}
\
${
comm_overlap_option
}
\
--rank
${
RANK
}
\
--world-size
${
WORLD_SIZE
}
\
--local-rank
${
LOCAL_RANK
}
\
--dist-url tcp://
${
1
}
:25900
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
;;
esac
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment