Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
c788823b
Commit
c788823b
authored
May 22, 2025
by
wxj
Browse files
删除用于dcu的te的batchgemm, 添加nv上的llama和qwen脚本
parent
475293db
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1099 additions
and
14 deletions
+1099
-14
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+6
-6
examples/llama/hostfile
examples/llama/hostfile
+2
-0
examples/llama/hostfile_llama2_7B
examples/llama/hostfile_llama2_7B
+0
-0
examples/llama/run_llama2_13B.sh
examples/llama/run_llama2_13B.sh
+31
-0
examples/llama/run_llama2_7B.sh
examples/llama/run_llama2_7B.sh
+6
-5
examples/llama/train_llama2_13b_1nodes.sh
examples/llama/train_llama2_13b_1nodes.sh
+163
-0
examples/llama/train_llama2_13b_2nodes.sh
examples/llama/train_llama2_13b_2nodes.sh
+163
-0
examples/llama/train_llama2_7b_1nodes.sh
examples/llama/train_llama2_7b_1nodes.sh
+3
-3
examples/llama/train_llama2_7b_2nodes.sh
examples/llama/train_llama2_7b_2nodes.sh
+163
-0
examples/qwen/hostfile
examples/qwen/hostfile
+2
-0
examples/qwen/run_qwen1.5_14B.sh
examples/qwen/run_qwen1.5_14B.sh
+31
-0
examples/qwen/run_qwen1.5_32B.sh
examples/qwen/run_qwen1.5_32B.sh
+31
-0
examples/qwen/train_qwen1.5_14b_1nodes.sh
examples/qwen/train_qwen1.5_14b_1nodes.sh
+165
-0
examples/qwen/train_qwen1.5_14b_2nodes.sh
examples/qwen/train_qwen1.5_14b_2nodes.sh
+165
-0
examples/qwen/train_qwen1.5_32b_2nodes.sh
examples/qwen/train_qwen1.5_32b_2nodes.sh
+168
-0
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
c788823b
...
@@ -140,13 +140,13 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -140,13 +140,13 @@ class CoreAdaptation(MegatronAdaptationABC):
from
..core.extensions.transformer_engine
import
TEDotProductAttentionPatch
from
..core.extensions.transformer_engine
import
TEDotProductAttentionPatch
from
megatron.core.extensions.transformer_engine
import
TEGroupedLinear
from
megatron.core.extensions.transformer_engine
import
TEGroupedLinear
if
not
is_te_min_version
(
"1.10.0"
):
#
if not is_te_min_version("1.10.0"):
# kv channels, te_min_version 1.10.0 -> 1.9.0
#
# kv channels, te_min_version 1.10.0 -> 1.9.0
MegatronAdaptation
.
register
(
'megatron.core.extensions.transformer_engine.TEDotProductAttention.__init__'
,
#
MegatronAdaptation.register('megatron.core.extensions.transformer_engine.TEDotProductAttention.__init__',
TEDotProductAttentionPatch
.
__init__
)
#
TEDotProductAttentionPatch.__init__)
if
int
(
os
.
getenv
(
"GROUPED_GEMM_BatchLinear"
,
'0'
)):
#
if int(os.getenv("GROUPED_GEMM_BatchLinear", '0')):
TEGroupedLinear
.
__bases__
=
(
te
.
pytorch
.
BatchedLinear
if
is_te_min_version
(
"2.3.0.dev0"
)
else
te
.
pytorch
.
BatchLinear
,)
#
TEGroupedLinear.__bases__ = (te.pytorch.BatchedLinear if is_te_min_version("2.3.0.dev0") else te.pytorch.BatchLinear,)
def
patch_tensor_parallel
(
self
):
def
patch_tensor_parallel
(
self
):
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
...
...
examples/llama/hostfile
0 → 100644
View file @
c788823b
a121 slots=8
a124 slots=8
\ No newline at end of file
examples/llama/hostfile_llama2_7B
deleted
100644 → 0
View file @
475293db
examples/llama/run_llama2_13B.sh
0 → 100755
View file @
c788823b
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"16"
# how many gpus to use
HOST
=
"a121"
# hostname
PORT
=
"11452"
# port id
DATA_PATH
=
"/data/datasets/oscar-1GB_head-llama2_text_document"
# path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH
=
"/data/models/llama2/tokenizer.model"
# path to tokenizer.model
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_args
"-p 11451"
\
bash
-c
"
./train_llama2_13b_2nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./logs/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/llama/run_llama2_7B.sh
View file @
c788823b
...
@@ -6,9 +6,9 @@ do
...
@@ -6,9 +6,9 @@ do
done
done
# Those variables need to modify
# Those variables need to modify
GPUS
=
"
4
"
# how many gpus to use
GPUS
=
"
16
"
# how many gpus to use
HOST
=
"
localhost
"
# hostname
HOST
=
"
a121
"
# hostname
PORT
=
"1145
1
"
# port id
PORT
=
"1145
2
"
# port id
DATA_PATH
=
"/data/datasets/oscar-1GB_head-llama2_text_document"
# path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/data/datasets/oscar-1GB_head-llama2_text_document"
# path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH
=
"/data/models/llama2/tokenizer.model"
# path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/data/models/llama2/tokenizer.model"
# path to tokenizer.model
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
...
@@ -18,13 +18,14 @@ mpirun -np ${GPUS} --hostfile hostfile \
...
@@ -18,13 +18,14 @@ mpirun -np ${GPUS} --hostfile hostfile \
--allow-run-as-root
\
--allow-run-as-root
\
--bind-to
none
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_args
"-p 11451"
\
bash
-c
"
bash
-c
"
./train_llama2_7b_
1
nodes.sh
\
./train_llama2_7b_
2
nodes.sh
\
${
HOST
}
\
${
HOST
}
\
${
PORT
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
--profiling=
$profiling
"
>
./logs/
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
\ No newline at end of file
examples/llama/train_llama2_13b_1nodes.sh
0 → 100755
View file @
c788823b
#!/bin/bash
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13824
--num-attention-heads
40
--max-position-embeddings
4096
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
local
--use-legacy-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
2
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
\ No newline at end of file
examples/llama/train_llama2_13b_2nodes.sh
0 → 100755
View file @
c788823b
#!/bin/bash
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13824
--num-attention-heads
40
--max-position-embeddings
4096
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
local
--use-legacy-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
2
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
\ No newline at end of file
examples/llama/train_llama2_7b_1nodes.sh
View file @
c788823b
...
@@ -72,8 +72,8 @@ GPT_MODEL_ARGS=(
...
@@ -72,8 +72,8 @@ GPT_MODEL_ARGS=(
)
)
TRAINING_ARGS
=(
TRAINING_ARGS
=(
--transformer-impl
local
--transformer-impl
transformer_engine
--use-
legacy
-models
--use-
mcore
-models
--micro-batch-size
1
--micro-batch-size
1
--global-batch-size
64
--global-batch-size
64
--train-iters
50
--train-iters
50
...
@@ -159,5 +159,5 @@ elif [[ $profiling == "hip" ]]; then
...
@@ -159,5 +159,5 @@ elif [[ $profiling == "hip" ]]; then
fi
fi
#for hygon cpu
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,
4,5,6,7
${
APP
}
${
APP
}
\ No newline at end of file
examples/llama/train_llama2_7b_2nodes.sh
0 → 100755
View file @
c788823b
#!/bin/bash
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
32
--hidden-size
4096
--ffn-hidden-size
11008
--num-attention-heads
32
--max-position-embeddings
4096
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
local
--use-legacy-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
1
--pipeline-model-parallel-size
2
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
\ No newline at end of file
examples/qwen/hostfile
0 → 100644
View file @
c788823b
a121 slots=8
a124 slots=8
\ No newline at end of file
examples/qwen/run_qwen1.5_14B.sh
0 → 100755
View file @
c788823b
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"16"
# how many gpus to use
HOST
=
"a121"
# hostname
PORT
=
"11452"
# port id
DATA_PATH
=
"/data/datasets/oscar-1GB_head-qwen_text_document"
# path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH
=
"/data/models/qwen1.5"
# path to tokenizer.model
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_args
"-p 11451"
\
bash
-c
"
./train_qwen1.5_14b_2nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./logs/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/qwen/run_qwen1.5_32B.sh
0 → 100755
View file @
c788823b
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"16"
# how many gpus to use
HOST
=
"a121"
# hostname
PORT
=
"11452"
# port id
DATA_PATH
=
"/data/datasets/oscar-1GB_head-qwen_text_document"
# path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH
=
"/data/models/qwen1.5"
# path to tokenizer.model
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_args
"-p 11451"
\
bash
-c
"
./train_qwen1.5_14b_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./logs/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/qwen/train_qwen1.5_14b_1nodes.sh
0 → 100755
View file @
c788823b
#!/bin/bash
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13696
--num-attention-heads
40
--max-position-embeddings
32768
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--add-qkv-bias
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
4
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
QwenTokenizer
--merge-file
${
TOKENIZER_MODEL_PATH
}
/merges.txt
--vocab-file
${
TOKENIZER_MODEL_PATH
}
/vocab.json
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
\ No newline at end of file
examples/qwen/train_qwen1.5_14b_2nodes.sh
0 → 100755
View file @
c788823b
#!/bin/bash
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13696
--num-attention-heads
40
--max-position-embeddings
32768
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--add-qkv-bias
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
2
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
QwenTokenizer
--merge-file
${
TOKENIZER_MODEL_PATH
}
/merges.txt
--vocab-file
${
TOKENIZER_MODEL_PATH
}
/vocab.json
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
\ No newline at end of file
examples/qwen/train_qwen1.5_32b_2nodes.sh
0 → 100755
View file @
c788823b
#!/bin/bash
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
64
--hidden-size
5120
--ffn-hidden-size
27392
--num-attention-heads
40
--max-position-embeddings
32768
--num-query-groups
8
--group-query-attention
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
1024
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--add-qkv-bias
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
4
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
QwenTokenizer
--merge-file
${
TOKENIZER_MODEL_PATH
}
/merges.txt
--vocab-file
${
TOKENIZER_MODEL_PATH
}
/vocab.json
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment