Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
57944e55
Commit
57944e55
authored
May 14, 2025
by
silencealiang
Browse files
update model parameters format
parent
90ae7f5c
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
170 additions
and
280 deletions
+170
-280
examples/deepseek_v3/run_deepseekv3_671B.sh
examples/deepseek_v3/run_deepseekv3_671B.sh
+34
-0
examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
+0
-26
examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
+0
-29
examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
+0
-29
examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
+0
-10
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
+0
-10
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+0
-10
examples/gpt3/run_gpt_567B.sh
examples/gpt3/run_gpt_567B.sh
+34
-0
examples/gpt3/run_gpt_567B_1nodes.sh
examples/gpt3/run_gpt_567B_1nodes.sh
+0
-26
examples/gpt3/run_gpt_567B_multinodes.sh
examples/gpt3/run_gpt_567B_multinodes.sh
+0
-29
examples/gpt3/train_gpt_567B_128nodes.sh
examples/gpt3/train_gpt_567B_128nodes.sh
+0
-10
examples/gpt3/train_gpt_567B_1nodes.sh
examples/gpt3/train_gpt_567B_1nodes.sh
+0
-10
examples/llama/hostfile_llama2_7B
examples/llama/hostfile_llama2_7B
+0
-0
examples/llama/run_llama2_7B.sh
examples/llama/run_llama2_7B.sh
+34
-0
examples/llama/run_llama2_7B_1nodes.sh
examples/llama/run_llama2_7B_1nodes.sh
+0
-26
examples/llama/train_llama2_7b_1nodes.sh
examples/llama/train_llama2_7b_1nodes.sh
+0
-10
examples/mixtral/run_mixtral_8x22B.sh
examples/mixtral/run_mixtral_8x22B.sh
+34
-0
examples/mixtral/run_mixtral_8x22B_1nodes.sh
examples/mixtral/run_mixtral_8x22B_1nodes.sh
+0
-26
examples/mixtral/run_mixtral_8x22B_multinodes.sh
examples/mixtral/run_mixtral_8x22B_multinodes.sh
+0
-29
examples/mixtral/run_mixtral_8x7B.sh
examples/mixtral/run_mixtral_8x7B.sh
+34
-0
No files found.
examples/deepseek_v3/run_deepseekv3_671B.sh
0 → 100755
View file @
57944e55
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
""
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST
=
""
# hostname
PORT
=
""
# port id
DATA_PATH
=
""
# path to mmap_deepseekv3_datasets_text_document
TOKENIZER_MODEL_PATH
=
""
# path to deepseekv3_dataset
CHECKPOINT_PATH
=
""
# path to ckpt
# Runs DeepseekV3 671B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile_deepseekv3_671B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_deepseekv3_671B_
$((${
GPUS
}
/
8
))
nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs DeepseekV3 671B model
source
/opt/dtk/env.sh
HOST
=
localhost
PORT
=
25900
DATA_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH
=
"path to deepseekv3_dataset"
CHECKPOINT_PATH
=
"path to output"
mpirun
-np
8
--allow-run-as-root
\
train_deepseekv3_671B_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs DeepseekV3 671B model
source
/opt/dtk/env.sh
HOST
=
""
# modify this variable
PORT
=
25900
DATA_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH
=
"path to deepseekv3_dataset"
CHECKPOINT_PATH
=
"path to output"
mpirun
-np
32
--hostfile
hostfile_deepseekv3_671B_4nodes
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
train_deepseekv3_671B_4nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-4nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs DeepseekV3 671B model
source
/opt/dtk/env.sh
HOST
=
""
# modify this variable
PORT
=
25900
DATA_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH
=
"path to deepseekv3_dataset"
CHECKPOINT_PATH
=
"path to output"
mpirun
-np
1024
--hostfile
hostfile_deepseekv3_671B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
train_deepseekv3_671B_multinodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-4nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/deepseek_v3/train_deepseekv3_671B_
multi
nodes.sh
→
examples/deepseek_v3/train_deepseekv3_671B_
128
nodes.sh
View file @
57944e55
...
...
@@ -27,16 +27,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
MP_PP0_LAYERS
=
5
# 是否使能视实际情况而定
...
...
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
View file @
57944e55
...
...
@@ -27,16 +27,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
#export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
...
...
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
View file @
57944e55
...
...
@@ -27,16 +27,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
MP_PP0_LAYERS
=
2
# 是否使能视实际情况而定
...
...
examples/gpt3/run_gpt_567B.sh
0 → 100755
View file @
57944e55
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
""
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST
=
""
# hostname
PORT
=
""
# port id
DATA_PATH
=
""
# path to redpajama_text_document
TOKENIZER_MODEL_PATH
=
""
# path to tokenizer.model
CHECKPOINT_PATH
=
""
# path to ckpt
# Runs GPT 567B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile_gpt_567B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_gpt_567B_
$((${
GPUS
}
/
8
))
nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/gpt3/run_gpt_567B_1nodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs GPT 567B model
source
/opt/dtk/env.sh
HOST
=
localhost
PORT
=
25900
DATA_PATH
=
"path to redpajama_text_document"
TOKENIZER_MODEL_PATH
=
"path to tokenizer.model"
CHECKPOINT_PATH
=
"path to ckpt"
mpirun
-np
8
--allow-run-as-root
\
train_gpt_567B_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/gpt3/run_gpt_567B_multinodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs GPT 567B model
source
/opt/dtk/env.sh
HOST
=
""
# modify this variable
PORT
=
25900
DATA_PATH
=
"path to redpajama_text_document"
TOKENIZER_MODEL_PATH
=
"path to tokenizer.model"
CHECKPOINT_PATH
=
"path to ckpt"
mpirun
-np
1024
--hostfile
hostfile_gpt_567B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
train_gpt_567B_multinodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-1024nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/gpt3/train_gpt_567B_
multi
nodes.sh
→
examples/gpt3/train_gpt_567B_
128
nodes.sh
View file @
57944e55
...
...
@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
...
...
examples/gpt3/train_gpt_567B_1nodes.sh
View file @
57944e55
...
...
@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
...
...
examples/
deepseek_v3/hostfile_deepseekv3_671B_4nodes
→
examples/
llama/hostfile_llama2_7B
View file @
57944e55
File moved
examples/llama/run_llama2_7B.sh
0 → 100755
View file @
57944e55
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
""
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST
=
""
# hostname
PORT
=
""
# port id
DATA_PATH
=
""
# path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH
=
""
# path to tokenizer.model
CHECKPOINT_PATH
=
""
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile_llama2_7B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_llama2_7b_
$((${
GPUS
}
/
8
))
nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/llama/run_llama2_7B_1nodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Llama2 7B model
source
/opt/dtk/env.sh
HOST
=
localhost
PORT
=
34577
DATA_PATH
=
"path to oscar-1GB_head-llama2_text_document"
TOKENIZER_MODEL_PATH
=
"path to tokenizer.model"
CHECKPOINT_PATH
=
"path to ckpt"
mpirun
-np
8
--allow-run-as-root
\
train_llama2_7b_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
examples/llama/train_llama2_7b_1nodes.sh
View file @
57944e55
...
...
@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
...
...
examples/mixtral/run_mixtral_8x22B.sh
0 → 100755
View file @
57944e55
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
""
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST
=
""
# hostname
PORT
=
""
# port id
DATA_PATH
=
""
# path to my-mixtral_text_document
TOKENIZER_MODEL_PATH
=
""
# path to tokenizer.model
CHECKPOINT_PATH
=
""
# path to ckpt
# Runs Mixtral 8x22B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile_mixtral_8x22B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_mixtral_8x22B_
$((${
GPUS
}
/
8
))
nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/mixtral/run_mixtral_8x22B_1nodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Mixtral 8x22B model
source
/opt/dtk/env.sh
HOST
=
localhost
PORT
=
25900
DATA_PATH
=
"path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH
=
"path to tokenizer.model"
CHECKPOINT_PATH
=
"path to ckpt"
mpirun
-np
8
--allow-run-as-root
\
train_mixtral_8x22B_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/mixtral/run_mixtral_8x22B_multinodes.sh
deleted
100755 → 0
View file @
90ae7f5c
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Mixtral 8x22B model
source
/opt/dtk/env.sh
HOST
=
""
# modify this variable
PORT
=
25900
DATA_PATH
=
"path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH
=
"path to tokenizer.model"
CHECKPOINT_PATH
=
"path to ckpt"
mpirun
-np
32
--hostfile
hostfile_mixtral_8x22B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
train_mixtral_8x22B_multinodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path
=
$DATA_PATH
\
--tokenizer_path
=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path
=
$CHECKPOINT_PATH
\
--profiling
=
$profiling
>
log-4nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/mixtral/run_mixtral_8x7B.sh
0 → 100755
View file @
57944e55
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
""
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST
=
""
# hostname
PORT
=
""
# port id
DATA_PATH
=
""
# path to my-mixtral_text_document
TOKENIZER_MODEL_PATH
=
""
# path to tokenizer.model
CHECKPOINT_PATH
=
""
# path to ckpt
# Runs Mixtral 8x7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile_mixtral_8x7B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_mixtral_8x7B_
$((${
GPUS
}
/
8
))
nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment