Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
9dabea91
Commit
9dabea91
authored
Mar 20, 2025
by
silencealiang
Browse files
update
parent
66d982b8
Pipeline
#2563
passed with stage
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
31 additions
and
31 deletions
+31
-31
examples/gpt3/train_gpt_567B_1nodes.sh
examples/gpt3/train_gpt_567B_1nodes.sh
+7
-7
examples/gpt3/train_gpt_567B_multinodes.sh
examples/gpt3/train_gpt_567B_multinodes.sh
+7
-7
examples/mixtral/train_mixtral_8x7B_1nodes.sh
examples/mixtral/train_mixtral_8x7B_1nodes.sh
+7
-7
examples/mixtral/train_mixtral_8x7B_multinodes.sh
examples/mixtral/train_mixtral_8x7B_multinodes.sh
+10
-10
No files found.
examples/gpt3/train_gpt_567B_1nodes.sh
View file @
9dabea91
...
@@ -11,6 +11,11 @@ done
...
@@ -11,6 +11,11 @@ done
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# default env
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
...
@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml"
...
@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
RANK
=
$OMPI_COMM_WORLD_RANK
# data path
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
CHECKPOINT_PATH
=
./CKPT
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
25900
CHECKPOINT_PATH
=
./CKPT
TOKENIZER_MODEL
=
"path to tokenizer.model"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DATA_PATH
=
"path to my-mixtral_text_document"
...
...
examples/gpt3/train_gpt_567B_multinodes.sh
View file @
9dabea91
...
@@ -11,6 +11,11 @@ done
...
@@ -11,6 +11,11 @@ done
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# default env
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
...
@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml"
...
@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
RANK
=
$OMPI_COMM_WORLD_RANK
# data path
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
CHECKPOINT_PATH
=
./CKPT
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
25900
CHECKPOINT_PATH
=
./CKPT
TOKENIZER_MODEL
=
"path to tokenizer.model"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DATA_PATH
=
"path to my-mixtral_text_document"
...
...
examples/mixtral/train_mixtral_8x7B_1nodes.sh
View file @
9dabea91
...
@@ -11,6 +11,11 @@ done
...
@@ -11,6 +11,11 @@ done
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# default env
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
...
@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml"
...
@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
RANK
=
$OMPI_COMM_WORLD_RANK
# data path
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
25900
CHECKPOINT_PATH
=
./CKPT
CHECKPOINT_PATH
=
./CKPT
TOKENIZER_MODEL
=
"path to tokenizer.model"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DATA_PATH
=
"path to my-mixtral_text_document"
...
@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
...
@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks
0 1 2 3 4 5 6 7
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-start
3
--profile-step-end
4
--profile-step-end
4
--profile-dir
torch_prof_mixtral_1nodes_tp2-pp1-ep8-ep_tp1
--profile-dir
torch_prof_mixtral_1nodes_tp2-pp1-ep8-ep_tp1
-cp1
--use-pytorch-profiler
--use-pytorch-profiler
)
)
...
...
examples/mixtral/train_mixtral_8x7B_multinodes.sh
View file @
9dabea91
...
@@ -11,6 +11,11 @@ done
...
@@ -11,6 +11,11 @@ done
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# default env
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
export
PYTHONPATH
=
${
MEGATRON_PATH
}
:
$PYTHONPATH
...
@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml"
...
@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
RANK
=
$OMPI_COMM_WORLD_RANK
# data path
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
25900
CHECKPOINT_PATH
=
./CKPT
CHECKPOINT_PATH
=
./CKPT
TOKENIZER_MODEL
=
"path to tokenizer.model"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DATA_PATH
=
"path to my-mixtral_text_document"
...
@@ -81,7 +81,7 @@ MOE_ARGS=(
...
@@ -81,7 +81,7 @@ MOE_ARGS=(
--moe-token-dispatcher-type
alltoall
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
0.5
--moe-pad-expert-input-to-capacity
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
#
--moe-grouped-gemm
)
)
DATA_ARGS
=(
DATA_ARGS
=(
...
@@ -112,14 +112,14 @@ TORCH_PROFIE_ARGS=(
...
@@ -112,14 +112,14 @@ TORCH_PROFIE_ARGS=(
--profile-ranks
0 1 2 3 8 9 10 11
--profile-ranks
0 1 2 3 8 9 10 11
--profile-step-start
3
--profile-step-start
3
--profile-step-end
4
--profile-step-end
4
--profile-dir
torch_prof_mixtral_4nodes_tp2-pp
8
-ep
2
-ep_tp1
--profile-dir
torch_prof_mixtral_4nodes_tp2-pp
4
-ep
8
-ep_tp1
-cp1
--use-pytorch-profiler
--use-pytorch-profiler
)
)
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
8
--pipeline-model-parallel-size
4
--expert-model-parallel-size
2
--expert-model-parallel-size
8
--expert-tensor-parallel-size
1
--expert-tensor-parallel-size
1
--use-distributed-optimizer
--use-distributed-optimizer
--sequence-parallel
--sequence-parallel
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment