Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
ce1860b9
Unverified
Commit
ce1860b9
authored
Dec 28, 2023
by
Yuting Jiang
Committed by
GitHub
Dec 27, 2023
Browse files
Bug Fix - Bug fix for latest megatron-lm benchmark (#600)
**Description** Bug fix to sync latest megatron-lm code.
parent
c2e7a543
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
99 additions
and
89 deletions
+99
-89
.gitmodules
.gitmodules
+6
-0
dockerfile/directx12.dockerfile
dockerfile/directx12.dockerfile
+2
-0
dockerfile/rocm5.7.x.dockerfile
dockerfile/rocm5.7.x.dockerfile
+3
-2
dockerfile/rocm6.0.x.dockerfile
dockerfile/rocm6.0.x.dockerfile
+4
-3
superbench/benchmarks/model_benchmarks/megatron_gpt3.py
superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+34
-19
superbench/benchmarks/model_benchmarks/model_base.py
superbench/benchmarks/model_benchmarks/model_base.py
+1
-1
tests/benchmarks/model_benchmarks/test_megatron_gpt.py
tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+12
-14
third_party/Makefile
third_party/Makefile
+6
-10
third_party/Megatron/Megatron-DeepSpeed
third_party/Megatron/Megatron-DeepSpeed
+1
-0
third_party/Megatron/Megatron-LM
third_party/Megatron/Megatron-LM
+1
-0
third_party/Megatron/megatron_deepspeed_rocm6.patch
third_party/Megatron/megatron_deepspeed_rocm6.patch
+26
-39
third_party/Megatron/requirements.txt
third_party/Megatron/requirements.txt
+3
-1
No files found.
.gitmodules
View file @
ce1860b9
...
...
@@ -24,3 +24,9 @@
[submodule "third_party/msccl"]
path = third_party/msccl
url = https://github.com/Azure/msccl
[submodule "third_party/Megatron/Megatron-LM"]
path = third_party/Megatron/Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
[submodule "third_party/Megatron/Megatron-DeepSpeed"]
path = third_party/Megatron/Megatron-DeepSpeed
url = https://github.com/microsoft/Megatron-DeepSpeed.git
dockerfile/directx12.dockerfile
View file @
ce1860b9
...
...
@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%
# Run the setup script to install the visual studio components
RUN
"%SB_HOME%
\\
dockerfile
\\
directx
\\
install-components.bat"
RUN
powershell
-Command
"Set-ItemProperty -Path HKLM:
\S
YSTEM
\C
urrentControlSet
\C
ontrol
\F
ileSystem -Name LongPathsEnabled -Value 1;"
RUN
git config
--system
core.longpaths
true
# Install Superbench
RUN
python
-m
pip
install
setuptools
==
65.0.0
&&
\
python
-m
pip
install
--no-cache-dir
.[amdworker]
&&
\
...
...
dockerfile/rocm5.7.x.dockerfile
View file @
ce1860b9
...
...
@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
# Install OpenMPI
ENV
OPENMPI_VERSION=4.1.x
ENV
MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN
cd
/tmp
&&
\
git clone
--recursive
https://github.com/open-mpi/ompi.git
-b
v
${
OPENMPI_VERSION
}
&&
\
...
...
@@ -145,9 +146,9 @@ RUN cd /opt/ && \
RUN
cd
/opt/rocm/share/amd_smi
&&
\
python3
-m
pip
install
--user
.
ENV
PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
ENV
PATH="/
usr/local/mpi/bin:/
opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/
mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/
lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
...
...
dockerfile/rocm6.0.x.dockerfile
View file @
ce1860b9
...
...
@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE}
# Lib:
# - torch: 2.0.1
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4
# - hipblaslt:
950ca43
# - hipblaslt:
release/rocm-rel-6.0
# - openmpi: 4.1.x
# - apex: 1.0.0
# Intel:
...
...
@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec
# Install OpenMPI
ENV
OPENMPI_VERSION=4.1.x
ENV
MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN
cd
/tmp
&&
\
git clone
--recursive
https://github.com/open-mpi/ompi.git
-b
v
${
OPENMPI_VERSION
}
&&
\
...
...
@@ -147,9 +148,9 @@ RUN cd /opt/ && \
..
&&
\
make
-j
${
NUM_MAKE_JOBS
}
ENV
PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
ENV
PATH="/
usr/local/mpi/bin:/
opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/
mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/
lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
...
...
superbench/benchmarks/model_benchmarks/megatron_gpt3.py
View file @
ce1860b9
...
...
@@ -116,6 +116,9 @@ def add_parser_arguments(self):
self
.
_parser
.
add_argument
(
'--data_home'
,
type
=
str
,
default
=
'/tmp'
,
help
=
'Data home.'
)
self
.
_parser
.
add_argument
(
'--vocab_path'
,
type
=
str
,
default
=
'/tmp/gpt2-vocab.json'
,
help
=
'Vocab path.'
)
self
.
_parser
.
add_argument
(
'--merge_path'
,
type
=
str
,
default
=
'/tmp/gpt2-merges.txt'
,
help
=
'Merge path.'
)
self
.
_parser
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'949,50,1'
,
help
=
'Split dataset ratio for train/val/test.'
)
self
.
_parser
.
add_argument
(
'--prescale_grad'
,
action
=
'store_true'
,
help
=
'Prescale grad.'
)
self
.
_parser
.
add_argument
(
'--hostfile'
,
type
=
str
,
default
=
None
,
help
=
'Hostfile to run the mutli-node benchmark.'
...
...
@@ -128,6 +131,13 @@ def add_parser_arguments(self):
def
_preprocess
(
self
):
if
not
super
().
_preprocess
():
return
False
if
not
self
.
_args
.
code_base
:
if
self
.
_args
.
deepspeed
:
self
.
_args
.
code_base
=
os
.
path
.
join
(
os
.
getenv
(
'SB_MICRO_PATH'
),
'third_party/Megatron/Megatron-DeepSpeed/'
)
else
:
self
.
_args
.
code_base
=
os
.
path
.
join
(
os
.
getenv
(
'SB_MICRO_PATH'
),
'third_party/Megatron/Megatron-LM'
)
if
not
os
.
path
.
exists
(
self
.
_args
.
code_base
)
or
\
not
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
_args
.
code_base
,
'pretrain_gpt.py'
)):
...
...
@@ -156,35 +166,35 @@ def _preprocess(self):
def
_parse_log
(
self
,
output
):
"""Parse log output and get the performance."""
tflops_pattern
=
re
.
compile
(
r
'TFLOPs: (\d+\.\d+)'
)
tflops_pattern
=
re
.
compile
(
r
'
(
TFLOPs
|TFLOP/s/GPU\))
: (\d+\.\d+)'
)
elapsed_time_pattern
=
re
.
compile
(
r
'elapsed time per iteration \(ms\): (\d+\.\d+)'
)
mem_allocated_pattern
=
re
.
compile
(
r
'
MemA
llocated
=([\d.]+)[KMGTPEZY]?B
'
)
max_mem_allocated_pattern
=
re
.
compile
(
r
'
M
ax
MemA
llocated
=([\d.]+)[KMGTPEZY]?B
'
)
mem_allocated_pattern
=
re
.
compile
(
r
'
a
llocated
: (\d+\.\d+)
'
)
max_mem_allocated_pattern
=
re
.
compile
(
r
'
m
ax
a
llocated
: (\d+\.\d+)
'
)
lines
=
output
.
splitlines
()
tflops
=
[]
mem_allocated
=
[]
max_mem_allocated
=
[]
iteration_times
=
[]
for
line
in
lines
:
if
'
TFLOPs
'
in
line
:
if
'
elapsed time per iteration
'
in
line
:
tflops_matches
=
tflops_pattern
.
search
(
line
)
elapsed_time_match
=
elapsed_time_pattern
.
search
(
line
)
if
tflops_matches
:
tflops_values
=
float
(
tflops_matches
.
group
(
1
))
tflops_values
=
float
(
tflops_matches
.
group
(
2
))
tflops
.
append
(
tflops_values
)
if
elapsed_time_match
:
elapsed_time_value
=
float
(
elapsed_time_match
.
group
(
1
))
iteration_times
.
append
(
elapsed_time_value
)
if
'
M
ax
MemA
llocated'
in
line
:
if
'
m
ax
a
llocated'
in
line
:
mem_allocated_match
=
mem_allocated_pattern
.
search
(
line
)
max_mem_allocated_match
=
max_mem_allocated_pattern
.
search
(
line
)
if
mem_allocated_match
:
mem_allocated_value
=
float
(
mem_allocated_match
.
group
(
1
))
mem_allocated_value
=
float
(
mem_allocated_match
.
group
(
1
))
/
1024
mem_allocated
.
append
(
mem_allocated_value
)
if
max_mem_allocated_match
:
max_mem_allocated_value
=
float
(
max_mem_allocated_match
.
group
(
1
))
max_mem_allocated_value
=
float
(
max_mem_allocated_match
.
group
(
1
))
/
1024
max_mem_allocated
.
append
(
max_mem_allocated_value
)
return
iteration_times
,
tflops
,
mem_allocated
,
max_mem_allocated
...
...
@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
--deepspeed
\
--deepspeed_config
{
self
.
_config_json_path
}
\
--zero-stage
{
self
.
_args
.
zero_stage
}
\
--pipeline-model-parallel-size
{
self
.
_args
.
pipeline_model_parallel_size
}
'
--pipeline-model-parallel-size
{
self
.
_args
.
pipeline_model_parallel_size
}
\
--train-tokens
{
self
.
_args
.
train_tokens
}
\
--data-impl
{
self
.
_args
.
data_impl
}
'
if
self
.
_args
.
pipeline_model_parallel_size
<=
1
:
deepspeed_options
=
f
'
{
deepspeed_options
}
--no-pipeline-parallel'
...
...
@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901
--num-attention-heads
{
self
.
_args
.
num_attn_heads
}
\
--seq-length
{
self
.
_args
.
seq_len
}
\
--max-position-embeddings
{
self
.
_args
.
seq_len
}
\
--train-tokens
{
self
.
_args
.
train_tokens
}
\
--train-samples
{
self
.
_args
.
num_steps
*
self
.
_args
.
batch_size
}
\
--lr
{
self
.
_args
.
lr
}
\
--min-lr
{
self
.
_args
.
min_lr
}
\
--split
949,50,1
\
--split
{
self
.
_args
.
split
}
\
--log-interval
{
self
.
_args
.
log_interval
}
\
--eval-interval
{
self
.
_args
.
eval_interval
}
\
--eval-iters
{
self
.
_args
.
eval_iters
}
\
...
...
@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901
--optimizer adam
\
--use-distributed-optimizer
\
{
precision_megatron
}
\
--seed
{
self
.
_args
.
seed
}
'
--seed
{
self
.
_args
.
seed
}
\
--log-throughput'
if
self
.
_args
.
sequence_parallel
:
megatron_options
=
f
'
{
megatron_options
}
--sequence-parallel'
...
...
@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901
script_path
=
os
.
path
.
join
(
self
.
_args
.
code_base
,
'pretrain_gpt.py'
)
if
self
.
_args
.
deepspeed
:
deepspeed_option
=
self
.
__prepare_deespeed_config
(
precision_megatron
.
lstrip
(
'--'
))
# No --log-throughput in Megatron-DeepSpeed by 20231219
megatron_options
=
megatron_options
.
replace
(
'--log-throughput'
,
''
).
strip
()
if
self
.
_num_nodes
>
1
:
command
=
f
'torchrun
{
self
.
_distributed_args
}
'
+
\
f
'
{
script_path
}
{
megatron_options
}
{
self
.
_data_options
}
{
deepspeed_option
}
'
...
...
@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
return
False
self
.
_num_nodes
=
int
(
os
.
getenv
(
'OMPI_COMM_WORLD_SIZE'
))
//
int
(
os
.
getenv
(
'OMPI_COMM_WORLD_LOCAL_SIZE'
))
master_addr
=
'localhost'
if
self
.
_num_nodes
>
1
:
if
not
self
.
_args
.
hostfile
:
sb_hostfile
=
os
.
path
.
join
(
os
.
environ
.
get
(
'SB_WORKSPACE'
,
'.'
),
'hostfile'
)
...
...
@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
if
self
.
_num_nodes
!=
len
(
hosts
):
logger
.
error
(
'MPI init failed since hostfile not match the MPI setting.'
)
return
False
master_addr
=
hosts
[
0
].
split
()[
0
]
addr
=
os
.
getenv
(
'MASTER_ADDR'
,
hosts
[
0
].
split
()[
0
]
)
port
=
os
.
getenv
(
'MASTER_PORT'
,
'29500'
)
node_rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
//
int
(
os
.
environ
[
'OMPI_COMM_WORLD_LOCAL_SIZE'
])
self
.
_distributed_args
=
f
'--nproc_per_node
{
self
.
_args
.
num_gpus
}
--nnodes
{
self
.
_num_nodes
}
'
+
\
f
'--node_rank
{
node_rank
}
--master_addr
{
addr
}
--master_port
{
port
}
'
addr
=
os
.
getenv
(
'MASTER_ADDR'
,
master_addr
)
port
=
os
.
getenv
(
'MASTER_PORT'
,
'29500'
)
node_rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
//
int
(
os
.
environ
[
'OMPI_COMM_WORLD_LOCAL_SIZE'
])
self
.
_distributed_args
=
f
'--nproc_per_node
{
self
.
_args
.
num_gpus
}
--nnodes
{
self
.
_num_nodes
}
'
+
\
f
'--node_rank
{
node_rank
}
--master_addr
{
addr
}
--master_port
{
port
}
'
return
True
def
_generate_dataset
(
self
):
...
...
@@ -448,8 +464,7 @@ def _generate_dataset(self):
self
.
_data_options
=
f
'
\
--vocab-file
{
self
.
_vocab_path
}
\
--merge-file
{
self
.
_merges_path
}
\
--data-path
{
self
.
_data_path
}
\
--data-impl
{
self
.
_args
.
data_impl
}
'
--data-path
{
self
.
_data_path
}
'
logger
.
info
(
'Dataset preparation successfully.'
)
return
True
...
...
superbench/benchmarks/model_benchmarks/model_base.py
View file @
ce1860b9
...
...
@@ -265,8 +265,8 @@ def __train(self, precision):
# The unit of step time should be millisecond.
step_times
=
self
.
_train_step
(
precision
)
if
isinstance
(
step_times
,
tuple
):
step_times
=
step_times
[
0
]
info
=
step_times
[
1
]
step_times
=
step_times
[
0
]
self
.
_process_info
(
ModelAction
.
TRAIN
,
precision
,
info
)
step_times
=
self
.
__process_model_result
(
ModelAction
.
TRAIN
,
precision
,
step_times
)
if
not
step_times
:
...
...
tests/benchmarks/model_benchmarks/test_megatron_gpt.py
View file @
ce1860b9
...
...
@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark
.
_data_options
=
f
'
\
--vocab-file
{
self
.
_tmp_dir
}
/gpt2-vocab.json
\
--merge-file
{
self
.
_tmp_dir
}
/gpt2-merges.txt
\
--data-path
{
self
.
_tmp_dir
}
/dataset_text_document
\
--data-impl mmap'
--data-path
{
self
.
_tmp_dir
}
/dataset_text_document'
script_path
=
str
(
Path
(
self
.
_tmp_dir
)
/
'pretrain_gpt.py'
)
expected_command
=
'torchrun {distributed_args} {script_path}
\
...
...
@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32
\
--seq-length 2048
\
--max-position-embeddings 2048
\
--train-tokens 300000000000
\
--train-samples 20480
\
--lr 0.00012
\
--min-lr 1e-06
\
...
...
@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--optimizer adam
\
--use-distributed-optimizer
\
{precision}
\
--seed 1234 {data_options}'
--seed 1234
\
--log-throughput {data_options}'
precision
=
Precision
.
FLOAT32
command
=
benchmark
.
_megatron_command
(
precision
)
...
...
@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark
.
_data_options
=
f
'
\
--vocab-file
{
self
.
_tmp_dir
}
/gpt2-vocab.json
\
--merge-file
{
self
.
_tmp_dir
}
/gpt2-merges.txt
\
--data-path
{
self
.
_tmp_dir
}
/dataset_text_document
\
--data-impl mmap'
--data-path
{
self
.
_tmp_dir
}
/dataset_text_document'
command
=
benchmark
.
_megatron_command
(
Precision
.
BFLOAT16
)
expected_command
=
'deepspeed {script_path}
\
--override-opt_param-scheduler
\
expected_command
=
'deepspeed {script_path} --override-opt_param-scheduler
\
--adam-beta1 0.9
\
--adam-beta2 0.95
\
--tensor-model-parallel-size 1
\
...
...
@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32
\
--seq-length 2048
\
--max-position-embeddings 2048
\
--train-tokens 300000000000
\
--train-samples 20480
\
--lr 0.00012
\
--min-lr 1e-06
\
...
...
@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--deepspeed
\
--deepspeed_config
{
benchmark
.
_config_json_path
}
\
--zero-stage 1
\
--pipeline-model-parallel-size 1 --no-pipeline-parallel'
--pipeline-model-parallel-size 1
\
--train-tokens 300000000000
\
--data-impl mmap --no-pipeline-parallel'
self
.
assertEqual
(
command
,
...
...
@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
iteration_times
,
tflops
,
mem_allocated
,
max_mem_allocated
=
benchmark
.
_parse_log
(
raw_output
)
assert
(
statistics
.
mean
(
iteration_times
)
==
75239.24
)
assert
(
statistics
.
mean
(
tflops
)
==
149.136
)
assert
(
statistics
.
mean
(
mem_allocated
)
==
17.5
4
)
assert
(
statistics
.
mean
(
max_mem_allocated
)
==
66.97
)
assert
(
statistics
.
mean
(
mem_allocated
)
==
17.5
35637855529785
)
assert
(
statistics
.
mean
(
max_mem_allocated
)
==
66.97
44234085083
)
info
=
{
'tflops'
:
tflops
,
'mem_allocated'
:
mem_allocated
,
'max_mem_allocated'
:
max_mem_allocated
}
benchmark
.
_process_info
(
ModelAction
.
TRAIN
,
Precision
.
FLOAT16
,
info
)
assert
(
benchmark
.
result
is
not
None
)
assert
(
benchmark
.
result
[
'fp16_train_tflops'
][
0
]
==
149.136
)
assert
(
benchmark
.
result
[
'fp16_train_mem_allocated'
][
0
]
==
17.5
4
)
assert
(
benchmark
.
result
[
'fp16_train_max_mem_allocated'
][
0
]
==
66.97
)
assert
(
benchmark
.
result
[
'fp16_train_mem_allocated'
][
0
]
==
17.5
35637855529785
)
assert
(
benchmark
.
result
[
'fp16_train_max_mem_allocated'
][
0
]
==
66.97
44234085083
)
third_party/Makefile
View file @
ce1860b9
...
...
@@ -177,21 +177,17 @@ directx_amf_encoding_latency:
"C:
\t
emp
\B
uildTools
\M
SBuild
\C
urrent
\B
in
\M
SBuild.exe"
"AMF
\a
mf
\p
ublic
\s
amples
\C
PPSamples_vs2019.sln"
/t:EncoderLatency /p:Platform
=
x64 /p:Configuration
=
Release /p:OutDir
=
"%SB_MICRO_PATH%
\b
in"
\
)
# Install Megatron-LM
# Install
requirements for
Megatron-LM
megatron_lm
:
if
[
!
-d
"Megatron/Megatron-LM"
]
;
then
\
git clone
"https://github.com/NVIDIA/Megatron-LM.git"
"Megatron/Megatron-LM"
;
\
fi
cd
Megatron
&&
\
python
-m
pip
install
-r
requirements.txt
apt
install
-y
python3-mpi4py
&&
\
python
-m
pip
install
--no-cache-dir
-r
requirements.txt
# Install Megatron-DeepSpeed
# Install
requirements for
Megatron-DeepSpeed
megatron_deepspeed
:
if
[
!
-d
"Megatron/Megatron-DeepSpeed"
]
;
then
\
git clone
"https://github.com/microsoft/Megatron-DeepSpeed.git"
"Megatron/Megatron-DeepSpeed"
;
\
fi
cd
Megatron
&&
\
python
-m
pip
install
-r
requirements.txt
&&
\
apt
install
-y
python3-mpi4py
&&
\
python
-m
pip
install
--no-cache-dir
-r
requirements.txt
&&
\
python
-m
pip
install
DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
...
...
Megatron-DeepSpeed
@
71e8407c
Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6
Megatron-LM
@
52b7a18a
Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
third_party/Megatron/megatron_deepspeed_rocm6.patch
View file @
ce1860b9
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 76086de..1533648 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
third_party/Megatron/requirements.txt
View file @
ce1860b9
...
...
@@ -10,4 +10,6 @@ tqdm
sentencepiece
wandb
einops
typing_extensions==4.5.0
typing_extensions==4.9.0
apex
mpi4py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment