Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
e4f506a0
Unverified
Commit
e4f506a0
authored
Jan 19, 2024
by
hugo-syn
Committed by
GitHub
Jan 19, 2024
Browse files
chore: Fix multiple typos (#617)
Signed-off-by:
hugo-syn
<
hugo.vincent@synacktiv.com
>
parent
051db0d7
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
6 additions
and
6 deletions
+6
-6
examples/pytorch/fsdp/README.md
examples/pytorch/fsdp/README.md
+1
-1
tests/jax/test_fused_attn.py
tests/jax/test_fused_attn.py
+1
-1
tests/jax/test_helper.py
tests/jax/test_helper.py
+1
-1
tests/paddle/parallel_tests/attention_tp.py
tests/paddle/parallel_tests/attention_tp.py
+1
-1
tests/paddle/parallel_tests/transformer_tp.py
tests/paddle/parallel_tests/transformer_tp.py
+1
-1
tests/pytorch/fused_attn/test_fused_attn.py
tests/pytorch/fused_attn/test_fused_attn.py
+1
-1
No files found.
examples/pytorch/fsdp/README.md
View file @
e4f506a0
...
@@ -37,7 +37,7 @@ $ torchrun --standalone --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) fsd
...
@@ -37,7 +37,7 @@ $ torchrun --standalone --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) fsd
# [GPU-0] Peak memory use = 3000MiB
# [GPU-0] Peak memory use = 3000MiB
# FSDP with deferred initialization:
# FSDP with deferred initialization:
# Modules initialized with empty param
a
ters via `device='meta'` option. Zero load on device
# Modules initialized with empty param
e
ters via `device='meta'` option. Zero load on device
# memory until torch.distributed.fsdp.FullyShardedDataParallel mode triggers a reset on
# memory until torch.distributed.fsdp.FullyShardedDataParallel mode triggers a reset on
# on already sharded model parameters.
# on already sharded model parameters.
$
torchrun
--standalone
--nnodes
=
1
--nproc-per-node
=
$(
nvidia-smi
-L
|
wc
-l
)
fsdp.py
--defer-init
$
torchrun
--standalone
--nnodes
=
1
--nproc-per-node
=
$(
nvidia-smi
-L
|
wc
-l
)
fsdp.py
--defer-init
...
...
tests/jax/test_fused_attn.py
View file @
e4f506a0
...
@@ -250,7 +250,7 @@ class FusedAttnRunner:
...
@@ -250,7 +250,7 @@ class FusedAttnRunner:
self
.
_setup_inputs
()
self
.
_setup_inputs
()
def
grad_func
(
func
,
*
args
,
**
kwargs
):
def
grad_func
(
func
,
*
args
,
**
kwargs
):
# Gradient is small, use a gradient multiplier to amplify the gra
i
dent
# Gradient is small, use a gradient multiplier to amplify the grad
i
ent
gradient_multiplier
=
self
.
valid_len_q
*
self
.
num_heads_q
gradient_multiplier
=
self
.
valid_len_q
*
self
.
num_heads_q
if
is_causal_mask
(
self
.
attn_mask_type
):
if
is_causal_mask
(
self
.
attn_mask_type
):
gradient_multiplier
/=
10
gradient_multiplier
/=
10
...
...
tests/jax/test_helper.py
View file @
e4f506a0
...
@@ -204,7 +204,7 @@ class TestFP8Functions(unittest.TestCase):
...
@@ -204,7 +204,7 @@ class TestFP8Functions(unittest.TestCase):
(
MeshResource
(
None
,
'tp'
)),
(
MeshResource
(
None
,
'tp'
)),
(
MeshResource
(
'dp'
,
'tp'
)),
(
MeshResource
(
'dp'
,
'tp'
)),
)
)
# TODO (Ming Huang): Suport multi-GPUs testing. # pylint: disable=fixme
# TODO (Ming Huang): Sup
p
ort multi-GPUs testing. # pylint: disable=fixme
mesh_shape
=
(
1
,
1
)
mesh_shape
=
(
1
,
1
)
devices
=
np
.
asarray
(
jax
.
devices
()[:
1
]).
reshape
(
*
mesh_shape
)
devices
=
np
.
asarray
(
jax
.
devices
()[:
1
]).
reshape
(
*
mesh_shape
)
with
jax
.
sharding
.
Mesh
(
devices
,
(
'dp'
,
'tp'
)):
with
jax
.
sharding
.
Mesh
(
devices
,
(
'dp'
,
'tp'
)):
...
...
tests/paddle/parallel_tests/attention_tp.py
View file @
e4f506a0
...
@@ -100,7 +100,7 @@ class TestAttentionTp(unittest.TestCase):
...
@@ -100,7 +100,7 @@ class TestAttentionTp(unittest.TestCase):
paddle
.
distributed
.
all_gather
(
total_weight
,
partial_weight
,
group
=
tp_group
)
paddle
.
distributed
.
all_gather
(
total_weight
,
partial_weight
,
group
=
tp_group
)
if
interleave
:
if
interleave
:
# Due to the interleaved qkv layout, need to concat on num_head
# Due to the interleaved qkv layout, need to concat on num_head
# dimen
t
ion for column parallel linear in MultiHeadAttention layer
# dimen
s
ion for column parallel linear in MultiHeadAttention layer
assert
axis
==
0
assert
axis
==
0
assert
[
3
*
self
.
hidden_size
//
self
.
world_size
,
assert
[
3
*
self
.
hidden_size
//
self
.
world_size
,
self
.
hidden_size
]
==
partial_weight
.
shape
self
.
hidden_size
]
==
partial_weight
.
shape
...
...
tests/paddle/parallel_tests/transformer_tp.py
View file @
e4f506a0
...
@@ -101,7 +101,7 @@ class TestTransformerTp(unittest.TestCase):
...
@@ -101,7 +101,7 @@ class TestTransformerTp(unittest.TestCase):
paddle
.
distributed
.
all_gather
(
total_weight
,
partial_weight
,
group
=
tp_group
)
paddle
.
distributed
.
all_gather
(
total_weight
,
partial_weight
,
group
=
tp_group
)
if
interleave
:
if
interleave
:
# Due to the interleaved qkv layout, need to concat on num_head
# Due to the interleaved qkv layout, need to concat on num_head
# dimen
t
ion for column parallel linear in MultiHeadAttention layer
# dimen
s
ion for column parallel linear in MultiHeadAttention layer
assert
axis
==
0
assert
axis
==
0
assert
[
3
*
self
.
hidden_size
//
self
.
world_size
,
assert
[
3
*
self
.
hidden_size
//
self
.
world_size
,
self
.
hidden_size
]
==
partial_weight
.
shape
self
.
hidden_size
]
==
partial_weight
.
shape
...
...
tests/pytorch/fused_attn/test_fused_attn.py
View file @
e4f506a0
...
@@ -668,7 +668,7 @@ def test_transformer_layer(dtype, model_configs, model, ckpt_attn, qkv_format, f
...
@@ -668,7 +668,7 @@ def test_transformer_layer(dtype, model_configs, model, ckpt_attn, qkv_format, f
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"te_1_2"
,
"te_2_0"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"te_1_2"
,
"te_2_0"
])
@
pytest
.
mark
.
parametrize
(
"qkv_format"
,
[
"bshd"
,
"sbhd"
])
@
pytest
.
mark
.
parametrize
(
"qkv_format"
,
[
"bshd"
,
"sbhd"
])
def
test_te_layer_misc
(
dtype
,
model_configs
,
model
,
qkv_format
):
def
test_te_layer_misc
(
dtype
,
model_configs
,
model
,
qkv_format
):
"""Test TransformerLayer module with miscellanous settings"""
"""Test TransformerLayer module with miscellan
e
ous settings"""
ckpt_attn
=
True
ckpt_attn
=
True
fused_qkv_params
=
True
fused_qkv_params
=
True
RoPE
=
True
RoPE
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment