Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
d33a44e8
Commit
d33a44e8
authored
Jun 26, 2023
by
Frank Lee
Browse files
[shardformer] refactored layernorm (#4086)
parent
c4b1b659
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
77 deletions
+51
-77
colossalai/shardformer/layer/__init__.py
colossalai/shardformer/layer/__init__.py
+2
-2
colossalai/shardformer/layer/layernorm.py
colossalai/shardformer/layer/layernorm.py
+38
-63
colossalai/shardformer/policies/bert.py
colossalai/shardformer/policies/bert.py
+6
-6
tests/test_shardformer/test_layer/test_layernorm.py
tests/test_shardformer/test_layer/test_layernorm.py
+5
-6
No files found.
colossalai/shardformer/layer/__init__.py
View file @
d33a44e8
from
.dropout
import
Dropout1D
from
.embedding
import
Embedding1D
,
VocabParallelEmbedding1D
from
.layernorm
import
LayerNorm
1D
from
.layernorm
import
Fused
LayerNorm
from
.linear
import
Linear1D_Col
,
Linear1D_Row
from
.linear_conv
import
LinearConv1D_Col
,
LinearConv1D_Row
from
.loss
import
cross_entropy_1d
__all__
=
[
"Embedding1D"
,
"VocabParallelEmbedding1D"
,
"Linear1D_Col"
,
"Linear1D_Row"
,
"LinearConv1D_Col"
,
"LinearConv1D_Row"
,
"Dropout1D"
,
"cross_entropy_1d"
,
'LayerNorm
1D
'
"Dropout1D"
,
"cross_entropy_1d"
,
'
Fused
LayerNorm'
]
colossalai/shardformer/layer/layernorm.py
View file @
d33a44e8
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
typing
import
List
,
Union
import
torch
import
torch.nn
as
nn
from
torch.distributed
import
ProcessGroup
from
colossalai.kernel
import
LayerNorm
from
colossalai.nn
import
init
as
init
from
.parallel_module
import
ParallelModule
__all__
=
[
'FusedLayerNorm'
]
__all__
=
[
'LayerNorm1D'
]
FAST_LAYERNORM_SUPPORTED_SIZE
=
[
1024
,
1536
,
2048
,
2304
,
3072
,
3840
,
4096
,
5120
,
6144
,
8192
,
10240
,
12288
,
12800
,
15360
,
16384
,
18432
,
20480
,
24576
,
25600
,
30720
,
32768
,
40960
,
49152
,
65536
]
Fast_LN
=
None
try
:
from
apex.contrib.layer_norm.layer_norm
import
FastLayerNorm
Fast_LN
=
FastLayerNorm
except
ImportError
:
pass
class
LayerNorm1D
(
ParallelModule
):
class
FusedLayerNorm
():
r
"""
Layer Normalization for colossalai
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
bias (bool, optional): Whether to add a bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
This is a wrapper around the apex fused layernorm implementation. It is meant to be used only with the from_native_module interface.
"""
_fast_ln_supported_sizes
=
[
1024
,
1536
,
2048
,
2304
,
3072
,
3840
,
4096
,
5120
,
6144
,
8192
,
10240
,
12288
,
12800
,
15360
,
16384
,
18432
,
20480
,
24576
,
25600
,
30720
,
32768
,
40960
,
49152
,
65536
]
def
__init__
(
self
,
normalized_shape
:
int
,
eps
:
int
=
1e-05
,
bias
:
bool
=
True
,
dtype
:
torch
.
dtype
=
None
,
device
:
torch
.
device
=
None
):
super
().
__init__
()
if
Fast_LN
is
not
None
and
normalized_shape
in
self
.
_fast_ln_supported_sizes
:
norm
=
Fast_LN
(
normalized_shape
,
eps
=
eps
).
to
(
dtype
)
else
:
norm
=
None
try
:
from
apex.normalization
import
FusedLayerNorm
norm
=
FusedLayerNorm
(
normalized_shape
,
eps
=
eps
).
to
(
dtype
)
except
ImportError
:
norm
=
LayerNorm
(
normalized_shape
,
eps
=
eps
,
device
=
device
,
dtype
=
dtype
)
self
.
norm
=
norm
def
__init__
(
self
)
->
None
:
raise
NotImplementedError
(
'FusedLayerNorm is not implemented as a physical class. '
'It is meant to be used only with the from_native_module interface to wrap the fused layernorm implementation provided by apex.'
)
@
staticmethod
def
from_native_module
(
module
:
nn
.
LayerNorm
,
process_group
:
Union
[
ProcessGroup
,
List
[
ProcessGroup
]],
*
args
,
**
kwargs
)
->
ParallelModule
:
def
from_native_module
(
module
:
nn
.
LayerNorm
,
*
args
,
**
kwargs
)
->
nn
.
Module
:
r
"""
Convert a native pytorch layer norm module to colossalai layer norm module
"""
# check if apex is installed
try
:
import
apex
except
ImportError
:
raise
ImportError
(
'Please install apex from source (https://github.com/NVIDIA/apex) to use the fused layernorm kernel'
)
# get the attributes of the module
normalized_shape
=
module
.
normalized_shape
eps
=
module
.
eps
bias
=
module
.
bias
is
not
No
ne
elementwise_affine
=
module
.
elementwise_affi
ne
dtype
=
module
.
weight
.
dtype
device
=
module
.
weight
.
device
# ensure only one process group is passed
if
isinstance
(
process_group
,
(
list
,
tuple
)):
assert
len
(
process_group
)
==
1
,
\
f
'Expected only one process group, got
{
len
(
process_group
)
}
.'
process_group
=
process_group
[
0
]
# pick the suitable layernorm implementation
use_fast_ln
=
normalized_shape
in
FAST_LAYERNORM_SUPPORTED_SIZE
if
use_fast_ln
:
try
:
from
apex.contrib.layer_norm.layer_norm
import
FastLayerNorm
as
ApexFusedLayerNorm
except
ImportError
:
# fall back to the normal fused layernorm is not built
from
apex.normalization
import
FusedLayerNorm
as
ApexFusedLayerNorm
else
:
from
apex.normalization
import
FusedLayerNorm
as
ApexFusedLayerNorm
# create layer norm
layer_norm
=
LayerNorm1D
(
normalized_shape
,
eps
=
eps
,
bias
=
bias
,
device
=
device
,
dtype
=
dtype
).
norm
layernorm
=
ApexFusedLayerNorm
(
normalized_shape
,
eps
=
eps
,
elementwise_affine
=
elementwise_affine
).
to
(
dtype
).
to
(
device
)
with
torch
.
no_grad
():
# copy weight and bias
layer_norm
.
weight
.
copy_
(
module
.
weight
)
if
bias
:
layer_norm
.
bias
.
copy_
(
module
.
bias
)
return
layer_norm
layernorm
.
weight
.
copy_
(
module
.
weight
)
layernorm
.
bias
.
copy_
(
module
.
bias
)
return
layernorm
colossalai/shardformer/policies/bert.py
View file @
d33a44e8
...
...
@@ -103,17 +103,17 @@ class BertPolicy(Policy):
base_policy
[
BertLayer
].
sub_module_replacement
.
append
(
SubModuleReplacementDescription
(
suffix
=
"attention.output.LayerNorm"
,
target_module
=
col_nn
.
LayerNorm
1D
,
target_module
=
col_nn
.
Fused
LayerNorm
,
))
base_policy
[
BertLayer
].
sub_module_replacement
.
append
(
SubModuleReplacementDescription
(
suffix
=
"output.LayerNorm"
,
target_module
=
col_nn
.
LayerNorm
1D
,
target_module
=
col_nn
.
Fused
LayerNorm
,
))
base_policy
[
BertEmbeddings
].
sub_module_replacement
.
append
(
SubModuleReplacementDescription
(
suffix
=
"LayerNorm"
,
target_module
=
col_nn
.
LayerNorm
1D
,
target_module
=
col_nn
.
Fused
LayerNorm
,
),)
return
base_policy
...
...
@@ -154,7 +154,7 @@ class BertForPretrainingPolicy(BertPolicy):
addon_module
[
BertLMPredictionHead
].
sub_module_replacement
.
append
(
SubModuleReplacementDescription
(
suffix
=
"transform.LayerNorm"
,
target_module
=
col_nn
.
LayerNorm
1D
,
target_module
=
col_nn
.
Fused
LayerNorm
,
))
module_policy
.
update
(
addon_module
)
return
module_policy
...
...
@@ -191,7 +191,7 @@ class BertLMHeadModelPolicy(BertPolicy):
addon_module
[
BertLMPredictionHead
].
sub_module_replacement
.
append
(
SubModuleReplacementDescription
(
suffix
=
"transform.LayerNorm"
,
target_module
=
col_nn
.
LayerNorm
1D
,
target_module
=
col_nn
.
Fused
LayerNorm
,
))
module_policy
.
update
(
addon_module
)
return
module_policy
...
...
@@ -228,7 +228,7 @@ class BertForMaskedLMPolicy(BertPolicy):
addon_module
[
BertLMPredictionHead
].
sub_module_replacement
.
append
(
SubModuleReplacementDescription
(
suffix
=
"transform.LayerNorm"
,
target_module
=
col_nn
.
LayerNorm
1D
,
target_module
=
col_nn
.
Fused
LayerNorm
,
))
module_policy
.
update
(
addon_module
)
return
module_policy
...
...
tests/test_shardformer/test_layer/test_layernorm.py
View file @
d33a44e8
import
torch
import
torch.distributed
as
dist
import
torch.nn
as
nn
from
torch.testing
import
assert_close
import
colossalai
from
colossalai.shardformer.layer
import
LayerNorm
1D
from
colossalai.shardformer.layer
import
Fused
LayerNorm
from
colossalai.testing
import
rerun_if_address_is_in_use
,
spawn
def
check_layernorm
_1d
():
def
check_layernorm
():
norm
=
nn
.
LayerNorm
(
128
,
0.00001
).
cuda
()
norm1d
=
LayerNorm
1D
.
from_native_module
(
norm
,
process_group
=
None
)
norm1d
=
Fused
LayerNorm
.
from_native_module
(
norm
,
process_group
=
None
)
assert
norm1d
.
weight
.
shape
==
torch
.
Size
([
128
])
...
...
@@ -33,11 +32,11 @@ def check_layernorm_1d():
def
run_dist
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
check_layernorm
_1d
()
check_layernorm
()
@
rerun_if_address_is_in_use
()
def
test_layernorm
_1d
():
def
test_layernorm
():
spawn
(
run_dist
,
nprocs
=
2
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment