Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1251072f
Unverified
Commit
1251072f
authored
Oct 28, 2021
by
Anton Lozhkov
Committed by
GitHub
Oct 28, 2021
Browse files
Fix SEW-D implementation differences (#14191)
* Fix SEW-D * Update tests * isort
parent
78b6a2ec
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
22 additions
and
16 deletions
+22
-16
src/transformers/activations.py
src/transformers/activations.py
+3
-2
src/transformers/models/sew_d/configuration_sew_d.py
src/transformers/models/sew_d/configuration_sew_d.py
+10
-6
src/transformers/models/sew_d/modeling_sew_d.py
src/transformers/models/sew_d/modeling_sew_d.py
+2
-2
tests/test_activations.py
tests/test_activations.py
+4
-3
tests/test_modeling_sew_d.py
tests/test_modeling_sew_d.py
+3
-3
No files found.
src/transformers/activations.py
View file @
1251072f
...
...
@@ -24,7 +24,7 @@ from .utils import logging
logger
=
logging
.
get_logger
(
__name__
)
def
_
gelu_python
(
x
):
def
gelu_python
(
x
):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
...
...
@@ -43,7 +43,7 @@ def gelu_new(x):
if
version
.
parse
(
torch
.
__version__
)
<
version
.
parse
(
"1.4"
):
gelu
=
_
gelu_python
gelu
=
gelu_python
else
:
gelu
=
nn
.
functional
.
gelu
...
...
@@ -97,6 +97,7 @@ ACT2FN = {
"swish"
:
silu
,
"gelu"
:
gelu
,
"tanh"
:
torch
.
tanh
,
"gelu_python"
:
gelu_python
,
"gelu_new"
:
gelu_new
,
"gelu_fast"
:
gelu_fast
,
"quick_gelu"
:
quick_gelu
,
...
...
src/transformers/models/sew_d/configuration_sew_d.py
View file @
1251072f
...
...
@@ -67,9 +67,9 @@ class SEWDConfig(PretrainedConfig):
:obj:`("p2c")`, :obj:`("p2c", "c2p")`, :obj:`("p2c", "c2p", 'p2p")`.
norm_rel_ebd (:obj:`str`, `optional`, defaults to :obj:`"layer_norm"`):
Whether to use layer norm in relative embedding (:obj:`"layer_norm"` if yes)
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu
_python
"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"`
, :obj:`"gelu_python"`
and :obj:`"gelu_new"` are supported.
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
...
...
@@ -78,8 +78,10 @@ class SEWDConfig(PretrainedConfig):
The dropout probability for the final projection layer of :class:`SEWDForCTC`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
The epsilon used by the layer normalization layers in the transformer encoder.
feature_layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
The epsilon used by the layer normalization after the feature extractor.
feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
...
...
@@ -167,7 +169,7 @@ class SEWDConfig(PretrainedConfig):
position_biased_input
=
False
,
pos_att_type
=
(
"p2c"
,
"c2p"
),
norm_rel_ebd
=
"layer_norm"
,
hidden_act
=
"gelu"
,
hidden_act
=
"gelu
_python
"
,
hidden_dropout
=
0.1
,
activation_dropout
=
0.1
,
attention_dropout
=
0.1
,
...
...
@@ -175,7 +177,8 @@ class SEWDConfig(PretrainedConfig):
final_dropout
=
0.1
,
layerdrop
=
0.1
,
initializer_range
=
0.02
,
layer_norm_eps
=
1e-5
,
layer_norm_eps
=
1e-7
,
feature_layer_norm_eps
=
1e-5
,
feat_extract_norm
=
"group"
,
feat_extract_activation
=
"gelu"
,
conv_dim
=
(
64
,
128
,
128
,
128
,
128
,
256
,
256
,
256
,
256
,
512
,
512
,
512
,
512
),
...
...
@@ -228,6 +231,7 @@ class SEWDConfig(PretrainedConfig):
self
.
final_dropout
=
final_dropout
self
.
layerdrop
=
layerdrop
self
.
layer_norm_eps
=
layer_norm_eps
self
.
feature_layer_norm_eps
=
feature_layer_norm_eps
self
.
initializer_range
=
initializer_range
self
.
vocab_size
=
vocab_size
...
...
src/transformers/models/sew_d/modeling_sew_d.py
View file @
1251072f
...
...
@@ -1310,13 +1310,13 @@ SEWD_INPUTS_DOCSTRING = r"""
"The bare SEW-D Model transformer outputting raw hidden-states without any specific head on top."
,
SEWD_START_DOCSTRING
,
)
# Copied from transformers.models.sew.modeling_sew.SEWModel with SEW->SEWD
# Copied from transformers.models.sew.modeling_sew.SEWModel with SEW->SEWD
, layer_norm_eps->feature_layer_norm_eps
class
SEWDModel
(
SEWDPreTrainedModel
):
def
__init__
(
self
,
config
:
SEWDConfig
):
super
().
__init__
(
config
)
self
.
config
=
config
self
.
feature_extractor
=
SEWDFeatureExtractor
(
config
)
self
.
layer_norm
=
nn
.
LayerNorm
(
config
.
conv_dim
[
-
1
],
eps
=
config
.
layer_norm_eps
)
self
.
layer_norm
=
nn
.
LayerNorm
(
config
.
conv_dim
[
-
1
],
eps
=
config
.
feature_
layer_norm_eps
)
self
.
project_features
=
config
.
conv_dim
[
-
1
]
!=
config
.
hidden_size
if
self
.
project_features
:
...
...
tests/test_activations.py
View file @
1251072f
...
...
@@ -21,7 +21,7 @@ from transformers.testing_utils import require_torch
if
is_torch_available
():
import
torch
from
transformers.activations
import
_
gelu_python
,
gelu_new
,
get_activation
from
transformers.activations
import
gelu_new
,
gelu_python
,
get_activation
@
require_torch
...
...
@@ -29,8 +29,8 @@ class TestActivations(unittest.TestCase):
def
test_gelu_versions
(
self
):
x
=
torch
.
tensor
([
-
100
,
-
1
,
-
0.1
,
0
,
0.1
,
1.0
,
100
])
torch_builtin
=
get_activation
(
"gelu"
)
self
.
assertTrue
(
torch
.
allclose
(
_
gelu_python
(
x
),
torch_builtin
(
x
)))
self
.
assertFalse
(
torch
.
allclose
(
_
gelu_python
(
x
),
gelu_new
(
x
)))
self
.
assertTrue
(
torch
.
allclose
(
gelu_python
(
x
),
torch_builtin
(
x
)))
self
.
assertFalse
(
torch
.
allclose
(
gelu_python
(
x
),
gelu_new
(
x
)))
def
test_get_activation
(
self
):
get_activation
(
"swish"
)
...
...
@@ -39,6 +39,7 @@ class TestActivations(unittest.TestCase):
get_activation
(
"tanh"
)
get_activation
(
"gelu_new"
)
get_activation
(
"gelu_fast"
)
get_activation
(
"gelu_python"
)
with
self
.
assertRaises
(
KeyError
):
get_activation
(
"bogus"
)
with
self
.
assertRaises
(
KeyError
):
...
...
tests/test_modeling_sew_d.py
View file @
1251072f
...
...
@@ -540,9 +540,9 @@ class SEWDModelIntegrationTest(unittest.TestCase):
)
expected_output_sum
=
54201.0469
self
.
assertTrue
(
torch
.
allclose
(
outputs
[:,
:
4
,
:
4
],
expected_outputs_first
,
atol
=
5
e-3
))
self
.
assertTrue
(
torch
.
allclose
(
outputs
[:,
-
4
:,
-
4
:],
expected_outputs_last
,
atol
=
5
e-3
))
self
.
assertTrue
(
abs
(
outputs
.
sum
()
-
expected_output_sum
)
<
5
)
self
.
assertTrue
(
torch
.
allclose
(
outputs
[:,
:
4
,
:
4
],
expected_outputs_first
,
atol
=
1
e-3
))
self
.
assertTrue
(
torch
.
allclose
(
outputs
[:,
-
4
:,
-
4
:],
expected_outputs_last
,
atol
=
1
e-3
))
self
.
assertTrue
(
abs
(
outputs
.
sum
()
-
expected_output_sum
)
<
1
)
def
test_inference_ctc_batched
(
self
):
model
=
SEWDForCTC
.
from_pretrained
(
"asapp/sew-d-tiny-100k-ft-ls100h"
).
to
(
torch_device
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment