Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
fa2ccbc0
"rust/vscode:/vscode.git/clone" did not exist on "a146d9990e148fdf2c247d639ba5d2a572175e9c"
Commit
fa2ccbc0
authored
Dec 21, 2019
by
Aymeric Augustin
Browse files
Fix E266 flake8 warning (x90).
parent
2ab78325
Changes
30
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
47 additions
and
45 deletions
+47
-45
transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
...ers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+1
-1
transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
...ormers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+1
-1
transformers/modeling_distilbert.py
transformers/modeling_distilbert.py
+2
-2
transformers/modeling_tf_distilbert.py
transformers/modeling_tf_distilbert.py
+2
-2
transformers/modeling_tf_pytorch_utils.py
transformers/modeling_tf_pytorch_utils.py
+4
-2
transformers/modeling_tf_transfo_xl.py
transformers/modeling_tf_transfo_xl.py
+10
-10
transformers/modeling_tf_xlnet.py
transformers/modeling_tf_xlnet.py
+8
-8
transformers/modeling_transfo_xl.py
transformers/modeling_transfo_xl.py
+10
-10
transformers/modeling_xlnet.py
transformers/modeling_xlnet.py
+8
-8
transformers/optimization_tf.py
transformers/optimization_tf.py
+1
-1
No files found.
transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
View file @
fa2ccbc0
...
...
@@ -70,7 +70,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
#
#
Required parameters
# Required parameters
parser
.
add_argument
(
"--xlm_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the official PyTorch dump."
)
...
...
transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
View file @
fa2ccbc0
...
...
@@ -82,7 +82,7 @@ def convert_xlnet_checkpoint_to_pytorch(
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
#
#
Required parameters
# Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the TensorFlow checkpoint path."
)
...
...
transformers/modeling_distilbert.py
View file @
fa2ccbc0
...
...
@@ -47,7 +47,7 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
}
##
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
##
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
def
gelu
(
x
):
return
0.5
*
x
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
...
...
@@ -327,7 +327,7 @@ class Transformer(nn.Module):
return
outputs
# last-layer hidden state, (all hidden states), (all attentions)
##
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
##
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class
DistilBertPreTrainedModel
(
PreTrainedModel
):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
...
...
transformers/modeling_tf_distilbert.py
View file @
fa2ccbc0
...
...
@@ -42,7 +42,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
}
##
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
##
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
def
gelu
(
x
):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
...
...
@@ -463,7 +463,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
return
tfmr_output
# last-layer hidden-state, (all hidden_states), (all attentions)
##
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
##
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class
TFDistilBertPreTrainedModel
(
TFPreTrainedModel
):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
...
...
transformers/modeling_tf_pytorch_utils.py
View file @
fa2ccbc0
...
...
@@ -67,7 +67,8 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
#####################
### PyTorch => TF 2.0
# PyTorch => TF 2.0 #
#####################
def
load_pytorch_checkpoint_in_tf2_model
(
tf_model
,
pytorch_checkpoint_path
,
tf_inputs
=
None
,
allow_missing_keys
=
False
):
...
...
@@ -197,7 +198,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
#####################
### TF 2.0 => PyTorch
# TF 2.0 => PyTorch #
#####################
def
load_tf2_checkpoint_in_pytorch_model
(
pt_model
,
tf_checkpoint_path
,
tf_inputs
=
None
,
allow_missing_keys
=
False
):
...
...
transformers/modeling_tf_transfo_xl.py
View file @
fa2ccbc0
...
...
@@ -79,23 +79,23 @@ class TFPositionwiseFF(tf.keras.layers.Layer):
def
call
(
self
,
inp
,
training
=
False
):
if
self
.
pre_lnorm
:
#
####
layer normalization + positionwise feed-forward
# layer normalization + positionwise feed-forward
core_out
=
self
.
layer_norm
(
inp
)
core_out
=
self
.
layer_1
(
core_out
)
core_out
=
self
.
drop_1
(
core_out
,
training
=
training
)
core_out
=
self
.
layer_2
(
core_out
)
core_out
=
self
.
drop_2
(
core_out
,
training
=
training
)
#
####
residual connection
# residual connection
output
=
core_out
+
inp
else
:
#
####
positionwise feed-forward
# positionwise feed-forward
core_out
=
self
.
layer_1
(
inp
)
core_out
=
self
.
drop_1
(
core_out
,
training
=
training
)
core_out
=
self
.
layer_2
(
core_out
)
core_out
=
self
.
drop_2
(
core_out
,
training
=
training
)
#
####
residual connection + layer normalization
# residual connection + layer normalization
output
=
self
.
layer_norm
(
inp
+
core_out
)
return
output
...
...
@@ -206,7 +206,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
r_head_k
=
tf
.
reshape
(
r_head_k
,
(
rlen
,
self
.
n_head
,
self
.
d_head
))
# qlen x n_head x d_head
#
###
compute attention score
# compute attention score
rw_head_q
=
w_head_q
+
self
.
r_w_bias
# qlen x bsz x n_head x d_head
AC
=
tf
.
einsum
(
"ibnd,jbnd->ijbn"
,
rw_head_q
,
w_head_k
)
# qlen x klen x bsz x n_head
...
...
@@ -218,7 +218,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
attn_score
=
AC
+
BD
attn_score
=
attn_score
*
self
.
scale
#
###
compute attention probability
# compute attention probability
if
attn_mask
is
not
None
:
attn_mask_t
=
attn_mask
[:,
:,
None
,
None
]
attn_score
=
attn_score
*
(
1
-
attn_mask_t
)
-
1e30
*
attn_mask_t
...
...
@@ -231,22 +231,22 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
if
head_mask
is
not
None
:
attn_prob
=
attn_prob
*
head_mask
#
###
compute attention vector
# compute attention vector
attn_vec
=
tf
.
einsum
(
"ijbn,jbnd->ibnd"
,
attn_prob
,
w_head_v
)
# [qlen x bsz x n_head x d_head]
attn_vec_sizes
=
shape_list
(
attn_vec
)
attn_vec
=
tf
.
reshape
(
attn_vec
,
(
attn_vec_sizes
[
0
],
attn_vec_sizes
[
1
],
self
.
n_head
*
self
.
d_head
))
#
####
linear projection
# linear projection
attn_out
=
self
.
o_net
(
attn_vec
)
attn_out
=
self
.
drop
(
attn_out
,
training
=
training
)
if
self
.
pre_lnorm
:
#
####
residual connection
# residual connection
outputs
=
[
w
+
attn_out
]
else
:
#
####
residual connection + layer normalization
# residual connection + layer normalization
outputs
=
[
self
.
layer_norm
(
w
+
attn_out
)]
if
self
.
output_attentions
:
...
...
transformers/modeling_tf_xlnet.py
View file @
fa2ccbc0
...
...
@@ -190,7 +190,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
(
h
,
g
,
attn_mask_h
,
attn_mask_g
,
r
,
seg_mat
,
mems
,
target_mapping
,
head_mask
)
=
inputs
if
g
is
not
None
:
#
#####
Two-stream attention with relative positional encoding.
# Two-stream attention with relative positional encoding.
# content based attention score
if
mems
is
not
None
and
len
(
shape_list
(
mems
))
>
1
:
cat
=
tf
.
concat
([
mems
,
h
],
axis
=
0
)
...
...
@@ -206,7 +206,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
# position-based key head
k_head_r
=
tf
.
einsum
(
"ibh,hnd->ibnd"
,
r
,
self
.
r
)
#
####
h-stream
# h-stream
# content-stream query head
q_head_h
=
tf
.
einsum
(
"ibh,hnd->ibnd"
,
h
,
self
.
q
)
...
...
@@ -221,7 +221,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
],
training
=
training
)
#
####
g-stream
# g-stream
# query-stream query head
q_head_g
=
tf
.
einsum
(
"ibh,hnd->ibnd"
,
g
,
self
.
q
)
...
...
@@ -251,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_prob
=
attn_prob_h
,
attn_prob_g
else
:
#
#####
Multi-head attention with relative positional encoding
# Multi-head attention with relative positional encoding
if
mems
is
not
None
and
len
(
shape_list
(
mems
))
>
1
:
cat
=
tf
.
concat
([
mems
,
h
],
axis
=
0
)
else
:
...
...
@@ -552,7 +552,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
dtype_float
=
tf
.
bfloat16
if
self
.
use_bfloat16
else
tf
.
float32
#
####
Attention mask
# Attention mask
# causal attention mask
if
self
.
attn_type
==
"uni"
:
attn_mask
=
self
.
create_mask
(
qlen
,
mlen
)
...
...
@@ -597,7 +597,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
else
:
non_tgt_mask
=
None
#
####
Word embeddings and prepare h & g hidden states
# Word embeddings and prepare h & g hidden states
if
inputs_embeds
is
not
None
:
word_emb_k
=
inputs_embeds
else
:
...
...
@@ -612,7 +612,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
else
:
output_g
=
None
#
####
Segment embedding
# Segment embedding
if
token_type_ids
is
not
None
:
# Convert `token_type_ids` to one-hot `seg_mat`
mem_pad
=
tf
.
zeros
([
mlen
,
bsz
],
dtype
=
tf
.
int32
)
...
...
@@ -624,7 +624,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
else
:
seg_mat
=
None
#
####
Positional encoding
# Positional encoding
pos_emb
=
self
.
relative_positional_encoding
(
qlen
,
klen
,
bsz
=
bsz
,
dtype
=
dtype_float
)
pos_emb
=
self
.
dropout
(
pos_emb
,
training
=
training
)
...
...
transformers/modeling_transfo_xl.py
View file @
fa2ccbc0
...
...
@@ -213,16 +213,16 @@ class PositionwiseFF(nn.Module):
def
forward
(
self
,
inp
):
if
self
.
pre_lnorm
:
#
####
layer normalization + positionwise feed-forward
# layer normalization + positionwise feed-forward
core_out
=
self
.
CoreNet
(
self
.
layer_norm
(
inp
))
#
####
residual connection
# residual connection
output
=
core_out
+
inp
else
:
#
####
positionwise feed-forward
# positionwise feed-forward
core_out
=
self
.
CoreNet
(
inp
)
#
####
residual connection + layer normalization
# residual connection + layer normalization
output
=
self
.
layer_norm
(
inp
+
core_out
)
return
output
...
...
@@ -316,7 +316,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
r_head_k
=
r_head_k
.
view
(
rlen
,
self
.
n_head
,
self
.
d_head
)
# qlen x n_head x d_head
#
###
compute attention score
# compute attention score
rw_head_q
=
w_head_q
+
self
.
r_w_bias
# qlen x bsz x n_head x d_head
AC
=
torch
.
einsum
(
"ibnd,jbnd->ijbn"
,
(
rw_head_q
,
w_head_k
))
# qlen x klen x bsz x n_head
...
...
@@ -328,7 +328,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
attn_score
=
AC
+
BD
attn_score
.
mul_
(
self
.
scale
)
#
###
compute attention probability
# compute attention probability
if
attn_mask
is
not
None
and
torch
.
sum
(
attn_mask
).
item
():
attn_mask
=
attn_mask
==
1
# Switch to bool
if
attn_mask
.
dim
()
==
2
:
...
...
@@ -352,21 +352,21 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
if
head_mask
is
not
None
:
attn_prob
=
attn_prob
*
head_mask
#
###
compute attention vector
# compute attention vector
attn_vec
=
torch
.
einsum
(
"ijbn,jbnd->ibnd"
,
(
attn_prob
,
w_head_v
))
# [qlen x bsz x n_head x d_head]
attn_vec
=
attn_vec
.
contiguous
().
view
(
attn_vec
.
size
(
0
),
attn_vec
.
size
(
1
),
self
.
n_head
*
self
.
d_head
)
#
####
linear projection
# linear projection
attn_out
=
self
.
o_net
(
attn_vec
)
attn_out
=
self
.
drop
(
attn_out
)
if
self
.
pre_lnorm
:
#
####
residual connection
# residual connection
outputs
=
[
w
+
attn_out
]
else
:
#
####
residual connection + layer normalization
# residual connection + layer normalization
outputs
=
[
self
.
layer_norm
(
w
+
attn_out
)]
if
self
.
output_attentions
:
...
...
transformers/modeling_xlnet.py
View file @
fa2ccbc0
...
...
@@ -330,7 +330,7 @@ class XLNetRelativeAttention(nn.Module):
def
forward
(
self
,
h
,
g
,
attn_mask_h
,
attn_mask_g
,
r
,
seg_mat
,
mems
=
None
,
target_mapping
=
None
,
head_mask
=
None
):
if
g
is
not
None
:
#
#####
Two-stream attention with relative positional encoding.
# Two-stream attention with relative positional encoding.
# content based attention score
if
mems
is
not
None
and
mems
.
dim
()
>
1
:
cat
=
torch
.
cat
([
mems
,
h
],
dim
=
0
)
...
...
@@ -346,7 +346,7 @@ class XLNetRelativeAttention(nn.Module):
# position-based key head
k_head_r
=
torch
.
einsum
(
"ibh,hnd->ibnd"
,
r
,
self
.
r
)
#
####
h-stream
# h-stream
# content-stream query head
q_head_h
=
torch
.
einsum
(
"ibh,hnd->ibnd"
,
h
,
self
.
q
)
...
...
@@ -361,7 +361,7 @@ class XLNetRelativeAttention(nn.Module):
# post processing
output_h
=
self
.
post_attention
(
h
,
attn_vec_h
)
#
####
g-stream
# g-stream
# query-stream query head
q_head_g
=
torch
.
einsum
(
"ibh,hnd->ibnd"
,
g
,
self
.
q
)
...
...
@@ -391,7 +391,7 @@ class XLNetRelativeAttention(nn.Module):
attn_prob
=
attn_prob_h
,
attn_prob_g
else
:
#
#####
Multi-head attention with relative positional encoding
# Multi-head attention with relative positional encoding
if
mems
is
not
None
and
mems
.
dim
()
>
1
:
cat
=
torch
.
cat
([
mems
,
h
],
dim
=
0
)
else
:
...
...
@@ -804,7 +804,7 @@ class XLNetModel(XLNetPreTrainedModel):
dtype_float
=
next
(
self
.
parameters
()).
dtype
device
=
next
(
self
.
parameters
()).
device
#
####
Attention mask
# Attention mask
# causal attention mask
if
self
.
attn_type
==
"uni"
:
attn_mask
=
self
.
create_mask
(
qlen
,
mlen
)
...
...
@@ -849,7 +849,7 @@ class XLNetModel(XLNetPreTrainedModel):
else
:
non_tgt_mask
=
None
#
####
Word embeddings and prepare h & g hidden states
# Word embeddings and prepare h & g hidden states
if
inputs_embeds
is
not
None
:
word_emb_k
=
inputs_embeds
else
:
...
...
@@ -864,7 +864,7 @@ class XLNetModel(XLNetPreTrainedModel):
else
:
output_g
=
None
#
####
Segment embedding
# Segment embedding
if
token_type_ids
is
not
None
:
# Convert `token_type_ids` to one-hot `seg_mat`
if
mlen
>
0
:
...
...
@@ -879,7 +879,7 @@ class XLNetModel(XLNetPreTrainedModel):
else
:
seg_mat
=
None
#
####
Positional encoding
# Positional encoding
pos_emb
=
self
.
relative_positional_encoding
(
qlen
,
klen
,
bsz
=
bsz
)
pos_emb
=
self
.
dropout
(
pos_emb
)
...
...
transformers/optimization_tf.py
View file @
fa2ccbc0
...
...
@@ -178,7 +178,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
return
True
#
# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class
GradientAccumulator
(
object
):
"""Distribution strategies-aware gradient accumulation utility."""
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment