Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7732d0fe
Unverified
Commit
7732d0fe
authored
Feb 09, 2022
by
Lysandre Debut
Committed by
GitHub
Feb 09, 2022
Browse files
Upgrade black to version ~=22.0 (#15565)
* Upgrade black to version ~=22.0 * Check copies * Fix code
parent
d923f762
Changes
91
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
60 additions
and
60 deletions
+60
-60
src/transformers/models/big_bird/modeling_big_bird.py
src/transformers/models/big_bird/modeling_big_bird.py
+1
-1
src/transformers/models/big_bird/modeling_flax_big_bird.py
src/transformers/models/big_bird/modeling_flax_big_bird.py
+1
-1
src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
...ormers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+1
-1
src/transformers/models/blenderbot/modeling_blenderbot.py
src/transformers/models/blenderbot/modeling_blenderbot.py
+1
-1
src/transformers/models/blenderbot/modeling_tf_blenderbot.py
src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+1
-1
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
...mers/models/blenderbot_small/modeling_blenderbot_small.py
+1
-1
src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
...s/models/blenderbot_small/modeling_tf_blenderbot_small.py
+1
-1
src/transformers/models/byt5/tokenization_byt5.py
src/transformers/models/byt5/tokenization_byt5.py
+1
-1
src/transformers/models/clip/modeling_clip.py
src/transformers/models/clip/modeling_clip.py
+7
-7
src/transformers/models/clip/modeling_flax_clip.py
src/transformers/models/clip/modeling_flax_clip.py
+1
-1
src/transformers/models/clip/modeling_tf_clip.py
src/transformers/models/clip/modeling_tf_clip.py
+6
-6
src/transformers/models/clip/tokenization_clip.py
src/transformers/models/clip/tokenization_clip.py
+2
-2
src/transformers/models/detr/modeling_detr.py
src/transformers/models/detr/modeling_detr.py
+1
-1
src/transformers/models/fsmt/modeling_fsmt.py
src/transformers/models/fsmt/modeling_fsmt.py
+1
-1
src/transformers/models/funnel/modeling_funnel.py
src/transformers/models/funnel/modeling_funnel.py
+3
-3
src/transformers/models/funnel/modeling_tf_funnel.py
src/transformers/models/funnel/modeling_tf_funnel.py
+3
-3
src/transformers/models/gpt2/tokenization_gpt2.py
src/transformers/models/gpt2/tokenization_gpt2.py
+2
-2
src/transformers/models/hubert/modeling_hubert.py
src/transformers/models/hubert/modeling_hubert.py
+1
-1
src/transformers/models/hubert/modeling_tf_hubert.py
src/transformers/models/hubert/modeling_tf_hubert.py
+1
-1
src/transformers/models/ibert/quant_modules.py
src/transformers/models/ibert/quant_modules.py
+24
-24
No files found.
src/transformers/models/big_bird/modeling_big_bird.py
View file @
7732d0fe
...
...
@@ -297,7 +297,7 @@ class BigBirdEmbeddings(nn.Module):
inputs_embeds
=
self
.
word_embeddings
(
input_ids
)
if
self
.
rescale_embeddings
:
inputs_embeds
=
inputs_embeds
*
(
self
.
hidden_size
**
0.5
)
inputs_embeds
=
inputs_embeds
*
(
self
.
hidden_size
**
0.5
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
...
...
src/transformers/models/big_bird/modeling_flax_big_bird.py
View file @
7732d0fe
...
...
@@ -220,7 +220,7 @@ class FlaxBigBirdEmbeddings(nn.Module):
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
.
astype
(
"i4"
))
if
self
.
config
.
rescale_embeddings
:
inputs_embeds
*=
self
.
config
.
hidden_size
**
0.5
inputs_embeds
*=
self
.
config
.
hidden_size
**
0.5
# Sum all embeddings
hidden_states
=
inputs_embeds
+
token_type_embeddings
+
position_embeds
...
...
src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
View file @
7732d0fe
...
...
@@ -1219,7 +1219,7 @@ class BigBirdPegasusDecoderAttention(nn.Module):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
...
...
src/transformers/models/blenderbot/modeling_blenderbot.py
View file @
7732d0fe
...
...
@@ -148,7 +148,7 @@ class BlenderbotAttention(nn.Module):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
...
...
src/transformers/models/blenderbot/modeling_tf_blenderbot.py
View file @
7732d0fe
...
...
@@ -155,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
tf
.
keras
.
layers
.
Dense
(
embed_dim
,
use_bias
=
bias
,
name
=
"k_proj"
)
...
...
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
View file @
7732d0fe
...
...
@@ -146,7 +146,7 @@ class BlenderbotSmallAttention(nn.Module):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
...
...
src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
View file @
7732d0fe
...
...
@@ -154,7 +154,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
tf
.
keras
.
layers
.
Dense
(
embed_dim
,
use_bias
=
bias
,
name
=
"k_proj"
)
...
...
src/transformers/models/byt5/tokenization_byt5.py
View file @
7732d0fe
...
...
@@ -96,7 +96,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
self
.
_extra_ids
=
extra_ids
self
.
_utf_vocab_size
=
2
**
8
# utf is 8 bits
self
.
_utf_vocab_size
=
2
**
8
# utf is 8 bits
# define special tokens dict
self
.
special_tokens_encoder
:
Dict
[
int
,
str
]
=
{
...
...
src/transformers/models/clip/modeling_clip.py
View file @
7732d0fe
...
...
@@ -177,7 +177,7 @@ class CLIPAttention(nn.Module):
assert
(
self
.
head_dim
*
self
.
num_heads
==
self
.
embed_dim
),
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
and `num_heads`:
{
self
.
num_heads
}
)."
self
.
scale
=
self
.
head_dim
**
-
0.5
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
dropout
=
config
.
attention_dropout
self
.
k_proj
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
embed_dim
)
...
...
@@ -348,13 +348,13 @@ class CLIPPreTrainedModel(PreTrainedModel):
module
.
position_embedding
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
factor
*
0.02
)
elif
isinstance
(
module
,
CLIPVisionEmbeddings
):
factor
=
self
.
config
.
initializer_factor
nn
.
init
.
normal_
(
module
.
class_embedding
,
mean
=
0.0
,
std
=
module
.
embed_dim
**
-
0.5
*
factor
)
nn
.
init
.
normal_
(
module
.
class_embedding
,
mean
=
0.0
,
std
=
module
.
embed_dim
**-
0.5
*
factor
)
nn
.
init
.
normal_
(
module
.
patch_embedding
.
weight
,
std
=
module
.
config
.
initializer_range
*
factor
)
nn
.
init
.
normal_
(
module
.
position_embedding
.
weight
,
std
=
module
.
config
.
initializer_range
*
factor
)
elif
isinstance
(
module
,
CLIPAttention
):
factor
=
self
.
config
.
initializer_factor
in_proj_std
=
(
module
.
embed_dim
**
-
0.5
)
*
((
2
*
module
.
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
out_proj_std
=
(
module
.
embed_dim
**
-
0.5
)
*
factor
in_proj_std
=
(
module
.
embed_dim
**-
0.5
)
*
((
2
*
module
.
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
out_proj_std
=
(
module
.
embed_dim
**-
0.5
)
*
factor
nn
.
init
.
normal_
(
module
.
q_proj
.
weight
,
std
=
in_proj_std
)
nn
.
init
.
normal_
(
module
.
k_proj
.
weight
,
std
=
in_proj_std
)
nn
.
init
.
normal_
(
module
.
v_proj
.
weight
,
std
=
in_proj_std
)
...
...
@@ -362,7 +362,7 @@ class CLIPPreTrainedModel(PreTrainedModel):
elif
isinstance
(
module
,
CLIPMLP
):
factor
=
self
.
config
.
initializer_factor
in_proj_std
=
(
(
module
.
config
.
hidden_size
**
-
0.5
)
*
((
2
*
module
.
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
(
module
.
config
.
hidden_size
**-
0.5
)
*
((
2
*
module
.
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
)
fc_std
=
(
2
*
module
.
config
.
hidden_size
)
**
-
0.5
*
factor
nn
.
init
.
normal_
(
module
.
fc1
.
weight
,
std
=
fc_std
)
...
...
@@ -370,11 +370,11 @@ class CLIPPreTrainedModel(PreTrainedModel):
elif
isinstance
(
module
,
CLIPModel
):
nn
.
init
.
normal_
(
module
.
text_projection
.
weight
,
std
=
module
.
text_embed_dim
**
-
0.5
*
self
.
config
.
initializer_factor
,
std
=
module
.
text_embed_dim
**-
0.5
*
self
.
config
.
initializer_factor
,
)
nn
.
init
.
normal_
(
module
.
visual_projection
.
weight
,
std
=
module
.
vision_embed_dim
**
-
0.5
*
self
.
config
.
initializer_factor
,
std
=
module
.
vision_embed_dim
**-
0.5
*
self
.
config
.
initializer_factor
,
)
if
isinstance
(
module
,
nn
.
LayerNorm
):
...
...
src/transformers/models/clip/modeling_flax_clip.py
View file @
7732d0fe
...
...
@@ -263,7 +263,7 @@ class FlaxCLIPAttention(nn.Module):
assert
(
self
.
head_dim
*
self
.
num_heads
==
self
.
embed_dim
),
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
and `num_heads`:
{
self
.
num_heads
}
)."
self
.
scale
=
self
.
head_dim
**
-
0.5
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
dropout
=
self
.
config
.
attention_dropout
self
.
k_proj
=
nn
.
Dense
(
self
.
embed_dim
,
dtype
=
self
.
dtype
,
kernel_init
=
jax
.
nn
.
initializers
.
normal
(
0.01
))
...
...
src/transformers/models/clip/modeling_tf_clip.py
View file @
7732d0fe
...
...
@@ -156,7 +156,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer):
self
.
class_embedding
=
self
.
add_weight
(
shape
=
(
self
.
embed_dim
,),
initializer
=
get_initializer
(
self
.
embed_dim
**
-
0.5
*
factor
),
initializer
=
get_initializer
(
self
.
embed_dim
**-
0.5
*
factor
),
trainable
=
True
,
name
=
"class_embedding"
,
)
...
...
@@ -270,8 +270,8 @@ class TFCLIPAttention(tf.keras.layers.Layer):
)
factor
=
config
.
initializer_factor
in_proj_std
=
(
self
.
embed_dim
**
-
0.5
)
*
((
2
*
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
out_proj_std
=
(
self
.
embed_dim
**
-
0.5
)
*
factor
in_proj_std
=
(
self
.
embed_dim
**-
0.5
)
*
((
2
*
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
out_proj_std
=
(
self
.
embed_dim
**-
0.5
)
*
factor
self
.
sqrt_att_head_size
=
math
.
sqrt
(
self
.
attention_head_size
)
...
...
@@ -360,7 +360,7 @@ class TFCLIPMLP(tf.keras.layers.Layer):
self
.
activation_fn
=
get_tf_activation
(
config
.
hidden_act
)
factor
=
config
.
initializer_factor
in_proj_std
=
(
config
.
hidden_size
**
-
0.5
)
*
((
2
*
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
in_proj_std
=
(
config
.
hidden_size
**-
0.5
)
*
((
2
*
config
.
num_hidden_layers
)
**
-
0.5
)
*
factor
fc_std
=
(
2
*
config
.
hidden_size
)
**
-
0.5
*
factor
self
.
fc1
=
tf
.
keras
.
layers
.
Dense
(
...
...
@@ -753,14 +753,14 @@ class TFCLIPMainLayer(tf.keras.layers.Layer):
self
.
visual_projection
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
projection_dim
,
kernel_initializer
=
get_initializer
(
vision_config
.
hidden_size
**
-
0.5
*
self
.
config
.
initializer_factor
),
kernel_initializer
=
get_initializer
(
vision_config
.
hidden_size
**-
0.5
*
self
.
config
.
initializer_factor
),
use_bias
=
False
,
name
=
"visual_projection"
,
)
self
.
text_projection
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
projection_dim
,
kernel_initializer
=
get_initializer
(
text_config
.
hidden_size
**
-
0.5
*
self
.
config
.
initializer_factor
),
kernel_initializer
=
get_initializer
(
text_config
.
hidden_size
**-
0.5
*
self
.
config
.
initializer_factor
),
use_bias
=
False
,
name
=
"text_projection"
,
)
...
...
src/transformers/models/clip/tokenization_clip.py
View file @
7732d0fe
...
...
@@ -68,10 +68,10 @@ def bytes_to_unicode():
)
cs
=
bs
[:]
n
=
0
for
b
in
range
(
2
**
8
):
for
b
in
range
(
2
**
8
):
if
b
not
in
bs
:
bs
.
append
(
b
)
cs
.
append
(
2
**
8
+
n
)
cs
.
append
(
2
**
8
+
n
)
n
+=
1
cs
=
[
chr
(
n
)
for
n
in
cs
]
return
dict
(
zip
(
bs
,
cs
))
...
...
src/transformers/models/detr/modeling_detr.py
View file @
7732d0fe
...
...
@@ -488,7 +488,7 @@ class DetrAttention(nn.Module):
assert
(
self
.
head_dim
*
num_heads
==
self
.
embed_dim
),
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
and `num_heads`:
{
num_heads
}
)."
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
self
.
v_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
...
...
src/transformers/models/fsmt/modeling_fsmt.py
View file @
7732d0fe
...
...
@@ -823,7 +823,7 @@ class Attention(nn.Module):
self
.
dropout
=
dropout
self
.
head_dim
=
embed_dim
//
num_heads
assert
self
.
head_dim
*
num_heads
==
self
.
embed_dim
,
"embed_dim must be divisible by num_heads"
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
encoder_decoder_attention
=
encoder_decoder_attention
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
...
...
src/transformers/models/funnel/modeling_funnel.py
View file @
7732d0fe
...
...
@@ -278,7 +278,7 @@ class FunnelAttentionStructure(nn.Module):
# Second type
pos
=
pooled_pos
stride
=
2
**
block_index
stride
=
2
**
block_index
rel_pos
=
self
.
relative_pos
(
pos
,
stride
)
rel_pos
=
rel_pos
[:,
None
]
+
zero_offset
...
...
@@ -297,7 +297,7 @@ class FunnelAttentionStructure(nn.Module):
# the previous block of the 1st real block. Since the 1st real
# block always has position 1, the position of the previous block
# will be at `1 - 2 ** block_index`.
cls_pos
=
pos_id
.
new_tensor
([
-
(
2
**
block_index
)
+
1
])
cls_pos
=
pos_id
.
new_tensor
([
-
(
2
**
block_index
)
+
1
])
pooled_pos_id
=
pos_id
[
1
:
-
1
]
if
self
.
config
.
truncate_seq
else
pos_id
[
1
:]
return
torch
.
cat
([
cls_pos
,
pooled_pos_id
[::
2
]],
0
)
else
:
...
...
@@ -454,7 +454,7 @@ class FunnelRelMultiheadAttention(nn.Module):
self
.
post_proj
=
nn
.
Linear
(
n_head
*
d_head
,
d_model
)
self
.
layer_norm
=
nn
.
LayerNorm
(
d_model
,
eps
=
config
.
layer_norm_eps
)
self
.
scale
=
1.0
/
(
d_head
**
0.5
)
self
.
scale
=
1.0
/
(
d_head
**
0.5
)
def
relative_positional_attention
(
self
,
position_embeds
,
q_head
,
context_len
,
cls_mask
=
None
):
"""Relative attention score for the positional encodings"""
...
...
src/transformers/models/funnel/modeling_tf_funnel.py
View file @
7732d0fe
...
...
@@ -231,7 +231,7 @@ class TFFunnelAttentionStructure:
# Second type
pos
=
pooled_pos
stride
=
2
**
block_index
stride
=
2
**
block_index
rel_pos
=
self
.
relative_pos
(
pos
,
stride
)
# rel_pos = tf.expand_dims(rel_pos,1) + zero_offset
...
...
@@ -252,7 +252,7 @@ class TFFunnelAttentionStructure:
# the previous block of the 1st real block. Since the 1st real
# block always has position 1, the position of the previous block
# will be at `1 - 2 ** block_index`.
cls_pos
=
tf
.
constant
([
-
(
2
**
block_index
)
+
1
],
dtype
=
pos_id
.
dtype
)
cls_pos
=
tf
.
constant
([
-
(
2
**
block_index
)
+
1
],
dtype
=
pos_id
.
dtype
)
pooled_pos_id
=
pos_id
[
1
:
-
1
]
if
self
.
truncate_seq
else
pos_id
[
1
:]
return
tf
.
concat
([
cls_pos
,
pooled_pos_id
[::
2
]],
0
)
else
:
...
...
@@ -400,7 +400,7 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer):
self
.
post_proj
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
kernel_initializer
=
initializer
,
name
=
"post_proj"
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
"layer_norm"
)
self
.
scale
=
1.0
/
(
d_head
**
0.5
)
self
.
scale
=
1.0
/
(
d_head
**
0.5
)
def
build
(
self
,
input_shape
):
n_head
,
d_head
,
d_model
=
self
.
n_head
,
self
.
d_head
,
self
.
d_model
...
...
src/transformers/models/gpt2/tokenization_gpt2.py
View file @
7732d0fe
...
...
@@ -78,10 +78,10 @@ def bytes_to_unicode():
)
cs
=
bs
[:]
n
=
0
for
b
in
range
(
2
**
8
):
for
b
in
range
(
2
**
8
):
if
b
not
in
bs
:
bs
.
append
(
b
)
cs
.
append
(
2
**
8
+
n
)
cs
.
append
(
2
**
8
+
n
)
n
+=
1
cs
=
[
chr
(
n
)
for
n
in
cs
]
return
dict
(
zip
(
bs
,
cs
))
...
...
src/transformers/models/hubert/modeling_hubert.py
View file @
7732d0fe
...
...
@@ -418,7 +418,7 @@ class HubertAttention(nn.Module):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
...
...
src/transformers/models/hubert/modeling_tf_hubert.py
View file @
7732d0fe
...
...
@@ -741,7 +741,7 @@ class TFHubertAttention(tf.keras.layers.Layer):
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
"
f
" and `num_heads`:
{
num_heads
}
)."
)
self
.
scaling
=
self
.
head_dim
**
-
0.5
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
is_decoder
=
is_decoder
self
.
k_proj
=
tf
.
keras
.
layers
.
Dense
(
embed_dim
,
use_bias
=
bias
,
name
=
"k_proj"
)
...
...
src/transformers/models/ibert/quant_modules.py
View file @
7732d0fe
...
...
@@ -327,16 +327,16 @@ class IntGELU(nn.Module):
def
int_erf
(
self
,
x_int
,
scaling_factor
):
b_int
=
torch
.
floor
(
self
.
coeff
[
1
]
/
scaling_factor
)
c_int
=
torch
.
floor
(
self
.
coeff
[
2
]
/
scaling_factor
**
2
)
c_int
=
torch
.
floor
(
self
.
coeff
[
2
]
/
scaling_factor
**
2
)
sign
=
torch
.
sign
(
x_int
)
abs_int
=
torch
.
min
(
torch
.
abs
(
x_int
),
-
b_int
)
y_int
=
sign
*
((
abs_int
+
b_int
)
**
2
+
c_int
)
scaling_factor
=
scaling_factor
**
2
*
self
.
coeff
[
0
]
scaling_factor
=
scaling_factor
**
2
*
self
.
coeff
[
0
]
# avoid overflow
y_int
=
floor_ste
.
apply
(
y_int
/
2
**
self
.
const
)
scaling_factor
=
scaling_factor
*
2
**
self
.
const
y_int
=
floor_ste
.
apply
(
y_int
/
2
**
self
.
const
)
scaling_factor
=
scaling_factor
*
2
**
self
.
const
return
y_int
,
scaling_factor
...
...
@@ -388,9 +388,9 @@ class IntSoftmax(nn.Module):
def
int_polynomial
(
self
,
x_int
,
scaling_factor
):
with
torch
.
no_grad
():
b_int
=
torch
.
floor
(
self
.
coef
[
1
]
/
scaling_factor
)
c_int
=
torch
.
floor
(
self
.
coef
[
2
]
/
scaling_factor
**
2
)
c_int
=
torch
.
floor
(
self
.
coef
[
2
]
/
scaling_factor
**
2
)
z
=
(
x_int
+
b_int
)
*
x_int
+
c_int
scaling_factor
=
self
.
coef
[
0
]
*
scaling_factor
**
2
scaling_factor
=
self
.
coef
[
0
]
*
scaling_factor
**
2
return
z
,
scaling_factor
def
int_exp
(
self
,
x_int
,
scaling_factor
):
...
...
@@ -402,7 +402,7 @@ class IntSoftmax(nn.Module):
r
=
x_int
-
x0_int
*
q
exp_int
,
exp_scaling_factor
=
self
.
int_polynomial
(
r
,
scaling_factor
)
exp_int
=
torch
.
clamp
(
floor_ste
.
apply
(
exp_int
*
2
**
(
self
.
const
-
q
)),
min
=
0
)
scaling_factor
=
exp_scaling_factor
/
2
**
self
.
const
scaling_factor
=
exp_scaling_factor
/
2
**
self
.
const
return
exp_int
,
scaling_factor
def
forward
(
self
,
x
,
scaling_factor
):
...
...
@@ -420,9 +420,9 @@ class IntSoftmax(nn.Module):
exp_int
=
exp
/
exp_scaling_factor
exp_int_sum
=
exp_int
.
sum
(
dim
=-
1
,
keepdim
=
True
)
factor
=
floor_ste
.
apply
(
2
**
self
.
max_bit
/
exp_int_sum
)
factor
=
floor_ste
.
apply
(
2
**
self
.
max_bit
/
exp_int_sum
)
exp_int
=
floor_ste
.
apply
(
exp_int
*
factor
/
2
**
(
self
.
max_bit
-
self
.
output_bit
))
scaling_factor
=
1
/
2
**
self
.
output_bit
scaling_factor
=
1
/
2
**
self
.
output_bit
return
exp_int
*
scaling_factor
,
scaling_factor
...
...
@@ -460,9 +460,9 @@ class IntLayerNorm(nn.Module):
def
set_shift
(
self
,
y_int
):
with
torch
.
no_grad
():
y_sq_int
=
y_int
**
2
y_sq_int
=
y_int
**
2
var_int
=
torch
.
sum
(
y_sq_int
,
axis
=
2
,
keepdim
=
True
)
shift
=
(
torch
.
log2
(
torch
.
sqrt
(
var_int
/
2
**
self
.
max_bit
)).
ceil
()).
max
()
shift
=
(
torch
.
log2
(
torch
.
sqrt
(
var_int
/
2
**
self
.
max_bit
)).
ceil
()).
max
()
shift_old
=
self
.
shift
self
.
shift
=
torch
.
max
(
self
.
shift
,
shift
)
logger
.
info
(
f
"Dynamic shift adjustment:
{
int
(
shift_old
)
}
->
{
int
(
self
.
shift
)
}
"
)
...
...
@@ -473,8 +473,8 @@ class IntLayerNorm(nn.Module):
to avoid overflow in the subsequent runs.
"""
self
.
set_shift
(
y_int
)
# adjusts `self.shift`
y_int_shifted
=
floor_ste
.
apply
(
y_int
/
2
**
self
.
shift
)
y_sq_int
=
y_int_shifted
**
2
y_int_shifted
=
floor_ste
.
apply
(
y_int
/
2
**
self
.
shift
)
y_sq_int
=
y_int_shifted
**
2
var_int
=
torch
.
sum
(
y_sq_int
,
axis
=
2
,
keepdim
=
True
)
return
var_int
...
...
@@ -482,7 +482,7 @@ class IntLayerNorm(nn.Module):
if
not
self
.
quant_mode
:
mean
=
x
.
mean
(
axis
=
2
,
keepdim
=
True
)
y
=
x
-
mean
var
=
torch
.
mean
(
y
**
2
,
axis
=
2
,
keepdim
=
True
)
var
=
torch
.
mean
(
y
**
2
,
axis
=
2
,
keepdim
=
True
)
x
=
y
/
torch
.
sqrt
(
self
.
eps
+
var
)
x
=
x
*
self
.
weight
+
self
.
bias
return
x
,
None
...
...
@@ -496,25 +496,25 @@ class IntLayerNorm(nn.Module):
x_int
=
x
/
scaling_factor
mean_int
=
round_ste
.
apply
(
x_int
.
mean
(
axis
=
2
,
keepdim
=
True
))
y_int
=
x_int
-
mean_int
y_int_shifted
=
floor_ste
.
apply
(
y_int
/
2
**
self
.
shift
)
y_sq_int
=
y_int_shifted
**
2
y_int_shifted
=
floor_ste
.
apply
(
y_int
/
2
**
self
.
shift
)
y_sq_int
=
y_int_shifted
**
2
var_int
=
torch
.
sum
(
y_sq_int
,
axis
=
2
,
keepdim
=
True
)
# overflow handling in training time
if
self
.
training
:
# if overflow is detected
if
var_int
.
max
()
>=
2
**
self
.
max_bit
:
if
var_int
.
max
()
>=
2
**
self
.
max_bit
:
var_int
=
self
.
overflow_fallback
(
y_int
)
assert
var_int
.
max
()
<
2
**
self
.
max_bit
+
0.1
,
(
assert
var_int
.
max
()
<
2
**
self
.
max_bit
+
0.1
,
(
"Error detected in overflow handling: "
"`var_int` exceeds `self.max_bit` (the maximum possible bit width)"
)
# To be replaced with integer-sqrt kernel that produces the same output
std_int
=
floor_ste
.
apply
(
torch
.
sqrt
(
var_int
))
*
2
**
self
.
shift
factor
=
floor_ste
.
apply
(
2
**
31
/
std_int
)
std_int
=
floor_ste
.
apply
(
torch
.
sqrt
(
var_int
))
*
2
**
self
.
shift
factor
=
floor_ste
.
apply
(
2
**
31
/
std_int
)
y_int
=
floor_ste
.
apply
(
y_int
*
factor
/
2
)
scaling_factor
=
self
.
dim_sqrt
/
2
**
30
scaling_factor
=
self
.
dim_sqrt
/
2
**
30
# scaling and shifting
bias
=
self
.
bias
.
data
.
detach
()
/
(
self
.
weight
.
data
.
detach
())
...
...
@@ -725,7 +725,7 @@ def batch_frexp(inputs, max_bit=31):
tmp_m
=
[]
for
m
in
output_m
:
int_m_shifted
=
int
(
decimal
.
Decimal
(
m
*
(
2
**
max_bit
)).
quantize
(
decimal
.
Decimal
(
"1"
),
rounding
=
decimal
.
ROUND_HALF_UP
)
decimal
.
Decimal
(
m
*
(
2
**
max_bit
)).
quantize
(
decimal
.
Decimal
(
"1"
),
rounding
=
decimal
.
ROUND_HALF_UP
)
)
tmp_m
.
append
(
int_m_shifted
)
output_m
=
np
.
array
(
tmp_m
)
...
...
@@ -796,7 +796,7 @@ class FixedPointMul(Function):
m
,
e
=
batch_frexp
(
new_scale
)
output
=
z_int
.
type
(
torch
.
double
)
*
m
.
type
(
torch
.
double
)
output
=
torch
.
round
(
output
/
(
2.0
**
e
))
output
=
torch
.
round
(
output
/
(
2.0
**
e
))
if
identity
is
not
None
:
# needs addition of identity activation
...
...
@@ -809,7 +809,7 @@ class FixedPointMul(Function):
m1
,
e1
=
batch_frexp
(
new_scale
)
output1
=
wx_int
.
type
(
torch
.
double
)
*
m1
.
type
(
torch
.
double
)
output1
=
torch
.
round
(
output1
/
(
2.0
**
e1
))
output1
=
torch
.
round
(
output1
/
(
2.0
**
e1
))
output
=
output1
+
output
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment