Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
941b4442
Commit
941b4442
authored
Jun 23, 2021
by
Lysandre
Browse files
Temporarily revert the `fill-mask` improvements.
parent
4bdff2cd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
81 deletions
+30
-81
src/transformers/pipelines/fill_mask.py
src/transformers/pipelines/fill_mask.py
+26
-52
tests/test_pipelines_fill_mask.py
tests/test_pipelines_fill_mask.py
+4
-29
No files found.
src/transformers/pipelines/fill_mask.py
View file @
941b4442
...
@@ -98,9 +98,9 @@ class FillMaskPipeline(Pipeline):
...
@@ -98,9 +98,9 @@ class FillMaskPipeline(Pipeline):
args (:obj:`str` or :obj:`List[str]`):
args (:obj:`str` or :obj:`List[str]`):
One or several texts (or one list of prompts) with masked tokens.
One or several texts (or one list of prompts) with masked tokens.
targets (:obj:`str` or :obj:`List[str]`, `optional`):
targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will
limit
the scores
to
the passed t
argets instead of looking up i
n the
whole
When passed, the model will
return
the scores
for
the passed t
oken or tokens rather tha
n the
top k
vocab
. If the provided targets are not in the model vocab, they will be
tokenized and the first
predictions in the entire vocabulary
. If the provided targets are not in the model vocab, they will be
resulting token will be used (with a warning
, and that might be slower
).
tokenized and the first
resulting token will be used (with a warning).
top_k (:obj:`int`, `optional`):
top_k (:obj:`int`, `optional`):
When passed, overrides the number of predictions to return.
When passed, overrides the number of predictions to return.
...
@@ -115,56 +115,25 @@ class FillMaskPipeline(Pipeline):
...
@@ -115,56 +115,25 @@ class FillMaskPipeline(Pipeline):
inputs
=
self
.
_parse_and_tokenize
(
*
args
,
**
kwargs
)
inputs
=
self
.
_parse_and_tokenize
(
*
args
,
**
kwargs
)
outputs
=
self
.
_forward
(
inputs
,
return_tensors
=
True
)
outputs
=
self
.
_forward
(
inputs
,
return_tensors
=
True
)
# top_k must be defined
if
top_k
is
None
:
top_k
=
self
.
top_k
results
=
[]
results
=
[]
batch_size
=
outputs
.
shape
[
0
]
if
self
.
framework
==
"tf"
else
outputs
.
size
(
0
)
batch_size
=
outputs
.
shape
[
0
]
if
self
.
framework
==
"tf"
else
outputs
.
size
(
0
)
if
targets
is
not
None
:
if
targets
is
not
None
:
if
len
(
targets
)
==
0
or
len
(
targets
[
0
])
==
0
:
raise
ValueError
(
"At least one target must be provided when passed."
)
if
isinstance
(
targets
,
str
):
if
isinstance
(
targets
,
str
):
targets
=
[
targets
]
targets
=
[
targets
]
try
:
targets_proc
=
[]
vocab
=
self
.
tokenizer
.
get_vocab
()
except
Exception
:
vocab
=
{}
target_ids
=
[]
for
target
in
targets
:
for
target
in
targets
:
id_
=
vocab
.
get
(
target
,
None
)
target_enc
=
self
.
tokenizer
.
tokenize
(
target
)
if
id_
is
None
:
if
len
(
target_enc
)
>
1
or
target_enc
[
0
]
==
self
.
tokenizer
.
unk_token
:
input_ids
=
self
.
tokenizer
(
target
,
add_special_tokens
=
False
,
return_attention_mask
=
False
,
return_token_type_ids
=
False
,
max_length
=
1
,
truncation
=
True
,
)[
"input_ids"
]
if
len
(
input_ids
)
==
0
:
logger
.
warning
(
f
"The specified target token `
{
target
}
` does not exist in the model vocabulary. "
f
"We cannot replace it with anything meaningful, ignoring it"
)
continue
id_
=
input_ids
[
0
]
# XXX: If users encounter this pass
# it becomes pretty slow, so let's make sure
# The warning enables them to fix the input to
# get faster performance.
logger
.
warning
(
logger
.
warning
(
f
"The specified target token `
{
target
}
` does not exist in the model vocabulary. "
f
"The specified target token `
{
target
}
` does not exist in the model vocabulary. "
f
"Replacing with `
{
self
.
tokenizer
.
convert_ids_to_tokens
(
id_
)
}
`."
f
"Replacing with `
{
target_enc
[
0
]
}
`."
)
)
target_ids
.
append
(
id_
)
targets_proc
.
append
(
target_enc
[
0
])
target_ids
=
list
(
set
(
target_ids
))
target_inds
=
np
.
array
(
self
.
tokenizer
.
convert_tokens_to_ids
(
targets_proc
))
if
len
(
target_ids
)
==
0
:
raise
ValueError
(
"At least one target must be provided when passed."
)
target_ids
=
np
.
array
(
target_ids
)
# Cap top_k if there are targets
if
top_k
>
target_ids
.
shape
[
0
]:
top_k
=
target_ids
.
shape
[
0
]
for
i
in
range
(
batch_size
):
for
i
in
range
(
batch_size
):
input_ids
=
inputs
[
"input_ids"
][
i
]
input_ids
=
inputs
[
"input_ids"
][
i
]
...
@@ -178,11 +147,14 @@ class FillMaskPipeline(Pipeline):
...
@@ -178,11 +147,14 @@ class FillMaskPipeline(Pipeline):
logits
=
outputs
[
i
,
masked_index
.
item
(),
:]
logits
=
outputs
[
i
,
masked_index
.
item
(),
:]
probs
=
tf
.
nn
.
softmax
(
logits
)
probs
=
tf
.
nn
.
softmax
(
logits
)
if
targets
is
not
None
:
if
targets
is
None
:
probs
=
tf
.
gather_nd
(
probs
,
tf
.
reshape
(
target_ids
,
(
-
1
,
1
)))
topk
=
tf
.
math
.
top_k
(
probs
,
k
=
top_k
if
top_k
is
not
None
else
self
.
top_k
)
values
,
predictions
=
topk
.
values
.
numpy
(),
topk
.
indices
.
numpy
()
topk
=
tf
.
math
.
top_k
(
probs
,
k
=
top_k
)
else
:
values
,
predictions
=
topk
.
values
.
numpy
(),
topk
.
indices
.
numpy
()
values
=
tf
.
gather_nd
(
probs
,
tf
.
reshape
(
target_inds
,
(
-
1
,
1
)))
sort_inds
=
tf
.
reverse
(
tf
.
argsort
(
values
),
[
0
])
values
=
tf
.
gather_nd
(
values
,
tf
.
reshape
(
sort_inds
,
(
-
1
,
1
))).
numpy
()
predictions
=
target_inds
[
sort_inds
.
numpy
()]
else
:
else
:
masked_index
=
torch
.
nonzero
(
input_ids
==
self
.
tokenizer
.
mask_token_id
,
as_tuple
=
False
)
masked_index
=
torch
.
nonzero
(
input_ids
==
self
.
tokenizer
.
mask_token_id
,
as_tuple
=
False
)
...
@@ -191,11 +163,13 @@ class FillMaskPipeline(Pipeline):
...
@@ -191,11 +163,13 @@ class FillMaskPipeline(Pipeline):
logits
=
outputs
[
i
,
masked_index
.
item
(),
:]
logits
=
outputs
[
i
,
masked_index
.
item
(),
:]
probs
=
logits
.
softmax
(
dim
=
0
)
probs
=
logits
.
softmax
(
dim
=
0
)
if
targets
is
None
:
if
targets
is
not
None
:
values
,
predictions
=
probs
.
topk
(
top_k
if
top_k
is
not
None
else
self
.
top_k
)
probs
=
probs
[...,
target_ids
]
else
:
values
=
probs
[...,
target_inds
]
values
,
predictions
=
probs
.
topk
(
top_k
)
sort_inds
=
list
(
reversed
(
values
.
argsort
(
dim
=-
1
)))
values
=
values
[...,
sort_inds
]
predictions
=
target_inds
[
sort_inds
]
for
v
,
p
in
zip
(
values
.
tolist
(),
predictions
.
tolist
()):
for
v
,
p
in
zip
(
values
.
tolist
(),
predictions
.
tolist
()):
tokens
=
input_ids
.
numpy
()
tokens
=
input_ids
.
numpy
()
...
...
tests/test_pipelines_fill_mask.py
View file @
941b4442
...
@@ -78,8 +78,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
...
@@ -78,8 +78,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
@
require_torch
@
require_torch
def
test_torch_fill_mask_with_targets
(
self
):
def
test_torch_fill_mask_with_targets
(
self
):
valid_inputs
=
[
"My name is <mask>"
]
valid_inputs
=
[
"My name is <mask>"
]
# ' Sam' will yield a warning but work
valid_targets
=
[[
" Teven"
,
" Patrick"
,
" Clara"
],
[
" Sam"
]]
valid_targets
=
[[
" Teven"
,
"ĠPatrick"
,
"ĠClara"
],
[
"ĠSam"
],
[
" Sam"
]]
invalid_targets
=
[[],
[
""
],
""
]
invalid_targets
=
[[],
[
""
],
""
]
for
model_name
in
self
.
small_models
:
for
model_name
in
self
.
small_models
:
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
)
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
)
...
@@ -90,34 +89,10 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
...
@@ -90,34 +89,10 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
for
targets
in
invalid_targets
:
for
targets
in
invalid_targets
:
self
.
assertRaises
(
ValueError
,
unmasker
,
valid_inputs
,
targets
=
targets
)
self
.
assertRaises
(
ValueError
,
unmasker
,
valid_inputs
,
targets
=
targets
)
@
require_torch
def
test_torch_fill_mask_with_targets_and_topk
(
self
):
model_name
=
self
.
small_models
[
0
]
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
)
targets
=
[
" Teven"
,
"ĠPatrick"
,
"ĠClara"
]
top_k
=
2
outputs
=
unmasker
(
"My name is <mask>"
,
targets
=
targets
,
top_k
=
top_k
)
self
.
assertEqual
(
len
(
outputs
),
2
)
@
require_torch
def
test_torch_fill_mask_with_duplicate_targets_and_topk
(
self
):
model_name
=
self
.
small_models
[
0
]
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
)
# String duplicates + id duplicates
targets
=
[
" Teven"
,
"ĠPatrick"
,
"ĠClara"
,
"ĠClara"
,
" Clara"
]
top_k
=
10
outputs
=
unmasker
(
"My name is <mask>"
,
targets
=
targets
,
top_k
=
top_k
)
# The target list contains duplicates, so we can't output more
# than them
self
.
assertEqual
(
len
(
outputs
),
3
)
@
require_tf
@
require_tf
def
test_tf_fill_mask_with_targets
(
self
):
def
test_tf_fill_mask_with_targets
(
self
):
valid_inputs
=
[
"My name is <mask>"
]
valid_inputs
=
[
"My name is <mask>"
]
# ' Sam' will yield a warning but work
valid_targets
=
[[
" Teven"
,
" Patrick"
,
" Clara"
],
[
" Sam"
]]
valid_targets
=
[[
" Teven"
,
"ĠPatrick"
,
"ĠClara"
],
[
"ĠSam"
],
[
" Sam"
]]
invalid_targets
=
[[],
[
""
],
""
]
invalid_targets
=
[[],
[
""
],
""
]
for
model_name
in
self
.
small_models
:
for
model_name
in
self
.
small_models
:
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
)
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
)
...
@@ -136,7 +111,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
...
@@ -136,7 +111,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
"My name is <mask>"
,
"My name is <mask>"
,
"The largest city in France is <mask>"
,
"The largest city in France is <mask>"
,
]
]
valid_targets
=
[
"
Ġ
Patrick"
,
"
Ġ
Clara"
]
valid_targets
=
[
"
Patrick"
,
"
Clara"
]
for
model_name
in
self
.
large_models
:
for
model_name
in
self
.
large_models
:
unmasker
=
pipeline
(
unmasker
=
pipeline
(
task
=
"fill-mask"
,
task
=
"fill-mask"
,
...
@@ -209,7 +184,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
...
@@ -209,7 +184,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
"My name is <mask>"
,
"My name is <mask>"
,
"The largest city in France is <mask>"
,
"The largest city in France is <mask>"
,
]
]
valid_targets
=
[
"
Ġ
Patrick"
,
"
Ġ
Clara"
]
valid_targets
=
[
"
Patrick"
,
"
Clara"
]
for
model_name
in
self
.
large_models
:
for
model_name
in
self
.
large_models
:
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
,
top_k
=
2
)
unmasker
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
,
top_k
=
2
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment