Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a2410110
Commit
a2410110
authored
Dec 20, 2019
by
thomwolf
Browse files
fix pipeline NER
parent
e37ca8e1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
20 deletions
+11
-20
transformers/pipelines.py
transformers/pipelines.py
+11
-20
No files found.
transformers/pipelines.py
View file @
a2410110
...
...
@@ -463,7 +463,7 @@ class NerPipeline(Pipeline):
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
=
None
,
modelcard
:
ModelCard
=
None
,
framework
:
Optional
[
str
]
=
None
,
args_parser
:
ArgumentHandler
=
None
,
device
:
int
=
-
1
,
binary_output
:
bool
=
False
):
binary_output
:
bool
=
False
,
ignore_labels
=
[
'O'
]
):
super
().
__init__
(
model
=
model
,
tokenizer
=
tokenizer
,
modelcard
=
modelcard
,
...
...
@@ -473,17 +473,12 @@ class NerPipeline(Pipeline):
binary_output
=
binary_output
)
self
.
_basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
False
)
self
.
ignore_labels
=
ignore_labels
def
__call__
(
self
,
*
texts
,
**
kwargs
):
inputs
,
answers
=
self
.
_args_parser
(
*
texts
,
**
kwargs
),
[]
for
sentence
in
inputs
:
# Ugly token to word idx mapping (for now)
token_to_word
,
words
=
[],
self
.
_basic_tokenizer
.
tokenize
(
sentence
)
for
i
,
w
in
enumerate
(
words
):
tokens
=
self
.
tokenizer
.
tokenize
(
w
)
token_to_word
+=
[
i
]
*
len
(
tokens
)
# Manage correct placement of the tensors
with
self
.
device_placement
():
...
...
@@ -500,26 +495,22 @@ class NerPipeline(Pipeline):
with
torch
.
no_grad
():
entities
=
self
.
model
(
**
tokens
)[
0
][
0
].
cpu
().
numpy
()
# Normalize scores
answer
,
token_start
=
[],
1
for
idx
,
word
in
groupby
(
token_to_word
):
# Sum log prob over token, then normalize across labels
score
=
np
.
exp
(
entities
[
token_start
])
/
np
.
exp
(
entities
[
token_start
]).
sum
(
-
1
,
keepdims
=
True
)
label_idx
=
score
.
argmax
()
score
=
np
.
exp
(
entities
)
/
np
.
exp
(
entities
).
sum
(
-
1
,
keepdims
=
True
)
labels_idx
=
score
.
argmax
(
axis
=-
1
)
if
label_idx
>
0
:
answer
=
[]
for
idx
,
label_idx
in
enumerate
(
labels_idx
):
if
self
.
model
.
config
.
id2label
[
label_idx
]
not
in
self
.
ignore_labels
:
answer
+=
[{
'word'
:
words
[
idx
]
,
'score'
:
score
[
label_idx
].
item
(),
'word'
:
self
.
tokenizer
.
decode
(
tokens
[
'input_ids'
][
0
][
idx
].
cpu
().
tolist
())
,
'score'
:
score
[
idx
][
label_idx
].
item
(),
'entity'
:
self
.
model
.
config
.
id2label
[
label_idx
]
}]
# Update token start
token_start
+=
len
(
list
(
word
))
# Append
answers
+=
[
answer
]
if
len
(
answers
)
==
1
:
return
answers
[
0
]
return
answers
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment