Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
40a39ab6
Commit
40a39ab6
authored
Dec 10, 2019
by
Morgan Funtowicz
Browse files
Reuse recent SQuAD refactored data structure inside QA pipelines.
parent
aae74065
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
39 deletions
+59
-39
transformers/data/processors/__init__.py
transformers/data/processors/__init__.py
+1
-1
transformers/modeling_auto.py
transformers/modeling_auto.py
+3
-9
transformers/pipelines.py
transformers/pipelines.py
+55
-29
No files found.
transformers/data/processors/__init__.py
View file @
40a39ab6
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
,
SingleSentenceClassificationProcessor
,
convert_examples_to_features
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
,
SingleSentenceClassificationProcessor
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
SquadV1Processor
,
SquadV2Processor
from
.xnli
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
from
.xnli
import
xnli_output_modes
,
xnli_processors
,
xnli_tasks_num_labels
\ No newline at end of file
transformers/modeling_auto.py
View file @
40a39ab6
...
@@ -31,7 +31,7 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi
...
@@ -31,7 +31,7 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi
from
.modeling_xlm
import
XLMModel
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMForQuestionAnswering
from
.modeling_xlm
import
XLMModel
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMForQuestionAnswering
from
.modeling_roberta
import
RobertaModel
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
from
.modeling_roberta
import
RobertaModel
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
from
.modeling_distilbert
import
DistilBertModel
,
DistilBertForQuestionAnswering
,
DistilBertForMaskedLM
,
DistilBertForSequenceClassification
from
.modeling_distilbert
import
DistilBertModel
,
DistilBertForQuestionAnswering
,
DistilBertForMaskedLM
,
DistilBertForSequenceClassification
from
.modeling_camembert
import
CamembertModel
,
CamembertForQuestionAnswering
,
CamembertForMaskedLM
,
CamembertForSequenceClassification
,
CamembertForMultipleChoice
from
.modeling_camembert
import
CamembertModel
,
CamembertForMaskedLM
,
CamembertForSequenceClassification
,
CamembertForMultipleChoice
from
.modeling_albert
import
AlbertModel
,
AlbertForMaskedLM
,
AlbertForSequenceClassification
,
AlbertForQuestionAnswering
from
.modeling_albert
import
AlbertModel
,
AlbertForMaskedLM
,
AlbertForSequenceClassification
,
AlbertForQuestionAnswering
from
.modeling_utils
import
PreTrainedModel
,
SequenceSummary
from
.modeling_utils
import
PreTrainedModel
,
SequenceSummary
...
@@ -294,10 +294,6 @@ class AutoModelWithLMHead(object):
...
@@ -294,10 +294,6 @@ class AutoModelWithLMHead(object):
return
XLMWithLMHeadModel
(
config
)
return
XLMWithLMHeadModel
(
config
)
elif
isinstance
(
config
,
CTRLConfig
):
elif
isinstance
(
config
,
CTRLConfig
):
return
CTRLLMHeadModel
(
config
)
return
CTRLLMHeadModel
(
config
)
elif
isinstance
(
config
,
AlbertConfig
):
return
AlbertLMHeadModel
(
config
)
elif
isinstance
(
config
,
CamembertConfig
):
return
CamembertLMHeadModel
(
config
)
raise
ValueError
(
"Unrecognized configuration class {}"
.
format
(
config
))
raise
ValueError
(
"Unrecognized configuration class {}"
.
format
(
config
))
@
classmethod
@
classmethod
...
@@ -454,7 +450,7 @@ class AutoModelForSequenceClassification(object):
...
@@ -454,7 +450,7 @@ class AutoModelForSequenceClassification(object):
"""
"""
if
isinstance
(
config
,
AlbertConfig
):
if
isinstance
(
config
,
AlbertConfig
):
return
AlbertForSequenceClassification
(
config
)
return
AlbertForSequenceClassification
(
config
)
elif
isintance
(
config
,
CamembertConfig
):
elif
isin
s
tance
(
config
,
CamembertConfig
):
return
CamembertForSequenceClassification
(
config
)
return
CamembertForSequenceClassification
(
config
)
elif
isinstance
(
config
,
DistilBertConfig
):
elif
isinstance
(
config
,
DistilBertConfig
):
return
DistilBertForSequenceClassification
(
config
)
return
DistilBertForSequenceClassification
(
config
)
...
@@ -606,10 +602,8 @@ class AutoModelForQuestionAnswering(object):
...
@@ -606,10 +602,8 @@ class AutoModelForQuestionAnswering(object):
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
"""
if
isintance
(
config
,
AlbertConfig
):
if
isin
s
tance
(
config
,
AlbertConfig
):
return
AlbertForQuestionAnswering
(
config
)
return
AlbertForQuestionAnswering
(
config
)
elif
isintance
(
config
,
CamembertConfig
):
return
CamembertForQuestionAnswering
(
config
)
elif
isinstance
(
config
,
DistilBertConfig
):
elif
isinstance
(
config
,
DistilBertConfig
):
return
DistilBertForQuestionAnswering
(
config
)
return
DistilBertForQuestionAnswering
(
config
)
elif
isinstance
(
config
,
BertConfig
):
elif
isinstance
(
config
,
BertConfig
):
...
...
transformers/pipelines.py
View file @
40a39ab6
...
@@ -20,7 +20,8 @@ from typing import Union, Optional, Tuple, List, Dict
...
@@ -20,7 +20,8 @@ from typing import Union, Optional, Tuple, List, Dict
import
numpy
as
np
import
numpy
as
np
from
transformers
import
is_tf_available
,
is_torch_available
,
logger
,
AutoTokenizer
,
PreTrainedTokenizer
from
transformers
import
is_tf_available
,
is_torch_available
,
logger
,
AutoTokenizer
,
PreTrainedTokenizer
,
\
SquadExample
,
squad_convert_examples_to_features
if
is_tf_available
():
if
is_tf_available
():
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
from
transformers
import
TFAutoModelForSequenceClassification
,
TFAutoModelForQuestionAnswering
...
@@ -107,13 +108,28 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -107,13 +108,28 @@ class QuestionAnsweringPipeline(Pipeline):
super
().
__init__
(
model
,
tokenizer
)
super
().
__init__
(
model
,
tokenizer
)
@
staticmethod
@
staticmethod
def
create_sample
(
question
:
Union
[
str
,
List
[
str
]],
context
:
Union
[
str
,
List
[
str
]])
->
Union
[
dict
,
List
[
Dict
]]:
def
create_sample
(
question
:
Union
[
str
,
List
[
str
]],
context
:
Union
[
str
,
List
[
str
]])
->
Union
[
SquadExample
,
List
[
SquadExample
]]:
is_list
=
isinstance
(
question
,
list
)
is_list
=
isinstance
(
question
,
list
)
if
is_list
:
if
is_list
:
return
[
{
'question'
:
q
,
'context'
:
c
}
for
q
,
c
in
zip
(
question
,
context
)]
return
[
SquadExample
(
None
,
q
,
c
,
None
,
None
,
None
)
for
q
,
c
in
zip
(
question
,
context
)]
else
:
else
:
return
{
'question'
:
question
,
'context'
:
context
}
return
SquadExample
(
None
,
question
,
context
,
None
,
None
,
None
)
def
inputs_for_model
(
self
,
features
:
Union
[
SquadExample
,
List
[
SquadExample
]])
->
Dict
:
args
=
[
'input_ids'
,
'attention_mask'
]
model_type
=
type
(
self
.
model
).
__name__
.
lower
()
if
'distilbert'
not
in
model_type
and
'xlm'
not
in
model_type
:
args
+=
[
'token_type_ids'
]
if
'xlnet'
in
model_type
or
'xlm'
in
model_type
:
args
+=
[
'cls_index'
,
'p_mask'
]
if
isinstance
(
features
,
SquadExample
):
return
{
k
:
features
.
__dict__
[
k
]
for
k
in
args
}
else
:
return
{
k
:
[
feature
.
__dict__
[
k
]
for
feature
in
features
]
for
k
in
args
}
@
classmethod
@
classmethod
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
def
from_config
(
cls
,
model
,
tokenizer
:
PreTrainedTokenizer
,
**
kwargs
):
...
@@ -121,8 +137,11 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -121,8 +137,11 @@ class QuestionAnsweringPipeline(Pipeline):
def
__call__
(
self
,
*
texts
,
**
kwargs
):
def
__call__
(
self
,
*
texts
,
**
kwargs
):
# Set defaults values
# Set defaults values
kwargs
.
setdefault
(
'max_answer_len'
,
15
)
kwargs
.
setdefault
(
'topk'
,
1
)
kwargs
.
setdefault
(
'topk'
,
1
)
kwargs
.
setdefault
(
'doc_stride'
,
128
)
kwargs
.
setdefault
(
'max_answer_len'
,
15
)
kwargs
.
setdefault
(
'max_seq_len'
,
384
)
kwargs
.
setdefault
(
'max_question_len'
,
64
)
if
kwargs
[
'topk'
]
<
1
:
if
kwargs
[
'topk'
]
<
1
:
raise
ValueError
(
'topk parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'topk'
]))
raise
ValueError
(
'topk parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'topk'
]))
...
@@ -130,56 +149,63 @@ class QuestionAnsweringPipeline(Pipeline):
...
@@ -130,56 +149,63 @@ class QuestionAnsweringPipeline(Pipeline):
if
kwargs
[
'max_answer_len'
]
<
1
:
if
kwargs
[
'max_answer_len'
]
<
1
:
raise
ValueError
(
'max_answer_len parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'max_answer_len'
]))
raise
ValueError
(
'max_answer_len parameter should be >= 1 (got {})'
.
format
(
kwargs
[
'max_answer_len'
]))
# Tabular input
# Position args
if
'question'
in
kwargs
and
'context'
in
kwargs
:
if
texts
is
not
None
and
len
(
texts
)
>
1
:
texts
=
QuestionAnsweringPipeline
.
create_sample
(
kwargs
[
'question'
],
kwargs
[
'context'
])
(
texts
,
)
=
texts
elif
'data'
in
kwargs
:
texts
=
kwargs
[
'data'
]
# Generic compatibility with sklearn and Keras
# Generic compatibility with sklearn and Keras
elif
'X'
in
kwargs
and
not
texts
:
elif
'X'
in
kwargs
and
not
texts
:
texts
=
kwargs
.
pop
(
'X'
)
texts
=
kwargs
.
pop
(
'X'
)
else
:
(
texts
,
)
=
texts
if
not
isinstance
(
texts
,
(
dict
,
list
)):
# Batched data
raise
Exception
(
'QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of dict.'
)
elif
'data'
in
kwargs
:
texts
=
kwargs
.
pop
(
'data'
)
# Tabular input
elif
'question'
in
kwargs
and
'context'
in
kwargs
:
texts
=
QuestionAnsweringPipeline
.
create_sample
(
kwargs
[
'question'
],
kwargs
[
'context'
])
else
:
raise
ValueError
(
'Unknown arguments {}'
.
format
(
kwargs
))
if
not
isinstance
(
texts
,
list
):
if
not
isinstance
(
texts
,
list
):
texts
=
[
texts
]
texts
=
[
texts
]
# Map to tuple (question, context)
# Convert inputs to features
texts
=
[(
text
[
'question'
],
text
[
'context'
])
for
text
in
texts
]
features
=
squad_convert_examples_to_features
(
texts
,
self
.
tokenizer
,
kwargs
[
'max_seq_len'
],
kwargs
[
'doc_stride'
],
kwargs
[
'max_question_len'
],
False
)
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
fw_args
=
self
.
inputs_for_model
(
features
)
texts
,
add_special_tokens
=
False
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
,
return_attention_masks
=
True
,
return_input_lengths
=
False
)
token_type_ids
=
inputs
.
pop
(
'token_type_ids'
)
if
is_tf_available
():
if
is_tf_available
():
# TODO trace model
import
tensorflow
as
tf
start
,
end
=
self
.
model
(
inputs
)
fw_args
=
{
k
:
tf
.
constant
(
v
)
for
(
k
,
v
)
in
fw_args
.
items
()}
start
,
end
=
self
.
model
(
fw_args
)
start
,
end
=
start
.
numpy
(),
end
.
numpy
()
start
,
end
=
start
.
numpy
(),
end
.
numpy
()
else
:
else
:
import
torch
import
torch
with
torch
.
no_grad
():
with
torch
.
no_grad
():
# Retrieve the score for the context tokens only (removing question tokens)
# Retrieve the score for the context tokens only (removing question tokens)
start
,
end
=
self
.
model
(
**
inputs
)
fw_args
=
{
k
:
torch
.
tensor
(
v
)
for
(
k
,
v
)
in
fw_args
.
items
()}
start
,
end
=
self
.
model
(
**
fw_args
)
start
,
end
=
start
.
cpu
().
numpy
(),
end
.
cpu
().
numpy
()
start
,
end
=
start
.
cpu
().
numpy
(),
end
.
cpu
().
numpy
()
answers
=
[]
answers
=
[]
for
i
in
range
(
len
(
texts
)):
for
i
,
(
example
,
feature
,
start_
,
end_
)
in
enumerate
(
zip
(
texts
,
features
,
start
,
end
)):
context_idx
=
token_type_ids
[
i
]
==
1
start_
,
end_
=
start_
*
np
.
abs
(
np
.
array
(
feature
.
p_mask
)
-
1
),
end_
*
np
.
abs
(
np
.
array
(
feature
.
p_mask
)
-
1
)
start_
,
end_
=
start
[
i
,
context_idx
],
end
[
i
,
context_idx
]
# Normalize logits and spans to retrieve the answer
# Normalize logits and spans to retrieve the answer
start_
=
np
.
exp
(
start_
)
/
np
.
sum
(
np
.
exp
(
start_
))
start_
=
np
.
exp
(
start_
)
/
np
.
sum
(
np
.
exp
(
start_
))
end_
=
np
.
exp
(
end_
)
/
np
.
sum
(
np
.
exp
(
end_
))
end_
=
np
.
exp
(
end_
)
/
np
.
sum
(
np
.
exp
(
end_
))
starts
,
ends
,
scores
=
self
.
decode
(
start_
,
end_
,
kwargs
[
'topk'
],
kwargs
[
'max_answer_len'
])
starts
,
ends
,
scores
=
self
.
decode
(
start_
,
end_
,
kwargs
[
'topk'
],
kwargs
[
'max_answer_len'
])
char_to_word
=
np
.
array
(
example
.
char_to_word_offset
)
# Convert the answer (tokens) back to the original text
# Convert the answer (tokens) back to the original text
answers
+=
[[
answers
+=
[[
{
**
{
'score'
:
score
},
**
self
.
span_to_answer
(
texts
[
i
][
1
],
s
,
e
)}
{
'score'
:
score
,
'start'
:
np
.
where
(
char_to_word
==
feature
.
token_to_orig_map
[
s
])[
0
][
0
],
'end'
:
np
.
where
(
char_to_word
==
feature
.
token_to_orig_map
[
e
])[
0
][
-
1
],
'answer'
:
' '
.
join
(
example
.
doc_tokens
[
feature
.
token_to_orig_map
[
s
]:
feature
.
token_to_orig_map
[
e
]
+
1
])
}
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
]]
]]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment