Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
81a911cc
Commit
81a911cc
authored
Dec 19, 2019
by
Morgan Funtowicz
Browse files
Doc, doc, ... doc.
parent
faef6f61
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
4 deletions
+58
-4
transformers/pipelines.py
transformers/pipelines.py
+58
-4
No files found.
transformers/pipelines.py
View file @
81a911cc
...
...
@@ -102,9 +102,19 @@ class PipelineDataFormat:
@
abstractmethod
def
save
(
self
,
data
:
dict
):
"""
Save the provided data object with the representation for the current `DataFormat`.
:param data: data to store
:return:
"""
raise
NotImplementedError
()
def
save_binary
(
self
,
data
:
Union
[
dict
,
List
[
dict
]])
->
str
:
"""
Save the provided data object as a pickle-formatted binary data on the disk.
:param data: data to store
:return: (str) Path where the data has been saved
"""
path
,
_
=
os
.
path
.
splitext
(
self
.
output
)
binary_path
=
os
.
path
.
extsep
.
join
((
path
,
'pickle'
))
...
...
@@ -222,6 +232,42 @@ class Pipeline(_ScikitCompat):
Base class implementing pipelined operations.
Pipeline workflow is defined as a sequence of the following operations:
Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
Pipeline supports running on CPU or GPU through the device argument. Users can specify
device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.
Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
provide the binary_output constructor argument. If set to True, the output will be stored in the
pickle format.
Arguments:
**model**: ``(str, PretrainedModel, TFPretrainedModel)``:
Reference to the model to use through this pipeline.
**tokenizer**: ``(str, PreTrainedTokenizer)``:
Reference to the tokenizer to use through this pipeline.
**args_parser**: ``ArgumentHandler``:
Reference to the object in charge of parsing supplied pipeline parameters.
**device**: ``int``:
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
**binary_output** ``bool`` (default: False):
Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
Return:
Pipeline returns list or dictionary depending on:
- Does the user provided multiple sample
- The pipeline expose multiple fields in the output object
Examples:
nlp = pipeline('ner')
nlp = pipeline('ner', model='...', config='...', tokenizer='...')
nlp = NerPipeline(model='...', config='...', tokenizer='...')
nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
"""
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
=
None
,
args_parser
:
ArgumentHandler
=
None
,
device
:
int
=
-
1
,
...
...
@@ -312,11 +358,11 @@ class Pipeline(_ScikitCompat):
# Encode for forward
with
self
.
device_placement
():
# TODO : Remove this 512 hard-limit
inputs
=
self
.
tokenizer
.
batch_encode_plus
(
inputs
,
add_special_tokens
=
True
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
,
# max_length=self.model.config.max_position_embedding
max_length
=
511
max_length
=
512
)
# Filter out features not available on specific models
...
...
@@ -385,6 +431,8 @@ class NerPipeline(Pipeline):
# Manage correct placement of the tensors
with
self
.
device_placement
():
# TODO : Remove this 512 hard-limit
tokens
=
self
.
tokenizer
.
encode_plus
(
sentence
,
return_attention_mask
=
False
,
return_tensors
=
'tf'
if
is_tf_available
()
else
'pt'
,
...
...
@@ -488,9 +536,12 @@ class QuestionAnsweringPipeline(Pipeline):
QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
We currently support extractive question answering.
Args:
Arg
ument
s:
question: (str, List[str]) The question to be ask for the associated context
context: (str, List[str]) The context in which we will look for the answer.
Returns:
SquadExample initialized with the corresponding question and context.
"""
if
isinstance
(
question
,
list
):
return
[
SquadExample
(
None
,
q
,
c
,
None
,
None
,
None
)
for
q
,
c
in
zip
(
question
,
context
)]
...
...
@@ -717,7 +768,10 @@ def pipeline(task: str, model: Optional = None,
Some (optional) post processing for enhancing model's output
Examples:
pipeline('ner')
pipeline('sentiment-analysis')
pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased')
pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
"""
# Try to infer tokenizer from model name (if provided as str)
if
tokenizer
is
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment