Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bbaaec04
Commit
bbaaec04
authored
Dec 20, 2019
by
thomwolf
Browse files
fixing CLI pipeline
parent
1c12ee0e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
58 additions
and
41 deletions
+58
-41
transformers/commands/run.py
transformers/commands/run.py
+22
-10
transformers/pipelines.py
transformers/pipelines.py
+36
-31
No files found.
transformers/commands/run.py
View file @
bbaaec04
...
...
@@ -9,6 +9,9 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def
try_infer_format_from_ext
(
path
:
str
):
if
not
path
:
return
'pipe'
for
ext
in
PipelineDataFormat
.
SUPPORTED_FORMATS
:
if
path
.
endswith
(
ext
):
return
ext
...
...
@@ -20,9 +23,16 @@ def try_infer_format_from_ext(path: str):
def
run_command_factory
(
args
):
nlp
=
pipeline
(
task
=
args
.
task
,
model
=
args
.
model
,
config
=
args
.
config
,
tokenizer
=
args
.
tokenizer
,
device
=
args
.
device
)
nlp
=
pipeline
(
task
=
args
.
task
,
model
=
args
.
model
if
args
.
model
else
None
,
config
=
args
.
config
,
tokenizer
=
args
.
tokenizer
,
device
=
args
.
device
)
format
=
try_infer_format_from_ext
(
args
.
input
)
if
args
.
format
==
'infer'
else
args
.
format
reader
=
PipelineDataFormat
.
from_str
(
format
,
args
.
output
,
args
.
input
,
args
.
column
)
reader
=
PipelineDataFormat
.
from_str
(
format
=
format
,
output_path
=
args
.
output
,
input_path
=
args
.
input
,
column
=
args
.
column
if
args
.
column
else
nlp
.
default_input_names
)
return
RunCommand
(
nlp
,
reader
)
...
...
@@ -35,24 +45,26 @@ class RunCommand(BaseTransformersCLICommand):
@
staticmethod
def
register_subcommand
(
parser
:
ArgumentParser
):
run_parser
=
parser
.
add_parser
(
'run'
,
help
=
"Run a pipeline through the CLI"
)
run_parser
.
add_argument
(
'--device'
,
type
=
int
,
default
=-
1
,
help
=
'Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)'
)
run_parser
.
add_argument
(
'--task'
,
choices
=
SUPPORTED_TASKS
.
keys
(),
help
=
'Task to run'
)
run_parser
.
add_argument
(
'--model'
,
type
=
str
,
required
=
True
,
help
=
'Name or path to the model to instantiate.'
)
run_parser
.
add_argument
(
'--input'
,
type
=
str
,
help
=
'Path to the file to use for inference'
)
run_parser
.
add_argument
(
'--output'
,
type
=
str
,
help
=
'Path to the file that will be used post to write results.'
)
run_parser
.
add_argument
(
'--model'
,
type
=
str
,
help
=
'Name or path to the model to instantiate.'
)
run_parser
.
add_argument
(
'--config'
,
type
=
str
,
help
=
'Name or path to the model
\'
s config to instantiate.'
)
run_parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
help
=
'Name of the tokenizer to use. (default: same as the model name)'
)
run_parser
.
add_argument
(
'--column'
,
type
=
str
,
help
=
'Name of the column to use as input. (For multi columns input as QA use column1,columns2)'
)
run_parser
.
add_argument
(
'--format'
,
type
=
str
,
default
=
'infer'
,
choices
=
PipelineDataFormat
.
SUPPORTED_FORMATS
,
help
=
'Input format to read from'
)
run_parser
.
add_argument
(
'--input'
,
type
=
str
,
help
=
'Path to the file to use for inference'
)
run_parser
.
add_argument
(
'--output'
,
type
=
str
,
help
=
'Path to the file that will be used post to write results.'
)
run_parser
.
add_argument
(
'--device'
,
type
=
int
,
default
=-
1
,
help
=
'Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)'
)
run_parser
.
set_defaults
(
func
=
run_command_factory
)
def
run
(
self
):
nlp
,
output
=
self
.
_nlp
,
[]
nlp
,
outputs
=
self
.
_nlp
,
[]
for
entry
in
self
.
_reader
:
if
self
.
_reader
.
is_multi_columns
:
output
+=
nlp
(
**
entry
)
output
=
nlp
(
**
entry
)
if
self
.
_reader
.
is_multi_columns
else
nlp
(
entry
)
if
isinstance
(
output
,
dict
):
outputs
.
append
(
output
)
else
:
output
+=
nlp
(
entry
)
output
s
+=
output
# Saving data
if
self
.
_nlp
.
binary_output
:
...
...
transformers/pipelines.py
View file @
bbaaec04
...
...
@@ -14,12 +14,14 @@
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
sys
import
csv
import
json
import
os
import
pickle
import
logging
import
six
from
abc
import
ABC
,
abstractmethod
from
contextlib
import
contextmanager
from
itertools
import
groupby
...
...
@@ -98,28 +100,29 @@ class PipelineDataFormat:
Supported data formats currently includes:
- JSON
- CSV
- stdin/stdout (pipe)
PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
"""
SUPPORTED_FORMATS
=
[
'json'
,
'csv'
,
'pipe'
]
def
__init__
(
self
,
output
:
Optional
[
str
],
input
:
Optional
[
str
],
column
:
Optional
[
str
]):
self
.
output
=
output
self
.
path
=
input
self
.
column
=
column
.
split
(
','
)
if
column
else
[
''
]
def
__init__
(
self
,
output
_path
:
Optional
[
str
],
input
_path
:
Optional
[
str
],
column
:
Optional
[
str
]):
self
.
output
_path
=
output
_path
self
.
input_
path
=
input
_path
self
.
column
=
column
.
split
(
','
)
if
column
is
not
None
else
[
''
]
self
.
is_multi_columns
=
len
(
self
.
column
)
>
1
if
self
.
is_multi_columns
:
self
.
column
=
[
tuple
(
c
.
split
(
'='
))
if
'='
in
c
else
(
c
,
c
)
for
c
in
self
.
column
]
if
output
is
not
None
:
if
exists
(
abspath
(
self
.
output
)):
raise
OSError
(
'{} already exists on disk'
.
format
(
self
.
output
))
if
output
_path
is
not
None
:
if
exists
(
abspath
(
self
.
output
_path
)):
raise
OSError
(
'{} already exists on disk'
.
format
(
self
.
output
_path
))
if
input
is
not
None
:
if
not
exists
(
abspath
(
self
.
path
)):
raise
OSError
(
'{} doesnt exist on disk'
.
format
(
self
.
path
))
if
input
_path
is
not
None
:
if
not
exists
(
abspath
(
self
.
input_
path
)):
raise
OSError
(
'{} doesnt exist on disk'
.
format
(
self
.
input_
path
))
@
abstractmethod
def
__iter__
(
self
):
...
...
@@ -140,7 +143,7 @@ class PipelineDataFormat:
:param data: data to store
:return: (str) Path where the data has been saved
"""
path
,
_
=
os
.
path
.
splitext
(
self
.
output
)
path
,
_
=
os
.
path
.
splitext
(
self
.
output
_path
)
binary_path
=
os
.
path
.
extsep
.
join
((
path
,
'pickle'
))
with
open
(
binary_path
,
'wb+'
)
as
f_output
:
...
...
@@ -149,23 +152,23 @@ class PipelineDataFormat:
return
binary_path
@
staticmethod
def
from_str
(
name
:
str
,
output
:
Optional
[
str
],
path
:
Optional
[
str
],
column
:
Optional
[
str
]):
if
name
==
'json'
:
return
JsonPipelineDataFormat
(
output
,
path
,
column
)
elif
name
==
'csv'
:
return
CsvPipelineDataFormat
(
output
,
path
,
column
)
elif
name
==
'pipe'
:
return
PipedPipelineDataFormat
(
output
,
path
,
column
)
def
from_str
(
format
:
str
,
output
_path
:
Optional
[
str
],
input_
path
:
Optional
[
str
],
column
:
Optional
[
str
]):
if
format
==
'json'
:
return
JsonPipelineDataFormat
(
output
_path
,
input_
path
,
column
)
elif
format
==
'csv'
:
return
CsvPipelineDataFormat
(
output
_path
,
input_
path
,
column
)
elif
format
==
'pipe'
:
return
PipedPipelineDataFormat
(
output
_path
,
input_
path
,
column
)
else
:
raise
KeyError
(
'Unknown reader {} (Available reader are json/csv/pipe)'
.
format
(
name
))
raise
KeyError
(
'Unknown reader {} (Available reader are json/csv/pipe)'
.
format
(
format
))
class
CsvPipelineDataFormat
(
PipelineDataFormat
):
def
__init__
(
self
,
output
:
Optional
[
str
],
input
:
Optional
[
str
],
column
:
Optional
[
str
]):
super
().
__init__
(
output
,
input
,
column
)
def
__init__
(
self
,
output
_path
:
Optional
[
str
],
input
_path
:
Optional
[
str
],
column
:
Optional
[
str
]):
super
().
__init__
(
output
_path
,
input
_path
,
column
)
def
__iter__
(
self
):
with
open
(
self
.
path
,
'r'
)
as
f
:
with
open
(
self
.
input_
path
,
'r'
)
as
f
:
reader
=
csv
.
DictReader
(
f
)
for
row
in
reader
:
if
self
.
is_multi_columns
:
...
...
@@ -174,7 +177,7 @@ class CsvPipelineDataFormat(PipelineDataFormat):
yield
row
[
self
.
column
[
0
]]
def
save
(
self
,
data
:
List
[
dict
]):
with
open
(
self
.
output
,
'w'
)
as
f
:
with
open
(
self
.
output
_path
,
'w'
)
as
f
:
if
len
(
data
)
>
0
:
writer
=
csv
.
DictWriter
(
f
,
list
(
data
[
0
].
keys
()))
writer
.
writeheader
()
...
...
@@ -182,10 +185,10 @@ class CsvPipelineDataFormat(PipelineDataFormat):
class
JsonPipelineDataFormat
(
PipelineDataFormat
):
def
__init__
(
self
,
output
:
Optional
[
str
],
input
:
Optional
[
str
],
column
:
Optional
[
str
]):
super
().
__init__
(
output
,
input
,
column
)
def
__init__
(
self
,
output
_path
:
Optional
[
str
],
input
_path
:
Optional
[
str
],
column
:
Optional
[
str
]):
super
().
__init__
(
output
_path
,
input
_path
,
column
)
with
open
(
input
,
'r'
)
as
f
:
with
open
(
input
_path
,
'r'
)
as
f
:
self
.
_entries
=
json
.
load
(
f
)
def
__iter__
(
self
):
...
...
@@ -196,7 +199,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
yield
entry
[
self
.
column
[
0
]]
def
save
(
self
,
data
:
dict
):
with
open
(
self
.
output
,
'w'
)
as
f
:
with
open
(
self
.
output
_path
,
'w'
)
as
f
:
json
.
dump
(
data
,
f
)
...
...
@@ -208,9 +211,7 @@ class PipedPipelineDataFormat(PipelineDataFormat):
If columns are provided, then the output will be a dictionary with {column_x: value_x}
"""
def
__iter__
(
self
):
import
sys
for
line
in
sys
.
stdin
:
# Split for multi-columns
if
'
\t
'
in
line
:
...
...
@@ -229,7 +230,7 @@ class PipedPipelineDataFormat(PipelineDataFormat):
print
(
data
)
def
save_binary
(
self
,
data
:
Union
[
dict
,
List
[
dict
]])
->
str
:
if
self
.
output
is
None
:
if
self
.
output
_path
is
None
:
raise
KeyError
(
'When using piped input on pipeline outputting large object requires an output file path. '
'Please provide such output path through --output argument.'
...
...
@@ -294,6 +295,9 @@ class Pipeline(_ScikitCompat):
nlp = NerPipeline(model='...', config='...', tokenizer='...')
nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
"""
default_input_names
=
None
def
__init__
(
self
,
model
,
tokenizer
:
PreTrainedTokenizer
=
None
,
modelcard
:
ModelCard
=
None
,
framework
:
Optional
[
str
]
=
None
,
args_parser
:
ArgumentHandler
=
None
,
device
:
int
=
-
1
,
...
...
@@ -582,6 +586,8 @@ class QuestionAnsweringPipeline(Pipeline):
Question Answering pipeline using ModelForQuestionAnswering head.
"""
default_input_names
=
'question,context'
def
__init__
(
self
,
model
,
tokenizer
:
Optional
[
PreTrainedTokenizer
],
modelcard
:
Optional
[
ModelCard
],
...
...
@@ -684,7 +690,6 @@ class QuestionAnsweringPipeline(Pipeline):
}
for
s
,
e
,
score
in
zip
(
starts
,
ends
,
scores
)
]
if
len
(
answers
)
==
1
:
return
answers
[
0
]
return
answers
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment