__init__.py 29.5 KB
Newer Older
1
2
3
4
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.

5
6
import io
import json
7
import os
8

9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
24
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
25
26

from ..configuration_utils import PretrainedConfig
27
from ..feature_extraction_utils import PreTrainedFeatureExtractor
28
from ..file_utils import http_get, is_tf_available, is_torch_available
29
30
31
from ..models.auto.configuration_auto import AutoConfig
from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
32
33
from ..tokenization_utils import PreTrainedTokenizer
from ..utils import logging
34
from .audio_classification import AudioClassificationPipeline
35
from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
36
37
38
39
40
41
42
43
44
from .base import (
    ArgumentHandler,
    CsvPipelineDataFormat,
    JsonPipelineDataFormat,
    PipedPipelineDataFormat,
    Pipeline,
    PipelineDataFormat,
    PipelineException,
    get_default_model,
45
    infer_framework_load_model,
46
47
48
49
)
from .conversational import Conversation, ConversationalPipeline
from .feature_extraction import FeatureExtractionPipeline
from .fill_mask import FillMaskPipeline
50
from .image_classification import ImageClassificationPipeline
51
from .image_segmentation import ImageSegmentationPipeline
52
from .object_detection import ObjectDetectionPipeline
53
54
55
56
57
from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
from .text_classification import TextClassificationPipeline
from .text_generation import TextGenerationPipeline
58
59
60
61
62
63
from .token_classification import (
    AggregationStrategy,
    NerPipeline,
    TokenClassificationArgumentHandler,
    TokenClassificationPipeline,
)
64
from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
65
from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82


if is_tf_available():
    import tensorflow as tf

    from ..models.auto.modeling_tf_auto import (
        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
        TF_MODEL_WITH_LM_HEAD_MAPPING,
        TFAutoModel,
        TFAutoModelForCausalLM,
        TFAutoModelForMaskedLM,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForSeq2SeqLM,
        TFAutoModelForSequenceClassification,
Kamal Raj's avatar
Kamal Raj committed
83
        TFAutoModelForTableQuestionAnswering,
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        TFAutoModelForTokenClassification,
    )

if is_torch_available():
    import torch

    from ..models.auto.modeling_auto import (
        MODEL_FOR_MASKED_LM_MAPPING,
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
        AutoModel,
98
        AutoModelForAudioClassification,
99
        AutoModelForCausalLM,
100
        AutoModelForCTC,
101
        AutoModelForImageClassification,
102
        AutoModelForImageSegmentation,
103
        AutoModelForMaskedLM,
104
        AutoModelForObjectDetection,
105
106
107
        AutoModelForQuestionAnswering,
        AutoModelForSeq2SeqLM,
        AutoModelForSequenceClassification,
108
        AutoModelForSpeechSeq2Seq,
109
110
111
112
113
114
115
116
117
118
119
        AutoModelForTableQuestionAnswering,
        AutoModelForTokenClassification,
    )
if TYPE_CHECKING:
    from ..modeling_tf_utils import TFPreTrainedModel
    from ..modeling_utils import PreTrainedModel

logger = logging.get_logger(__name__)


# Register all the supported tasks here
120
121
122
123
TASK_ALIASES = {
    "sentiment-analysis": "text-classification",
    "ner": "token-classification",
}
124
SUPPORTED_TASKS = {
125
126
127
128
129
    "audio-classification": {
        "impl": AudioClassificationPipeline,
        "tf": (),
        "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
        "default": {"model": {"pt": "superb/wav2vec2-base-superb-ks"}},
130
        "type": "audio",
131
    },
132
133
134
    "automatic-speech-recognition": {
        "impl": AutomaticSpeechRecognitionPipeline,
        "tf": (),
135
        "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (),
136
        "default": {"model": {"pt": "facebook/wav2vec2-base-960h"}},
137
        "type": "multimodal",
138
    },
139
140
    "feature-extraction": {
        "impl": FeatureExtractionPipeline,
141
142
        "tf": (TFAutoModel,) if is_tf_available() else (),
        "pt": (AutoModel,) if is_torch_available() else (),
143
        "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}},
144
        "type": "multimodal",
145
    },
146
    "text-classification": {
147
        "impl": TextClassificationPipeline,
148
149
        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
150
151
152
153
154
155
        "default": {
            "model": {
                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
            },
        },
156
        "type": "text",
157
    },
158
    "token-classification": {
159
        "impl": TokenClassificationPipeline,
160
161
        "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (),
        "pt": (AutoModelForTokenClassification,) if is_torch_available() else (),
162
163
164
165
166
167
        "default": {
            "model": {
                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
            },
        },
168
        "type": "text",
169
170
171
    },
    "question-answering": {
        "impl": QuestionAnsweringPipeline,
172
173
        "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (),
        "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (),
174
175
176
        "default": {
            "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"},
        },
177
        "type": "text",
178
179
180
    },
    "table-question-answering": {
        "impl": TableQuestionAnsweringPipeline,
181
        "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
Kamal Raj's avatar
Kamal Raj committed
182
        "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
183
184
        "default": {
            "model": {
185
186
187
                "pt": "google/tapas-base-finetuned-wtq",
                "tokenizer": "google/tapas-base-finetuned-wtq",
                "tf": "google/tapas-base-finetuned-wtq",
188
189
            },
        },
190
        "type": "text",
191
192
193
    },
    "fill-mask": {
        "impl": FillMaskPipeline,
194
195
        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
        "pt": (AutoModelForMaskedLM,) if is_torch_available() else (),
196
        "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}},
197
        "type": "text",
198
199
200
    },
    "summarization": {
        "impl": SummarizationPipeline,
201
202
        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
203
        "default": {"model": {"pt": "sshleifer/distilbart-cnn-12-6", "tf": "t5-small"}},
204
        "type": "text",
205
206
207
208
    },
    # This task is a special case as it's parametrized by SRC, TGT languages.
    "translation": {
        "impl": TranslationPipeline,
209
210
        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
211
212
213
214
215
        "default": {
            ("en", "fr"): {"model": {"pt": "t5-base", "tf": "t5-base"}},
            ("en", "de"): {"model": {"pt": "t5-base", "tf": "t5-base"}},
            ("en", "ro"): {"model": {"pt": "t5-base", "tf": "t5-base"}},
        },
216
        "type": "text",
217
218
219
    },
    "text2text-generation": {
        "impl": Text2TextGenerationPipeline,
220
221
        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
222
        "default": {"model": {"pt": "t5-base", "tf": "t5-base"}},
223
        "type": "text",
224
225
226
    },
    "text-generation": {
        "impl": TextGenerationPipeline,
227
228
        "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (),
        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
229
        "default": {"model": {"pt": "gpt2", "tf": "gpt2"}},
230
        "type": "text",
231
232
233
    },
    "zero-shot-classification": {
        "impl": ZeroShotClassificationPipeline,
234
235
        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
236
237
238
239
240
        "default": {
            "model": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"},
            "config": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"},
            "tokenizer": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"},
        },
241
        "type": "text",
242
    },
243
244
245
246
247
248
249
    "zero-shot-image-classification": {
        "impl": ZeroShotImageClassificationPipeline,
        "tf": (TFAutoModel,) if is_tf_available() else (),
        "pt": (AutoModel,) if is_torch_available() else (),
        "default": {"pt": "openai/clip-vit-base-patch32", "tf": "openai/clip-vit-base-patch32"},
        "type": "multimodal",
    },
250
251
    "conversational": {
        "impl": ConversationalPipeline,
252
253
        "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (),
        "pt": (AutoModelForSeq2SeqLM, AutoModelForCausalLM) if is_torch_available() else (),
254
        "default": {"model": {"pt": "microsoft/DialoGPT-medium", "tf": "microsoft/DialoGPT-medium"}},
255
        "type": "text",
256
    },
257
258
    "image-classification": {
        "impl": ImageClassificationPipeline,
259
260
        "tf": (),
        "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
261
        "default": {"model": {"pt": "google/vit-base-patch16-224"}},
262
        "type": "image",
263
    },
264
265
266
267
268
    "image-segmentation": {
        "impl": ImageSegmentationPipeline,
        "tf": (),
        "pt": (AutoModelForImageSegmentation,) if is_torch_available() else (),
        "default": {"model": {"pt": "facebook/detr-resnet-50-panoptic"}},
269
        "type": "image",
270
    },
271
272
273
274
275
    "object-detection": {
        "impl": ObjectDetectionPipeline,
        "tf": (),
        "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
        "default": {"model": {"pt": "facebook/detr-resnet-50"}},
276
        "type": "image",
277
    },
278
279
}

280
281
282
283
284
285
286
287
288
289
NO_FEATURE_EXTRACTOR_TASKS = set()
NO_TOKENIZER_TASKS = set()
for task, values in SUPPORTED_TASKS.items():
    if values["type"] == "text":
        NO_FEATURE_EXTRACTOR_TASKS.add(task)
    elif values["type"] in {"audio", "image"}:
        NO_TOKENIZER_TASKS.add(task)
    elif values["type"] != "multimodal":
        raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")

290

291
292
293
294
295
296
297
298
299
def get_supported_tasks() -> List[str]:
    """
    Returns a list of supported task strings.
    """
    supported_tasks = list(SUPPORTED_TASKS.keys()) + list(TASK_ALIASES.keys())
    supported_tasks.sort()
    return supported_tasks


300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def get_task(model: str, use_auth_token: Optional[str] = None) -> str:
    tmp = io.BytesIO()
    headers = {}
    if use_auth_token:
        headers["Authorization"] = f"Bearer {use_auth_token}"

    try:
        http_get(f"https://huggingface.co/api/models/{model}", tmp, headers=headers)
        tmp.seek(0)
        body = tmp.read()
        data = json.loads(body)
    except Exception as e:
        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
    if "pipeline_tag" not in data:
        raise RuntimeError(
            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
        )
    if data.get("library_name", "transformers") != "transformers":
        raise RuntimeError(f"This model is meant to be used with {data['library_name']} not with transformers")
    task = data["pipeline_tag"]
    return task


323
324
325
326
327
328
def check_task(task: str) -> Tuple[Dict, Any]:
    """
    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
    default models if they exist.

    Args:
329
        task (`str`):
330
331
            The task defining which pipeline will be returned. Currently accepted tasks are:

332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
            - `"audio-classification"`
            - `"automatic-speech-recognition"`
            - `"conversational"`
            - `"feature-extraction"`
            - `"fill-mask"`
            - `"image-classification"`
            - `"question-answering"`
            - `"table-question-answering"`
            - `"text2text-generation"`
            - `"text-classification"` (alias `"sentiment-analysis"` available)
            - `"text-generation"`
            - `"token-classification"` (alias `"ner"` available)
            - `"translation"`
            - `"translation_xx_to_yy"`
            - `"summarization"`
            - `"zero-shot-classification"`
348
349

    Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
350
351
        (task_defaults`dict`, task_options: (`tuple`, None)) The actual dictionary required to initialize the pipeline
        and some extra task options for parametrized tasks like "translation_XX_to_YY"
352
353
354


    """
355
356
    if task in TASK_ALIASES:
        task = TASK_ALIASES[task]
357
358
359
360
361
362
363
364
365
    if task in SUPPORTED_TASKS:
        targeted_task = SUPPORTED_TASKS[task]
        return targeted_task, None

    if task.startswith("translation"):
        tokens = task.split("_")
        if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
            targeted_task = SUPPORTED_TASKS["translation"]
            return targeted_task, (tokens[1], tokens[3])
366
        raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
367

368
    raise KeyError(f"Unknown task {task}, available tasks are {get_supported_tasks() + ['translation_XX_to_YY']}")
369
370
371


def pipeline(
372
    task: str = None,
373
374
375
    model: Optional = None,
    config: Optional[Union[str, PretrainedConfig]] = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
376
    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
377
378
379
    framework: Optional[str] = None,
    revision: Optional[str] = None,
    use_fast: bool = True,
380
    use_auth_token: Optional[Union[str, bool]] = None,
381
    model_kwargs: Dict[str, Any] = None,
382
    pipeline_class: Optional[Any] = None,
383
384
385
    **kwargs
) -> Pipeline:
    """
386
    Utility factory method to build a [`Pipeline`].
387
388
389

    Pipelines are made of:

390
391
        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
        - A [model](model) to make predictions from the inputs.
392
393
394
        - Some (optional) post processing for enhancing model's output.

    Args:
395
        task (`str`):
396
397
            The task defining which pipeline will be returned. Currently accepted tasks are:

398
            - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
Sylvain Gugger's avatar
Sylvain Gugger committed
399
            - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`].
400
401
402
403
404
405
406
407
408
409
            - `"conversational"`: will return a [`ConversationalPipeline`].
            - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
            - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
            - `"image-classification"`: will return a [`ImageClassificationPipeline`].
            - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
            - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
            - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
            - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
              [`TextClassificationPipeline`].
            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
Sylvain Gugger's avatar
Sylvain Gugger committed
410
            - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
411
412
413
414
415
416
            - `"translation"`: will return a [`TranslationPipeline`].
            - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
            - `"summarization"`: will return a [`SummarizationPipeline`].
            - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].

        model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
417
            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
Sylvain Gugger's avatar
Sylvain Gugger committed
418
419
            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch) or
            [`TFPreTrainedModel`] (for TensorFlow).
420

421
422
            If not provided, the default for the `task` will be loaded.
        config (`str` or [`PretrainedConfig`], *optional*):
423
            The configuration that will be used by the pipeline to instantiate the model. This can be a model
Sylvain Gugger's avatar
Sylvain Gugger committed
424
            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
425
426

            If not provided, the default configuration file for the requested model will be used. That means that if
Sylvain Gugger's avatar
Sylvain Gugger committed
427
428
            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
            `task`'s default model's config is used instead.
429
        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
430
            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
431
            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
432

Sylvain Gugger's avatar
Sylvain Gugger committed
433
434
435
436
            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
            will be loaded.
437
        feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*):
438
            The feature extractor that will be used by the pipeline to encode data for the model. This can be a model
Sylvain Gugger's avatar
Sylvain Gugger committed
439
            identifier or an actual pretrained feature extractor inheriting from [`PreTrainedFeatureExtractor`].
440
441
442
443

            Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal
            models. Multi-modal models will also require a tokenizer to be passed.

Sylvain Gugger's avatar
Sylvain Gugger committed
444
445
446
447
            If not provided, the default feature extractor for the given `model` will be loaded (if it is a string). If
            `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it
            is a string). However, if `config` is also not given or not a string, then the default feature extractor
            for the given `task` will be loaded.
448
        framework (`str`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
449
450
            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
            installed.
451
452

            If no framework is specified, will default to the one currently installed. If no framework is specified and
Sylvain Gugger's avatar
Sylvain Gugger committed
453
454
            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
            provided.
Stas Bekman's avatar
Stas Bekman committed
455
        revision (`str`, *optional*, defaults to `"main"`):
456
457
            When passing a task name or a string model identifier: The specific model version to use. It can be a
            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
458
459
460
461
            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
        use_fast (`bool`, *optional*, defaults to `True`):
            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
        use_auth_token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
462
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
Stas Bekman's avatar
Stas Bekman committed
463
            when running `transformers-cli login` (stored in `~/.huggingface`).
Funtowicz Morgan's avatar
Funtowicz Morgan committed
464
        model_kwargs:
Sylvain Gugger's avatar
Sylvain Gugger committed
465
466
            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
            **model_kwargs)` function.
467
468
469
470
471
        kwargs:
            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
            corresponding pipeline class for possible values).

    Returns:
472
        [`Pipeline`]: A suitable pipeline for the task.
473

474
    Examples:
475

476
477
    ```python
    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
478

479
    >>> # Sentiment analysis pipeline
Sylvain Gugger's avatar
Sylvain Gugger committed
480
    >>> pipeline("sentiment-analysis")
481

482
    >>> # Question answering pipeline, specifying the checkpoint identifier
Sylvain Gugger's avatar
Sylvain Gugger committed
483
    >>> pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="bert-base-cased")
484

485
486
487
    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
Sylvain Gugger's avatar
Sylvain Gugger committed
488
    >>> pipeline("ner", model=model, tokenizer=tokenizer)
489
    ```"""
490
491
    if model_kwargs is None:
        model_kwargs = {}
492
493
494
495
496
497
498
499

    if task is None and model is None:
        raise RuntimeError(
            "Impossible to instantiate a pipeline without either a task or a model"
            "being specified."
            "Please provide a task class or a model"
        )

500
501
502
503
504
505
506
507
508
509
510
511
    if model is None and tokenizer is not None:
        raise RuntimeError(
            "Impossible to instantiate a pipeline with tokenizer specified but not the model "
            "as the provided tokenizer may not be compatible with the default model. "
            "Please provide a PreTrainedModel class or a path/identifier to a pretrained model when providing tokenizer."
        )
    if model is None and feature_extractor is not None:
        raise RuntimeError(
            "Impossible to instantiate a pipeline with feature_extractor specified but not the model "
            "as the provided feature_extractor may not be compatible with the default model. "
            "Please provide a PreTrainedModel class or a path/identifier to a pretrained model when providing feature_extractor."
        )
512

513
514
515
516
517
518
519
520
    if task is None and model is not None:
        if not isinstance(model, str):
            raise RuntimeError(
                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`."
                f"{model} is not a valid model_id."
            )
        task = get_task(model, use_auth_token)

521
522
    # Retrieve the task
    targeted_task, task_options = check_task(task)
523
524
    if pipeline_class is None:
        pipeline_class = targeted_task["impl"]
525
526
527
528
529

    # Use default model/config/tokenizer for the task if no model is provided
    if model is None:
        # At that point framework might still be undetermined
        model = get_default_model(targeted_task, framework, task_options)
530
        logger.warning(f"No model was supplied, defaulted to {model} (https://huggingface.co/{model})")
531

532
533
534
    # Retrieve use_auth_token and add it to model_kwargs to be used in .from_pretrained
    model_kwargs["use_auth_token"] = model_kwargs.get("use_auth_token", use_auth_token)

535
536
537
538
539
540
    # Config is the primordial information item.
    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config, revision=revision, _from_pipeline=task, **model_kwargs)
    elif config is None and isinstance(model, str):
        config = AutoConfig.from_pretrained(model, revision=revision, _from_pipeline=task, **model_kwargs)
541

542
    model_name = model if isinstance(model, str) else None
543

544
545
546
547
548
    # Infer the framework from the model
    # Forced if framework already defined, inferred if it's None
    # Will load the correct model if possible
    model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
    framework, model = infer_framework_load_model(
549
550
551
552
553
554
555
        model,
        model_classes=model_classes,
        config=config,
        framework=framework,
        revision=revision,
        task=task,
        **model_kwargs,
556
    )
557

558
559
    model_config = model.config

560
561
    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
    load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
562

563
    if task in NO_TOKENIZER_TASKS:
564
        # These will never require a tokenizer.
565
566
567
568
        # the model on the other hand might have a tokenizer, but
        # the files could be missing from the hub, instead of failing
        # on such repos, we just force to not load it.
        load_tokenizer = False
569
570
    if task in NO_FEATURE_EXTRACTOR_TASKS:
        load_feature_extractor = False
571

572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
    if load_tokenizer:
        # Try to infer tokenizer from model or config name (if provided as str)
        if tokenizer is None:
            if isinstance(model_name, str):
                tokenizer = model_name
            elif isinstance(config, str):
                tokenizer = config
            else:
                # Impossible to guess what is the right tokenizer here
                raise Exception(
                    "Impossible to guess which tokenizer to use. "
                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
                )

        # Instantiate tokenizer if needed
        if isinstance(tokenizer, (str, tuple)):
            if isinstance(tokenizer, tuple):
                # For tuple we have (tokenizer name, {kwargs})
                use_fast = tokenizer[1].pop("use_fast", use_fast)
                tokenizer_identifier = tokenizer[0]
                tokenizer_kwargs = tokenizer[1]
            else:
                tokenizer_identifier = tokenizer
                tokenizer_kwargs = model_kwargs

            tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_identifier, revision=revision, use_fast=use_fast, _from_pipeline=task, **tokenizer_kwargs
            )

    if load_feature_extractor:
        # Try to infer feature extractor from model or config name (if provided as str)
        if feature_extractor is None:
            if isinstance(model_name, str):
                feature_extractor = model_name
            elif isinstance(config, str):
                feature_extractor = config
            else:
                # Impossible to guess what is the right feature_extractor here
                raise Exception(
                    "Impossible to guess which feature extractor to use. "
                    "Please provide a PreTrainedFeatureExtractor class or a path/identifier "
                    "to a pretrained feature extractor."
                )

        # Instantiate feature_extractor if needed
        if isinstance(feature_extractor, (str, tuple)):
            feature_extractor = AutoFeatureExtractor.from_pretrained(
                feature_extractor, revision=revision, _from_pipeline=task, **model_kwargs
            )

Nicolas Patry's avatar
Nicolas Patry committed
622
623
624
625
626
627
            if (
                feature_extractor._processor_class
                and feature_extractor._processor_class.endswith("WithLM")
                and isinstance(model_name, str)
            ):
                try:
628
                    import kenlm  # to trigger `ImportError` if not installed
Nicolas Patry's avatar
Nicolas Patry committed
629
630
                    from pyctcdecode import BeamSearchDecoderCTC

631
632
633
634
635
636
637
638
639
                    if os.path.isdir(model_name) or os.path.isfile(model_name):
                        decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
                    else:
                        language_model_glob = os.path.join(
                            BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
                        )
                        alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
                        allow_regex = [language_model_glob, alphabet_filename]
                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex)
Nicolas Patry's avatar
Nicolas Patry committed
640
641

                    kwargs["decoder"] = decoder
642
                except ImportError as e:
Nicolas Patry's avatar
Nicolas Patry committed
643
                    logger.warning(
644
                        f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
Nicolas Patry's avatar
Nicolas Patry committed
645
646
                    )

647
648
649
650
651
652
653
654
655
    if task == "translation" and model.config.task_specific_params:
        for key in model.config.task_specific_params:
            if key.startswith("translation"):
                task = key
                warnings.warn(
                    f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
                    UserWarning,
                )
                break
656

657
658
659
660
661
662
    if tokenizer is not None:
        kwargs["tokenizer"] = tokenizer

    if feature_extractor is not None:
        kwargs["feature_extractor"] = feature_extractor

663
    return pipeline_class(model=model, framework=framework, task=task, **kwargs)