add pipeline - train

2d855973 · thomwolf · Morgan Funtowicz · 72c36b9e · 2d855973 · 2d855973
Commit 2d855973 authored Oct 16, 2019 by thomwolf Committed by Morgan Funtowicz Dec 10, 2019
Showing with 492 additions and 216 deletions

transformers/commands/train.py transformers/commands/train.py +80 -47

transformers/data/processors/utils.py transformers/data/processors/utils.py +158 -169

transformers/pipeline.py transformers/pipeline.py +254 -0

No files found.
--- a/transformers/commands/train.py
+++ b/transformers/commands/train.py
+import os
 from argparse import ArgumentParser, Namespace
-
 from logging import getLogger

 from transformers.commands import BaseTransformersCLICommand
@@ -14,8 +14,6 @@ else:
    raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")

 # TF training parameters
-BATCH_SIZE = 32
-EVAL_BATCH_SIZE = BATCH_SIZE * 2
 USE_XLA = False
 USE_AMP = False

@@ -24,7 +22,7 @@ def train_command_factory(args: Namespace):
    Factory function used to instantiate serving server from provided command line arguments.
    :return: ServeCommand
    """
-    return TrainCommand(args.model)
+    return TrainCommand(args)


 class TrainCommand(BaseTransformersCLICommand):
@@ -38,50 +36,84 @@ class TrainCommand(BaseTransformersCLICommand):
        """
        train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.')
        train_parser.add_argument('--train_data', type=str, required=True,
-                                  help='path to train (and optionally evaluation) dataset.')
+                                  help="path to train (and optionally evaluation) dataset as a csv with "
+                                       "tab separated labels and sentences.")
+
+        train_parser.add_argument('--column_label', type=int, default=0,
+                                  help='Column of the dataset csv file with example labels.')
+        train_parser.add_argument('--column_text', type=int, default=1,
+                                  help='Column of the dataset csv file with example texts.')
+        train_parser.add_argument('--column_id', type=int, default=2,
+                                  help='Column of the dataset csv file with example ids.')
+
+        train_parser.add_argument('--validation_data', type=str, default='',
+                                  help='path to validation dataset.')
+        train_parser.add_argument('--validation_split', type=float, default=0.1,
+                                  help="if validation dataset is not provided, fraction of train dataset "
+                                       "to use as validation dataset.")
+
+        train_parser.add_argument('--output', type=str, default='./',
+                                  help='path to saved the trained model.')
+
        train_parser.add_argument('--task', type=str, default='text_classification',
                                  help='Task to train the model on.')
        train_parser.add_argument('--model', type=str, default='bert-base-uncased',
                                  help='Model\'s name or path to stored model.')
-        train_parser.add_argument('--valid_data', type=str, default='',
-                                  help='path to validation dataset.')
-        train_parser.add_argument('--valid_data_ratio', type=float, default=0.1,
-                                  help="if validation dataset is not provided, fraction of train dataset "
-                                       "to use as validation dataset.")
+        train_parser.add_argument('--train_batch_size', type=int, default=32,
+                                  help='Batch size for training.')
+        train_parser.add_argument('--valid_batch_size', type=int, default=64,
+                                  help='Batch size for validation.')
+        train_parser.add_argument('--learning_rate', type=float, default=3e-5,
+                                  help="Learning rate.")
+        train_parser.add_argument('--adam_epsilon', type=float, default=1e-08,
+                                  help="Epsilon for Adam optimizer.")
        train_parser.set_defaults(func=train_command_factory)

-    def __init__(self, model_name: str, task: str, train_data: str,
-                 valid_data: str, valid_data_ratio: float):
-        self._logger = getLogger('transformers-cli/training')
+    def __init__(self, args: Namespace):
+        self.logger = getLogger('transformers-cli/training')

-        self._framework = 'tf' if is_tf_available() else 'torch'
+        self.framework = 'tf' if is_tf_available() else 'torch'

-        self._logger.info('Loading model {}'.format(model_name))
-        self._model_name = model_name
-        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
-        if task == 'text_classification':
-            self._model = SequenceClassifModel.from_pretrained(model_name)
-        elif task == 'token_classification':
+        os.makedirs(args.output)
+        self.output = args.output
+
+        self.column_label = args.column_label
+        self.column_text = args.column_text
+        self.column_id = args.column_id
+
+        self.logger.info('Loading model {}'.format(args.model_name))
+        self.model_name = args.model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+        if args.task == 'text_classification':
+            self.model = SequenceClassifModel.from_pretrained(args.model_name)
+        elif args.task == 'token_classification':
            raise NotImplementedError
-        elif task == 'question_answering':
+        elif args.task == 'question_answering':
            raise NotImplementedError

-        dataset = SingleSentenceClassificationProcessor.create_from_csv(train_data)
-        num_data_samples = len(SingleSentenceClassificationProcessor)
-        if valid_data:
-            self._train_dataset = dataset
-            self._num_train_samples = num_data_samples
-            self._valid_dataset = SingleSentenceClassificationProcessor.create_from_csv(valid_data)
-            self._num_valid_samples = len(self._valid_dataset)
+        self.logger.info('Loading dataset from {}'.format(args.train_data))
+        dataset = SingleSentenceClassificationProcessor.create_from_csv(args.train_data)
+        num_data_samples = len(dataset)
+        if args.validation_data:
+            self.logger.info('Loading validation dataset from {}'.format(args.validation_data))
+            self.valid_dataset = SingleSentenceClassificationProcessor.create_from_csv(args.validation_data)
+            self.num_valid_samples = len(self.valid_dataset)
+            self.train_dataset = dataset
+            self.num_train_samples = num_data_samples
        else:
-            assert 0.0 < valid_data_ratio < 1.0, "--valid_data_ratio should be between 0.0 and 1.0"
-            self._num_valid_samples = num_data_samples * valid_data_ratio
-            self._num_train_samples = num_data_samples - self._num_valid_samples
-            self._train_dataset = dataset[self._num_train_samples]
-            self._valid_dataset = dataset[self._num_valid_samples]
+            assert 0.0 < args.validation_split < 1.0, "--validation_split should be between 0.0 and 1.0"
+            self.num_valid_samples = num_data_samples * args.validation_split
+            self.num_train_samples = num_data_samples - self.num_valid_samples
+            self.train_dataset = dataset[self.num_train_samples]
+            self.valid_dataset = dataset[self.num_valid_samples]
+
+        self.train_batch_size = args.train_batch_size
+        self.valid_batch_size = args.valid_batch_size
+        self.learning_rate = args.learning_rate
+        self.adam_epsilon = args.adam_epsilon

    def run(self):
-        if self._framework == 'tf':
+        if self.framework == 'tf':
            return self.run_tf()
        return self.run_torch()

@@ -95,27 +127,28 @@ class TrainCommand(BaseTransformersCLICommand):
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

        # Prepare dataset as a tf.train_data.Dataset instance
-        train_dataset = convert_examples_to_features(self._train_dataset, self._tokenizer, mode='sequence_classification')
-        valid_dataset = convert_examples_to_features(self._valid_dataset, self._tokenizer, mode='sequence_classification')
-        train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
-        valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
+        self.logger.info('Tokenizing and processing dataset')
+        train_dataset = self.train_dataset.get_features(self.tokenizer)
+        valid_dataset = self.valid_dataset.get_features(self.tokenizer)
+        train_dataset = train_dataset.shuffle(128).batch(self.train_batch_size).repeat(-1)
+        valid_dataset = valid_dataset.batch(self.valid_batch_size)

        # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-        opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
+        opt = tf.keras.optimizers.Adam(learning_rate=args.learning_rate, epsilon=self.adam_epsilon)
        if USE_AMP:
            # loss scaling is currently required when using mixed precision
            opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-        model.compile(optimizer=opt, loss=loss, metrics=[metric])
+        self.model.compile(optimizer=opt, loss=loss, metrics=[metric])

        # Train and evaluate using tf.keras.Model.fit()
-        train_steps = train_examples//BATCH_SIZE
-        valid_steps = valid_examples//EVAL_BATCH_SIZE
+        train_steps = self.num_train_samples//self.train_batch_size
+        valid_steps = self.num_valid_samples//self.valid_batch_size

-        history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
-                            validation_data=valid_dataset, validation_steps=valid_steps)
+        self.logger.info('Training model')
+        history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+                                 validation_data=valid_dataset, validation_steps=valid_steps)

-        # Save TF2 model
-        os.makedirs('./save/', exist_ok=True)
-        model.save_pretrained('./save/')
+        # Save trained model
+        self.model.save_pretrained(self.output)
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -18,6 +18,11 @@ import csv
 import sys
 import copy
 import json
+import logging
+
+from ...file_utils import is_tf_available, is_torch_available
+
+logger = logging.getLogger(__name__)

 class InputExample(object):
    """
@@ -64,7 +69,7 @@ class InputFeatures(object):
        label: Label corresponding to the input
    """

-    def __init__(self, input_ids, attention_mask, token_type_ids, label):
+    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
@@ -86,34 +91,6 @@ class InputFeatures(object):
 class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
-        """
-        raise NotImplementedError()
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    def tfds_map(self, example):
-        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 
-        This method converts examples to the correct format."""
-        if len(self.get_labels()) > 1:
-            example.label = self.get_labels()[int(example.label)]
-        return example
-
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
@@ -129,15 +106,11 @@ class DataProcessor(object):

 class SingleSentenceClassificationProcessor(DataProcessor):
    """ Generic processor for a single sentence classification data set."""
-    def __init__(self, labels=None, examples=None):
+    def __init__(self, labels=None, examples=None, mode='classification', verbose=False):
        self.labels = [] if labels is None else labels
        self.examples = [] if examples is None else examples
-
-    @classmethod
-    def create_from_csv(cls, file_name):
-        processor = cls()
-        processor.add_examples_from_csv(file_name)
-        return processor
+        self.mode = mode
+        self.verbose = verbose

    def __len__(self):
        return len(self.examples)
@@ -148,30 +121,40 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                                                         examples=self.examples[idx])
        return self.examples[idx]

-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        return self.labels
+    @classmethod
+    def create_from_csv(cls, file_name, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples_from_csv(file_name)
+        return processor

-    def add_examples_from_csv(self, file_name):
+    def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None,
+                              overwrite_labels=False, overwrite_examples=False):
        lines = self._read_tsv(file_name)
-        self.add_examples_from_lines(lines)
-
-    def add_examples_from_lines(self, lines, split_name='', overwrite_labels=False, overwrite_examples=False):
-        """Creates examples for the training and dev sets."""
-        added_labels = set()
-        examples = []
+        texts = []
+        labels = []
+        ids = []
        for (i, line) in enumerate(lines):
-            if len(line) > 2:
-                guid = "%s-%s" % (split_name, line[0]) if split_name else line[0]
-                label = line[1]
-                text_a = line[2]
+            texts.append(line[column_text])
+            labels.append(line[column_label])
+            if column_id is not None:
+                ids.append(line[column_id])
            else:
                guid = "%s-%s" % (split_name, i) if split_name else "%s" % i
-                label = line[0]
-                text_a = line[1]
+                ids.append(guid)
+
+        return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples)
+
+    def add_examples(self, texts, labels, ids=None, overwrite_labels=False, overwrite_examples=False):
+        if ids is None:
+            ids = [None] * len(texts)
+        assert len(texts) == len(labels)
+        assert len(texts) == len(ids)

+        examples = []
+        added_labels = set()
+        for (text, label, guid) in zip(texts, labels, ids):
            added_labels.add(label)
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))

        # Update examples
        if overwrite_examples:
@@ -187,123 +170,129 @@ class SingleSentenceClassificationProcessor(DataProcessor):

        return self.examples

+    @classmethod
+    def create_from_examples(cls, texts, labels, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples(texts, labels)
+        return processor

-def convert_examples_to_features(examples, tokenizer,
-                                 mode='sequence_classification',   
-                                 max_length=512,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 pad_token_segment_id=0,
-                                 mask_padding_with_zero=True):
-    """
-    Loads a data file into a list of ``InputFeatures``
+    def get_features(self,
+                     tokenizer,
+                     max_length=None,
+                     pad_on_left=False,
+                     pad_token=0,
+                     mask_padding_with_zero=True,
+                     return_tensors=None):
+        """
+        Convert examples in a list of ``InputFeatures``

-    Args:
-        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
-        tokenizer: Instance of a tokenizer that will tokenize the examples
-        max_length: Maximum example length
-        task: GLUE task
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
-        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
-        pad_token: Padding token
-        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
-        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
-            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
-            actual values)
-
-    Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-        containing the task-specific features. If the input is a list of ``InputExamples``, will return
-        a list of task-specific ``InputFeatures`` which can be fed to the model.
+        Args:
+            tokenizer: Instance of a tokenizer that will tokenize the examples
+            max_length: Maximum example length
+            task: GLUE task
+            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+            pad_token: Padding token
+            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+                actual values)
+
+        Returns:
+            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+            containing the task-specific features. If the input is a list of ``InputExamples``, will return
+            a list of task-specific ``InputFeatures`` which can be fed to the model.

-    """
-    is_tf_dataset = False
-    if is_tf_available() and isinstance(examples, tf.data.Dataset):
-        is_tf_dataset = True
-
-    if task is not None:
-        processor = glue_processors[task]()
-        if label_list is None:
-            label_list = processor.get_labels()
-            logger.info("Using label list %s for task %s" % (label_list, task))
-        if output_mode is None:
-            output_mode = glue_output_modes[task]
-            logger.info("Using output mode %s for task %s" % (output_mode, task))
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d" % (ex_index))
-        if is_tf_dataset:
-            example = processor.get_example_from_tensor_dict(example)
-
-        inputs = tokenizer.encode_plus(
-            example.text_a,
-            example.text_b,
-            add_special_tokens=True,
-            max_length=max_length,
-        )
-        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-        else:
-            input_ids = input_ids + ([pad_token] * padding_length)
-            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-
-        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
-
-        if output_mode == "classification":
-            label = label_map[example.label]
-        elif output_mode == "regression":
-            label = float(example.label)
+        """
+
+        label_map = {label: i for i, label in enumerate(self.labels)}
+
+        all_input_ids = []
+        for (ex_index, example) in enumerate(self.examples):
+            if ex_index % 10000 == 0:
+                logger.info("Tokenizing example %d", ex_index)
+
+            input_ids = tokenizer.encode(
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
+            )
+            all_input_ids.append(input_ids)
+
+        batch_length = max(len(input_ids) for input_ids in all_input_ids)
+
+        features = []
+        for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, examples)):
+            if ex_index % 10000 == 0:
+                logger.info("Writing example %d", ex_index)
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = batch_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length)
+            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length)
+
+            if self.mode == "classification":
+                label = label_map[example.label]
+            elif self.mode == "regression":
+                label = float(example.label)
+            else:
+                raise ValueError(self.mode)
+
+            if ex_index < 5 and self.verbose:
+                logger.info("*** Example ***")
+                logger.info("guid: %s" % (example.guid))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+                logger.info("label: %s (id = %d)" % (example.label, label))
+
+            features.append(
+                    InputFeatures(input_ids=input_ids,
+                                  attention_mask=attention_mask,
+                                  label=label))
+
+        if return_tensors is None:
+            return features
+        elif return_tensors == 'tf':
+            if not is_tf_available():
+                raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
+            import tensorflow as tf
+            def gen():
+                for ex in features:
+                    yield  ({'input_ids': ex.input_ids,
+                            'attention_mask': ex.attention_mask},
+                            ex.label)
+
+            dataset = tf.data.Dataset.from_generator(gen,
+                    ({'input_ids': tf.int32,
+                    'attention_mask': tf.int32},
+                    tf.int64),
+                    ({'input_ids': tf.TensorShape([None]),
+                    'attention_mask': tf.TensorShape([None])},
+                    tf.TensorShape([])))
+            return dataset
+        elif return_tensors == 'pt':
+            if not is_torch_available():
+                raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported")
+            import torch
+            from torch.utils.data import TensorDataset
+            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+            if self.mode == "classification":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            elif self.mode == "regression":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
+            return dataset
        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
-            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label))
-
-        features.append(
-                InputFeatures(input_ids=input_ids,
-                              attention_mask=attention_mask,
-                              token_type_ids=token_type_ids,
-                              label=label))
-
-    if is_tf_available() and is_tf_dataset:
-        def gen():
-            for ex in features:
-                yield  ({'input_ids': ex.input_ids,
-                         'attention_mask': ex.attention_mask,
-                         'token_type_ids': ex.token_type_ids},
-                        ex.label)
-
-        return tf.data.Dataset.from_generator(gen,
-            ({'input_ids': tf.int32,
-              'attention_mask': tf.int32,
-              'token_type_ids': tf.int32},
-             tf.int64),
-            ({'input_ids': tf.TensorShape([None]),
-              'attention_mask': tf.TensorShape([None]),
-              'token_type_ids': tf.TensorShape([None])},
-             tf.TensorShape([])))
-
-    return features
+            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
--- a/transformers/pipeline.py
+++ b/transformers/pipeline.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pipeline class: Tokenizer + Model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import logging
+
+from .modeling_auto import (AutoModel, AutoModelForQuestionAnswering,
+                            AutoModelForSequenceClassification,
+                            AutoModelWithLMHead)
+from .tokenization_auto import AutoTokenizer
+from .file_utils import add_start_docstrings, is_tf_available, is_torch_available
+from .data.processors import SingleSentenceClassificationProcessor
+
+if is_tf_available():
+    import tensorflow as tf
+if is_torch_available():
+    import torch
+
+logger = logging.getLogger(__name__)
+
+# TF training parameters
+USE_XLA = False
+USE_AMP = False
+
+class TextClassificationPipeline(object):
+    r"""
+        :class:`~transformers.TextClassificationPipeline` is a class encapsulating a pretrained model and
+        its tokenizer and will be instantiated as one of the base model classes of the library
+        when created with the `Pipeline.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+    """
+    def __init__(self, tokenizer, model):
+        self.tokenizer = tokenizer
+        self.model = model
+        if is_tf_available():
+            self.framework = 'tf'
+        elif is_torch_available():
+            self.framework = 'pt'
+        else:
+            raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
+        self.is_compiled = False
+
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiates one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `ctrl`: CTRLModel (Salesforce CTRL  model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        # Extract tokenizer and model arguments
+        tokenizer_kwargs = {}
+        for key in kwargs:
+            if key.startswith('tokenizer_'):
+                # Specific to the tokenizer
+                tokenizer_kwargs[key.replace('tokenizer_', '')] = kwargs.pop(key)
+            elif not key.startswith('model_'):
+                # used for both the tokenizer and the model
+                tokenizer_kwargs[key] = kwargs[key]
+
+        model_kwargs = {}
+        for key in kwargs:
+            if key.startswith('model_'):
+                # Specific to the model
+                model_kwargs[key.replace('model_', '')] = kwargs.pop(key)
+            elif not key.startswith('tokenizer_'):
+                # used for both the tokenizer and the model
+                model_kwargs[key] = kwargs[key]
+
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **tokenizer_kwargs)
+        model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
+        return cls(tokenizer, model)
+
+
+    def save_pretrained(self, save_directory):
+        if not os.path.isdir(save_directory):
+            logger.error("Saving directory ({}) should be a directory".format(save_directory))
+            return
+        self.model.save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+
+
+    def compile(self, learning_rate=3e-5, epsilon=1e-8):
+        if self.framework == 'tf':
+            logger.info('Preparing model')
+            # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
+            opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
+            if USE_AMP:
+                # loss scaling is currently required when using mixed precision
+                opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
+            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+            self.model.compile(optimizer=opt, loss=loss, metrics=[metric])
+        else:
+            raise NotImplementedError
+        self.is_compiled = True
+
+
+    def prepare_data(self, train_samples_text, train_samples_labels,
+                     valid_samples_text=None, valid_samples_labels=None,
+                     validation_split=0.1):
+        dataset = SingleSentenceClassificationProcessor.create_from_examples(train_samples_text,
+                                                                             train_samples_labels)
+        num_data_samples = len(dataset)
+        if valid_samples_text is not None and valid_samples_labels is not None:
+            valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(valid_samples_text,
+                                                                                       valid_samples_labels)
+            num_valid_samples = len(valid_dataset)
+            train_dataset = dataset
+            num_train_samples = num_data_samples
+        else:
+            assert 0.0 < validation_split < 1.0, "validation_split should be between 0.0 and 1.0"
+            num_valid_samples = int(num_data_samples * validation_split)
+            num_train_samples = num_data_samples - num_valid_samples
+            train_dataset = dataset[num_train_samples]
+            valid_dataset = dataset[num_valid_samples]
+
+        logger.info('Tokenizing and processing dataset')
+        train_dataset = train_dataset.get_features(self.tokenizer, return_tensors=self.framework)
+        valid_dataset = valid_dataset.get_features(self.tokenizer, return_tensors=self.framework)
+        return train_dataset, valid_dataset, num_train_samples, num_valid_samples
+
+
+    def fit(self, train_samples_text, train_samples_labels,
+            valid_samples_text=None, valid_samples_labels=None,
+            train_batch_size=None, valid_batch_size=None,
+            validation_split=0.1,
+            **kwargs):
+
+        if not self.is_compiled:
+            self.compile()
+
+        datasets = self.prepare_data(train_samples_text, train_samples_labels,
+                                     valid_samples_text, valid_samples_labels,
+                                     validation_split)
+        train_dataset, valid_dataset, num_train_samples, num_valid_samples = datasets
+
+        train_steps = num_train_samples//train_batch_size
+        valid_steps = num_valid_samples//valid_batch_size
+
+        if self.framework == 'tf':
+            # Prepare dataset as a tf.train_data.Dataset instance
+            train_dataset = train_dataset.shuffle(128).batch(train_batch_size).repeat(-1)
+            valid_dataset = valid_dataset.batch(valid_batch_size)
+
+            logger.info('Training TF 2.0 model')
+            history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+                                     validation_data=valid_dataset, validation_steps=valid_steps, **kwargs)
+        else:
+            raise NotImplementedError
+
+
+    def __call__(self, text):
+        inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, return_tensors=self.framework)
+        if self.framework == 'tf':
+            # TODO trace model
+            predictions = self.model(**inputs)[0]
+        else:
+            with torch.no_grad():
+                predictions = self.model(**inputs)[0]
+
+        return predictions.numpy().tolist()