Benchmarks (#4912)

* finish benchmark * fix isort * fix setup cfg * retab * fix time measuring of tf graph mode * fix tf cuda * clean code * better error message

Benchmarks (#4912)
* finish benchmark * fix isort * fix setup cfg * retab * fix time measuring of tf graph mode * fix tf cuda * clean code * better error message
fa0be6d7 · Patrick von Platen · GitHub · 18a0150b · fa0be6d7 · fa0be6d7
Unverified Commit fa0be6d7 authored Jun 22, 2020 by Patrick von Platen Committed by GitHub Jun 22, 2020
18 changed files
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
 # coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2020 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

--- a/examples/benchmarking/run_benchmark_tf.py
+++ b/examples/benchmarking/run_benchmark_tf.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training in Tensorflow"""
+
+from transformers import HfArgumentParser, TensorflowBenchmark, TensorflowBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(TensorflowBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = TensorflowBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/longform-qa/eli5_app.py
+++ b/examples/longform-qa/eli5_app.py
+import faiss
+import nlp
 import numpy as np
 import torch
+from elasticsearch import Elasticsearch

-import faiss
-import nlp
 import streamlit as st
 import transformers
-from elasticsearch import Elasticsearch
 from eli5_utils import (
    embed_questions_for_retrieval,
    make_qa_s2s_model,

--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@@ -4,17 +4,17 @@ import os  # noqa: F401
 from random import choice, randint
 from time import time

+import faiss  # noqa: F401
+import nlp  # noqa: F401
 import numpy as np
 import pandas as pd
 import torch
 import torch.utils.checkpoint as checkpoint
+from elasticsearch import Elasticsearch  # noqa: F401
+from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm

-import faiss  # noqa: F401
-import nlp  # noqa: F401
-from elasticsearch import Elasticsearch  # noqa: F401
-from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup



--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -8,3 +8,8 @@ tensorflow_datasets
 pytorch-lightning==0.7.6
 matplotlib
 git-python==1.0.3
+faiss
+streamlit
+elasticsearch
+pandas
+nlp
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,12 +5,15 @@ include_trailing_comma = True
 known_first_party = transformers
 known_third_party =
    absl
+    elasticsearch
    fairseq
+    faiss
    fastprogress
    git
    h5py
    matplotlib
    MeCab
+    nlp
    nltk
    numpy
    packaging

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -78,6 +78,9 @@ from .file_utils import (
    add_end_docstrings,
    add_start_docstrings,
    cached_path,
+    is_apex_available,
+    is_psutil_available,
+    is_py3nvml_available,
    is_tf_available,
    is_torch_available,
    is_torch_tpu_available,
@@ -398,7 +401,8 @@ if is_torch_available():
    from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments

    # Benchmarks
-    from .benchmark import PyTorchBenchmark, PyTorchBenchmarkArguments
+    from .benchmark.benchmark import PyTorchBenchmark
+    from .benchmark.benchmark_args import PyTorchBenchmarkArguments

 # TensorFlow
 if is_tf_available():
@@ -608,6 +612,10 @@ if is_tf_available():
    # Trainer
    from .trainer_tf import TFTrainer

+    # Benchmarks
+    from .benchmark.benchmark_tf import TensorflowBenchmark
+    from .benchmark.benchmark_args_tf import TensorflowBenchmarkArguments
+

 if not is_tf_available() and not is_torch_available():
    logger.warning(

--- a/src/transformers/benchmark/__init__.py
+++ b/src/transformers/benchmark/__init__.py
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-from ..file_utils import is_torch_available
-
-
-if is_torch_available():
-    from .benchmark_args import PyTorchBenchmarkArguments
-    from .benchmark import PyTorchBenchmark
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -20,16 +20,24 @@

 import logging
 import timeit
+from typing import Callable, Optional

 from transformers import (
    MODEL_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    PretrainedConfig,
+    is_py3nvml_available,
    is_torch_available,
-    is_torch_tpu_available,
 )

-from .benchmark_utils import Benchmark, Memory, measure_peak_memory_cpu, start_memory_tracing, stop_memory_tracing
+from .benchmark_utils import (
+    Benchmark,
+    Memory,
+    MemorySummary,
+    measure_peak_memory_cpu,
+    start_memory_tracing,
+    stop_memory_tracing,
+)


 if is_torch_available():
@@ -37,6 +45,10 @@ if is_torch_available():
    from .benchmark_args import PyTorchBenchmarkArguments


+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
+
+
 logger = logging.getLogger(__name__)


@@ -50,220 +62,173 @@ class PyTorchBenchmark(Benchmark):
    def framework_version(self):
        return torch.__version__

-    def train(self, model_name, batch_size, sequence_length, trace_memory=False):
-        try:
-            config = self.config_dict[model_name]
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_inference)

-            if self.args.torchscript:
-                config.torchscript = True
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_inference)

-            model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
-            model.to(self.args.device)
-            model.train()
+    def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        _train = self._prepare_train_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_train)

-            # encoder-decoder has vocab size saved differently
-            vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
-            input_ids = torch.randint(
-                vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
-            )
+    def _train_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        _train = self._prepare_train_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_train)

-            if self.args.torchscript:
-                raise NotImplementedError("Training for torchscript is currently not implemented")
-            else:
-                train_model = model
-
-            def compute_loss_and_backprob_encoder():
-                loss = train_model(input_ids, labels=input_ids)[0]
-                loss.backward()
-                train_model.zero_grad()
-
-            def compute_loss_and_backprob_encoder_decoder():
-                loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
-                loss.backward()
-                train_model.zero_grad()
-
-            _train = (
-                compute_loss_and_backprob_encoder_decoder
-                if config.is_encoder_decoder
-                else compute_loss_and_backprob_encoder
-            )
-
-            if trace_memory is True:
-                if self.args.trace_memory_line_by_line:
-                    trace = start_memory_tracing("transformers")
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    # clear gpu cache
-                    torch.cuda.empty_cache()
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        torch.cuda.reset_peak_memory_stats()
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        torch.cuda.reset_max_memory_cached()
-
-                    # calculate loss and do backpropagation
-                    _train()
-                elif not self.args.no_tpu and is_torch_tpu_available():
-                    # tpu
-                    raise NotImplementedError(
-                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
-                    )
-                else:
-                    # cpu
-                    memory_bytes = measure_peak_memory_cpu(_train)
-                    memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+    def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]

-                if self.args.trace_memory_line_by_line:
-                    summary = stop_memory_tracing(trace)
-                else:
-                    summary = None
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        memory = Memory(torch.cuda.max_memory_reserved())
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        memory = Memory(torch.cuda.max_memory_reserved())
-
-                return memory, summary
-            else:
-                if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
-                    # run additional 10 times to stabilize compilation for tpu and torchscript
-                    logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
-                    timeit.repeat(
-                        _train, repeat=1, number=5,
-                    )
+        if self.args.torchscript:
+            config.torchscript = True
+        if self.args.with_lm_head:
+            model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+        else:
+            model = MODEL_MAPPING[config.__class__](config)
+
+        model.eval()
+        model.to(self.args.device)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
+
+        if self.args.fp16:
+            logger.info("Running training in Mixed Precision...")
+            assert self.args.is_gpu, "Mixed precision is possible only for GPU."
+            # amp seems to have memory leaks so that memory usage
+            # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
+            model.half()
+
+        if self.args.torchscript:
+            with torch.no_grad():
+                inference_model = torch.jit.trace(model, input_ids)
+        else:
+            inference_model = model
+
+        def encoder_decoder_forward():
+            with torch.no_grad():
+                outputs = inference_model(input_ids, decoder_input_ids=input_ids)
+            return outputs
+
+        def encoder_forward():
+            with torch.no_grad():
+                outputs = inference_model(input_ids)
+            return outputs
+
+        _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+        return _forward
+
+    def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
+        model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+
+        if self.args.torchscript:
+            raise NotImplementedError("Training for torchscript is currently not implemented")
+        else:
+            train_model = model
+
+        model.eval()
+        model.to(self.args.device)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
+
+        if self.args.fp16:
+            logger.info("Running training in Mixed Precision...")
+            assert self.args.is_gpu, "Mixed precision is possible only for GPU."
+
+            # amp seems to have memory leaks so that memory usage
+            # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
+            model.half()
+
+        def compute_loss_and_backprob_encoder():
+            loss = train_model(input_ids, labels=input_ids)[0]
+            loss.backward()
+            train_model.zero_grad()
+
+        def compute_loss_and_backprob_encoder_decoder():
+            loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
+            loss.backward()
+            train_model.zero_grad()
+
+        _train = (
+            compute_loss_and_backprob_encoder_decoder
+            if config.is_encoder_decoder
+            else compute_loss_and_backprob_encoder
+        )
+        return _train
+
+    def _measure_speed(self, func) -> float:
+        try:
+            if self.args.is_tpu or self.args.torchscript:
+                # run additional 10 times to stabilize compilation for tpu and torchscript
+                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
+                timeit.repeat(
+                    func, repeat=1, number=5,
+                )

-                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(_train, repeat=self.args.repeat, number=10,)
+            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)

-                if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
-                    import torch_xla.debug.metrics as met
+            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
+                import torch_xla.debug.metrics as met

-                    self.print_fn(met.metrics_report())
+                self.print_fn(met.metrics_report())

-                return min(runtimes) / 10.0
+            return min(runtimes) / 10.0
        except RuntimeError as e:
            self.print_fn("Doesn't fit on GPU. {}".format(e))
-            if trace_memory:
-                return "N/A", None
-            else:
-                return "N/A"
+            return "N/A"

-    def inference(self, model_name, batch_size, sequence_length, trace_memory=False):
+    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
        try:
-            config = self.config_dict[model_name]
-            model = None
-
-            if self.args.torchscript:
-                config.torchscript = True
-
-            if self.args.with_lm_head:
-                model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
-            else:
-                model = MODEL_MAPPING[config.__class__](config)
-
-            model.eval()
-            model.to(self.args.device)
-
-            # encoder-decoder has vocab size saved differently
-            vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
-
-            input_ids = torch.randint(
-                vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
-            )
-
-            if self.args.torchscript:
-                with torch.no_grad():
-                    if config.is_encoder_decoder:
-                        raise NotImplementedError("Torchscript is currently not supported for EncoderDecoder models")
-                    else:
-                        inference_model = torch.jit.trace(model, input_ids)
-            else:
-                inference_model = model
-
-            def encoder_decoder_forward():
-                with torch.no_grad():
-                    inference_model(input_ids, decoder_input_ids=input_ids)
-
-            def encoder_forward():
-                with torch.no_grad():
-                    inference_model(input_ids)
-
-            _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
-
-            if trace_memory is True:
-                if self.args.trace_memory_line_by_line:
-                    trace = start_memory_tracing("transformers")
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    # clear gpu cache
-                    torch.cuda.empty_cache()
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        torch.cuda.reset_peak_memory_stats()
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        torch.cuda.reset_max_memory_cached()
-
-                    # run forward
-                    _forward()
-                elif not self.args.no_tpu and is_torch_tpu_available():
-                    # tpu
-                    raise NotImplementedError(
-                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
+            if self.args.trace_memory_line_by_line:
+                trace = start_memory_tracing("transformers")
+
+            if self.args.is_tpu:
+                # tpu
+                raise NotImplementedError(
+                    "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no_memory` or `args.no_memory=True`"
+                )
+            elif self.args.is_gpu:
+                if not is_py3nvml_available():
+                    logger.warning(
+                        "py3nvml not installed, we won't log GPU memory usage. "
+                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
+                    memory = "N/A"
                else:
-                    # cpu
-                    memory_bytes = measure_peak_memory_cpu(_forward)
-                    memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
-
-                if self.args.trace_memory_line_by_line:
-                    summary = stop_memory_tracing(trace)
-                else:
-                    summary = None
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        memory = Memory(torch.cuda.max_memory_reserved())
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        memory = Memory(torch.cuda.max_memory_cached())
-
-                return memory, summary
-            else:
-
-                if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
-                    # run additional 10 times to stabilize compilation for tpu and torchscript
-                    logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
-                    timeit.repeat(
-                        _forward, repeat=1, number=5,
+                    logger.info(
+                        "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
                    )
+                    # init nvml
+                    nvml.nvmlInit()
+                    func()
+                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                    meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                    max_bytes_in_use = meminfo.used
+                    memory = Memory(max_bytes_in_use)
+                    # shutdown nvml
+                    nvml.nvmlShutdown()
+            else:
+                # cpu
+                memory_bytes = measure_peak_memory_cpu(func)
+                memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes

-                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(_forward, repeat=self.args.repeat, number=10,)
-
-                if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
-                    import torch_xla.debug.metrics as met
-
-                    self.print_fn(met.metrics_report())
-
-                return min(runtimes) / 10.0
+            if self.args.trace_memory_line_by_line:
+                summary = stop_memory_tracing(trace)
+            else:
+                summary = None

+            return memory, summary
        except RuntimeError as e:
            self.print_fn("Doesn't fit on GPU. {}".format(e))
-            if trace_memory:
-                return "N/A", None
-            else:
-                return "N/A"
+            return "N/A", None
--- a/src/transformers/benchmark/benchmark_args.py
+++ b/src/transformers/benchmark/benchmark_args.py
@@ -34,11 +34,17 @@ logger = logging.getLogger(__name__)

 @dataclass
 class PyTorchBenchmarkArguments(BenchmarkArguments):
-    no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
    torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"})
-    no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
-    fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
-    tpu_print_metrics: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
+    torch_xla_tpu_print_metrics: bool = field(default=False, metadata={"help": "Print Xla/PyTorch tpu metrics"})
+    fp16_opt_level: str = field(
+        default="O1",
+        metadata={
+            "help": (
+                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                "See details at https://nvidia.github.io/apex/amp.html"
+            )
+        },
+    )

    @cached_property
    @torch_required
@@ -55,9 +61,14 @@ class PyTorchBenchmarkArguments(BenchmarkArguments):
            n_gpu = torch.cuda.device_count()
        return device, n_gpu

+    @property
+    def is_tpu(self):
+        return is_torch_tpu_available() and not self.no_tpu
+
    @property
    @torch_required
    def device_idx(self) -> int:
+        # TODO(PVP): currently only single GPU is supported
        return torch.cuda.current_device()

    @property
@@ -69,3 +80,7 @@ class PyTorchBenchmarkArguments(BenchmarkArguments):
    @torch_required
    def n_gpu(self):
        return self._setup_devices[1]
+
+    @property
+    def is_gpu(self):
+        return self.n_gpu > 0
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Tuple
+
+from ..file_utils import cached_property, is_tf_available, tf_required
+from .benchmark_args_utils import BenchmarkArguments
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TensorflowBenchmarkArguments(BenchmarkArguments):
+    tpu_name: str = field(
+        default=None, metadata={"help": "Name of TPU"},
+    )
+    device_idx: int = field(
+        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+    )
+    eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
+    use_xla: bool = field(
+        default=False,
+        metadata={
+            "help": "Benchmark models using XLA JIT compilation. Note that `eager_model` has to be set to `False`."
+        },
+    )
+
+    @cached_property
+    @tf_required
+    def _setup_tpu(self) -> Tuple["tf.distribute.cluster_resolver.TPUClusterResolver"]:
+        if not self.no_tpu:
+            try:
+                if self.tpu_name:
+                    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name)
+                else:
+                    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
+            except ValueError:
+                tpu = None
+        return tpu
+
+    @cached_property
+    @tf_required
+    def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", "tf.distribute.cluster_resolver.TPUClusterResolver"]:
+        if self.is_tpu:
+            tf.config.experimental_connect_to_cluster(self._setup_tpu)
+            tf.tpu.experimental.initialize_tpu_system(self._setup_tpu)
+
+            strategy = tf.distribute.experimental.TPUStrategy(self._setup_tpu)
+        else:
+            # currently no multi gpu is allowed
+            if self.is_gpu:
+                # TODO: Currently only single GPU is supported
+                tf.config.experimental.set_visible_devices(self.gpu_list[self.device_idx], "GPU")
+                strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{self.device_idx}")
+            else:
+                tf.config.experimental.set_visible_devices([], "GPU")  # disable GPU
+                strategy = tf.distribute.OneDeviceStrategy(device=f"/cpu:{self.device_idx}")
+
+        return strategy
+
+    @property
+    @tf_required
+    def is_tpu(self) -> bool:
+        return self._setup_tpu is not None
+
+    @property
+    @tf_required
+    def strategy(self) -> "tf.distribute.Strategy":
+        return self._setup_strategy
+
+    @property
+    @tf_required
+    def gpu_list(self):
+        return tf.config.list_physical_devices("GPU")
+
+    @property
+    @tf_required
+    def n_gpu(self) -> int:
+        if not self.no_cuda:
+            return len(self.gpu_list)
+        return 0
+
+    @property
+    def is_gpu(self) -> bool:
+        return self.n_gpu > 0
--- a/src/transformers/benchmark/benchmark_args_utils.py
+++ b/src/transformers/benchmark/benchmark_args_utils.py
@@ -16,11 +16,15 @@

 import dataclasses
 import json
+import logging
 from dataclasses import dataclass, field
 from time import time
 from typing import List


+logger = logging.getLogger(__name__)
+
+
 def list_field(default=None, metadata=None):
    return field(default_factory=lambda: default, metadata=metadata)

@@ -53,6 +57,9 @@ class BenchmarkArguments:
    )

    no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"})
+    no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
+    no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
+    fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
    training: bool = field(default=False, metadata={"help": "Benchmark training of model"})
    verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"})
    no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"})
@@ -61,6 +68,12 @@ class BenchmarkArguments:
    save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"})
    log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"})
    no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"})
+    no_multi_process: bool = field(
+        default=False,
+        metadata={
+            "help": "Don't use multiprocessing for memory and speed measurement. It is highly recommended to use multiprocessing for accurate CPU and GPU memory measurements. This option should only be used for debugging / testing and on TPU."
+        },
+    )
    with_lm_head: bool = field(
        default=False,
        metadata={
@@ -101,4 +114,17 @@ class BenchmarkArguments:

    @property
    def model_names(self):
+        assert (
+            len(self.models) > 0
+        ), "Please make sure you provide at least one model name / model identifier, *e.g.* `--models bert-base-cased` or `args.models = ['bert-base-cased']."
        return self.models
+
+    @property
+    def do_multi_processing(self):
+        if self.no_multi_process:
+            return False
+        elif self.is_tpu:
+            logger.info("Multiprocessing is currently not possible on TPU.")
+            return False
+        else:
+            return True
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Benchmarking the library on inference and training in PyTorch.
+"""
+
+
+import logging
+import random
+import timeit
+from functools import wraps
+from typing import Callable, Optional
+
+from transformers import (
+    TF_MODEL_MAPPING,
+    TF_MODEL_WITH_LM_HEAD_MAPPING,
+    PretrainedConfig,
+    is_py3nvml_available,
+    is_tf_available,
+)
+
+from .benchmark_utils import (
+    Benchmark,
+    Memory,
+    MemorySummary,
+    measure_peak_memory_cpu,
+    start_memory_tracing,
+    stop_memory_tracing,
+)
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from .benchmark_args_tf import TensorflowBenchmarkArguments
+    from tensorflow.python.framework.errors_impl import ResourceExhaustedError
+
+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
+
+logger = logging.getLogger(__name__)
+
+
+def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
+    def run_func(func):
+        @wraps(func)
+        def run_in_eager_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        @tf.function(experimental_compile=use_xla)
+        def run_in_graph_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        if do_eager_mode is True:
+            assert (
+                use_xla is False
+            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            return run_in_eager_mode
+        else:
+            return run_in_graph_mode
+
+    return run_func
+
+
+def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> ["tf.Tensor"]:
+    rng = random.Random()
+    values = [rng.randint(0, vocab_size - 1) for i in range(batch_size * sequence_length)]
+    return tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
+
+
+class TensorflowBenchmark(Benchmark):
+
+    args: TensorflowBenchmarkArguments
+    configs: PretrainedConfig
+    framework: str = "Tensorflow"
+
+    @property
+    def framework_version(self):
+        return tf.__version__
+
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        # initialize GPU on separate process
+        strategy = self.args.strategy
+        assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_inference)
+
+    def _train_speed(self, model_name, batch_size, sequence_length):
+        raise NotImplementedError(
+            "Training is currently not really implemented." "Wait for TFTrainer to support CLM and MLM."
+        )
+
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        # initialize GPU on separate process
+        if self.args.is_gpu:
+            tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
+        strategy = self.args.strategy
+        assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_inference)
+
+    def _train_memory(self, model_name, batch_size, sequence_length):
+        raise NotImplementedError(
+            "Training is currently not really implemented. Wait for TFTrainer to support CLM and MLM."
+        )
+
+    def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
+
+        if self.args.fp16:
+            raise NotImplementedError("Mixed precision is currently not supported.")
+
+        if self.args.with_lm_head:
+            model = TF_MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+        else:
+            model = TF_MODEL_MAPPING[config.__class__](config)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
+
+        @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
+        def encoder_decoder_forward():
+            return model(input_ids, decoder_input_ids=input_ids, training=False)
+
+        @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
+        def encoder_forward():
+            return model(input_ids, training=False)
+
+        _inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+
+        return _inference
+
+    def _measure_speed(self, func) -> float:
+        with self.args.strategy.scope():
+            try:
+                if self.args.is_tpu or self.args.use_xla:
+                    # run additional 10 times to stabilize compilation for tpu
+                    logger.info("Do inference on TPU. Running model 5 times to stabilize compilation")
+                    timeit.repeat(func, repeat=1, number=5)
+
+                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+
+                return min(runtimes) / 10.0
+            except ResourceExhaustedError as e:
+                self.print_fn("Doesn't fit on GPU. {}".format(e))
+
+    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
+        logger.info(
+            "Note that Tensorflow allocates more memory than"
+            "it might need to speed up computation."
+            "The memory reported here corresponds to the memory"
+            "reported by `nvidia-smi`, which can vary depending"
+            "on total available memory on the GPU that is used."
+        )
+        with self.args.strategy.scope():
+            try:
+                if self.args.trace_memory_line_by_line:
+                    assert (
+                        self.args.eager_mode
+                    ), "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory consumption line by line."
+                    trace = start_memory_tracing("transformers")
+
+                if self.args.is_tpu:
+                    # tpu
+                    raise NotImplementedError(
+                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
+                    )
+                elif self.args.is_gpu:
+                    # gpu
+                    if not is_py3nvml_available():
+                        logger.warning(
+                            "py3nvml not installed, we won't log GPU memory usage. "
+                            "Install py3nvml (pip install py3nvml) to log information about GPU."
+                        )
+                        memory = "N/A"
+                    else:
+                        logger.info(
+                            "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
+                        )
+                        # init nvml
+                        nvml.nvmlInit()
+                        func()
+                        handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                        meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                        max_bytes_in_use = meminfo.used
+                        memory = Memory(max_bytes_in_use)
+                        # shutdown nvml
+                        nvml.nvmlShutdown()
+                else:
+                    # cpu
+                    if self.args.trace_memory_line_by_line:
+                        logger.info(
+                            "When enabling line by line tracing, the max peak memory for CPU is inaccurate in Tensorflow."
+                        )
+                        memory = None
+                    else:
+                        memory_bytes = measure_peak_memory_cpu(func)
+                        memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+                if self.args.trace_memory_line_by_line:
+                    summary = stop_memory_tracing(trace)
+                    if memory is None:
+                        memory = summary.total
+                else:
+                    summary = None
+
+                return memory, summary
+            except ResourceExhaustedError as e:
+                self.print_fn("Doesn't fit on GPU. {}".format(e))
+                return "N/A", None
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -81,6 +81,31 @@ except ImportError:
    _torch_tpu_available = False


+try:
+    import psutil  # noqa: F401
+
+    _psutil_available = True
+
+except ImportError:
+    _psutil_available = False
+
+
+try:
+    import py3nvml  # noqa: F401
+
+    _py3nvml_available = True
+
+except ImportError:
+    _py3nvml_available = False
+
+
+try:
+    from apex import amp  # noqa: F401
+
+    _has_apex = True
+except ImportError:
+    _has_apex = False
+
 default_cache_path = os.path.join(torch_cache_home, "transformers")


@@ -115,6 +140,18 @@ def is_torch_tpu_available():
    return _torch_tpu_available


+def is_psutil_available():
+    return _psutil_available
+
+
+def is_py3nvml_available():
+    return _py3nvml_available
+
+
+def is_apex_available():
+    return _has_apex
+
+
 def add_start_docstrings(*docstr):
    def docstring_decorator(fn):
        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -20,23 +20,16 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
 from tqdm.auto import tqdm, trange

 from .data.data_collator import DataCollator, default_data_collator
+from .file_utils import is_apex_available, is_torch_tpu_available
 from .modeling_utils import PreTrainedModel
 from .optimization import AdamW, get_linear_schedule_with_warmup
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput, is_wandb_available
-from .training_args import TrainingArguments, is_torch_tpu_available
+from .training_args import TrainingArguments


-try:
+if is_apex_available():
    from apex import amp

-    _has_apex = True
-except ImportError:
-    _has_apex = False
-
-
-def is_apex_available():
-    return _has_apex
-

 if is_torch_tpu_available():
    import torch_xla.core.xla_model as xm

--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -5,7 +5,7 @@ from pathlib import Path

 from transformers import AutoConfig, is_torch_available

-from .utils import require_torch
+from .utils import require_torch, torch_device


 if is_torch_available():
@@ -26,7 +26,12 @@ class BenchmarkTest(unittest.TestCase):
    def test_inference_no_configs(self):
        MODEL_ID = "sshleifer/tiny-gpt2"
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args)
        results = benchmark.run()
@@ -42,6 +47,24 @@ class BenchmarkTest(unittest.TestCase):
            torchscript=True,
            sequence_lengths=[8],
            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_inference_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            fp16=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args)
        results = benchmark.run()
@@ -51,7 +74,29 @@ class BenchmarkTest(unittest.TestCase):
    def test_train_no_configs(self):
        MODEL_ID = "sshleifer/tiny-gpt2"
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_train_no_configs_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            fp16=True,
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args)
        results = benchmark.run()
@@ -62,7 +107,12 @@ class BenchmarkTest(unittest.TestCase):
        MODEL_ID = "sshleifer/tiny-gpt2"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -73,7 +123,12 @@ class BenchmarkTest(unittest.TestCase):
        MODEL_ID = "sshleifer/tinier_bart"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -81,26 +136,15 @@ class BenchmarkTest(unittest.TestCase):
        self.check_results_dict_not_empty(results.memory_inference_result)

    def test_train_with_configs(self):
-        MODEL_ID = "sshleifer/tiny-gpt2"
-        config = AutoConfig.from_pretrained(MODEL_ID)
-        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
-        )
-        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
-        results = benchmark.run()
-        self.check_results_dict_not_empty(results.time_train_result)
-        self.check_results_dict_not_empty(results.memory_train_result)
-
-    def test_train_with_configs_torchscript(self):
        MODEL_ID = "sshleifer/tiny-gpt2"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
            models=[MODEL_ID],
            training=True,
            no_inference=True,
-            torchscript=True,
            sequence_lengths=[8],
            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -111,7 +155,12 @@ class BenchmarkTest(unittest.TestCase):
        MODEL_ID = "sshleifer/tinier_bart"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -133,6 +182,7 @@ class BenchmarkTest(unittest.TestCase):
                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
                train_time_csv_file=os.path.join(tmp_dir, "train_time.csv"),
                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                no_multi_process=True,
            )
            benchmark = PyTorchBenchmark(benchmark_args)
            benchmark.run()
@@ -161,6 +211,7 @@ class BenchmarkTest(unittest.TestCase):
                log_filename=os.path.join(tmp_dir, "log.txt"),
                log_print=True,
                trace_memory_line_by_line=True,
+                no_multi_process=True,
            )
            benchmark = PyTorchBenchmark(benchmark_args)
            result = benchmark.run()

--- a/tests/test_benchmark_tf.py
+++ b/tests/test_benchmark_tf.py
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from transformers import AutoConfig, is_tf_available
+
+from .utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TensorflowBenchmark, TensorflowBenchmarkArguments
+
+
+@require_tf
+class TFBenchmarkTest(unittest.TestCase):
+    def check_results_dict_not_empty(self, results):
+        for model_result in results.values():
+            for batch_size, sequence_length in zip(model_result["bs"], model_result["ss"]):
+                result = model_result["result"][batch_size][sequence_length]
+                self.assertIsNotNone(result)
+
+    def test_inference_no_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_encoder_decoder_with_configs(self):
+        MODEL_ID = "patrickvonplaten/t5-tiny-random"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(is_tf_available() and len(tf.config.list_physical_devices("GPU")) == 0, "Cannot do xla on CPU.")
+    def test_inference_no_configs_xla(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            use_xla=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_save_csv_files(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorflowBenchmarkArguments(
+                models=[MODEL_ID],
+                no_inference=False,
+                save_to_csv=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                inference_time_csv_file=os.path.join(tmp_dir, "inf_time.csv"),
+                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
+                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                no_multi_process=True,
+            )
+            benchmark = TensorflowBenchmark(benchmark_args)
+            benchmark.run()
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_time.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_mem.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "env.csv")).exists())
+
+    def test_trace_memory(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+
+        def _check_summary_is_not_empty(summary):
+            self.assertTrue(hasattr(summary, "sequential"))
+            self.assertTrue(hasattr(summary, "cumulative"))
+            self.assertTrue(hasattr(summary, "current"))
+            self.assertTrue(hasattr(summary, "total"))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorflowBenchmarkArguments(
+                models=[MODEL_ID],
+                no_inference=False,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                log_filename=os.path.join(tmp_dir, "log.txt"),
+                log_print=True,
+                trace_memory_line_by_line=True,
+                eager_mode=True,
+                no_multi_process=True,
+            )
+            benchmark = TensorflowBenchmark(benchmark_args)
+            result = benchmark.run()
+            _check_summary_is_not_empty(result.inference_summary)
+            self.assertTrue(Path(os.path.join(tmp_dir, "log.txt")).exists())