Unverified Commit fa0be6d7 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Benchmarks (#4912)

* finish benchmark

* fix isort

* fix setup cfg

* retab

* fix time measuring of tf graph mode

* fix tf cuda

* clean code

* better error message
parent 18a0150b
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
# Copyright 2020 The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Benchmarking the library on inference and training in Tensorflow"""
from transformers import HfArgumentParser, TensorflowBenchmark, TensorflowBenchmarkArguments
def main():
parser = HfArgumentParser(TensorflowBenchmarkArguments)
benchmark_args = parser.parse_args_into_dataclasses()[0]
benchmark = TensorflowBenchmark(args=benchmark_args)
benchmark.run()
if __name__ == "__main__":
main()
import faiss
import nlp
import numpy as np
import torch
from elasticsearch import Elasticsearch
import faiss
import nlp
import streamlit as st
import transformers
from elasticsearch import Elasticsearch
from eli5_utils import (
embed_questions_for_retrieval,
make_qa_s2s_model,
......
......@@ -4,17 +4,17 @@ import os # noqa: F401
from random import choice, randint
from time import time
import faiss # noqa: F401
import nlp # noqa: F401
import numpy as np
import pandas as pd
import torch
import torch.utils.checkpoint as checkpoint
from elasticsearch import Elasticsearch # noqa: F401
from elasticsearch.helpers import bulk, streaming_bulk # noqa: F401
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm
import faiss # noqa: F401
import nlp # noqa: F401
from elasticsearch import Elasticsearch # noqa: F401
from elasticsearch.helpers import bulk, streaming_bulk # noqa: F401
from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
......
......@@ -8,3 +8,8 @@ tensorflow_datasets
pytorch-lightning==0.7.6
matplotlib
git-python==1.0.3
faiss
streamlit
elasticsearch
pandas
nlp
......@@ -5,12 +5,15 @@ include_trailing_comma = True
known_first_party = transformers
known_third_party =
absl
elasticsearch
fairseq
faiss
fastprogress
git
h5py
matplotlib
MeCab
nlp
nltk
numpy
packaging
......
......@@ -78,6 +78,9 @@ from .file_utils import (
add_end_docstrings,
add_start_docstrings,
cached_path,
is_apex_available,
is_psutil_available,
is_py3nvml_available,
is_tf_available,
is_torch_available,
is_torch_tpu_available,
......@@ -398,7 +401,8 @@ if is_torch_available():
from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
# Benchmarks
from .benchmark import PyTorchBenchmark, PyTorchBenchmarkArguments
from .benchmark.benchmark import PyTorchBenchmark
from .benchmark.benchmark_args import PyTorchBenchmarkArguments
# TensorFlow
if is_tf_available():
......@@ -608,6 +612,10 @@ if is_tf_available():
# Trainer
from .trainer_tf import TFTrainer
# Benchmarks
from .benchmark.benchmark_tf import TensorflowBenchmark
from .benchmark.benchmark_args_tf import TensorflowBenchmarkArguments
if not is_tf_available() and not is_torch_available():
logger.warning(
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
from ..file_utils import is_torch_available
if is_torch_available():
from .benchmark_args import PyTorchBenchmarkArguments
from .benchmark import PyTorchBenchmark
......@@ -20,16 +20,24 @@
import logging
import timeit
from typing import Callable, Optional
from transformers import (
MODEL_MAPPING,
MODEL_WITH_LM_HEAD_MAPPING,
PretrainedConfig,
is_py3nvml_available,
is_torch_available,
is_torch_tpu_available,
)
from .benchmark_utils import Benchmark, Memory, measure_peak_memory_cpu, start_memory_tracing, stop_memory_tracing
from .benchmark_utils import (
Benchmark,
Memory,
MemorySummary,
measure_peak_memory_cpu,
start_memory_tracing,
stop_memory_tracing,
)
if is_torch_available():
......@@ -37,6 +45,10 @@ if is_torch_available():
from .benchmark_args import PyTorchBenchmarkArguments
if is_py3nvml_available():
import py3nvml.py3nvml as nvml
logger = logging.getLogger(__name__)
......@@ -50,220 +62,173 @@ class PyTorchBenchmark(Benchmark):
def framework_version(self):
return torch.__version__
def train(self, model_name, batch_size, sequence_length, trace_memory=False):
try:
config = self.config_dict[model_name]
def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
return self._measure_speed(_inference)
if self.args.torchscript:
config.torchscript = True
def _inference_memory(
self, model_name: str, batch_size: int, sequence_length: int
) -> [Memory, Optional[MemorySummary]]:
_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
return self._measure_memory(_inference)
model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
model.to(self.args.device)
model.train()
def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
_train = self._prepare_train_func(model_name, batch_size, sequence_length)
return self._measure_speed(_train)
# encoder-decoder has vocab size saved differently
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
input_ids = torch.randint(
vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
)
def _train_memory(
self, model_name: str, batch_size: int, sequence_length: int
) -> [Memory, Optional[MemorySummary]]:
_train = self._prepare_train_func(model_name, batch_size, sequence_length)
return self._measure_memory(_train)
if self.args.torchscript:
raise NotImplementedError("Training for torchscript is currently not implemented")
else:
train_model = model
def compute_loss_and_backprob_encoder():
loss = train_model(input_ids, labels=input_ids)[0]
loss.backward()
train_model.zero_grad()
def compute_loss_and_backprob_encoder_decoder():
loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
loss.backward()
train_model.zero_grad()
_train = (
compute_loss_and_backprob_encoder_decoder
if config.is_encoder_decoder
else compute_loss_and_backprob_encoder
)
if trace_memory is True:
if self.args.trace_memory_line_by_line:
trace = start_memory_tracing("transformers")
if self.args.n_gpu > 0:
# gpu
# clear gpu cache
torch.cuda.empty_cache()
if hasattr(torch.cuda, "max_memory_reserved"):
torch.cuda.reset_peak_memory_stats()
else:
logger.info(
"Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
)
torch.cuda.reset_max_memory_cached()
# calculate loss and do backpropagation
_train()
elif not self.args.no_tpu and is_torch_tpu_available():
# tpu
raise NotImplementedError(
"Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
)
else:
# cpu
memory_bytes = measure_peak_memory_cpu(_train)
memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
config = self.config_dict[model_name]
if self.args.trace_memory_line_by_line:
summary = stop_memory_tracing(trace)
else:
summary = None
if self.args.n_gpu > 0:
# gpu
if hasattr(torch.cuda, "max_memory_reserved"):
memory = Memory(torch.cuda.max_memory_reserved())
else:
logger.info(
"Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
)
memory = Memory(torch.cuda.max_memory_reserved())
return memory, summary
else:
if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
# run additional 10 times to stabilize compilation for tpu and torchscript
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
timeit.repeat(
_train, repeat=1, number=5,
)
if self.args.torchscript:
config.torchscript = True
if self.args.with_lm_head:
model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
else:
model = MODEL_MAPPING[config.__class__](config)
model.eval()
model.to(self.args.device)
# encoder-decoder has vocab size saved differently
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
if self.args.fp16:
logger.info("Running training in Mixed Precision...")
assert self.args.is_gpu, "Mixed precision is possible only for GPU."
# amp seems to have memory leaks so that memory usage
# is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
model.half()
if self.args.torchscript:
with torch.no_grad():
inference_model = torch.jit.trace(model, input_ids)
else:
inference_model = model
def encoder_decoder_forward():
with torch.no_grad():
outputs = inference_model(input_ids, decoder_input_ids=input_ids)
return outputs
def encoder_forward():
with torch.no_grad():
outputs = inference_model(input_ids)
return outputs
_forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
return _forward
def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
config = self.config_dict[model_name]
model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
if self.args.torchscript:
raise NotImplementedError("Training for torchscript is currently not implemented")
else:
train_model = model
model.eval()
model.to(self.args.device)
# encoder-decoder has vocab size saved differently
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
if self.args.fp16:
logger.info("Running training in Mixed Precision...")
assert self.args.is_gpu, "Mixed precision is possible only for GPU."
# amp seems to have memory leaks so that memory usage
# is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
model.half()
def compute_loss_and_backprob_encoder():
loss = train_model(input_ids, labels=input_ids)[0]
loss.backward()
train_model.zero_grad()
def compute_loss_and_backprob_encoder_decoder():
loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
loss.backward()
train_model.zero_grad()
_train = (
compute_loss_and_backprob_encoder_decoder
if config.is_encoder_decoder
else compute_loss_and_backprob_encoder
)
return _train
def _measure_speed(self, func) -> float:
try:
if self.args.is_tpu or self.args.torchscript:
# run additional 10 times to stabilize compilation for tpu and torchscript
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
timeit.repeat(
func, repeat=1, number=5,
)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(_train, repeat=self.args.repeat, number=10,)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
import torch_xla.debug.metrics as met
if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
import torch_xla.debug.metrics as met
self.print_fn(met.metrics_report())
self.print_fn(met.metrics_report())
return min(runtimes) / 10.0
return min(runtimes) / 10.0
except RuntimeError as e:
self.print_fn("Doesn't fit on GPU. {}".format(e))
if trace_memory:
return "N/A", None
else:
return "N/A"
return "N/A"
def inference(self, model_name, batch_size, sequence_length, trace_memory=False):
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
try:
config = self.config_dict[model_name]
model = None
if self.args.torchscript:
config.torchscript = True
if self.args.with_lm_head:
model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
else:
model = MODEL_MAPPING[config.__class__](config)
model.eval()
model.to(self.args.device)
# encoder-decoder has vocab size saved differently
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
input_ids = torch.randint(
vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
)
if self.args.torchscript:
with torch.no_grad():
if config.is_encoder_decoder:
raise NotImplementedError("Torchscript is currently not supported for EncoderDecoder models")
else:
inference_model = torch.jit.trace(model, input_ids)
else:
inference_model = model
def encoder_decoder_forward():
with torch.no_grad():
inference_model(input_ids, decoder_input_ids=input_ids)
def encoder_forward():
with torch.no_grad():
inference_model(input_ids)
_forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
if trace_memory is True:
if self.args.trace_memory_line_by_line:
trace = start_memory_tracing("transformers")
if self.args.n_gpu > 0:
# gpu
# clear gpu cache
torch.cuda.empty_cache()
if hasattr(torch.cuda, "max_memory_reserved"):
torch.cuda.reset_peak_memory_stats()
else:
logger.info(
"Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
)
torch.cuda.reset_max_memory_cached()
# run forward
_forward()
elif not self.args.no_tpu and is_torch_tpu_available():
# tpu
raise NotImplementedError(
"Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
if self.args.trace_memory_line_by_line:
trace = start_memory_tracing("transformers")
if self.args.is_tpu:
# tpu
raise NotImplementedError(
"Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no_memory` or `args.no_memory=True`"
)
elif self.args.is_gpu:
if not is_py3nvml_available():
logger.warning(
"py3nvml not installed, we won't log GPU memory usage. "
"Install py3nvml (pip install py3nvml) to log information about GPU."
)
memory = "N/A"
else:
# cpu
memory_bytes = measure_peak_memory_cpu(_forward)
memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
if self.args.trace_memory_line_by_line:
summary = stop_memory_tracing(trace)
else:
summary = None
if self.args.n_gpu > 0:
# gpu
if hasattr(torch.cuda, "max_memory_reserved"):
memory = Memory(torch.cuda.max_memory_reserved())
else:
logger.info(
"Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
)
memory = Memory(torch.cuda.max_memory_cached())
return memory, summary
else:
if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
# run additional 10 times to stabilize compilation for tpu and torchscript
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
timeit.repeat(
_forward, repeat=1, number=5,
logger.info(
"Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
)
# init nvml
nvml.nvmlInit()
func()
handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
max_bytes_in_use = meminfo.used
memory = Memory(max_bytes_in_use)
# shutdown nvml
nvml.nvmlShutdown()
else:
# cpu
memory_bytes = measure_peak_memory_cpu(func)
memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(_forward, repeat=self.args.repeat, number=10,)
if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
import torch_xla.debug.metrics as met
self.print_fn(met.metrics_report())
return min(runtimes) / 10.0
if self.args.trace_memory_line_by_line:
summary = stop_memory_tracing(trace)
else:
summary = None
return memory, summary
except RuntimeError as e:
self.print_fn("Doesn't fit on GPU. {}".format(e))
if trace_memory:
return "N/A", None
else:
return "N/A"
return "N/A", None
......@@ -34,11 +34,17 @@ logger = logging.getLogger(__name__)
@dataclass
class PyTorchBenchmarkArguments(BenchmarkArguments):
no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"})
no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
tpu_print_metrics: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
torch_xla_tpu_print_metrics: bool = field(default=False, metadata={"help": "Print Xla/PyTorch tpu metrics"})
fp16_opt_level: str = field(
default="O1",
metadata={
"help": (
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html"
)
},
)
@cached_property
@torch_required
......@@ -55,9 +61,14 @@ class PyTorchBenchmarkArguments(BenchmarkArguments):
n_gpu = torch.cuda.device_count()
return device, n_gpu
@property
def is_tpu(self):
return is_torch_tpu_available() and not self.no_tpu
@property
@torch_required
def device_idx(self) -> int:
# TODO(PVP): currently only single GPU is supported
return torch.cuda.current_device()
@property
......@@ -69,3 +80,7 @@ class PyTorchBenchmarkArguments(BenchmarkArguments):
@torch_required
def n_gpu(self):
return self._setup_devices[1]
@property
def is_gpu(self):
return self.n_gpu > 0
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from dataclasses import dataclass, field
from typing import Tuple
from ..file_utils import cached_property, is_tf_available, tf_required
from .benchmark_args_utils import BenchmarkArguments
if is_tf_available():
import tensorflow as tf
logger = logging.getLogger(__name__)
@dataclass
class TensorflowBenchmarkArguments(BenchmarkArguments):
tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"},
)
device_idx: int = field(
default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
)
eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
use_xla: bool = field(
default=False,
metadata={
"help": "Benchmark models using XLA JIT compilation. Note that `eager_model` has to be set to `False`."
},
)
@cached_property
@tf_required
def _setup_tpu(self) -> Tuple["tf.distribute.cluster_resolver.TPUClusterResolver"]:
if not self.no_tpu:
try:
if self.tpu_name:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name)
else:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
except ValueError:
tpu = None
return tpu
@cached_property
@tf_required
def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", "tf.distribute.cluster_resolver.TPUClusterResolver"]:
if self.is_tpu:
tf.config.experimental_connect_to_cluster(self._setup_tpu)
tf.tpu.experimental.initialize_tpu_system(self._setup_tpu)
strategy = tf.distribute.experimental.TPUStrategy(self._setup_tpu)
else:
# currently no multi gpu is allowed
if self.is_gpu:
# TODO: Currently only single GPU is supported
tf.config.experimental.set_visible_devices(self.gpu_list[self.device_idx], "GPU")
strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{self.device_idx}")
else:
tf.config.experimental.set_visible_devices([], "GPU") # disable GPU
strategy = tf.distribute.OneDeviceStrategy(device=f"/cpu:{self.device_idx}")
return strategy
@property
@tf_required
def is_tpu(self) -> bool:
return self._setup_tpu is not None
@property
@tf_required
def strategy(self) -> "tf.distribute.Strategy":
return self._setup_strategy
@property
@tf_required
def gpu_list(self):
return tf.config.list_physical_devices("GPU")
@property
@tf_required
def n_gpu(self) -> int:
if not self.no_cuda:
return len(self.gpu_list)
return 0
@property
def is_gpu(self) -> bool:
return self.n_gpu > 0
......@@ -16,11 +16,15 @@
import dataclasses
import json
import logging
from dataclasses import dataclass, field
from time import time
from typing import List
logger = logging.getLogger(__name__)
def list_field(default=None, metadata=None):
return field(default_factory=lambda: default, metadata=metadata)
......@@ -53,6 +57,9 @@ class BenchmarkArguments:
)
no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"})
no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
training: bool = field(default=False, metadata={"help": "Benchmark training of model"})
verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"})
no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"})
......@@ -61,6 +68,12 @@ class BenchmarkArguments:
save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"})
log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"})
no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"})
no_multi_process: bool = field(
default=False,
metadata={
"help": "Don't use multiprocessing for memory and speed measurement. It is highly recommended to use multiprocessing for accurate CPU and GPU memory measurements. This option should only be used for debugging / testing and on TPU."
},
)
with_lm_head: bool = field(
default=False,
metadata={
......@@ -101,4 +114,17 @@ class BenchmarkArguments:
@property
def model_names(self):
assert (
len(self.models) > 0
), "Please make sure you provide at least one model name / model identifier, *e.g.* `--models bert-base-cased` or `args.models = ['bert-base-cased']."
return self.models
@property
def do_multi_processing(self):
if self.no_multi_process:
return False
elif self.is_tpu:
logger.info("Multiprocessing is currently not possible on TPU.")
return False
else:
return True
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Benchmarking the library on inference and training in PyTorch.
"""
import logging
import random
import timeit
from functools import wraps
from typing import Callable, Optional
from transformers import (
TF_MODEL_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
PretrainedConfig,
is_py3nvml_available,
is_tf_available,
)
from .benchmark_utils import (
Benchmark,
Memory,
MemorySummary,
measure_peak_memory_cpu,
start_memory_tracing,
stop_memory_tracing,
)
if is_tf_available():
import tensorflow as tf
from .benchmark_args_tf import TensorflowBenchmarkArguments
from tensorflow.python.framework.errors_impl import ResourceExhaustedError
if is_py3nvml_available():
import py3nvml.py3nvml as nvml
logger = logging.getLogger(__name__)
def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
def run_func(func):
@wraps(func)
def run_in_eager_mode(*args, **kwargs):
return func(*args, **kwargs)
@wraps(func)
@tf.function(experimental_compile=use_xla)
def run_in_graph_mode(*args, **kwargs):
return func(*args, **kwargs)
if do_eager_mode is True:
assert (
use_xla is False
), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
return run_in_eager_mode
else:
return run_in_graph_mode
return run_func
def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> ["tf.Tensor"]:
rng = random.Random()
values = [rng.randint(0, vocab_size - 1) for i in range(batch_size * sequence_length)]
return tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
class TensorflowBenchmark(Benchmark):
args: TensorflowBenchmarkArguments
configs: PretrainedConfig
framework: str = "Tensorflow"
@property
def framework_version(self):
return tf.__version__
def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
# initialize GPU on separate process
strategy = self.args.strategy
assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
return self._measure_speed(_inference)
def _train_speed(self, model_name, batch_size, sequence_length):
raise NotImplementedError(
"Training is currently not really implemented." "Wait for TFTrainer to support CLM and MLM."
)
def _inference_memory(
self, model_name: str, batch_size: int, sequence_length: int
) -> [Memory, Optional[MemorySummary]]:
# initialize GPU on separate process
if self.args.is_gpu:
tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
strategy = self.args.strategy
assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
_inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
return self._measure_memory(_inference)
def _train_memory(self, model_name, batch_size, sequence_length):
raise NotImplementedError(
"Training is currently not really implemented. Wait for TFTrainer to support CLM and MLM."
)
def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
config = self.config_dict[model_name]
if self.args.fp16:
raise NotImplementedError("Mixed precision is currently not supported.")
if self.args.with_lm_head:
model = TF_MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
else:
model = TF_MODEL_MAPPING[config.__class__](config)
# encoder-decoder has vocab size saved differently
vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
@run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
def encoder_decoder_forward():
return model(input_ids, decoder_input_ids=input_ids, training=False)
@run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
def encoder_forward():
return model(input_ids, training=False)
_inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
return _inference
def _measure_speed(self, func) -> float:
with self.args.strategy.scope():
try:
if self.args.is_tpu or self.args.use_xla:
# run additional 10 times to stabilize compilation for tpu
logger.info("Do inference on TPU. Running model 5 times to stabilize compilation")
timeit.repeat(func, repeat=1, number=5)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
return min(runtimes) / 10.0
except ResourceExhaustedError as e:
self.print_fn("Doesn't fit on GPU. {}".format(e))
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
logger.info(
"Note that Tensorflow allocates more memory than"
"it might need to speed up computation."
"The memory reported here corresponds to the memory"
"reported by `nvidia-smi`, which can vary depending"
"on total available memory on the GPU that is used."
)
with self.args.strategy.scope():
try:
if self.args.trace_memory_line_by_line:
assert (
self.args.eager_mode
), "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory consumption line by line."
trace = start_memory_tracing("transformers")
if self.args.is_tpu:
# tpu
raise NotImplementedError(
"Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
)
elif self.args.is_gpu:
# gpu
if not is_py3nvml_available():
logger.warning(
"py3nvml not installed, we won't log GPU memory usage. "
"Install py3nvml (pip install py3nvml) to log information about GPU."
)
memory = "N/A"
else:
logger.info(
"Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
)
# init nvml
nvml.nvmlInit()
func()
handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
max_bytes_in_use = meminfo.used
memory = Memory(max_bytes_in_use)
# shutdown nvml
nvml.nvmlShutdown()
else:
# cpu
if self.args.trace_memory_line_by_line:
logger.info(
"When enabling line by line tracing, the max peak memory for CPU is inaccurate in Tensorflow."
)
memory = None
else:
memory_bytes = measure_peak_memory_cpu(func)
memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
if self.args.trace_memory_line_by_line:
summary = stop_memory_tracing(trace)
if memory is None:
memory = summary.total
else:
summary = None
return memory, summary
except ResourceExhaustedError as e:
self.print_fn("Doesn't fit on GPU. {}".format(e))
return "N/A", None
......@@ -81,6 +81,31 @@ except ImportError:
_torch_tpu_available = False
try:
import psutil # noqa: F401
_psutil_available = True
except ImportError:
_psutil_available = False
try:
import py3nvml # noqa: F401
_py3nvml_available = True
except ImportError:
_py3nvml_available = False
try:
from apex import amp # noqa: F401
_has_apex = True
except ImportError:
_has_apex = False
default_cache_path = os.path.join(torch_cache_home, "transformers")
......@@ -115,6 +140,18 @@ def is_torch_tpu_available():
return _torch_tpu_available
def is_psutil_available():
return _psutil_available
def is_py3nvml_available():
return _py3nvml_available
def is_apex_available():
return _has_apex
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
......
......@@ -20,23 +20,16 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
from tqdm.auto import tqdm, trange
from .data.data_collator import DataCollator, default_data_collator
from .file_utils import is_apex_available, is_torch_tpu_available
from .modeling_utils import PreTrainedModel
from .optimization import AdamW, get_linear_schedule_with_warmup
from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput, is_wandb_available
from .training_args import TrainingArguments, is_torch_tpu_available
from .training_args import TrainingArguments
try:
if is_apex_available():
from apex import amp
_has_apex = True
except ImportError:
_has_apex = False
def is_apex_available():
return _has_apex
if is_torch_tpu_available():
import torch_xla.core.xla_model as xm
......
......@@ -5,7 +5,7 @@ from pathlib import Path
from transformers import AutoConfig, is_torch_available
from .utils import require_torch
from .utils import require_torch, torch_device
if is_torch_available():
......@@ -26,7 +26,12 @@ class BenchmarkTest(unittest.TestCase):
def test_inference_no_configs(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
results = benchmark.run()
......@@ -42,6 +47,24 @@ class BenchmarkTest(unittest.TestCase):
torchscript=True,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
def test_inference_fp16(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
fp16=True,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
results = benchmark.run()
......@@ -51,7 +74,29 @@ class BenchmarkTest(unittest.TestCase):
def test_train_no_configs(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
models=[MODEL_ID],
training=True,
no_inference=True,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
results = benchmark.run()
self.check_results_dict_not_empty(results.time_train_result)
self.check_results_dict_not_empty(results.memory_train_result)
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
def test_train_no_configs_fp16(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID],
training=True,
no_inference=True,
sequence_lengths=[8],
batch_sizes=[1],
fp16=True,
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
results = benchmark.run()
......@@ -62,7 +107,12 @@ class BenchmarkTest(unittest.TestCase):
MODEL_ID = "sshleifer/tiny-gpt2"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
results = benchmark.run()
......@@ -73,7 +123,12 @@ class BenchmarkTest(unittest.TestCase):
MODEL_ID = "sshleifer/tinier_bart"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
results = benchmark.run()
......@@ -81,26 +136,15 @@ class BenchmarkTest(unittest.TestCase):
self.check_results_dict_not_empty(results.memory_inference_result)
def test_train_with_configs(self):
MODEL_ID = "sshleifer/tiny-gpt2"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
)
benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
results = benchmark.run()
self.check_results_dict_not_empty(results.time_train_result)
self.check_results_dict_not_empty(results.memory_train_result)
def test_train_with_configs_torchscript(self):
MODEL_ID = "sshleifer/tiny-gpt2"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID],
training=True,
no_inference=True,
torchscript=True,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
results = benchmark.run()
......@@ -111,7 +155,12 @@ class BenchmarkTest(unittest.TestCase):
MODEL_ID = "sshleifer/tinier_bart"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = PyTorchBenchmarkArguments(
models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
models=[MODEL_ID],
training=True,
no_inference=True,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
results = benchmark.run()
......@@ -133,6 +182,7 @@ class BenchmarkTest(unittest.TestCase):
inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
train_time_csv_file=os.path.join(tmp_dir, "train_time.csv"),
env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
benchmark.run()
......@@ -161,6 +211,7 @@ class BenchmarkTest(unittest.TestCase):
log_filename=os.path.join(tmp_dir, "log.txt"),
log_print=True,
trace_memory_line_by_line=True,
no_multi_process=True,
)
benchmark = PyTorchBenchmark(benchmark_args)
result = benchmark.run()
......
import os
import tempfile
import unittest
from pathlib import Path
from transformers import AutoConfig, is_tf_available
from .utils import require_tf
if is_tf_available():
import tensorflow as tf
from transformers import TensorflowBenchmark, TensorflowBenchmarkArguments
@require_tf
class TFBenchmarkTest(unittest.TestCase):
def check_results_dict_not_empty(self, results):
for model_result in results.values():
for batch_size, sequence_length in zip(model_result["bs"], model_result["ss"]):
result = model_result["result"][batch_size][sequence_length]
self.assertIsNotNone(result)
def test_inference_no_configs_eager(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
eager_mode=True,
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args)
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
def test_inference_no_configs_graph(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args)
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
def test_inference_with_configs_eager(self):
MODEL_ID = "sshleifer/tiny-gpt2"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
eager_mode=True,
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args, [config])
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
def test_inference_with_configs_graph(self):
MODEL_ID = "sshleifer/tiny-gpt2"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args, [config])
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
def test_inference_encoder_decoder_with_configs(self):
MODEL_ID = "patrickvonplaten/t5-tiny-random"
config = AutoConfig.from_pretrained(MODEL_ID)
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args, configs=[config])
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
@unittest.skipIf(is_tf_available() and len(tf.config.list_physical_devices("GPU")) == 0, "Cannot do xla on CPU.")
def test_inference_no_configs_xla(self):
MODEL_ID = "sshleifer/tiny-gpt2"
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
training=False,
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
use_xla=True,
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args)
results = benchmark.run()
self.check_results_dict_not_empty(results.time_inference_result)
self.check_results_dict_not_empty(results.memory_inference_result)
def test_save_csv_files(self):
MODEL_ID = "sshleifer/tiny-gpt2"
with tempfile.TemporaryDirectory() as tmp_dir:
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
no_inference=False,
save_to_csv=True,
sequence_lengths=[8],
batch_sizes=[1],
inference_time_csv_file=os.path.join(tmp_dir, "inf_time.csv"),
inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args)
benchmark.run()
self.assertTrue(Path(os.path.join(tmp_dir, "inf_time.csv")).exists())
self.assertTrue(Path(os.path.join(tmp_dir, "inf_mem.csv")).exists())
self.assertTrue(Path(os.path.join(tmp_dir, "env.csv")).exists())
def test_trace_memory(self):
MODEL_ID = "sshleifer/tiny-gpt2"
def _check_summary_is_not_empty(summary):
self.assertTrue(hasattr(summary, "sequential"))
self.assertTrue(hasattr(summary, "cumulative"))
self.assertTrue(hasattr(summary, "current"))
self.assertTrue(hasattr(summary, "total"))
with tempfile.TemporaryDirectory() as tmp_dir:
benchmark_args = TensorflowBenchmarkArguments(
models=[MODEL_ID],
no_inference=False,
sequence_lengths=[8],
batch_sizes=[1],
log_filename=os.path.join(tmp_dir, "log.txt"),
log_print=True,
trace_memory_line_by_line=True,
eager_mode=True,
no_multi_process=True,
)
benchmark = TensorflowBenchmark(benchmark_args)
result = benchmark.run()
_check_summary_is_not_empty(result.inference_summary)
self.assertTrue(Path(os.path.join(tmp_dir, "log.txt")).exists())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment