Commit 3c15726c authored by yangzhong's avatar yangzhong
Browse files

git init

parents
#!/bin/bash
# This script was taken from: https://github.com/IllinoisGraphBenchmark/IGB-Datasets/blob/main/igb/download_igbh600m.sh
# Copy of licence for this ONLY this script
# MIT License
# Copyright (c) 2022 IMPACT Research Group and IllinoisGraphBenchmark
# This license is applicable for all software codebases provided in the repository.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
echo "IGBH600M download starting"
mkdir -p $1/full/processed
cd $1/full/processed
# paper
mkdir paper
cd paper
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/node_feat.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/node_label_19.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/node_label_2K.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/paper_id_index_mapping.npy
cd ..
# paper__cites__paper
mkdir paper__cites__paper
cd paper__cites__paper
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__cites__paper/edge_index.npy
cd ..
# author
mkdir author
cd author
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/author/author_id_index_mapping.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/author/node_feat.npy
cd ..
# conference
mkdir conference
cd conference
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/conference/conference_id_index_mapping.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/conference/node_feat.npy
cd ..
# institute
mkdir institute
cd institute
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/institute/institute_id_index_mapping.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/institute/node_feat.npy
cd ..
# journal
mkdir journal
cd journal
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/journal/journal_id_index_mapping.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/journal/node_feat.npy
cd ..
# fos
mkdir fos
cd fos
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/fos/fos_id_index_mapping.npy
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/fos/node_feat.npy
cd ..
# author__affiliated_to__institute
mkdir author__affiliated_to__institute
cd author__affiliated_to__institute
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/author__affiliated_to__institute/edge_index.npy
cd ..
# paper__published__journal
mkdir paper__published__journal
cd paper__published__journal
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__published__journal/edge_index.npy
cd ..
# paper__topic__fos
mkdir paper__topic__fos
cd paper__topic__fos
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__topic__fos/edge_index.npy
cd ..
# paper__venue__conference
mkdir paper__venue__conference
cd paper__venue__conference
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__venue__conference/edge_index.npy
cd ..
# paper__written_by__author
mkdir paper__written_by__author
cd paper__written_by__author
wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__written_by__author/edge_index.npy
cd ..
echo "IGBH-IGBH download complete"
\ No newline at end of file
import argparse
import os
from igb import download
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--target-path",
type=str,
default="igbh/",
help="Download path for the dataset",
)
parser.add_argument(
"--dataset-size",
type=str,
default="tiny",
choices=["tiny", "small", "medium"],
help="Size of the dataset, only full for official submissions",
)
parser.add_argument(
"--dataset-type",
type=str,
default="heterogeneous",
choices=["homogeneous", "heterogeneous"],
)
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
os.makedirs(args.target_path, exist_ok=True)
download.download_dataset(
path=args.target_path,
dataset_size=args.dataset_size,
dataset_type=args.dataset_type,
)
# Script taken from
# https://github.com/mlcommons/training/blob/master/graph_neural_network/split_seeds.py
import argparse
import os.path as osp
import torch
class SeedSplitter(object):
def __init__(
self,
path,
dataset_size="tiny",
use_label_2K=True,
random_seed=42,
validation_frac=0.01,
calibration=False
):
self.path = path
self.dataset_size = dataset_size
self.use_label_2K = use_label_2K
self.random_seed = random_seed
self.validation_frac = validation_frac
self.calibration = calibration
self.paper_nodes_num = {
"tiny": 100000,
"small": 1000000,
"medium": 10000000,
"large": 100000000,
"full": 269346174,
}
self.process()
def process(self):
torch.manual_seed(self.random_seed)
n_labeled_idx = self.paper_nodes_num[self.dataset_size]
if self.dataset_size == "full":
if self.use_label_2K:
n_labeled_idx = 157675969
else:
n_labeled_idx = 227130858
shuffled_index = torch.randperm(n_labeled_idx)
n_train = int(n_labeled_idx * 0.6)
n_val = int(n_labeled_idx * self.validation_frac)
train_idx = shuffled_index[:n_train]
val_idx = shuffled_index[n_train: n_train + n_val]
path = osp.join(self.path, self.dataset_size, "processed")
torch.save(train_idx, osp.join(path, "train_idx.pt"))
torch.save(val_idx, osp.join(path, "val_idx.pt"))
if self.calibration and self.dataset_size == "full":
n_calibration = 5000
calibration_idx = shuffled_index[:n_calibration].numpy().tolist()
with open(osp.join(path, "calibration.txt"), "w+") as f:
f.writelines([f"{idx}\n" for idx in calibration_idx])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
root = osp.join(
osp.dirname(
osp.dirname(
osp.dirname(
osp.realpath(__file__)))), "data", "igbh"
)
parser.add_argument(
"--path", type=str, default=root, help="path containing the datasets"
)
parser.add_argument(
"--dataset_size",
type=str,
default="full",
choices=["tiny", "small", "medium", "large", "full"],
help="size of the datasets",
)
parser.add_argument("--random_seed", type=int, default="42")
parser.add_argument(
"--num_classes",
type=int,
default=2983,
choices=[19, 2983],
help="number of classes",
)
parser.add_argument(
"--validation_frac",
type=float,
default=0.005,
help="Fraction of labeled vertices to be used for validation.",
)
parser.add_argument(
"--calibration",
action="store_true",
help="Save calibration dataset",
)
args = parser.parse_args()
splitter = SeedSplitter(
path=args.path,
dataset_size=args.dataset_size,
use_label_2K=(args.num_classes == 2983),
random_seed=args.random_seed,
validation_frac=args.validation_frac,
calibration=args.calibration,
)
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds
build/
eval_features.pickle
onnxruntime_profile__*.json
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
# Install LoadGen
# Cloning the LoadGen so that we have clean git repo from within the docker container.
RUN cd /tmp \
&& git clone https://github.com/mlperf/inference.git \
&& cd inference \
&& git submodule update --init third_party/pybind \
&& cd loadgen \
&& python3 setup.py install \
&& cd /tmp \
&& rm -rf inference
# Install dependencies
RUN python3 -m pip install torch==1.4.0 onnx==1.6.0 transformers==2.4.0 \
onnxruntime==1.2.0 numpy==1.18.0 tokenization==1.0.7
# Add user
ARG GID
ARG UID
ARG GROUP
ARG USER
RUN echo root:root | chpasswd \
&& groupadd -f -g ${GID} ${GROUP} \
&& useradd -G sudo -g ${GID} -u ${UID} -m ${USER} \
&& echo ${USER}:${USER} | chpasswd \
&& echo -e "\nexport PS1=\"(mlperf) \\u@\\h:\\w\\$ \"" | tee -a /home/${USER}/.bashrc \
&& echo -e "\n%sudo ALL=(ALL:ALL) NOPASSWD:ALL\n" | tee -a /etc/sudoers
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SHELL := /bin/bash
MAKEFILE_NAME := $(lastword $(MAKEFILE_LIST))
UNAME := $(shell whoami)
UID := $(shell id -u `whoami`)
GROUPNAME := $(shell id -gn `whoami`)
GROUPID := $(shell id -g `whoami`)
HOST_VOL ?= ${PWD}
CONTAINER_VOL ?= /workspace
BUILD_DIR := build
DATA_DIR := $(BUILD_DIR)/data
BERT_DIR := $(DATA_DIR)/bert_tf_v1_1_large_fp32_384_v2
RESULT_DIR := $(BUILD_DIR)/result
MLPERF_CONF := $(BUILD_DIR)/mlperf.conf
FEATURE_CACHE := eval_features.pickle
# Handle different nvidia-docker version
ifneq ($(wildcard /usr/bin/nvidia-docker),)
DOCKER_RUN_CMD := nvidia-docker run
else
DOCKER_RUN_CMD := docker run --gpus=all
endif
.PHONY: setup
setup:
@if [ ! -e $(BUILD_DIR) ]; then \
mkdir $(BUILD_DIR); \
fi
@if [ ! -e $(MLPERF_CONF) ]; then \
cp ../../mlperf.conf $(MLPERF_CONF); \
fi
@$(MAKE) -f $(MAKEFILE_NAME) init_submodule
@$(MAKE) -f $(MAKEFILE_NAME) download_data
@$(MAKE) -f $(MAKEFILE_NAME) download_model
.PHONY: init_submodule
init_submodule:
@git submodule update --init DeepLearningExamples
.PHONY: download_data
download_data:
@if [ ! -e $(DATA_DIR) ]; then \
mkdir $(DATA_DIR); \
fi
@if [ ! -e $(DATA_DIR)/dev-v1.1.json ]; then \
wget -O $(DATA_DIR)/dev-v1.1.json https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset/dev-v1.1.json?raw=true; \
fi
@if [ ! -e $(DATA_DIR)/evaluate-v1.1.py ]; then \
wget -O $(DATA_DIR)/evaluate-v1.1.py https://github.com/allenai/bi-att-flow/raw/master/squad/evaluate-v1.1.py; \
fi
@if [ ! -e $(BERT_DIR) ]; then \
mkdir $(BERT_DIR) ; \
fi
@if [ ! -e $(RESULT_DIR) ]; then \
mkdir $(RESULT_DIR); \
fi
.PHONY: download_model
download_model:
@$(MAKE) -f $(MAKEFILE_NAME) download_tf_model
@$(MAKE) -f $(MAKEFILE_NAME) download_pytorch_model
@$(MAKE) -f $(MAKEFILE_NAME) download_onnx_model
.PHONY: download_tf_model
download_tf_model:
@if [ ! -e $(BERT_DIR)/model.ckpt-5474.data-00000-of-00001 ]; then \
wget -O $(BERT_DIR)/model.ckpt-5474.data-00000-of-00001 https://zenodo.org/record/3733868/files/model.ckpt-5474.data-00000-of-00001?download=1; \
fi
@if [ ! -e $(BERT_DIR)/model.ckpt-5474.index ]; then \
wget -O $(BERT_DIR)/model.ckpt-5474.index https://zenodo.org/record/3733868/files/model.ckpt-5474.index?download=1; \
fi
@if [ ! -e $(BERT_DIR)/model.ckpt-5474.meta ]; then \
wget -O $(BERT_DIR)/model.ckpt-5474.meta https://zenodo.org/record/3733868/files/model.ckpt-5474.meta?download=1; \
fi
@if [ ! -e $(BERT_DIR)/vocab.txt ]; then \
wget -O $(BERT_DIR)/vocab.txt https://zenodo.org/record/3733868/files/vocab.txt?download=1; \
fi
@if [ ! -e $(BERT_DIR)/model.pb ]; then \
wget -O $(BERT_DIR)/model.pb https://zenodo.org/record/3939747/files/model.pb?download=1; \
fi
.PHONY: download_pytorch_model
download_pytorch_model:
@if [ ! -e $(BERT_DIR)/model.pytorch ]; then \
wget -O $(BERT_DIR)/model.pytorch https://zenodo.org/record/3733896/files/model.pytorch?download=1; \
fi
@if [ ! -e $(BERT_DIR)/vocab.txt ]; then \
wget -O $(BERT_DIR)/vocab.txt https://zenodo.org/record/3733896/files/vocab.txt?download=1; \
fi
.PHONY: download_onnx_model
download_onnx_model:
@if [ ! -e $(BERT_DIR)/model.onnx ]; then \
wget -O $(BERT_DIR)/model.onnx https://zenodo.org/record/3733910/files/model.onnx?download=1; \
fi
@if [ ! -e $(BERT_DIR)/bert_large_v1_1_fake_quant.onnx ]; then \
wget -O $(BERT_DIR)/bert_large_v1_1_fake_quant.onnx https://zenodo.org/record/3750364/files/bert_large_v1_1_fake_quant.onnx?download=1; \
fi
@if [ ! -e $(BERT_DIR)/vocab.txt ]; then \
wget -O $(BERT_DIR)/vocab.txt https://zenodo.org/record/3733910/files/vocab.txt?download=1; \
fi
.PHONY: build_docker
build_docker:
@docker pull nvcr.io/nvidia/tensorrtserver:19.08-py3
@cd DeepLearningExamples/TensorFlow/LanguageModeling/BERT && docker build . --rm -t mlperf-inference-bert
@docker build --build-arg GID=$(GROUPID) --build-arg UID=$(UID) --build-arg GROUP=$(GROUPNAME) --build-arg USER=$(UNAME) \
--build-arg BASE_IMAGE=mlperf-inference-bert -t mlperf-inference-bert - < Dockerfile
.PHONY: launch_docker
launch_docker:
@$(DOCKER_RUN_CMD) --rm -it -w /workspace -v $(HOST_VOL):$(CONTAINER_VOL) -v ${HOME}:/mnt/${HOME} \
--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-v /etc/timezone:/etc/timezone:ro -v /etc/localtime:/etc/localtime:ro \
--security-opt apparmor=unconfined --security-opt seccomp=unconfined \
--name mlperf-inference-bert-$(UNAME) -h mlperf-inference-bert-$(UNAME) --add-host mlperf-inference-bert-$(UNAME):127.0.0.1 \
--user $(UID):$(GROUPID) --net host --device /dev/fuse --cap-add SYS_ADMIN $(DOCKER_ARGS) mlperf-inference-bert
.PHONY: run_tf_performance
run_tf_performance:
@python3 run.py --backend=tf
.PHONY: run_tf_accuracy
run_tf_accuracy:
@python3 run.py --backend=tf --accuracy
.PHONY: run_pytorch_performance
run_pytorch_performance:
@python3 run.py --backend=pytorch
.PHONY: run_pytorch_accuracy
run_pytorch_accuracy:
@python3 run.py --backend=pytorch --accuracy
.PHONY: run_onnxruntime_performance
run_onnxruntime_performance:
@python3 run.py --backend=onnxruntime
.PHONY: run_onnxruntime_accuracy
run_onnxruntime_accuracy:
@python3 run.py --backend=onnxruntime --accuracy
.PHONY: run_onnxruntime_quantized_performance
run_onnxruntime_quantized_performance:
@python3 run.py --backend=onnxruntime --quantized
.PHONY: run_onnxruntime_quantized_accuracy
run_onnxruntime_quantized_accuracy:
@python3 run.py --backend=onnxruntime --quantized --accuracy
.PHONY: evaluate
evaluate:
@python3 $(DATA_DIR)/evaluate-v1.1.py.json $(DATA_DIR)/dev-v1.1.json $(RESULT_DIR)/predictions.json
.PHONY: clean
clean:
@rm -rf ${BUILD_DIR}
@rm -f ${FEATURE_CACHE}
@rm -f onnxruntime_profile__*.json
# MLPerf Inference Benchmarks for Natural Language Processing
This is the reference implementation for MLPerf Inference benchmarks for Natural Language Processing.
The chosen model is BERT-Large performing SQuAD v1.1 question answering task.
Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/bert) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
## Prerequisites
- [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
- Any NVIDIA GPU supported by TensorFlow or PyTorch
## Supported Models
| model | framework | accuracy | dataset | model link | model source | precision | notes |
| ----- | --------- | -------- | ------- | ---------- | ------------ | --------- | ----- |
| BERT-Large | TensorFlow | f1_score=90.874% | SQuAD v1.1 validation set | [from zenodo](https://zenodo.org/record/3733868) [from zenodo](https://zenodo.org/record/3939747) | [BERT-Large](https://github.com/google-research/bert), trained with [NVIDIA DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) | fp32 | |
| BERT-Large | PyTorch | f1_score=90.874% | SQuAD v1.1 validation set | [from zenodo](https://zenodo.org/record/3733896) | [BERT-Large](https://github.com/google-research/bert), trained with [NVIDIA DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT), converted with [bert_tf_to_pytorch.py](bert_tf_to_pytorch.py) | fp32 | |
| BERT-Large | ONNX | f1_score=90.874% | SQuAD v1.1 validation set | [from zenodo](https://zenodo.org/record/3733910) | [BERT-Large](https://github.com/google-research/bert), trained with [NVIDIA DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT), converted with [bert_tf_to_pytorch.py](bert_tf_to_pytorch.py) | fp32 | |
| BERT-Large | ONNX | f1_score=90.067% | SQuAD v1.1 validation set | [from zenodo](https://zenodo.org/record/3750364) | Fine-tuned based on the PyTorch model and converted with [bert_tf_to_pytorch.py](bert_tf_to_pytorch.py) | int8, symetrically per-tensor quantized without bias | See [MLPerf INT8 BERT Finetuning.pdf](MLPerf INT8 BERT Finetuning.pdf) for details about the fine-tuning process |
| BERT-Large | PyTorch | f1_score=90.633% | SQuAD v1.1 validation set | [from zenodo](https://zenodo.org/record/4792496) | Fine-tuned based on [Huggingface bert-large-uncased pretrained model](https://huggingface.co/bert-large-uncased) | int8, symetrically per-tensor quantized without bias | See README.md at Zenodo link for details about the fine-tuning process |
## Disclaimer
This benchmark app is a reference implementation that is not meant to be the fastest implementation possible.
## Commands
Please run the following commands:
- `make setup`: initialize submodule, download datasets, and download models.
- `make build_docker`: build docker image.
- `make launch_docker`: launch docker container with an interaction session.
- `python3 run.py --backend=[tf|pytorch|onnxruntime|tf_estimator] --scenario=[Offline|SingleStream|MultiStream|Server] [--accuracy] [--quantized]`: run the harness inside the docker container. Performance or Accuracy results will be printed in console.
* ENV variable `CM_MAX_NUM_THREADS` can be used to control the number of parallel threads issuing queries.
## Details
- SUT implementations are in [tf_SUT.py](tf_SUT.py), [tf_estimator_SUT.py](tf_estimator_SUT.py) and [pytorch_SUT.py](pytorch_SUT.py). QSL implementation is in [squad_QSL.py](squad_QSL.py).
- The script [accuracy-squad.py](accuracy-squad.py) parses LoadGen accuracy log, post-processes it, and computes the accuracy.
- Tokenization and detokenization (post-processing) are not included in the timed path.
- The inputs to the SUT are `input_ids`, `input_make`, and `segment_ids`. The output from SUT is `start_logits` and `end_logits` concatenated together.
- `max_seq_length` is 384.
- The script [tf_freeze_bert.py] freezes the TensorFlow model into pb file.
- The script [bert_tf_to_pytorch.py] converts the TensorFlow model into the PyTorch `BertForQuestionAnswering` module in [HuggingFace Transformers](https://github.com/huggingface/transformers) and also exports the model to [ONNX](https://github.com/onnx/onnx) format.
## Loadgen over the Network
```
pip install cm4mlops
```
The below CM command will launch the SUT server
```
cm run script --tags=generate-run-cmds,inference --model=bert-99 --backend=pytorch \
--mode=performance --device=cuda --quiet --test_query_count=1000 --network=sut
```
Once the SUT server is launched, the below command can be run on the loadgen node to do issue queries to the SUT nodes. In this command `-sut_servers` has just the localhost address - it can be changed to a comma-separated list of any hostname/IP in the network.
```
cm run script --tags=generate-run-cmds,inference --model=bert-99 --backend=pytorch --rerun \
--mode=performance --device=cuda --quiet --test_query_count=1000 \
--sut_servers,=http://localhost:8000 --network=lon
```
If you are not using CM, just add `--network=lon` along with your normal run command on the SUT side.
On the loadgen node, add `--network=lon` option and `--sut_server <IP1> <IP2>` to the normal command to connect to SUT nodes at IP addresses IP1, IP2 etc.
Loadgen over the network works for `onnxruntime` and `pytorch` backends.
## License
Apache License 2.0
# coding=utf-8
# Copyright 2021 Arm Limited and affiliates.
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import collections
import json
import math
import os
import subprocess
import sys
import numpy as np
import pkg_resources
import six
from transformers import BertTokenizer
# To support feature cache.
import pickle
sys.path.insert(0, os.path.dirname(__file__))
installed = {pkg.key for pkg in pkg_resources.working_set}
if "tensorflow" in installed:
import tensorflow
sys.path.insert(
0,
os.path.join(
os.path.dirname(__file__),
"DeepLearningExamples",
"TensorFlow",
"LanguageModeling",
"BERT",
),
)
elif "torch" in installed:
import torch
sys.path.insert(
0,
os.path.join(
os.path.dirname(__file__),
"DeepLearningExamples",
"PyTorch",
"LanguageModeling",
"BERT",
),
)
try:
import tokenization
from create_squad_data import convert_examples_to_features, read_squad_examples
except ImportError:
raise Exception("Error importing local modules")
max_seq_length = 384
max_query_length = 64
doc_stride = 128
RawResult = collections.namedtuple(
"RawResult", ["unique_id", "start_logits", "end_logits"]
)
dtype_map = {
"int8": np.int8,
"int16": np.int16,
"int32": np.int32,
"int64": np.int64,
"float16": np.float16,
"float32": np.float32,
"float64": np.float64}
def get_final_text(pred_text, orig_text, do_lower_case):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for i, c in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for i, tok_index in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
return orig_text
output_text = orig_text[orig_start_position: (orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits),
key=lambda x: x[1],
reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
def write_predictions(
all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
do_lower_case,
output_prediction_file,
max_examples=None,
):
"""Write final predictions to the json file and log-odds of null if needed."""
print("Writing predictions to: %s" % (output_prediction_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction",
["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
)
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for example_index, example in enumerate(all_examples):
if max_examples and example_index == max_examples:
break
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for feature_index, feature in enumerate(features):
# FIX: During compliance/audit runs, we only generate a small subset of
# all entries from the dataset. As a result, sometimes dict retrieval
# fails because a key is missing.
# result = unique_id_to_result[feature.unique_id]
result = unique_id_to_result.get(feature.unique_id, None)
if result is None:
continue
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of
# irrelevant
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(
start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index],
)
)
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True,
)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"]
)
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
tok_tokens = feature.tokens[pred.start_index: (pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start: (
orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit,
)
)
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(
text="empty",
start_logit=0.0,
end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for i, entry in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
all_predictions[example.qas_id] = nbest_json[0]["text"]
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
def load_loadgen_log(
log_path, eval_features, dtype=np.float32, output_transposed=False
):
with open(log_path) as f:
predictions = json.load(f)
results = []
for prediction in predictions:
qsl_idx = prediction["qsl_idx"]
if output_transposed:
logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(
2, -1
)
logits = np.transpose(logits)
else:
logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(
-1, 2
)
# Pad logits to max_seq_length
seq_length = logits.shape[0]
start_logits = np.ones(max_seq_length) * -10000.0
end_logits = np.ones(max_seq_length) * -10000.0
start_logits[:seq_length] = logits[:, 0]
end_logits[:seq_length] = logits[:, 1]
results.append(
RawResult(
unique_id=eval_features[qsl_idx].unique_id,
start_logits=start_logits.tolist(),
end_logits=end_logits.tolist(),
)
)
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--vocab_file",
default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt",
help="Path to vocab.txt",
)
parser.add_argument(
"--val_data", default="build/data/dev-v1.1.json", help="Path to validation data"
)
parser.add_argument(
"--log_file",
default="build/logs/mlperf_log_accuracy.json",
help="Path to LoadGen accuracy log",
)
parser.add_argument(
"--out_file",
default="build/result/predictions.json",
help="Path to output predictions file",
)
parser.add_argument(
"--features_cache_file",
default="eval_features.pickle",
help="Path to features' cache file",
)
parser.add_argument(
"--output_transposed", action="store_true", help="Transpose the output"
)
parser.add_argument(
"--output_dtype",
default="float32",
choices=dtype_map.keys(),
help="Output data type",
)
parser.add_argument(
"--max_examples",
type=int,
help="Maximum number of examples to consider (not limited by default)",
)
args = parser.parse_args()
output_dtype = dtype_map[args.output_dtype]
print("Reading examples...")
eval_examples = read_squad_examples(
input_file=args.val_data, is_training=False, version_2_with_negative=False
)
eval_features = []
# Load features if cached, convert from examples otherwise.
cache_path = args.features_cache_file
if os.path.exists(cache_path):
print("Loading cached features from '%s'..." % cache_path)
with open(cache_path, "rb") as cache_file:
eval_features = pickle.load(cache_file)
else:
print(
"No cached features at '%s'... converting from examples..." %
cache_path)
print("Creating tokenizer...")
tokenizer = BertTokenizer(args.vocab_file)
print("Converting examples to features...")
def append_feature(feature):
eval_features.append(feature)
convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=False,
output_fn=append_feature,
verbose_logging=False,
)
print("Caching features at '%s'..." % cache_path)
with open(cache_path, "wb") as cache_file:
pickle.dump(eval_features, cache_file)
print("Loading LoadGen logs...")
results = load_loadgen_log(
args.log_file, eval_features, output_dtype, args.output_transposed
)
print("Post-processing predictions...")
write_predictions(
eval_examples,
eval_features,
results,
20,
30,
True,
args.out_file,
args.max_examples,
)
print("Evaluating predictions...")
cmd = "python3 {:}/evaluate_v1.1.py {:} {:} {}".format(
os.path.dirname(os.path.abspath(__file__)),
args.val_data,
args.out_file,
"--max_examples {}".format(
args.max_examples) if args.max_examples else "",
)
subprocess.check_call(cmd, shell=True)
if __name__ == "__main__":
main()
# Copyright 2023 MLCommons. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
from time import sleep
import squad_QSL
import mlperf_loadgen as lg
import numpy as np
import threading
import requests
import array
import time
import os
import sys
sys.path.insert(0, os.getcwd())
class bert_QDL:
"""QDL acting as a proxy to the SUT.
This QDL communicates with the SUT via HTTP.
It uses two endpoints to communicate with the SUT:
- /predict/ : Send a query to the SUT and get a response.
- /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response.
"""
def __init__(self, qsl: squad_QSL.SQuAD_v1_QSL, sut_server_addr: list):
"""
Constructor for the QDL.
Args:
qsl: The QSL to use.
sut_server_addr: A list of addresses of the SUT.
"""
self.qsl = qsl
self.quantized = False
# Construct QDL from the python binding
self.qdl = lg.ConstructQDL(
self.issue_query, self.flush_queries, self.client_get_name
)
self.sut_server_addr = sut_server_addr
self.num_nodes = len(sut_server_addr)
# For round robin between the SUTs:
self.next_sut_id = 0
self.lock = threading.Lock()
def issue_query(self, query_samples):
"""Process the query to send to the SUT"""
threading.Thread(
target=self.process_query_async,
args=[query_samples]).start()
def flush_queries(self):
"""Flush the queries. Dummy implementation."""
pass
def process_query_async(self, query_samples):
"""
This function is called by the Loadgen in a separate thread.
It is responsible for
1. Creating a query for the SUT, by reading the features from the QSL.
2. Sending the query to the SUT.
3. Waiting for the response from the SUT.
4. Deserializing the response.
5. Calling mlperf_loadgen.QuerySamplesComplete(query_samples, response)
Args:
query_samples: A list of QuerySample objects.
"""
max_num_threads = int(
os.environ.get(
"CM_MAX_NUM_THREADS",
os.cpu_count()))
for i in range(len(query_samples)):
eval_features = self.qsl.get_features(query_samples[i].index)
encoded_eval_features = {
"input_ids": eval_features.input_ids,
"input_mask": eval_features.input_mask,
"segment_ids": eval_features.segment_ids,
}
n = threading.active_count()
while n >= max_num_threads:
sleep(0.0001)
n = threading.active_count()
threading.Thread(
target=self.client_predict_worker,
args=[encoded_eval_features, query_samples[i].id],
).start()
def get_sut_id_round_robin(self):
"""Get the SUT id in round robin."""
with self.lock:
res = self.next_sut_id
self.next_sut_id = (self.next_sut_id + 1) % self.num_nodes
return res
def client_predict_worker(self, query, query_id):
"""Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
url = "{}/predict/".format(
self.sut_server_addr[self.get_sut_id_round_robin()])
responses = []
response = requests.post(url, json={"query": query})
output = response.json()["result"]
output = np.array(output).astype(np.float32)
response_array = array.array("B", output.tobytes())
bi = response_array.buffer_info()
responses.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
lg.QuerySamplesComplete(responses)
def client_get_name(self):
"""Get the name of the SUT from ALL the SUTS."""
if len(self.sut_server_addr) == 1:
return requests.post(
f"{self.sut_server_addr[0]}/getname/").json()["name"]
sut_names = [
requests.post(f"{addr}/getname/").json()["name"]
for addr in self.sut_server_addr
]
return "Multi-node SUT: " + ", ".join(sut_names)
def __del__(self):
lg.DestroyQDL(self.qdl)
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 512,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 30522
}
# coding=utf-8
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The HuggingFace Inc. team.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import math
import os
import random
import re
import shutil
import sys
import time
sys.path.insert(
0,
os.path.join(
os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"
),
)
sys.path.insert(0, os.getcwd())
try:
import tensorflow as tf
from transformers import BertConfig, BertTokenizer, BertForQuestionAnswering
import torch
import numpy as np
except ImportError:
raise Exception("Error importing local modules")
def load_from_tf(config, tf_path):
model = BertForQuestionAnswering(config)
model.classifier = model.qa_outputs
# This part is copied from HuggingFace Transformers with a fix to bypass
# an error
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
# print("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
print("Skipping {}".format("/".join(name)))
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(
pointer, "classifier"
) # This line is causing the issue
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
print("Skipping {}".format("/".join(name)))
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
assert pointer.shape == array.shape
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
model.qa_outputs = model.classifier
del model.classifier
return model
def save_to_onnx(model):
tokenizer = BertTokenizer.from_pretrained(
"bert-large-uncased-whole-word-masking-finetuned-squad"
)
model.eval()
dummy_input = torch.ones((1, 384), dtype=torch.int64)
torch.onnx.export(
model,
(dummy_input, dummy_input, dummy_input),
"build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx",
verbose=True,
input_names=["input_ids", "input_mask", "segment_ids"],
output_names=["output_start_logits", "output_end_logits"],
opset_version=11,
dynamic_axes=(
{
"input_ids": {0: "batch_size"},
"input_mask": {0: "batch_size"},
"segment_ids": {0: "batch_size"},
"output_start_logits": {0: "batch_size"},
"output_end_logits": {0: "batch_size"},
}
),
)
def main():
with open("build/data/bert_tf_v1_1_large_fp32_384_v2/bert_config.json") as f:
config_json = json.load(f)
config = BertConfig(
attention_probs_dropout_prob=config_json["attention_probs_dropout_prob"],
hidden_act=config_json["hidden_act"],
hidden_dropout_prob=config_json["hidden_dropout_prob"],
hidden_size=config_json["hidden_size"],
initializer_range=config_json["initializer_range"],
intermediate_size=config_json["intermediate_size"],
max_position_embeddings=config_json["max_position_embeddings"],
num_attention_heads=config_json["num_attention_heads"],
num_hidden_layers=config_json["num_hidden_layers"],
type_vocab_size=config_json["type_vocab_size"],
vocab_size=config_json["vocab_size"],
)
model = load_from_tf(
config, "build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474"
)
torch.save(
model.state_dict(), "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch"
)
save_to_onnx(model)
if __name__ == "__main__":
main()
# Copyright 2021 Arm Limited and affiliates.
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is identical to DeepLearningExamples/TensorFlow/LanguageModeling/BERT/utils/create_squad_data.py
# except that the dependency on horovod is removed.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import tokenization
import six
class SquadExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(
self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False,
):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(
self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None,
):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training,
version_2_with_negative=False):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file) as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[
answer_offset + answer_length - 1
]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position: (end_position + 1)]
)
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text)
)
if actual_text.find(cleaned_answer_text) == -1:
print(
"Could not find answer: '%s' vs. '%s'",
actual_text,
cleaned_answer_text,
)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible,
)
examples.append(example)
return examples
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for span_index, doc_span in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + \
0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _improve_answer_span(
doc_tokens, input_start, input_end, tokenizer, orig_answer_text
):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start: (new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def convert_examples_to_features(
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training,
output_fn,
verbose_logging=False,
):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for example_index, example in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for i, token in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens,
tok_start_position,
tok_end_position,
tokenizer,
example.orig_answer_text,
)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"]
)
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for doc_span_index, doc_span in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(
tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(
doc_spans, doc_span_index, split_token_index
)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (
tok_start_position >= doc_start and tok_end_position <= doc_end
):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if verbose_logging and example_index < 20:
print("*** Example ***")
print("unique_id: %s" % (unique_id))
print("example_index: %s" % (example_index))
print("doc_span_index: %s" % (doc_span_index))
print(
"tokens: %s"
% " ".join([tokenization.printable_text(x) for x in tokens])
)
print(
"token_to_orig_map: %s"
% " ".join(
[
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]
)
)
print(
"token_is_max_context: %s"
% " ".join(
[
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]
)
)
print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" %
" ".join([str(x) for x in input_mask]))
print("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
if is_training and example.is_impossible:
print("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(
tokens[start_position: (end_position + 1)])
print("start_position: %d" % (start_position))
print("end_position: %d" % (end_position))
print(
"answer: %s" %
(tokenization.printable_text(answer_text)))
feature = InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible,
)
# Run callback
output_fn(feature)
unique_id += 1
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Source:
# https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions, max_examples=None):
f1 = exact_match = total = 0
for article in dataset:
if max_examples and max_examples == total:
break
for paragraph in article["paragraphs"]:
if max_examples and max_examples == total:
break
for qa in paragraph["qas"]:
total += 1
if max_examples and max_examples == total:
break
if qa["id"] not in predictions:
message = (
"Unanswered question " +
qa["id"] + " will receive score 0."
)
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x["text"], qa["answers"]))
prediction = predictions[qa["id"]]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths
)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {"exact_match": exact_match, "f1": f1}
if __name__ == "__main__":
expected_version = "1.1"
parser = argparse.ArgumentParser(
description="Evaluation for SQuAD " + expected_version
)
parser.add_argument("dataset_file", help="Dataset file")
parser.add_argument("prediction_file", help="Prediction File")
parser.add_argument(
"--max_examples",
type=int,
help="Maximum number of examples to consider (not limited by default)",
)
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if dataset_json["version"] != expected_version:
print(
"Evaluation expects v-"
+ expected_version
+ ", but got dataset with v-"
+ dataset_json["version"],
file=sys.stderr,
)
dataset = dataset_json["data"]
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
print(json.dumps(evaluate(dataset, predictions, args.max_examples)))
# Copyright 2023 MLCommons. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
import bert_QDL
import mlperf_loadgen as lg
import squad_QSL
from absl import app
import sys
import os
sys.path.insert(0, os.getcwd())
def set_args(
argv,
g_settings,
g_log_settings,
g_audit_conf,
g_sut_server,
g_backend,
g_total_count_override=None,
g_perf_count_override=None,
):
global settings, log_settings, audit_conf, sut_server, total_count_override, perf_count_override, backend
sys.argv = sys.argv[0:1]
settings = g_settings
log_settings = g_log_settings
audit_conf = g_audit_conf
sut_server = g_sut_server
total_count_override = g_total_count_override
perf_count_override = g_perf_count_override
backend = g_backend
def main(argv):
qsl = squad_QSL.get_squad_QSL(total_count_override, perf_count_override)
qdl = bert_QDL.bert_QDL(qsl, sut_server_addr=sut_server)
lg.StartTestWithLogSettings(
qdl.qdl,
qsl.qsl,
settings,
log_settings,
audit_conf)
if __name__ == "__main__":
app.run(main)
# coding=utf-8
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from time import sleep
from squad_QSL import get_squad_QSL
from transformers import BertConfig, BertForQuestionAnswering
import onnxruntime
import numpy as np
import mlperf_loadgen as lg
import threading
import array
import json
import os
import sys
sys.path.insert(0, os.getcwd())
class BERT_ONNXRuntime_SUT:
def __init__(self, args):
self.profile = args.profile
self.network = args.network
self.options = onnxruntime.SessionOptions()
self.options.enable_profiling = args.profile
print("Loading ONNX model...")
self.quantized = args.quantized
model_path = os.environ.get("ML_MODEL_FILE_WITH_PATH")
if not model_path:
if self.quantized:
model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/bert_large_v1_1_fake_quant.onnx"
else:
model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx"
if len(onnxruntime.get_all_providers()) > 1 and os.environ.get(
"USE_GPU", "yes"
).lower() not in ["0", "false", "off", "no"]:
preferred_execution_provider = os.environ.get(
"ONNXRUNTIME_PREFERRED_EXECUTION_PROVIDER", "CUDAExecutionProvider"
)
self.sess = onnxruntime.InferenceSession(
model_path, self.options, providers=[
preferred_execution_provider]
)
else:
self.sess = onnxruntime.InferenceSession(
model_path, self.options, providers=["CPUExecutionProvider"]
)
print("Constructing SUT...")
self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
print("Finished constructing SUT.")
self.qsl = get_squad_QSL(args.max_examples)
def issue_queries(self, query_samples):
max_num_threads = int(
os.environ.get(
"CM_MAX_NUM_THREADS",
os.cpu_count()))
for i in range(len(query_samples)):
eval_features = self.qsl.get_features(query_samples[i].index)
n = threading.active_count()
while n >= max_num_threads:
# sleep(0.01)
n = threading.active_count()
threading.Thread(
target=self.process_sample, args=[
eval_features, query_samples[i].id]
).start()
def process_sample(self, eval_features, query_id=None):
"""For Loadgen over the network"""
if self.network == "sut":
input_ids = eval_features["input_ids"]
input_mask = eval_features["input_mask"]
segment_ids = eval_features["segment_ids"]
else:
input_ids = eval_features.input_ids
input_mask = eval_features.input_mask
segment_ids = eval_features.segment_ids
if self.quantized:
fd = {
"input_ids": np.array(input_ids).astype(np.int64)[np.newaxis, :],
"attention_mask": np.array(input_mask).astype(np.int64)[np.newaxis, :],
"token_type_ids": np.array(segment_ids).astype(np.int64)[np.newaxis, :],
}
else:
fd = {
"input_ids": np.array(input_ids).astype(np.int64)[np.newaxis, :],
"input_mask": np.array(input_mask).astype(np.int64)[np.newaxis, :],
"segment_ids": np.array(segment_ids).astype(np.int64)[np.newaxis, :],
}
scores = self.sess.run([o.name for o in self.sess.get_outputs()], fd)
output = np.stack(scores, axis=-1)[0]
if self.network == "sut":
return output.tolist()
response_array = array.array("B", output.tobytes())
bi = response_array.buffer_info()
response = lg.QuerySampleResponse(query_id, bi[0], bi[1])
lg.QuerySamplesComplete([response])
def flush_queries(self):
pass
def __del__(self):
if self.profile:
print(
"ONNX runtime profile dumped to: '{}'".format(
self.sess.end_profiling())
)
print("Finished destroying SUT.")
def get_onnxruntime_sut(args):
return BERT_ONNXRuntime_SUT(args)
# coding=utf-8
# Copyright 2021 Arm Limited and affiliates.
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import array
import json
import os
import sys
sys.path.insert(
0,
os.path.join(
os.getcwd(), "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"
),
)
sys.path.insert(0, os.getcwd())
try:
from squad_QSL import get_squad_QSL
from transformers import BertConfig, BertForQuestionAnswering
import transformers
import torch
import numpy as np
import mlperf_loadgen as lg
except ImportError:
raise Exception("Error importing local modules")
class BERT_PyTorch_SUT:
def __init__(self, args):
print("Loading BERT configs...")
with open("bert_config.json") as f:
config_json = json.load(f)
config = BertConfig(
attention_probs_dropout_prob=config_json["attention_probs_dropout_prob"],
hidden_act=config_json["hidden_act"],
hidden_dropout_prob=config_json["hidden_dropout_prob"],
hidden_size=config_json["hidden_size"],
initializer_range=config_json["initializer_range"],
intermediate_size=config_json["intermediate_size"],
max_position_embeddings=config_json["max_position_embeddings"],
num_attention_heads=config_json["num_attention_heads"],
num_hidden_layers=config_json["num_hidden_layers"],
type_vocab_size=config_json["type_vocab_size"],
vocab_size=config_json["vocab_size"],
)
self.network = args.network
self.dev = (
torch.device("cuda:0") if torch.cuda.is_available(
) else torch.device("cpu")
)
self.version = transformers.__version__
print("Loading PyTorch model...")
self.model = BertForQuestionAnswering(config)
self.model.to(self.dev)
self.model.eval()
model_file = os.environ.get(
"ML_MODEL_FILE_WITH_PATH",
"build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch",
)
self.model.load_state_dict(torch.load(model_file), strict=False)
print("Constructing SUT...")
self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
print("Finished constructing SUT.")
self.qsl = get_squad_QSL(args.max_examples)
def issue_queries(self, query_samples):
for i in range(len(query_samples)):
eval_features = self.qsl.get_features(query_samples[i].index)
self.process_sample(eval_features, query_samples[i].id)
def process_sample(self, sample_input, query_id=None):
if self.network == "sut":
input_ids = sample_input["input_ids"]
input_mask = sample_input["input_mask"]
segment_ids = sample_input["segment_ids"]
else:
input_ids = sample_input.input_ids
input_mask = sample_input.input_mask
segment_ids = sample_input.segment_ids
with torch.no_grad():
model_output = self.model.forward(
input_ids=torch.LongTensor(
input_ids).unsqueeze(0).to(self.dev),
attention_mask=torch.LongTensor(
input_mask).unsqueeze(0).to(self.dev),
token_type_ids=torch.LongTensor(
segment_ids).unsqueeze(0).to(self.dev),
)
if self.version >= "4.0.0":
start_scores = model_output.start_logits
end_scores = model_output.end_logits
else:
start_scores, end_scores = model_output
output = (
torch.stack([start_scores, end_scores], axis=-1)
.squeeze(0)
.cpu()
.numpy()
)
if self.network == "sut":
return output.tolist()
response_array = array.array("B", output.tobytes())
bi = response_array.buffer_info()
response = lg.QuerySampleResponse(query_id, bi[0], bi[1])
lg.QuerySamplesComplete([response])
def flush_queries(self):
pass
def __del__(self):
print("Finished destroying SUT.")
def get_pytorch_sut(args):
return BERT_PyTorch_SUT(args)
# coding=utf-8
# Copyright 2021 Arm Limited and affiliates.
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import array
import json
import os
import sys
sys.path.insert(
0,
os.path.join(
os.getcwd(), "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"
),
)
sys.path.insert(0, os.getcwd())
try:
from ray.util.actor_pool import ActorPool
import ray
from squad_QSL import get_squad_QSL
from transformers import BertConfig, BertForQuestionAnswering
import transformers
import torch_tensorrt
import torch
import numpy as np
import mlperf_loadgen as lg
except ImportError:
raise Exception("Error importing local modules")
# Adjustable Parameters
# Note. num_samples (called "test_query_count" in CM) must be a multiple
# of batch_size
BATCH_SIZE = 16
@ray.remote(num_cpus=1, num_gpus=1)
class TorchPredictor:
def __init__(self, config_json, model_file, batch_size):
print("init", os.getpid(), torch.cuda.device_count())
self.pid = os.getpid()
self.dev_cnt = torch.cuda.device_count()
config = BertConfig(
attention_probs_dropout_prob=config_json["attention_probs_dropout_prob"],
hidden_act=config_json["hidden_act"],
hidden_dropout_prob=config_json["hidden_dropout_prob"],
hidden_size=config_json["hidden_size"],
initializer_range=config_json["initializer_range"],
intermediate_size=config_json["intermediate_size"],
max_position_embeddings=config_json["max_position_embeddings"],
num_attention_heads=config_json["num_attention_heads"],
num_hidden_layers=config_json["num_hidden_layers"],
type_vocab_size=config_json["type_vocab_size"],
vocab_size=config_json["vocab_size"],
)
self.dev = torch.device("cuda")
self.model = BertForQuestionAnswering(config)
self.model.to(self.dev)
self.model.eval()
self.model.load_state_dict(torch.load(model_file), strict=False)
# tensor rt
batch_input_ids = torch.LongTensor(
np.zeros(
(batch_size, 384))).to(
self.dev)
traced_mlm_model = torch.jit.trace(
self.model,
[batch_input_ids, batch_input_ids, batch_input_ids],
strict=False,
)
self.trt_model = torch_tensorrt.compile(
traced_mlm_model,
inputs=[
torch_tensorrt.Input(
shape=[
batch_size,
384],
dtype=torch.int32),
torch_tensorrt.Input(
shape=[
batch_size,
384],
dtype=torch.int32),
torch_tensorrt.Input(
shape=[
batch_size,
384],
dtype=torch.int32),
],
enabled_precisions={torch.float32, torch.float16},
workspace_size=2000000000,
truncate_long_and_double=True,
)
print("done loading")
# Logic for inference on 1 batch of data.
def forward(self, batch):
input_ids = torch.from_numpy(batch["input_ids"]).to(self.dev)
attention_mask = torch.from_numpy(batch["attention_mask"]).to(self.dev)
token_type_ids = torch.from_numpy(batch["token_type_ids"]).to(self.dev)
with torch.inference_mode():
# pytorch
# model_output = self.model.forward(input_ids=input_ids,
# attention_mask=attention_mask,
# token_type_ids=token_type_ids)
# start_scores = model_output.start_logits
# end_scores = model_output.end_logits
# tensor rt
trt_output = self.trt_model(
input_ids, attention_mask, token_type_ids)
start_scores = trt_output["start_logits"]
end_scores = trt_output["end_logits"]
batch_ret = torch.stack(
[start_scores, end_scores], axis=-1).cpu().numpy()
return {"output": batch_ret}
def ready(self):
pass
class BERT_Ray_SUT:
def __init__(self, args):
with open("bert_config.json") as f:
config_json = json.load(f)
model_file = os.environ.get(
"ML_MODEL_FILE_WITH_PATH",
"build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch",
)
print("Constructing SUT...")
self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
print("Finished constructing SUT.")
self.qsl = get_squad_QSL(args.max_examples)
try:
ray.init(address="auto")
except BaseException:
print("WARN: Cannot connect to existing Ray cluster.")
print("We are going to start a new RAY cluster, but pay attention that")
print("the cluster contains only one node.")
print(
"If you want to use multiple nodes, please start the cluster manually via:"
)
print("\tOn the head node, run `ray start --head`")
print("\tOn other nodes, run `ray start --address=<head node IP>:6379`")
ray.init()
self.batch_size = BATCH_SIZE
resources = ray.cluster_resources()
num_gpus = int(resources.get("GPU", 0))
print(f"The cluster has {num_gpus} GPUs.")
self.actor_list = [
TorchPredictor.remote(config_json, model_file, self.batch_size)
for _ in range(num_gpus)
]
self.pool = ActorPool(self.actor_list)
samples = []
for i in range(self.qsl.count):
sample = {}
eval_features = self.qsl.get_features(i)
sample["input_ids"] = np.array(
eval_features.input_ids).astype(
np.int32)
sample["attention_mask"] = np.array(eval_features.input_mask).astype(
np.int32
)
sample["token_type_ids"] = np.array(eval_features.segment_ids).astype(
np.int32
)
samples.append(sample)
self.samples = samples
print("Waiting Actors init")
for actor in self.actor_list:
ray.get(actor.ready.remote())
print("BERT_Ray_SUT construct complete")
def issue_queries(self, query_samples):
if len(query_samples) % self.batch_size != 0:
print("ERROR: batch size must be a multiple of the number of samples")
sys.exit(1)
batch_samples = []
i = 0
while i < len(query_samples):
batch_sample = {
"input_ids": np.array(
[
self.samples[query_sample.index]["input_ids"]
for query_sample in query_samples[i: i + self.batch_size]
]
),
"attention_mask": np.array(
[
self.samples[query_sample.index]["attention_mask"]
for query_sample in query_samples[i: i + self.batch_size]
]
),
"token_type_ids": np.array(
[
self.samples[query_sample.index]["token_type_ids"]
for query_sample in query_samples[i: i + self.batch_size]
]
),
}
batch_samples.append(batch_sample)
i = i + self.batch_size
# print("samples len", len(batch_samples))
batch_inference_results = list(
self.pool.map_unordered(
lambda a, v: a.forward.remote(v), batch_samples)
)
cur_query_index = 0
for batch_inference_result in batch_inference_results:
batch_inference_result = batch_inference_result["output"]
for inference_result in batch_inference_result:
response_array = array.array("B", inference_result.tobytes())
bi = response_array.buffer_info()
response = lg.QuerySampleResponse(
query_samples[cur_query_index].id, bi[0], bi[1]
)
lg.QuerySamplesComplete([response])
cur_query_index += 1
def flush_queries(self):
pass
def __del__(self):
print("Finished destroying SUT.")
def get_ray_sut(args):
return BERT_Ray_SUT(args)
# coding=utf-8
# Copyright 2021 Arm Limited and affiliates.
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from absl import flags
from absl import app
import subprocess
import mlperf_loadgen as lg
import argparse
import os
import sys
sys.path.insert(0, os.getcwd())
sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "lon"))
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--backend",
choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"],
default="tf",
help="Backend",
)
parser.add_argument(
"--scenario",
choices=["SingleStream", "Offline", "Server", "MultiStream"],
default="Offline",
help="Scenario",
)
parser.add_argument(
"--accuracy",
action="store_true",
help="enable accuracy pass")
parser.add_argument(
"--quantized",
action="store_true",
help="use quantized model (only valid for onnxruntime backend)",
)
parser.add_argument(
"--profile",
action="store_true",
help="enable profiling (only valid for onnxruntime backend)",
)
parser.add_argument(
"--user_conf",
default="user.conf",
help="user config for user LoadGen settings such as target QPS",
)
parser.add_argument(
"--audit_conf",
default="audit.conf",
help="audit config for LoadGen settings during compliance runs",
)
parser.add_argument(
"--max_examples",
type=int,
help="Maximum number of examples to consider (not limited by default)",
)
parser.add_argument(
"--network",
choices=["sut", "lon", None],
default=None,
help="Loadgen network mode",
)
parser.add_argument("--node", type=str, default="")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--sut_server",
nargs="*",
default=["http://localhost:8000"],
help="Address of the server(s) under test.",
)
args = parser.parse_args()
return args
scenario_map = {
"SingleStream": lg.TestScenario.SingleStream,
"Offline": lg.TestScenario.Offline,
"Server": lg.TestScenario.Server,
"MultiStream": lg.TestScenario.MultiStream,
}
def main():
args = get_args()
sut = None
if not args.network or args.network == "sut":
if args.backend == "pytorch":
assert (
not args.quantized
), "Quantized model is only supported by onnxruntime backend!"
assert (
not args.profile
), "Profiling is only supported by onnxruntime backend!"
from pytorch_SUT import get_pytorch_sut
sut = get_pytorch_sut(args)
elif args.backend == "tf":
assert (
not args.quantized
), "Quantized model is only supported by onnxruntime backend!"
assert (
not args.profile
), "Profiling is only supported by onnxruntime backend!"
from tf_SUT import get_tf_sut
sut = get_tf_sut(args)
elif args.backend == "tf_estimator":
assert (
not args.quantized
), "Quantized model is only supported by onnxruntime backend!"
assert (
not args.profile
), "Profiling is only supported by onnxruntime backend!"
from tf_estimator_SUT import get_tf_estimator_sut
sut = get_tf_estimator_sut()
elif args.backend == "onnxruntime":
from onnxruntime_SUT import get_onnxruntime_sut
sut = get_onnxruntime_sut(args)
elif args.backend == "ray":
assert (
not args.quantized
), "Quantized model is only supported by onnxruntime backend!"
assert (
not args.profile
), "Profiling is only supported by onnxruntime backend!"
from ray_SUT import get_ray_sut
sut = get_ray_sut(args)
else:
raise ValueError("Unknown backend: {:}".format(args.backend))
settings = lg.TestSettings()
settings.scenario = scenario_map[args.scenario]
# mlperf.conf is automatically loaded by the loadgen
# settings.FromConfig(args.mlperf_conf, "bert", args.scenario)
settings.FromConfig(args.user_conf, "bert", args.scenario)
if args.accuracy:
settings.mode = lg.TestMode.AccuracyOnly
else:
settings.mode = lg.TestMode.PerformanceOnly
log_path = os.environ.get("LOG_PATH")
if not log_path:
log_path = "build/logs"
if not os.path.exists(log_path):
os.makedirs(log_path)
log_output_settings = lg.LogOutputSettings()
log_output_settings.outdir = log_path
log_output_settings.copy_summary_to_stdout = True
log_settings = lg.LogSettings()
log_settings.log_output = log_output_settings
log_settings.enable_trace = True
if args.network == "lon":
from network_LON import app, set_args, main as app_main
set_args(
args,
settings,
log_settings,
args.audit_conf,
args.sut_server,
args.backend,
args.max_examples,
)
app.run(app_main)
elif args.network == "sut":
from network_SUT import app, node, set_backend
node = args.node
set_backend(sut)
app.run(debug=False, port=args.port, host="0.0.0.0")
else:
print("Running LoadGen test...")
lg.StartTestWithLogSettings(
sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf
)
if args.accuracy and not os.environ.get("SKIP_VERIFY_ACCURACY"):
cmd = "python3 {:}/accuracy-squad.py {}".format(
os.path.dirname(os.path.abspath(__file__)),
(
"--max_examples {}".format(args.max_examples)
if args.max_examples
else ""
),
)
subprocess.check_call(cmd, shell=True)
print("Done!")
if sut:
print("Destroying SUT...")
lg.DestroySUT(sut.sut)
print("Destroying QSL...")
lg.DestroyQSL(sut.qsl.qsl)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment