Commit 3c15726c authored by yangzhong's avatar yangzhong
Browse files

git init

parents
import psutil
import torch
def get_gpu_memory_info():
if torch.cuda.is_available():
device = torch.cuda.current_device()
total_memory = torch.cuda.get_device_properties(device).total_memory
allocated_memory = torch.cuda.memory_allocated(device)
free_memory = total_memory - allocated_memory
return free_memory / (1024**3)
def get_cpu_memory_info():
mem = psutil.virtual_memory()
return mem.total / (1024**3)
from transformers import AutoTokenizer
def get_transformer_autotokenizer(model_name: str):
return AutoTokenizer.from_pretrained(
model_name,
model_max_length=2048,
padding_side="left",
use_fast=False,
)
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds
import json
import os
import io
def _make_r_io_base(f, mode: str):
if not isinstance(f, io.IOBase):
f = open(f, mode=mode)
return f
def jload(f, mode="r"):
"""Load a .json file into a dictionary."""
f = _make_r_io_base(f, mode)
jdict = json.load(f)
f.close()
return jdict
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
SHELL ["/bin/bash", "-c"]
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
ENV TZ=US/Pacific
ENV DEBIAN_FRONTEND=noninteractive
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN rm -rf /var/lib/apt/lists/* && rm /etc/apt/sources.list.d/* \
&& apt update \
&& apt install -y --no-install-recommends build-essential autoconf \
libtool git ccache curl wget pkg-config sudo ca-certificates \
automake libssl-dev bc python3-dev python3-pip google-perftools \
gdb libglib2.0-dev clang sshfs libre2-dev libboost-dev \
libnuma-dev numactl sysstat sshpass ntpdate less iputils-ping \
&& apt -y autoremove \
&& apt remove -y cmake \
&& apt install -y --no-install-recommends pkg-config zip g++ zlib1g-dev \
unzip libarchive-dev
RUN apt install -y --no-install-recommends rsync
# Install setuptools
RUN python3 -m pip install --upgrade pip \
&& python3 -m pip install --upgrade setuptools wheel virtualenv
# Install conda
WORKDIR /tmp
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh \
&& bash Miniconda3-* -b -p /opt/miniconda3
ENV PATH="$PATH:/opt/miniconda3/bin"
RUN conda create -n llama2-70b python=3.10
RUN chmod -R 777 /opt/miniconda3
# Reference Implementation for llama2-70b
**Basic implementation for llama2-70b. Few noteworthy items:**
+ Streamer for communicating with loadgen has quite some overhead. This is only meant to provide functional implementation
+ For custom/optimized implementations of this benchmark it is important to include the :
- For server scenario, it is necessary to call `lg.FirstTokenComplete(response)` for each query. This way the first token will be reported and it's latency will be measured.
- For all scenarios, when calling `lg.QuerySamplesComplete(response)`, it is necessary that each of the elements in response is a `lg.QuerySampleResponse` that contains the number of tokens (can be create this way: `lg.QuerySampleResponse(qitem.id, bi[0], bi[1], n_tokens)`). The number of tokens reported should match with the number of tokens on your answer and this will be checked in [TEST06](../../compliance/nvidia/TEST06/)
Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama2-70b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
## Prepare environment
Copy the mlperf.conf file to this folder.
```
cp ../../mlperf.conf .
```
For a CPU-only run:
```
conda create -n llama2-70b python=3.9
conda activate llama2-70b
# Install packages
conda install pybind11==2.10.4 -c conda-forge -y
python -m pip install torch==2.2.0.dev20231006+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
export CUR_DIR=${PWD}
cd <inference-repo-root>/loadgen
# Need to fetch Pablo's changes
git fetch origin pull/1523/head:llm-server
git merge llm-server
python -m pip install .
```
For a GPU-based run:
A dockerfile is provided, along with scripts to help launch it. First, add any docker volume mounts you want in
`launch.sh`. There is a section at the top of the file that looks like:
```
# Add any volume mounts here with the following syntax
# /path/to/src:/path/to/dir/in/container
MOUNTS=(
$MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
)
```
For example if you have a raid space located at `/raid/data` on your local machine, you can add it to the same path in the container like so:
```
# Add any volume mounts here with the following syntax
# /path/to/src:/path/to/dir/in/container
MOUNTS=(
$MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
/raid/data:/raid/data
)
```
Once you have added all your mounts, launch the container with `bash launch.sh`.
Inside the container, set up the environment with `bash build.sh`. This will install all the dependencies from the
CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch.
## Get Model
### MLCommons Members Download
MLCommons hosts the model and preprocessed dataset for download **exclusively by MLCommons Members**. You must first agree to the [confidentiality notice](https://llama2.mlcommons.org) using your organizational email address, then you will receive a link to a directory containing Rclone download instructions. _If you cannot access the form but you are part of a MLCommons Member organization, submit the [MLCommons subscription form](https://mlcommons.org/community/subscribe/) with your organizational email address and [associate a Google account](https://accounts.google.com/SignUpWithoutGmail) with your organizational email address._
### External Download
+ First go to [llama2-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
+ Requires Git Large Files Storage
```
export CHECKPOINT_PATH=${PWD}/Llama-2-70b-chat-hf
git lfs install
git clone https://huggingface.co/meta-llama/Llama-2-70b-chat-hf ${CHECKPOINT_PATH}
```
## Get Dataset
### Preprocessed
You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket.
To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
To install Rclone on Linux/macOS/BSD systems, run:
```
sudo -v ; curl https://rclone.org/install.sh | sudo bash
```
Once Rclone is installed, run the following command to authenticate with the bucket:
```
rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
```
You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
```
rclone copy mlc-inference:mlcommons-inference-wg-public/open_orca ./open_orca -P
```
### Unprocessed
You can also download and process the dataset yourself following the command below:
```
# First get the `open-orca` parquet from huggingface
export OPENORCA_DATASET=${PWD}/open-orca
git clone https://huggingface.co/datasets/Open-Orca/OpenOrca ${OPENORCA_DATASET}
export OPENORCA_PARQUET=${OPENORCA_DATASET}/1M-GPT4-Augmented.parquet
EXPORT_DIR=${PWD}/processed-openorca
export DATASET_PATH=${PWD}/processed-data.pkl
# Process the dataset according the Taskforce's agreed criteria
python3 processorca.py --dataset_pq_path=${OPENORCA_PARQUET} --model_dir=${CHECKPOINT_PATH} --seqlen_limit=1024 --export_dir=${EXPORT_DIR} --num_total_samples=24576
mv ${EXPORT_DIR}/open_orca_gpt4_tokenized_llama.sampled_24576.pkl ${DATASET_PATH}
```
The script will perform the following steps on the original open_orca GPT4 dataset:
- filter out all queries with non-ascii characters, except for normal unicode quotes and hyphens.
- filter out all queries with out-of-bound input/output sequence lengths
- filter out all queries with expected answers shorter than 2 words (known to cause issues for Llama2)
- filter out all queries with prompts that generate bad output texts using Llama2 models
- sample equally from the sub-dataset (i.e. COT, NIV, FLAN, T0) and form the final dataset.
## Run Performance Benchmarks
### Offline
```
python -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--device cpu \
--dataset-path ${DATASET_PATH} \
--output-log-dir offline-logs
```
For a GPU-based run:
```
python3 -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--dataset-path ${DATASET_PATH} \
--output-log-dir offline-logs \
--dtype float32 \
--device cuda:0 2>&1 | tee offline_performance_log.log
```
### Server
```
python -u main.py --scenario Server \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--device cpu \
--dataset-path ${DATASET_PATH} \
--output-log-dir server-logs
```
The ServerSUT was not tested for GPU runs.
## Run Accuracy Benchmarks
### Offline
```
OUTPUT_LOG_DIR=offline-accuracy-logs
mkdir -p "run_outputs" # The script will dump all the outputs to 'run_outputs'.
python -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--accuracy \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--dataset-path ${DATASET_PATH} \
--output-log-dir ${OUTPUT_LOG_DIR} \
--device cpu
ACCURACY_LOG_FILE=${OUTPUT_LOG_DIR}/mlperf_log_accuracy.json
if [ -e ${ACCURACY_LOG_FILE} ]; then
python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
--mlperf-accuracy-file ${ACCURACY_LOG_FILE} --dataset-file ${DATASET_PATH} --dtype int32
fi
# Optional: Create a pickled pandas DataFrame that is the original dataset with extra columns with output data from the
# accuracy run. The following columns will be added:
# - "gen_output_tok_id": A list of ints representing the tokenized output sequence.
# - "gen_output_text": A str representing the untokenized output sequence.
# - "gen_output_tok_len": An int representing the number of output tokens.
# - "rouge1": The rouge1 score for this sample
# - "rouge2": The rouge2 score for this sample
# - "rougeL": The rougeL score for this sample
# This file will by default be saved to 'full_output.pkl'. You can modify this with --output-pkl-path.
python consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
```
For the GPU run - The above steps have been automated in `run_accuracy.sh`. You can also modify this script to use
`--device cpu` to adapt it to a CPU-only run.
### Server
```
OUTPUT_LOG_DIR=server-accuracy-logs
python -u main.py --scenario Server \
--model-path ${CHECKPOINT_PATH} \
--accuracy \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--dataset-path ${DATASET_PATH} \
--output-log-dir ${OUTPUT_LOG_DIR} \
--device cpu
ACCURACY_LOG_FILE=${OUTPUT_LOG_DIR}/mlperf_log_accuracy.json
if [ -e ${ACCURACY_LOG_FILE} ]; then
python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
--mlperf-accuracy-file ${ACCURACY_LOG_FILE} --dataset-file ${DATASET_PATH} --dtype int32
fi
```
The ServerSUT was not tested for GPU runs.
## Accuracy Target
Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets (normalized to a 0-100
scale from a 0.0-1.0 scale):
- Rouge1: 44.4312
- Rouge2: 22.0352
- RougeL: 28.6162
- Tokens per sample: 294.45
This was run on a DGX-H100 node. Total runtime was ~4.5 days.
# Run llama2-70b-interactive benchmark
For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
```
python -u main.py --scenario Server \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--device cpu \
--dataset-path ${DATASET_PATH} \
--output-log-dir server-logs \
--lg-model-name llama2-70b-interactive
```
import os
import time
import numpy as np
import array
import torch
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
from transformers.generation.streamers import BaseStreamer
import pickle
import time
import threading
import tqdm
import queue
import logging
from typing import TYPE_CHECKING, Optional, List
from pathlib import Path
import mlperf_loadgen as lg
from dataset import Dataset
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("Llama-70B-SUT")
gen_kwargs = {
"early_stopping": True,
"max_new_tokens": 1024,
"min_new_tokens": 1,
"num_beams": 1,
"do_sample": False,
}
class FirstTokenStreamer(BaseStreamer):
"""Streams first tokens to a 'holder'"""
def __init__(
self, first_token, tokens_cache=[], is_first_token=True, response_ids=[]
):
"""Response ids added to 'sign' the first token"""
self.first_token = first_token # Queue for first token
self.is_first_token = is_first_token
# Cache for subsequent generated tokens
self.tokens_cache = tokens_cache
self.response_ids = response_ids
self.is_prompt = (
True # The first tokens sent to the streamer are actually the input prompts
)
def put(self, value):
"""Caches the tokens as they're generated. Assumes bs=1"""
# Prompts are streamed first so we need to skip the first time value
# that arrives
if self.is_prompt:
self.is_prompt = False
return
value = value.item()
if self.is_first_token:
# Add generated first token together with its query response_id to
# first tokens queue
self.first_token.put((value, self.response_ids[0]))
self.is_first_token = False
return
self.tokens_cache.append(value)
def end(self):
pass
def get_out_tokens(self):
return self.tokens_cache
class SUT:
def __init__(
self,
model_path=None,
dtype="bfloat16",
device="cpu",
batch_size=None,
total_sample_count=24576,
dataset_path=None,
use_cached_outputs=False,
# Set this to True *only for test accuracy runs* in case your prior
# session was killed partway through
workers=1,
):
self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
self.device = device
if not batch_size:
if device == "cpu":
batch_size = 1
else:
batch_size = 32 # Reduce to 8 if using 4 GPUs, 16 for 8.
self.batch_size = batch_size
# dtype
if dtype == "bfloat16":
self.amp_enabled = True
self.amp_dtype = torch.bfloat16
elif dtype == "float16":
self.amp_enabled = True
self.amp_dtype = torch.float16
else:
self.amp_enabled = False
self.amp_dtype = torch.float32
if "cuda" in self.device:
assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
self.dataset_path = dataset_path
self.data_object = Dataset(
self.model_path,
dataset_path=self.dataset_path,
total_sample_count=total_sample_count,
device=self.device,
)
self.qsl = lg.ConstructQSL(
self.data_object.total_sample_count,
self.data_object.perf_count,
self.data_object.LoadSamplesToRam,
self.data_object.UnloadSamplesFromRam,
)
self.load_model()
self.num_workers = workers
self.worker_threads = [None] * self.num_workers
self.query_queue = queue.Queue()
self.use_cached_outputs = use_cached_outputs
self.sample_counter = 0
self.sample_counter_lock = threading.Lock()
def start(self):
# Create worker threads
for j in range(self.num_workers):
worker = threading.Thread(target=self.process_queries)
worker.start()
self.worker_threads[j] = worker
def stop(self):
for _ in range(self.num_workers):
self.query_queue.put(None)
for worker in self.worker_threads:
worker.join()
def process_queries(self):
"""Processor of the queued queries. User may choose to add batching logic"""
while True:
qitem = self.query_queue.get()
if qitem is None:
break
query_ids = [q.index for q in qitem]
fname = "q" + "_".join([str(i) for i in query_ids])
fname = f"run_outputs/{fname}.pkl"
_p = Path(fname)
if self.use_cached_outputs and _p.exists():
# Read cache
with _p.open(mode="rb") as f:
d = pickle.load(f)
processed_output = d["outputs"]
tik1 = None
tik2 = None
tik3 = None
tok = None
else:
# Construct / collate batch
max_seq_len = 1024
tik1 = time.time()
input_ids_tensor = []
input_masks_tensor = []
input_len = []
for q in qitem:
input_ids_tensor.append(
pad(
self.data_object.input_ids[q.index],
(
max_seq_len -
self.data_object.input_lens[q.index],
0,
0,
0,
),
value=self.tokenizer.pad_token_id,
)
)
input_masks_tensor.append(
pad(
self.data_object.attention_masks[q.index],
(
max_seq_len -
self.data_object.input_lens[q.index],
0,
0,
0,
),
value=0,
)
)
input_len.append(self.data_object.input_lens[q.index])
input_ids_tensor = torch.cat(input_ids_tensor)
input_masks_tensor = torch.cat(input_masks_tensor)
assert input_ids_tensor.shape == input_masks_tensor.shape
assert input_ids_tensor.shape[0] <= self.batch_size
tik2 = time.time()
pred_output_tokens = self.model.generate(
input_ids=input_ids_tensor,
attention_mask=input_masks_tensor,
pad_token_id=self.tokenizer.pad_token_id,
**gen_kwargs,
)
tik3 = time.time()
processed_output = self.data_object.postProcess(
pred_output_tokens,
input_seq_lens=input_len,
query_id_list=query_ids,
)
for i in range(len(qitem)):
n_tokens = processed_output[i].shape[0]
response_array = array.array(
"B", processed_output[i].tobytes())
bi = response_array.buffer_info()
response = [
lg.QuerySampleResponse(
qitem[i].id,
bi[0],
bi[1],
n_tokens)]
lg.QuerySamplesComplete(response)
tok = time.time()
with self.sample_counter_lock:
self.sample_counter += len(qitem)
print(f"Samples run: {self.sample_counter}")
if tik1:
print(f"\tBatchMaker time: {tik2 - tik1}")
print(f"\tInference time: {tik3 - tik2}")
print(f"\tPostprocess time: {tok - tik3}")
print(f"\t==== Total time: {tok - tik1}")
else:
print(f"\tLoaded from cache: {_p}")
def load_model(self):
self.model = LlamaForCausalLM.from_pretrained(
self.model_path,
device_map="auto",
low_cpu_mem_usage=True,
torch_dtype=self.amp_dtype,
)
print("Loaded model")
self.device = torch.device(self.device)
if self.device == "cpu":
self.model = self.model.to(
self.device
) # Force CPU if your system has GPU and you specifically want CPU-only run
self.model.eval()
try: # for systems with low ram, the below command gives error as some part is offloaded to disk
self.model = self.model.to(memory_format=torch.channels_last)
except BaseException:
pass
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path,
model_max_length=1024,
padding_side="left",
use_fast=False,
)
self.tokenizer.pad_token = self.tokenizer.eos_token
print("Loaded tokenizer")
def get_sut(self):
self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
return self.sut
def get_qsl(self):
return self.qsl
def predict(self, **kwargs):
raise NotImplementedError
def issue_queries(self, query_samples):
"""Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
list_prompts_tokens = []
list_prompts_attn_masks = []
print(f"IssueQuery started with {len(query_samples)} samples")
while len(query_samples) > 0:
self.query_queue.put(query_samples[: self.batch_size])
query_samples = query_samples[self.batch_size:]
print(f"IssueQuery done")
def flush_queries(self):
pass
def __del__(self):
pass
class SUTServer(SUT):
def __init__(
self,
model_path=None,
dtype="bfloat16",
device="cpu",
total_sample_count=24576,
dataset_path=None,
batch_size=None,
workers=1,
):
super().__init__(
model_path=model_path,
dtype=dtype,
device=device,
total_sample_count=total_sample_count,
dataset_path=dataset_path,
workers=workers,
)
self.first_token_queue = queue.Queue()
def start(self):
# Create worker threads
for j in range(self.num_workers):
worker = threading.Thread(target=self.process_queries)
worker.start()
self.worker_threads[j] = worker
# Create first token response thread
self.ft_response_thread = threading.Thread(
target=self.process_first_tokens)
self.ft_response_thread.start()
def process_first_tokens(self):
while True:
first_token_item = self.first_token_queue.get()
if first_token_item is None:
log.info("Exiting First token response thread")
break
first_tokens, response_id = first_token_item
response_data = array.array(
"B", np.array(
first_tokens, np.int32).tobytes())
bi = response_data.buffer_info()
response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
lg.FirstTokenComplete(response)
def process_queries(self):
"""Processor of the queued queries. User may choose to add batching logic"""
while True:
qitem = self.query_queue.get()
if qitem is None:
break
input_ids_tensor = self.data_object.input_ids[qitem.index]
input_masks_tensor = self.data_object.attention_masks[qitem.index]
# TODO: This PoC is super slow with significant overhead. Best to
# create a patch to `generate`
tokens_cache = []
tokens_streamer = FirstTokenStreamer(
self.first_token_queue,
tokens_cache=tokens_cache,
is_first_token=True,
response_ids=[qitem.id],
)
_ = self.model.generate(
input_ids=input_ids_tensor,
attention_mask=input_masks_tensor,
pad_token_id=self.tokenizer.pad_token_id,
streamer=tokens_streamer,
**gen_kwargs,
)
output_tokens = tokens_streamer.get_out_tokens()
n_tokens = len(output_tokens)
response_array = array.array(
"B", np.array(output_tokens, np.int32).tobytes()
)
bi = response_array.buffer_info()
response = [
lg.QuerySampleResponse(
qitem.id,
bi[0],
bi[1],
n_tokens)]
lg.QuerySamplesComplete(response)
def issue_queries(self, query_samples):
self.query_queue.put(query_samples[0])
def stop(self):
for _ in range(self.num_workers):
self.query_queue.put(None)
for worker in self.worker_threads:
worker.join()
self.first_token_queue.put(None)
self.ft_response_thread.join()
import os
import time
import numpy as np
import array
import torch
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
from transformers.generation.streamers import BaseStreamer
import json
import pickle
import time
import threading
import tqdm
import queue
import logging
from typing import TYPE_CHECKING, Optional, List
from pathlib import Path
import more_itertools as mit
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from urllib3.exceptions import InsecureRequestWarning
import mlperf_loadgen as lg
from dataset import Dataset
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("Llama-70B-SUT")
gen_kwargs = {
"early_stopping": True,
"max_new_tokens": 1024,
"min_new_tokens": 1,
"num_beams": 1,
"do_sample": False,
}
class FirstTokenStreamer(BaseStreamer):
"""Streams first tokens to a 'holder'"""
def __init__(
self, first_token, tokens_cache=[], is_first_token=True, response_ids=[]
):
"""Response ids added to 'sign' the first token"""
self.first_token = first_token # Queue for first token
self.is_first_token = is_first_token
# Cache for subsequent generated tokens
self.tokens_cache = tokens_cache
self.response_ids = response_ids
self.is_prompt = (
True # The first tokens sent to the streamer are actually the input prompts
)
def put(self, value):
"""Caches the tokens as they're generated. Assumes bs=1"""
# Prompts are streamed first so we need to skip the first time value
# that arrives
if self.is_prompt:
self.is_prompt = False
return
value = value.item()
if self.is_first_token:
# Add generated first token together with its query response_id to
# first tokens queue
self.first_token.put((value, self.response_ids[0]))
self.is_first_token = False
return
self.tokens_cache.append(value)
def end(self):
pass
def get_out_tokens(self):
return self.tokens_cache
class SUT:
def __init__(
self,
model_path=None,
api_server=None,
api_model_name=None,
dtype="bfloat16",
device="cpu",
batch_size=None,
total_sample_count=24576,
dataset_path=None,
use_cached_outputs=False,
# Set this to True *only for test accuracy runs* in case your prior
# session was killed partway through
workers=1,
):
self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
self.device = device
self.api_servers = []
if api_server:
self.api_servers.append(api_server)
self.api_model_name = api_model_name
self.device = device
batch_size = total_sample_count
self.batch_size = batch_size
# dtype
if dtype == "bfloat16":
self.amp_enabled = True
self.amp_dtype = torch.bfloat16
elif dtype == "float16":
self.amp_enabled = True
self.amp_dtype = torch.float16
else:
self.amp_enabled = False
self.amp_dtype = torch.float32
if "cuda" in self.device:
assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
self.dataset_path = dataset_path
self.data_object = Dataset(
self.model_path,
dataset_path=self.dataset_path,
total_sample_count=total_sample_count,
device=self.device,
)
self.qsl = lg.ConstructQSL(
self.data_object.total_sample_count,
self.data_object.perf_count,
self.data_object.LoadSamplesToRam,
self.data_object.UnloadSamplesFromRam,
)
# self.load_model()
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path,
model_max_length=1024,
padding_side="left",
use_fast=True,
) # changed from false
self.tokenizer.pad_token = self.tokenizer.eos_token
self.num_workers = workers
self.worker_threads = [None] * self.num_workers
self.query_queue = queue.Queue()
self.use_cached_outputs = use_cached_outputs
self.sample_counter = 0
self.sample_counter_lock = threading.Lock()
def start(self):
# Create worker threads
for j in range(self.num_workers):
worker = threading.Thread(target=self.process_queries)
worker.start()
self.worker_threads[j] = worker
def stop(self):
for _ in range(self.num_workers):
self.query_queue.put(None)
for worker in self.worker_threads:
worker.join()
def query_api_vllm(self, inputs, idx):
headers = {
"Content-Type": "application/json",
}
json_data = {
"model": self.api_model_name,
"prompt": inputs,
"min_tokens": 1,
"max_tokens": 1024,
}
response_code = 0
print(f"Server path {self.api_servers[idx]}/v1/completions")
while response_code != 200:
try:
response = requests.post(
f"{self.api_servers[idx]}/v1/completions",
headers=headers,
json=json_data,
verify=False,
)
response_code = response.status_code
except Exception as e:
print(e)
print("connection failure")
break
return [resp["text"] for resp in json.loads(response.text)["choices"]]
def api_action_handler(self, chunk, server_idx):
output = self.query_api_vllm(chunk, server_idx)
return output
def process_queries(self):
"""Processor of the queued queries. User may choose to add batching logic"""
while True:
qitem = self.query_queue.get()
if qitem is None:
break
query_ids = [q.index for q in qitem]
fname = "q" + "_".join([str(i) for i in query_ids])
fname = f"run_outputs/{fname}.pkl"
_p = Path(fname)
if self.use_cached_outputs and _p.exists():
# Read cache
with _p.open(mode="rb") as f:
d = pickle.load(f)
processed_output = d["outputs"]
tik1 = None
tik2 = None
tik3 = None
tok = None
else:
# Construct / collate batch
max_seq_len = 1024
tik1 = time.time()
# OpenAI-API servers don't require padding and can take input tokens
# directly, so we build our input_ids_tensor as a jagged list
input_ids_tensor = []
for q in qitem:
# input_ids_tensor.append(self.data_object.input_ids[q.index].tolist())
input_ids_tensor += self.data_object.input_ids[q.index].tolist(
)
# NOTE(mgoin): I don't think this has to be a torch tensor
# input_ids_tensor = torch.cat(input_ids_tensor)
# print(input_ids_tensor)
assert len(input_ids_tensor) <= self.batch_size
tik2 = time.time()
# NOTE(mgoin): I don't think threading is necessary since we are submitting all queries in one request
# The API server should take care of mini-batches and
# scheduling
if self.api_servers:
"""
decoded = self.tokenizer.batch_decode(input_ids_tensor)
cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
cleaned_chunks = [list(c) for c in mit.divide(len(self.api_servers), cleaned)]
"""
cleaned_chunks = [input_ids_tensor]
with ThreadPoolExecutor(
max_workers=len(self.api_servers)
) as executor:
# needs to be tested
output_chunks = list(
executor.map(
self.api_action_handler,
cleaned_chunks,
range(len(self.api_servers)),
)
)
output = []
for row in output_chunks:
output += row
else:
print(
"Error: Specify at least one API to which the request is to be sent!"
)
exit(1)
tik3 = time.time()
processed_output = self.tokenizer(output)["input_ids"]
# for i in range(len(qitem)):
for i in range(len(processed_output)):
# NOTE(mgoin): Not optimal to make numpy arrays just to
# serialize
unpadded = np.array(processed_output[i])
n_tokens = unpadded.shape[0]
response_array = array.array("B", unpadded.tobytes())
bi = response_array.buffer_info()
response = [
lg.QuerySampleResponse(
qitem[i].id,
bi[0],
bi[1],
n_tokens)]
lg.QuerySamplesComplete(response)
tok = time.time()
with self.sample_counter_lock:
self.sample_counter += len(qitem)
print(f"Samples run: {self.sample_counter}")
if tik1:
print(f"\tBatchMaker time: {tik2 - tik1}")
print(f"\tInference time: {tik3 - tik2}")
print(f"\tPostprocess time: {tok - tik3}")
print(f"\t==== Total time: {tok - tik1}")
else:
print(f"\tLoaded from cache: {_p}")
def get_sut(self):
self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
return self.sut
def get_qsl(self):
return self.qsl
def predict(self, **kwargs):
raise NotImplementedError
def issue_queries(self, query_samples):
"""Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
list_prompts_tokens = []
list_prompts_attn_masks = []
print(f"IssueQuery started with {len(query_samples)} samples")
while len(query_samples) > 0:
self.query_queue.put(query_samples[: self.batch_size])
query_samples = query_samples[self.batch_size:]
print(f"IssueQuery done")
def flush_queries(self):
pass
def __del__(self):
pass
class SUTServer(SUT):
def __init__(
self,
model_path=None,
api_server=None,
api_model_name=None,
dtype="bfloat16",
device="cpu",
total_sample_count=24576,
dataset_path=None,
batch_size=None,
workers=1,
):
super().__init__(
model_path=model_path,
api_server=None,
api_model_name=None,
dtype=dtype,
device=device,
total_sample_count=total_sample_count,
dataset_path=dataset_path,
workers=workers,
)
with open(f"{self.model_path}/tokenizer.json", "r") as token_file:
llama_tokenizer = json.load(token_file)
self.llama_vocab = llama_tokenizer["model"]["vocab"]
self.first_token_queue = queue.Queue()
def start(self):
# Create worker threads
for j in range(self.num_workers):
worker = threading.Thread(target=self.process_queries)
worker.start()
self.worker_threads[j] = worker
# Create first token response thread
self.ft_response_thread = threading.Thread(
target=self.process_first_tokens)
self.ft_response_thread.start()
def process_first_tokens(self):
while True:
first_token_item = self.first_token_queue.get()
if first_token_item is None:
log.info("Exiting First token response thread")
break
first_tokens, response_id = first_token_item
response_data = array.array(
"B", np.array(first_tokens, np.float32).tobytes()
)
bi = response_data.buffer_info()
response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
lg.FirstTokenComplete(response)
def stream_api_vllm(self, input, response_ids, idx):
headers = {
"Content-Type": "application/json",
}
json_data = {
"model": "/opt/app-root/share/models",
"prompt": input,
"max_tokens": 1024,
"temperature": 0,
"stream": True,
"logprobs": 1,
}
while True:
try:
token_cache = []
s = requests.Session()
first = True
with s.post(
f"{self.api_servers[idx]}/v1/completions",
headers=headers,
json=json_data,
verify=False,
stream=True,
) as resp:
for line in resp.iter_lines():
if line:
decoded = line.decode()
if decoded.startswith(
"data") and "[DONE]" not in decoded:
inter = json.loads(decoded[6:])["choices"][0][
"logprobs"
]
if "top_logprobs" in inter:
token_s = list(
inter["top_logprobs"][0].keys())[0]
token = self.llama_vocab[token_s]
if first:
self.first_token_queue.put(
(token, response_ids[0])
)
first = False
token_cache.append(token)
s.close()
if token_cache:
return token_cache
except Exception as e:
s.close()
print("Connection failure")
print(f"An exception occurred: {type(e).__name__}")
print(f"Exception details: {e}")
def async_process_query(self, input_ids_tensor, qitem_id, idx):
decoded = self.tokenizer.decode(input_ids_tensor[0])
response_ids = [qitem_id]
output_tokens = self.stream_api_vllm(decoded, response_ids, idx)
n_tokens = len(output_tokens)
if n_tokens <= 1:
print("WARNING: caught low token count")
print(input_ids_tensor)
print(output_tokens)
response_array = array.array(
"B", np.array(
output_tokens, np.int32).tobytes())
bi = response_array.buffer_info()
response = [lg.QuerySampleResponse(qitem_id, bi[0], bi[1], n_tokens)]
lg.QuerySamplesComplete(response)
sys.exit()
def process_queries(self):
"""Processor of the queued queries. User may choose to add batching logic"""
server_idx = 0
while True:
qitem = self.query_queue.get()
if qitem is None:
break
input_ids_tensor = self.data_object.input_ids[qitem.index]
input_masks_tensor = self.data_object.attention_masks[qitem.index]
if self.api_servers:
threading.Thread(
target=self.async_process_query,
args=(input_ids_tensor, qitem.id, server_idx),
).start()
server_idx = (server_idx + 1) % len(self.api_servers)
else:
# TODO: This PoC is super slow with significant overhead. Best
# to create a patch to `generate`
tokens_cache = []
tokens_streamer = FirstTokenStreamer(
self.first_token_queue,
tokens_cache=tokens_cache,
is_first_token=True,
response_ids=[qitem.id],
)
_ = self.model.generate(
input_ids=input_ids_tensor,
attention_mask=input_masks_tensor,
pad_token_id=self.tokenizer.pad_token_id,
streamer=tokens_streamer,
**gen_kwargs,
)
output_tokens = tokens_streamer.get_out_tokens()
n_tokens = len(output_tokens)
response_array = array.array(
"B", np.array(output_tokens, np.int32).tobytes()
)
bi = response_array.buffer_info()
response = [
lg.QuerySampleResponse(
qitem.id, bi[0], bi[1], n_tokens)]
lg.QuerySamplesComplete(response)
def issue_queries(self, query_samples):
self.query_queue.put(query_samples[0])
def stop(self):
for _ in range(self.num_workers):
self.query_queue.put(None)
for worker in self.worker_threads:
worker.join()
self.first_token_queue.put(None)
self.ft_response_thread.join()
set -e
conda install pybind11==2.10.4 -c conda-forge -y
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch-nightly -c nvidia
python -m pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
cd ../../loadgen && python3 -m pip install .
import argparse
import evaluate
import glob
import nltk
import numpy as np
import os
import pandas as pd
import pickle
from pathlib import Path
from transformers import LlamaTokenizerFast
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset-path",
type=str,
default=None,
help="Path to .pkl generated by processorca.py",
)
parser.add_argument(
"--run-outputs",
type=str,
default="run_outputs",
help="Output dir generated by accuracy run.",
)
parser.add_argument(
"--model-dir",
type=str,
default=None,
help="Path to Llamav2 HuggingFace repo clone",
)
parser.add_argument(
"--output-pkl-path",
type=str,
default="full_output.pkl",
help="Path to dump output to",
)
args = parser.parse_args()
return args
def load_dataset(p: os.PathLike):
print(f"Loading from {p}...")
return pd.read_pickle(p)
def load_run_outputs(p: os.PathLike):
g = glob.glob(str(Path(p) / "q*.pkl"))
by_query_idx = dict()
for pkl_file in g:
print(f"Loading from {pkl_file}...")
with open(pkl_file, "rb") as f:
d = pickle.load(f)
assert len(d["query_ids"]) == len(d["outputs"])
for i in range(len(d["query_ids"])):
qid = d["query_ids"][i]
assert qid not in by_query_idx
by_query_idx[qid] = d["outputs"][i]
return by_query_idx
def main(args):
# Set up decode and evaluation objects
tokenizer = LlamaTokenizerFast.from_pretrained(args.model_dir)
metric = evaluate.load("rouge")
nltk.download("punkt")
# Load Data
df = load_dataset(args.dataset_path)
run_outputs = load_run_outputs(args.run_outputs)
assert len(run_outputs) == 24576
# Set up columns to add
output_tok_ids_col = [None] * 24576
output_text_col = [None] * 24576
output_lens = [None] * 24576
# Process data
no_eos_ids = []
for qid, output in tqdm(run_outputs.items()):
L = list(output)
# Prune trailing 2s (EOS token)
try:
first2 = L.index(2)
L = L[:first2]
except ValueError:
# Do nothing
no_eos_ids.append(qid)
assert L[-1] != 2
output_tok_ids_col[qid] = L
output_lens[qid] = len(L)
# Decode tokens
output_text_col[qid] = tokenizer.decode(
output_tok_ids_col[qid], skip_special_tokens=True
)
print(f"Found {len(no_eos_ids)} samples with no EOS token")
print("Calculating rouge scores...")
def _preproc(s): return "\n".join(nltk.sent_tokenize(s.strip()))
preds = list(map(_preproc, output_text_col))
targets = list(map(_preproc, list(df["output"])))
rouge_scores = metric.compute(
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
)
assert len(rouge_scores["rouge1"]) == 24576
assert len(rouge_scores["rouge2"]) == 24576
assert len(rouge_scores["rougeL"]) == 24576
agg = {k: round(np.mean(v) * 100, 4) for k, v in rouge_scores.items()}
print(agg)
print("Avg output seqlen:", np.mean(output_lens))
# Set columns
df["gen_output_tok_id"] = output_tok_ids_col
df["gen_output_text"] = output_text_col
df["gen_output_tok_len"] = output_lens
df["rouge1"] = rouge_scores["rouge1"]
df["rouge2"] = rouge_scores["rouge2"]
df["rougeL"] = rouge_scores["rougeL"]
p = Path(args.output_pkl_path)
p.parent.mkdir(exist_ok=True)
df.to_pickle(p)
print(f"Dumped to {p}")
if __name__ == "__main__":
main(get_args())
import random
import os
import time
import numpy as np
import torch
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from typing import Optional, Dict, Sequence
import io
# import utils
import copy
import pickle
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("Llama-70B-Dataset")
class Dataset:
def __init__(
self,
model_name=None,
total_sample_count=24576,
perf_count_override=None,
dataset_path=None,
device="cpu",
):
self.model_name = model_name or "meta-llama/Llama-2-70b-chat-hf"
self.dataset_path = dataset_path
self.max_length = 1024
self.device = device
# self.total_sample_count = total_sample_count
self.load_tokenizer()
self.load_processed_dataset()
self.total_sample_count = min(len(self.input_ids), total_sample_count)
self.perf_count = perf_count_override or self.total_sample_count
def load_tokenizer(self):
"""Returns tokenizer"""
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
model_max_length=1024,
padding_side="left",
use_fast=False,
)
self.tokenizer.pad_token = self.tokenizer.eos_token
def load_processed_dataset(self):
if not os.path.isfile(self.dataset_path):
log.warn(
"Processed pickle file {} not found. Please check that the path is correct".format(
self.dataset_path
)
)
print("Loading dataset...")
import pandas as pd
processed_data = pd.read_pickle(self.dataset_path)
input_tokens = processed_data["tok_input"]
self.input_ids = []
self.input_lens = []
self.attention_masks = []
for ids in input_tokens:
input_ids = torch.tensor(ids, dtype=torch.int32).view(
1, -1).to(self.device)
attn_mask = torch.ones_like(input_ids)
self.input_ids.append(input_ids)
self.attention_masks.append(attn_mask)
self.input_lens.append(input_ids.shape[-1])
print("Finished loading dataset.")
def postProcess(
self,
out_tokens,
input_seq_lens=None,
query_id_list=None,
sample_index_list=None,
):
"""Postprocesses output prediction"""
# TODO: Create response object in postProcess(?)
"""
preds = []
for i in range(out_tokens.shape[0]):
#pred = out_tokens[i].reshape(-1).cpu().numpy() # Slice up to original input length as below?
input_len = input_seq_lens[i] if input_seq_lens else 0
pred = out_tokens[i, input_len:].reshape(-1).cpu().numpy()
preds.append(pred)
"""
# Everything is padded to max_len (1024), so prune the input and parse
# to numpy
output_seq = out_tokens[:, 1024:].cpu().numpy()
assert len(query_id_list) == output_seq.shape[0]
# Save outputs
if not os.path.exists("run_outputs"):
os.makedirs("run_outputs")
fname = "q" + "_".join([str(i) for i in query_id_list])
fname = f"run_outputs/{fname}.pkl"
with open(fname, mode="wb") as f:
d = {"query_ids": query_id_list, "outputs": output_seq}
print(f"Saving outputs to {fname}")
pickle.dump(d, f)
return output_seq
def LoadSamplesToRam(self, sample_list):
pass
def UnloadSamplesFromRam(self, sample_list):
pass
def __del__(self):
pass
import argparse
from transformers import AutoTokenizer
import nltk
import evaluate
import numpy as np
import json
from multiprocessing import Pool, cpu_count
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
)
parser.add_argument(
"--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
)
parser.add_argument(
"--dataset-file",
required=True,
help="path to processed openorca validation set",
)
parser.add_argument(
"--verbose",
action="store_true",
help="verbose messages")
parser.add_argument(
"--dtype",
default="int64",
help="dtype of the accuracy log",
choices=["int32", "int64", "float"],
)
args = parser.parse_args()
return args
def get_groundtruth(processed_dataset_file):
import pandas as pd
data = pd.read_pickle(processed_dataset_file)
ground_truths = data["output"]
return ground_truths
def postprocess_text(preds, targets):
preds = [pred.strip() for pred in preds]
targets = [target.strip() for target in targets]
# rougeLSum expects newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
return preds, targets
def compute_rouge_chunk(chunk):
"""Compute ROUGE scores for a chunk of predictions and references."""
metric = evaluate.load("rouge")
preds, targets = chunk
result = metric.compute(
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
)
return result
def main():
args = get_args()
dataset_path = args.dataset_file
checkpoint_path = args.checkpoint_path
nltk.download("punkt")
nltk.download("punkt_tab")
tokenizer = AutoTokenizer.from_pretrained(
checkpoint_path,
model_max_length=2048,
padding_side="left",
use_fast=False,
)
targets = get_groundtruth(args.dataset_file)
target_required = []
preds_token_ids = []
eval_dtype = np.int64
if args.dtype == "int32":
eval_dtype = np.int32
elif args.dtype == "float":
eval_dtype = np.float32
with open(args.mlperf_accuracy_file, "r") as f:
results = json.load(f)
seen = set()
gen_tok_len = 0
for pred in results:
qsl_idx = pred["qsl_idx"]
if qsl_idx in seen:
continue
seen.add(qsl_idx)
target = targets[qsl_idx]
target_required.append(target)
pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
gen_tok_len += len(pred)
preds_token_ids.append(pred)
preds_decoded_text = tokenizer.batch_decode(
preds_token_ids, skip_special_tokens=True
)
preds, targets = postprocess_text(preds_decoded_text, target_required)
# Split data into chunks for parallel processing
num_chunks = cpu_count() # Number of parallel processes
chunk_size = len(preds) // num_chunks + (len(preds) % num_chunks > 0)
chunks = [
(preds[i:i + chunk_size], targets[i:i + chunk_size])
for i in range(0, len(preds), chunk_size)
]
# Use multiprocessing Pool to compute ROUGE scores in parallel
with Pool(num_chunks) as pool:
results_list = pool.map(compute_rouge_chunk, chunks)
# Aggregate results from all chunks
aggregated_results = {}
for result in results_list:
for k, v in result.items():
if k not in aggregated_results:
aggregated_results[k] = []
aggregated_results[k].extend(v)
final_result = {k: round(np.mean(v) * 100, 4)
for k, v in aggregated_results.items()}
prediction_lens = [len(pred) for pred in preds]
gen_num = len(preds)
final_result.update({
"gen_len": np.sum(prediction_lens),
"gen_num": gen_num,
"gen_tok_len": gen_tok_len,
"tokens_per_sample": round(gen_tok_len / gen_num, 1),
})
print("\nResults\n")
print(final_result)
if __name__ == "__main__":
main()
#!/bin/bash
MLCOMMONS_REPO_PATH="$(dirname "$(dirname "$PWD")")"
# Add any volume mounts here with the following syntax
# /path/to/src:/path/to/dir/in/container
MOUNTS=(
$MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
)
# Set up docker environment file for current user
rm -f .docker_env
echo "CI_BUILD_USER=`id -u -n`" >> .docker_env
echo "CI_BUILD_UID=`id -u`" >> .docker_env
echo "CI_BUILD_GROUP=`id -g -n`" >> .docker_env
echo "CI_BUILD_GID=`id -g`" >> .docker_env
cat .docker_env
# Build container
docker build . -t llm/gpubringup
# Build mount flags
declare -a MOUNT_FLAGS
for _mount in ${MOUNTS[@]}; do
_split=($(echo $_mount | tr ':' '\n'));
MOUNT_FLAGS+=("--mount type=bind,source=${_split[0]},target=${_split[1]}");
done
set -x
nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
--cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH \
--security-opt seccomp=unconfined \
-w $PWD \
--env-file `pwd`/.docker_env \
${MOUNT_FLAGS[*]} \
llm/gpubringup \
bash ./with_the_same_user
import subprocess
import mlperf_loadgen as lg
import argparse
import os
import logging
import sys
import requests
import json
sys.path.insert(0, os.getcwd())
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("Llama-70B-MAIN")
# function to check the model name in server matches the user specified one
def verify_model_name(user_specified_name, url):
response = requests.get(url)
if response.status_code == 200:
response_dict = response.json()
server_model_name = response_dict["data"][0]["id"]
if user_specified_name == server_model_name:
return {"matched": True, "error": False}
else:
return {
"matched": False,
"error": f"User specified {user_specified_name} and server model name {server_model_name} mismatch!",
}
else:
return {
"matched": False,
"error": f"Failed to get a valid response. Status code: {response.status_code}",
}
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--scenario",
type=str,
choices=["Offline", "Server"],
default="Offline",
help="Scenario",
)
parser.add_argument(
"--model-path",
type=str,
default="meta-llama/Llama-2-70b-chat-hf",
help="Model name",
)
parser.add_argument("--dataset-path", type=str, default=None, help="")
parser.add_argument(
"--accuracy",
action="store_true",
help="Run accuracy mode")
parser.add_argument(
"--dtype",
type=str,
default="float32",
help="data type of the model, choose from float16, bfloat16 and float32",
)
parser.add_argument(
"--device",
type=str,
choices=["cpu", "cuda:0"],
default="cpu",
help="device to use",
)
parser.add_argument(
"--audit-conf",
type=str,
default="audit.conf",
help="audit config for LoadGen settings during compliance runs",
)
parser.add_argument(
"--user-conf",
type=str,
default="user.conf",
help="user config for user LoadGen settings such as target QPS",
)
# TODO: This interpretation of 'total-sample-count' is a little
# misleading. Fix it
parser.add_argument(
"--total-sample-count",
type=int,
default=24576,
help="Number of samples to use in benchmark.",
)
parser.add_argument(
"--batch-size",
type=int,
default=1,
help="Model batch-size to use in benchmark.",
)
parser.add_argument(
"--output-log-dir", type=str, default="output-logs", help="Where logs are saved"
)
parser.add_argument(
"--enable-log-trace",
action="store_true",
help="Enable log tracing. This file can become quite large",
)
parser.add_argument(
"--num-workers",
type=int,
default=1,
help="Number of workers to process queries",
)
parser.add_argument("--vllm", action="store_true", help="vllm mode")
parser.add_argument(
"--api-model-name",
type=str,
default="meta-llama/Llama-2-70b-chat-hf",
help="Model name(specified in llm server)",
)
parser.add_argument(
"--api-server",
type=str,
default=None,
help="Specify an api endpoint call to use api mode",
)
parser.add_argument(
"--lg-model-name",
type=str,
default="llama2-70b",
choices=["llama2-70b", "llama2-70b-interactive"],
help="Model name(specified in llm server)",
)
args = parser.parse_args()
return args
scenario_map = {
"offline": lg.TestScenario.Offline,
"server": lg.TestScenario.Server,
}
def main():
args = get_args()
if args.vllm:
resp = verify_model_name(
args.api_model_name,
args.api_server + "/v1/models")
if resp["error"]:
print(f"\n\n\033[91mError:\033[0m", end=" ")
print(resp["error"])
sys.exit(1)
settings = lg.TestSettings()
settings.scenario = scenario_map[args.scenario.lower()]
# mlperf.conf is automatically loaded by the loadgen
settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
if args.accuracy:
settings.mode = lg.TestMode.AccuracyOnly
else:
settings.mode = lg.TestMode.PerformanceOnly
os.makedirs(args.output_log_dir, exist_ok=True)
log_output_settings = lg.LogOutputSettings()
log_output_settings.outdir = args.output_log_dir
log_output_settings.copy_summary_to_stdout = True
log_settings = lg.LogSettings()
log_settings.log_output = log_output_settings
log_settings.enable_trace = args.enable_log_trace
if args.vllm:
from SUT_API import SUT, SUTServer
else:
from SUT import SUT, SUTServer
sut_map = {"offline": SUT, "server": SUTServer}
sut_cls = sut_map[args.scenario.lower()]
if args.vllm:
sut = sut_cls(
model_path=args.model_path,
dtype=args.dtype,
batch_size=args.batch_size,
dataset_path=args.dataset_path,
total_sample_count=args.total_sample_count,
device=args.device,
api_server=args.api_server,
api_model_name=args.api_model_name,
workers=args.num_workers,
)
else:
sut = sut_cls(
model_path=args.model_path,
dtype=args.dtype,
batch_size=args.batch_size,
dataset_path=args.dataset_path,
total_sample_count=args.total_sample_count,
device=args.device,
workers=args.num_workers,
)
# Start sut before loadgen starts
sut.start()
lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
log.info("Starting Benchmark run")
lg.StartTestWithLogSettings(
lgSUT,
sut.qsl,
settings,
log_settings,
args.audit_conf)
# Stop sut after completion
sut.stop()
log.info("Run Completed!")
log.info("Destroying SUT...")
lg.DestroySUT(lgSUT)
log.info("Destroying QSL...")
lg.DestroyQSL(sut.qsl)
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import argparse
import pandas as pd
import numpy as np
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from transformers import LlamaTokenizerFast
from typing import Dict
__doc__ = """
This script takes the open_orca GPT4 dataset parquet and perform the following preprocessing and filtering steps:
1. filter out all queries with non-ascii characters, except for normal unicode quotes and hyphens.
2. filter out all queries with out-of-bound input/output sequence lengths
3. filter out all queries with expected answers shorter than 2 words (known to cause issues for Llama2)
4. filter out all queries with prompts that generate bad output texts using Llama2 models
4. sample equally from the sub-dataset (i.e. COT, NIV, FLAN, T0) and form the final dataset.
"""
llama_prompt_system = "<s>[INST] <<SYS>>\n{}\n<</SYS>>\n\n{} [/INST]"
llama_prompt_no_system = "<s>[INST] {} [/INST]"
def format_llama_input(row):
if row["system_prompt"]:
return llama_prompt_system.format(
row["system_prompt"], row["question"])
else:
return llama_prompt_no_system.format(row["question"])
def is_english(s):
for c in s:
allowed = c.isascii()
allowed = allowed or (
c in ["’", "–", "“", "”", "—"]
) # Taken from Habana: Unicode quotes and hyphens
if not allowed:
return False
return True
def _tokenize_helper(x, llama_tokenizer=None, append_response_init_token=True):
if not isinstance(x, str):
return []
tokens = llama_tokenizer(x)["input_ids"]
if append_response_init_token:
# Workaround to enable cheat checking for first token: Llama always outputs token 29871 first
# It is possible for submitters to just immediately output this token
# to achieve a very fast TTFT.
tokens.append(29871)
return tokens
@dataclass
class Keyphrase:
col: str
phrase: str
startswith: bool = False
case: bool = False
class OpenOrcaDatasetGenerator:
def __init__(
self,
pq_path: os.PathLike,
model_dir: os.PathLike,
io_token_limit: int,
calibration_subset_size: int = 1000,
):
self.pq_path = Path(pq_path)
self.model_dir = Path(model_dir)
self.io_token_limit = io_token_limit
self.keyphrases = []
self.calibration_subset_size = calibration_subset_size
def load_parquet(self) -> pd.DataFrame:
llama_tokenizer = LlamaTokenizerFast.from_pretrained(self.model_dir)
tik = time.time()
df = pd.read_parquet(self.pq_path)
print(f"Tokenizing input")
df.rename(columns={"response": "output"}, inplace=True)
df["input"] = df.apply(format_llama_input, axis=1)
input_tokenizer = partial(
_tokenize_helper,
llama_tokenizer=llama_tokenizer)
output_tokenizer = partial(
_tokenize_helper,
llama_tokenizer=llama_tokenizer,
append_response_init_token=False,
)
df["tok_input"] = df["input"].apply(input_tokenizer)
df["tok_output"] = df["output"].apply(output_tokenizer)
tok = time.time()
print(f"Loaded parquet and tokenized in {tok-tik} sec.")
return df
def filter_english(self, df: pd.DataFrame) -> pd.DataFrame:
df["input_english"] = df["input"].apply(is_english)
df["output_english"] = df["output"].apply(is_english)
df["all_english"] = df["input_english"] & df["output_english"]
# Filter based on english tokens
df = df[df["all_english"]].drop(
["input_english", "output_english", "all_english"], axis=1
)
return df.reset_index(drop=True)
def filter_seqlen_oob(self, df: pd.DataFrame) -> pd.DataFrame:
df["tok_input_length"] = df["tok_input"].apply(lambda x: len(x))
df["tok_output_length"] = df["tok_output"].apply(lambda x: len(x))
# Filter based on sequence length
df = df[df["tok_input_length"] < self.io_token_limit]
df = df[df["tok_output_length"] < self.io_token_limit]
return df.reset_index(drop=True)
def filter_short_expected_response(self, df: pd.DataFrame) -> pd.DataFrame:
# We have found that short expected responses (such as for yes/no and true/false questions, or multiple choice
# questions where the expected response is just the choice, i.e. (B)), disproportionately have lower Rouge1
# scores (< 0.02).
# Filter out 1 and 2 word expected responses. We've seen best results when this is filtered to >= 6, but it is
# hard to justify removing that many samples.
df = df[df["tok_output_length"] >= 3]
return df.reset_index(drop=True)
def filter_bad_prompts(
self, df: pd.DataFrame, only_niv_t0: bool = True
) -> pd.DataFrame:
# Some prompts underperform and cause very bad Rouge scores for a significant percentage of samples with these
# prompts. See Jupyter notebook for analysis.
# These generally only affect NIV and t0 and do not exist in flan or cot.
# Set 'only_niv_t0' to True to explicitly only remove these prompts
# from niv and t0 samples.
bad_prompts = [
"",
"You are an AI assistant that follows instruction extremely well. Help as much as you can.",
"You are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.",
"You are an AI assistant. Provide a detailed answer so user don't need to search outside to understand the answer.",
"User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.",
"Explain how you used the definition to come up with the answer.",
]
for prompt in bad_prompts:
criteria = df.system_prompt == prompt
if only_niv_t0:
criteria = criteria & (
(df.origin == "niv") | (
df.origin == "t0"))
df = df[~criteria]
return df.reset_index(drop=True)
def register_keyphrase(self, keyphrase: Keyphrase):
self.keyphrases.append(keyphrase)
def filter_keyphrases(self, df: pd.DataFrame) -> pd.DataFrame:
# Filter out registered keyphrases. This is unused for the final
# dataset as there are no registered keyphrases.
for kp in self.keyphrases:
if kp.startswith:
selector = df[kp.col].str.startswith(kp.phrase)
else:
selector = df[kp.col].str.contains(kp.phrase, case=kp.case)
df = df[~selector]
return df.reset_index(drop=True)
def set_origins(self, df: pd.DataFrame) -> pd.DataFrame:
def get_sample_origin(x): return x.split(".")[0]
df["origin"] = df["id"].apply(get_sample_origin)
return df
def _per_origin_split(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
print(f"Unique sample origin datasets: {df.origin.unique()}")
dfs_by_origin = dict(tuple(df.groupby("origin")))
for origin, sub_df in dfs_by_origin.items():
sub_df = sub_df.reset_index(drop=True, inplace=True)
return dfs_by_origin
def _get_sampling(self, df, N, rng_seed: int = 1337):
_N = min(df.shape[0], N)
if _N < N:
raise RuntimeError(f"Not enough samples. Requires {N - _N} more.")
return df.sample(n=_N, random_state=rng_seed)
def sample(
self, dfs_by_origin: Dict[str, pd.DataFrame], n_total, rng_seed: int = 1337
) -> pd.DataFrame:
nways = len(dfs_by_origin)
assert (
n_total % nways == 0
), f"Total number of samples ({n_total}) must be divisible by n_origins ({nways})"
split_size = n_total // nways
samplings = []
for origin, df in dfs_by_origin.items():
print(f"Sampling {split_size} from {origin}")
sample = self._get_sampling(df, split_size, rng_seed=rng_seed)
samplings.append(sample)
sampled_df = pd.concat(samplings)
sampled_df = sampled_df.reset_index(drop=True)
return sampled_df
def generate(
self,
export_dir: os.PathLike,
n_samples: int = 24576,
use_cached: bool = True,
calib_rng_seed: int = 12345,
):
export_dir = Path(export_dir)
if not export_dir.exists():
print(f"Creating {export_dir}")
export_dir.mkdir(parents=True)
if export_dir.is_file():
raise ValueError(
f"Cannot export to file {export_dir}. Must be a directory."
)
full_fpath = export_dir / f"open_orca_gpt4_tokenized_llama.full.pkl"
if full_fpath.exists() and use_cached:
df = pd.read_pickle(full_fpath)
else:
df = self.load_parquet()
df = self.set_origins(df)
# Apply filters
df = self.filter_english(df)
df = self.filter_seqlen_oob(df)
df = self.filter_short_expected_response(df)
df = self.filter_bad_prompts(df)
df = self.filter_keyphrases(df)
df.to_pickle(full_fpath)
dfs_by_origin = self._per_origin_split(df)
# Export base files
for origin, sub_df in dfs_by_origin.items():
print(f"Subset '{origin}' has {sub_df.shape[0]} samples")
origin_fpath = export_dir / \
f"open_orca_gpt4_tokenized_llama.{origin}.pkl"
if not origin_fpath.exists() or not use_cached:
sub_df.to_pickle(origin_fpath)
# Strategy:
# After some analysis, we found that OpenOrca's dataset has a skewed "origin-dataset" distribution:
# cot and niv have significantly fewer samples (71K and 58K) compared to flan and t0 (375K and 278K).
# cot has a higher rouge score from a 100k sampling (of the whole dataset) than the rest, while niv has lower.
# Sample from each dataset equally.
sampled_df = self.sample(dfs_by_origin, n_samples)
sampled_fpath = (
export_dir /
f"open_orca_gpt4_tokenized_llama.sampled_{n_samples}.pkl"
)
sampled_df.to_pickle(sampled_fpath)
# Calibration dataset
calib_ds = sampled_df.sample(
n=self.calibration_subset_size, random_state=calib_rng_seed
)
calib_ds = calib_ds.reset_index(drop=True)
calib_fpath = (
export_dir
/ f"open_orca_gpt4_tokenized_llama.calibration_{self.calibration_subset_size}.pkl"
)
calib_ds.to_pickle(calib_fpath)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset_pq_path",
type=str,
default="/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet",
help="the path to the open_orca GPT4 parquet.",
)
parser.add_argument(
"--model_dir", type=str, default="/raid/data/mlperf-llm/Llama-2-70b-chat-hf"
)
parser.add_argument(
"--seqlen_limit",
type=int,
default=1024,
help="Upper limit of the input/output sequence lengths",
)
parser.add_argument(
"--export_dir",
type=str,
default="/raid/data/mlperf-llm/OpenOrca/llama/filtered",
help="Path to the output pkl file.",
)
parser.add_argument(
"--num_total_samples",
type=int,
default=24576,
help="Number of samples to generate",
)
parser.add_argument(
"--calibration_subset_size",
type=int,
default=1000,
help="Number of samples for calibration subset",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
ds_gen = OpenOrcaDatasetGenerator(
pq_path=args.dataset_pq_path,
model_dir=args.model_dir,
io_token_limit=args.seqlen_limit,
calibration_subset_size=args.calibration_subset_size,
)
ds_gen.generate(
export_dir=args.export_dir,
n_samples=args.num_total_samples,
)
# Sample command to run:
# python3 processorca.py
# --dataset_pq_path=/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet
# --model_dir=/raid/data/mlperf-llm/Llama-2-70b-chat-hf
# --seqlen_limit=1024
# --export_dir=/raid/data/mlperf-llm/OpenOrca/llama/filtered
# --num_total_samples=24576
CHECKPOINT_PATH="${CHECKPOINT_PATH:-meta-llama/Llama-2-70b-chat-hf}"
DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
mkdir -p "run_outputs"
python3 -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--accuracy \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--dataset-path ${DATASET_PATH} \
--output-log-dir offline_accuracy_loadgen_logs \
--dtype float32 \
--device cuda:0 2>&1 | tee offline_accuracy_log.log
python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
--mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \
--dataset-file ${DATASET_PATH} \
--dtype int32
python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
CHECKPOINT_PATH="${CHECKPOINT_PATH:-meta-llama/Llama-2-70b-chat-hf}"
DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
python -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--dataset-path ${DATASET_PATH} \
--device cpu 2>&1 | tee server_log.log
CHECKPOINT_PATH="${CHECKPOINT_PATH:-meta-llama/Llama-2-70b-chat-hf}"
DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
python -u main.py --scenario Server \
--model-path ${CHECKPOINT_PATH} \
--mlperf-conf mlperf.conf \
--user-conf user.conf \
--total-sample-count 24576 \
--dataset-path ${DATASET_PATH} \
--device cpu 2>&1 | tee server_log.log
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds
#
*.Offline.min_duration = 600000
*.Offline.min_query_count = 2000
*.Server.target_qps = 0.5
*.Server.min_duration = 120000
*.Server.min_query_count = 100
#!/usr/bin/env bash
# wkong: manually set the user info in env first
set -ex
if [ -z "$@" ]; then
COMMAND=(bash)
else
COMMAND=("$@")
fi
apt-get update && apt-get install -y sudo
getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" --disabled-password --quiet "${CI_BUILD_USER}"
usermod -a -G dip "${CI_BUILD_USER}"
usermod -a -G sudo "${CI_BUILD_USER}"
usermod -a -G root "${CI_BUILD_USER}"
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
mkdir -p /home/"${CI_BUILD_USER}"
touch /home/"${CI_BUILD_USER}"/.bashrc
echo 'export PATH="$PATH:/opt/miniconda3/bin"' >> /home/"${CI_BUILD_USER}"/.bashrc
sudo -H -u "#${CI_BUILD_UID}" --preserve-env \
PATH="${PATH}" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
PYTHONPATH="${PYTHONPATH}" \
bash -c "conda init bash"
echo 'conda activate llama2-70b' >> /home/"${CI_BUILD_USER}"/.bashrc
sudo -H -u "#${CI_BUILD_UID}" --preserve-env \
PATH="${PATH}" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
PYTHONPATH="${PYTHONPATH}" \
${COMMAND[@]}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment