Commit 3c15726c authored by yangzhong's avatar yangzhong
Browse files

git init

parents
import argparse
from transformers import AutoTokenizer
import nltk
import evaluate
import numpy as np
import pandas as pd
import json
import re
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
)
parser.add_argument(
"--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
)
parser.add_argument(
"--dataset-file", required=True, help="path to processed validation dataset"
)
parser.add_argument(
"--n_workers",
default=2,
type=int,
help="Number of workers used for the MBXP evaluation",
)
parser.add_argument(
"--verbose",
action="store_true",
help="verbose messages")
parser.add_argument(
"--dtype",
default="int64",
help="dtype of the accuracy log",
choices=["int32", "int64", "float"],
)
args = parser.parse_args()
return args
def get_groundtruth(processed_dataset_file):
data = pd.read_pickle(processed_dataset_file)
return data
# Functions for evaluating GSM8K
def find_numbers(x: str) -> list[str]:
"""Finds all numbers in a string."""
# Search for number, possibly negative (hyphen), with thousand separators
# (comma), and with a decimal point (period inbetween digits).
numbers = re.compile(
r"-?[\d,]*\.?\d+",
re.MULTILINE | re.DOTALL | re.IGNORECASE,
).findall(x)
return numbers
def find_number(x: str, answer_delimiter: str = "The answer is") -> str:
"""Finds the most relevant number in a string."""
# If model uses the answer delimiter, then select the first number following
# that format.
if answer_delimiter in x:
answer = x.split(answer_delimiter)[-1]
numbers = find_numbers(answer)
if numbers:
return numbers[0]
# In general, select the last number in the string.
numbers = find_numbers(x)
if numbers:
return numbers[-1]
return ""
def maybe_remove_comma(x: str) -> str:
# Example: 5,600 -> 5600
return x.replace(",", "")
def try_float(x: str):
try:
ret = float(x)
except BaseException:
ret = None
return ret
# Functions for evaluating OpenOrca
def postprocess_text(preds, targets):
preds = [pred.strip() for pred in preds]
targets = [target.strip() for target in targets]
# rougeLSum expects newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
return preds, targets
# Functions for MBXP
def create_mbxp_dict(row, response):
lang, entry_point = row["id"].split("_", 1)
return {
"lang": lang,
"prompt": row["input"],
"test_code": row["gt_output"],
"entry_point": entry_point,
"response": response,
}
def main():
args = get_args()
dataset_path = args.dataset_file
checkpoint_path = args.checkpoint_path
metric = evaluate.load("rouge")
nltk.download("punkt")
nltk.download("punkt_tab")
tokenizer = AutoTokenizer.from_pretrained(
checkpoint_path,
model_max_length=2048,
padding_side="left",
use_fast=False,
)
data = get_groundtruth(args.dataset_file)
query_types, gt_outputs = data["dataset"], data["gt_output"]
target_required_GSM8K = []
target_required_OpenOrca = []
results_MBXP = []
preds_token_GSM8K = []
preds_token_OpenOrca = []
preds_token_MBXP = []
eval_dtype = np.int64
if args.dtype == "int32":
eval_dtype = np.int32
elif args.dtype == "float":
eval_dtype = np.float32
with open(args.mlperf_accuracy_file, "r") as f:
results = json.load(f)
seen = set()
gen_tok_len = 0
gen_num = 0
for pred in results:
gen_num += 1
qsl_idx = pred["qsl_idx"]
if qsl_idx in seen:
continue
seen.add(qsl_idx)
query_type = query_types.iloc[qsl_idx]
if query_type == "GSM8K":
target = gt_outputs.iloc[qsl_idx]
target_required_GSM8K.append(target)
pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
gen_tok_len += len(pred)
preds_token_GSM8K.append(pred)
elif query_type == "OpenOrca":
target = gt_outputs.iloc[qsl_idx]
target_required_OpenOrca.append(target)
pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
gen_tok_len += len(pred)
preds_token_OpenOrca.append(pred)
else:
target = data.iloc[qsl_idx]
pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
pred_str = tokenizer.decode(pred, skip_special_tokens=True)
results_MBXP.append(create_mbxp_dict(target, pred_str))
gen_tok_len += len(pred)
# OpenOrca metric
preds_decoded_text = tokenizer.batch_decode(
preds_token_OpenOrca, skip_special_tokens=True
)
preds, targets = postprocess_text(
preds_decoded_text, target_required_OpenOrca)
if preds:
result = metric.compute(
predictions=preds,
references=targets,
use_stemmer=True,
use_aggregator=False,
)
result = {k: float(round(np.mean(v) * 100, 4))
for k, v in result.items()}
prediction_lens = [len(pred) for pred in preds]
else:
result = {}
prediction_lens = []
# GSM8K metric
preds_decoded_text = tokenizer.batch_decode(
preds_token_GSM8K, skip_special_tokens=True
)
pred_nums = [
maybe_remove_comma(find_number(pred_text.split("\nQ:")[0]))
for pred_text in preds_decoded_text
]
gsm8k_total = len(target_required_GSM8K)
correct = 0
for idx in range(len(target_required_GSM8K)):
ref = try_float(target_required_GSM8K[idx])
tgt = try_float(pred_nums[idx])
if tgt is None:
continue
correct += ref == tgt
result["gsm8k"] = 100.0 * correct / gsm8k_total
# MBXP metric
from evaluate_mbxp import evaluate_mbxp
if results_MBXP:
result["mbxp"] = evaluate_mbxp(results_MBXP, args.n_workers)
else:
result["mbxp"] = 0
result = {
**result,
"gen_len": int(np.sum(prediction_lens)),
"gen_num": gen_num,
"gen_tok_len": gen_tok_len,
"tokens_per_sample": round(gen_tok_len / gen_num, 1),
}
print("\nResults\n")
print(result)
if __name__ == "__main__":
main()
import argparse
import json
import multiprocessing
import pickle
import queue
import re
import timeit
import pandas as pd
from tqdm import tqdm
from mxeval.execution import check_correctness as check_correctness_python
from mxeval.execution import (
check_correctness_cpp,
check_correctness_csharp,
check_correctness_go,
check_correctness_java,
check_correctness_javascript,
check_correctness_kotlin,
check_correctness_perl,
check_correctness_php,
check_correctness_ruby,
check_correctness_scala,
check_correctness_swift,
check_correctness_typescript,
)
def postprocess_golang(code: str) -> str:
multi_line_imports = re.compile(
r"^import \(\n(.+)((?:\n.+)+)\n\)", re.MULTILINE)
line_imports = re.compile(r"^import \".*\"")
func_main = re.compile(r"^func main.*^}", re.MULTILINE | re.DOTALL)
code = code.replace("package main", "") # Remove package main
code = multi_line_imports.sub("", code)
code = line_imports.sub("", code)
code = func_main.sub("", code)
return code
def postprocess_scala(code: str) -> str:
code = code.replace("object Main extends App {", "")
code = "".join(code.splitlines(True)[:-1])
return code
def postprocess_python(code: str) -> str:
return code.lstrip()
def worker(inp_queue, out_queue):
while True:
try:
problem = inp_queue.get(timeout=5)
except queue.Empty:
break
key = f"{problem['lang']}_{problem['entry_point']}"
checker = eval(f"check_correctness_{problem['lang']}")
problem["task_id"] = key
problem["test"] = problem["test_code"]
solution = problem["response"]
try:
solution = solution[: solution.index("```")]
except ValueError:
# Happens when a code block isn't closed properly
pass
if problem["lang"] == "go":
solution = postprocess_golang(solution)
elif problem["lang"] == "python":
solution = postprocess_python(solution)
elif problem["lang"] == "scala":
solution = postprocess_scala(solution)
# Mixtral likes escaping underscores for some reason, so let's remove
# these
solution = solution.replace("\\_", "_")
# The evaluation script evaluates `code = prompt + solution + tests`
# But Mixtral regenerates the prompt in its output, so we should remove
# this
problem["prompt"] = ""
try:
result = checker(problem, solution, timeout=20.0)
out_queue.put(
(
key,
problem["lang"],
result["passed"],
result["result"],
problem["response"],
)
)
except Exception as e:
print(e)
out_queue.put(
(key, problem["lang"], False, "", problem["response"]))
def evaluate_mbxp(results, n_workers):
by_lang = {}
for problem in results:
by_lang.setdefault(problem["lang"], []).append(problem)
inp_queue = multiprocessing.Queue()
out_queue = multiprocessing.Queue()
n_problems = 0
for lang, problems in by_lang.items():
if lang not in ["cpp", "python", "php",
"javascript", "ruby", "typescript"]:
continue
n_problems += len(problems)
for problem in problems:
inp_queue.put(problem)
start = timeit.default_timer()
workers = []
for _ in range(n_workers):
w = multiprocessing.Process(target=worker, args=(inp_queue, out_queue))
w.start()
workers.append(w)
passes = {}
n_passed = 0
lang_passed = {}
lang_counts = {}
for i in tqdm(range(n_problems)):
key, lang, passed, result, response = out_queue.get()
passes[key] = {
"passed": passed,
"result": result,
"response": response}
n_passed += passed
lang_passed.setdefault(lang, 0)
lang_passed[lang] += passed
lang_counts.setdefault(lang, 0)
lang_counts[lang] += 1
end = timeit.default_timer()
print(f"Processed {n_problems} in {end - start}s")
print(f"{100 * n_passed / n_problems : .02f}% pass@1")
print(lang_passed, lang_counts)
with open("evaluated_test.json", "w") as f:
json.dump(passes, f, indent=2)
return 100 * n_passed / n_problems
#!/bin/bash
MLCOMMONS_REPO_PATH="$(dirname "$(dirname "$PWD")")"
# Add any volume mounts here with the following syntax
# /path/to/src:/path/to/dir/in/container
MOUNTS=(
$MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
)
# Set up docker environment file for current user
rm -f .docker_env
echo "CI_BUILD_USER=`id -u -n`" >> .docker_env
echo "CI_BUILD_UID=`id -u`" >> .docker_env
echo "CI_BUILD_GROUP=`id -g -n`" >> .docker_env
echo "CI_BUILD_GID=`id -g`" >> .docker_env
cat .docker_env
# Build container
docker build . -t llm/gpubringup
# Build mount flags
declare -a MOUNT_FLAGS
for _mount in ${MOUNTS[@]}; do
_split=($(echo $_mount | tr ':' '\n'));
MOUNT_FLAGS+=("--mount type=bind,source=${_split[0]},target=${_split[1]}");
done
set -x
nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
--cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH \
--security-opt seccomp=unconfined \
-w $PWD \
--env-file `pwd`/.docker_env \
${MOUNT_FLAGS[*]} \
llm/gpubringup \
bash ./with_the_same_user
import subprocess
import mlperf_loadgen as lg
import argparse
import os
import logging
import sys
from SUT import SUT, SUTServer
sys.path.insert(0, os.getcwd())
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("Mixtral-8x7B-Instruct-v0.1-MAIN")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--scenario",
type=str,
choices=["Offline", "Server"],
default="Offline",
help="Scenario",
)
parser.add_argument(
"--model-path",
type=str,
default="mistralai/Mixtral-8x7B-Instruct-v0.1",
help="Model name",
)
parser.add_argument(
"--dataset-path",
type=str,
default=None,
help="path to processed validation dataset",
)
parser.add_argument(
"--accuracy",
action="store_true",
help="Run accuracy mode")
parser.add_argument(
"--dtype",
type=str,
default="float32",
help="data type of the model, choose from float16, bfloat16 and float32",
)
parser.add_argument(
"--device",
type=str,
choices=["cpu", "cuda:0"],
default="cpu",
help="device to use",
)
parser.add_argument(
"--audit-conf",
type=str,
default="audit.conf",
help="audit config for LoadGen settings during compliance runs",
)
parser.add_argument(
"--user-conf",
type=str,
default="user.conf",
help="user config for user LoadGen settings such as target QPS",
)
# TODO: This interpretation of 'total-sample-count' is a little
# misleading. Fix it
parser.add_argument(
"--total-sample-count",
type=int,
default=24576,
help="Number of samples to use in benchmark.",
)
parser.add_argument(
"--batch-size",
type=int,
default=1,
help="Model batch-size to use in benchmark.",
)
parser.add_argument(
"--output-log-dir", type=str, default="output-logs", help="Where logs are saved"
)
parser.add_argument(
"--enable-log-trace",
action="store_true",
help="Enable log tracing. This file can become quite large",
)
parser.add_argument(
"--num-workers",
type=int,
default=1,
help="Number of workers to process queries",
)
args = parser.parse_args()
return args
scenario_map = {
"offline": lg.TestScenario.Offline,
"server": lg.TestScenario.Server,
}
sut_map = {"offline": SUT, "server": SUTServer}
def main():
args = get_args()
settings = lg.TestSettings()
settings.scenario = scenario_map[args.scenario.lower()]
# mlperf_conf is automatically loaded by the loadgen
# settings.FromConfig(args.mlperf_conf, "mixtral-8x7b", args.scenario)
settings.FromConfig(args.user_conf, "mixtral-8x7b", args.scenario)
if args.accuracy:
settings.mode = lg.TestMode.AccuracyOnly
else:
settings.mode = lg.TestMode.PerformanceOnly
os.makedirs(args.output_log_dir, exist_ok=True)
log_output_settings = lg.LogOutputSettings()
log_output_settings.outdir = args.output_log_dir
log_output_settings.copy_summary_to_stdout = True
log_settings = lg.LogSettings()
log_settings.log_output = log_output_settings
log_settings.enable_trace = args.enable_log_trace
sut_cls = sut_map[args.scenario.lower()]
sut = sut_cls(
model_path=args.model_path,
dtype=args.dtype,
batch_size=args.batch_size,
dataset_path=args.dataset_path,
total_sample_count=args.total_sample_count,
device=args.device,
)
# Start sut before loadgen starts
sut.start()
lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
log.info("Starting Benchmark run")
lg.StartTestWithLogSettings(
lgSUT,
sut.qsl,
settings,
log_settings,
args.audit_conf)
# Stop sut after completion
sut.stop()
log.info("Run Completed!")
log.info("Destroying SUT...")
lg.DestroySUT(lgSUT)
log.info("Destroying QSL...")
lg.DestroyQSL(sut.qsl)
if __name__ == "__main__":
main()
transformers==4.46.2
nltk==3.8.1
evaluate==0.4.0
absl-py==1.4.0
rouge-score==0.1.2
sentencepiece==0.2.0
accelerate==1.2.1
pybind11==2.10.4
CHECKPOINT_PATH="${CHECKPOINT_PATH:mistralai/Mixtral-8x7B-Instruct-v0.1}"
DATASET_PATH="${DATASET_PATH:dataset/2024_06_06_mixtral_15k_v4.pkl}"
mkdir -p "run_outputs"
python3 -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--accuracy \
--user-conf user.conf \
--total-sample-count 15000 \
--dataset-path ${DATASET_PATH} \
--output-log-dir offline_accuracy_loadgen_logs \
--dtype float32 \
--device cuda:0 2>&1 | tee offline_accuracy_log.log
python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
--mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \
--dataset-file ${DATASET_PATH} \
--dtype int32
python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
CHECKPOINT_PATH="${CHECKPOINT_PATH:mistralai/Mixtral-8x7B-Instruct-v0.1}"
DATASET_PATH="${DATASET_PATH:dataset/2024_06_06_mixtral_15k_v4.pkl}"
python -u main.py --scenario Offline \
--model-path ${CHECKPOINT_PATH} \
--user-conf user.conf \
--total-sample-count 15000 \
--dataset-path ${DATASET_PATH} \
--device cpu 2>&1 | tee server_log.log
CHECKPOINT_PATH="${CHECKPOINT_PATH:mistralai/Mixtral-8x7B-Instruct-v0.1}"
DATASET_PATH="${DATASET_PATH:dataset/2024_06_06_mixtral_15k_v4.pkl}"
python -u main.py --scenario Server \
--model-path ${CHECKPOINT_PATH} \
--user-conf user.conf \
--total-sample-count 15000 \
--dataset-path ${DATASET_PATH} \
--device cpu 2>&1 | tee server_log.log
# Mixtral reference standalone inference script
The reference output and accuracy can be checked using the standalone hugginface inference script following the instructions below:
```
cd language/mixtral-8x7b
docker build -t mlc-ngc .
nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH --security-opt seccomp=unconfined -w $PWD -v $PWD:$PWD -t mlc-ngc
pip install -r requirements.txt
cd standalone_infer
# Make sure the checkpoint and reference pickle file is already downloaded
python3 hf_eval_all.py --input_pkl=09292024_mixtral_15k_mintoken2_v1.pkl --checkpoint_path=/raid/data/mlperf-llm/Mixtral-8x7B-Instruct-v0.1 --output_pkl=mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl --batch_size=64
# Exit the container and enter the evaluation container
exit
docker build . -f Dockerfile.eval -t evaluation
docker run -it --rm --net=host --runtime=nvidia --ipc=host -v $PWD:$PWD -w $PWD evaluation
cd standalone_infer
python3 run_accuracy.py --results_path=mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl
```
Expected output:
```
EM: 0.7366, correct: 3683 / 5000, gen_token_per_sample: 129.9604
Evaluating OpenOrca score...
OpenOrca score: {'rouge1': np.float64(45.5989), 'rouge2': np.float64(23.3526), 'rougeL': np.float64(30.4608), 'rougeLsum': np.float64(42.5396)}, gen_token_per_sample: 205.8656
Evaluating MBXP score...
100%|| 5000/5000 [02:33<00:00, 32.50it/s]
Processed 5000 in 153.89411109898356s
60.16% pass@1
{'cpp': 381, 'typescript': 438, 'ruby': 419, 'python': 492, 'php': 809, 'javascript': 469} out of {'cpp': 743, 'typescript': 868, 'ruby': 846, 'python': 863, 'php': 846, 'javascript': 834}
gen_tokens_per_sample: 98.7026
```
#!/usr/bin/env python3
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
import torch
import pandas as pd
import time
from pathlib import Path
import argparse
def run_infer(df, ckpt_path, bs):
"""
dataset GSM8K
id train.548
question Gary manages two Amazon distribution centers. ...
input <s> [INST] As an expert problem solver solve s...
ref_output The first center processes 10000 packages per ...
gt_output 14000
tok_input [1, 1, 28705, 733, 16289, 28793, 1136, 396, 75...
tok_ref_output [415, 907, 4982, 9537, 28705, 28740, 28734, 28...
stop_sequence </s>
tok_stop_sequence [2]
tok_input_len 662
tok_ref_output_len 174
Name: 0, dtype: object
"""
device = "cuda" # the device to load the model onto
# Load the model from local if possible.
model_path = Path(ckpt_path)
if not model_path.exists():
raise RuntimeError(
f"{ckpt_path} not existed. Please download the checkpoint from mlcommon")
tokenizer = AutoTokenizer.from_pretrained(
model_path, padding_side="left", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map="auto", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# gen parameter. We stop at 1024. Starting from v5.0, min_token is set to
# 2 to avoid 0-output issue
gen_kwargs = {
# "min_new_tokens": 1,
"min_new_tokens": 2,
"max_new_tokens": 1024,
"do_sample": False,
"temperature": None,
"top_p": None,
}
# Start inference
BS = bs
bidx = 0
model.eval()
input_tokens = []
input_tokens_lens = []
output_tokens = []
output_tokens_lens = []
output_texts = []
tic = time.time()
for idx in range(0, len(df), BS):
tac = time.time()
print(f"Processing {idx}/{len(df)}, time: {tac - tic}s")
sidx = idx
eidx = min(sidx + BS, len(df))
# We use batch_encode_plus for batch inference.
# Note 9/29/2024: Mixtral changed its tokenizer in Jun. Using the Feb
# 29 2024 version.
batch_texts = df['input'][sidx:eidx].tolist()
batch_ids = tokenizer.batch_encode_plus(
batch_texts, return_tensors="pt", padding=True)
# tok_input_length = batch_ids['attention_mask'].sum(
# axis=1).to(torch.int32).tolist()
# input_tokens_lens += tok_input_length
tok_input_id = batch_ids['input_ids'].to(torch.int32).tolist()
# Remove eos from the input id
tok_input_id = [[element for element in sublist if element !=
tokenizer.eos_token_id] for sublist in tok_input_id]
input_tokens += tok_input_id
tok_input_length = [len(seq) for seq in tok_input_id]
input_tokens_lens += tok_input_length
batch_ids = batch_ids.to(device)
_, length = batch_ids.input_ids.shape
outputs = model.generate(**batch_ids, num_return_sequences=1,
**gen_kwargs)
output_ids = outputs[:, length:].cpu().tolist()
output_tokens += output_ids
# Filter out EOS
id_filtered = [[num for num in sublist if num !=
tokenizer.eos_token_id] for sublist in output_ids]
output_id_len = [len(out) for out in id_filtered]
output_tokens_lens += output_id_len
# Detokenizer
output_msgs = tokenizer.batch_decode(
output_ids, skip_special_tokens=True)
output_texts += output_msgs
bidx += 1
# Assemble the output
output_df = df[:len(output_tokens)].copy()
output_df["infer_tok_input"] = input_tokens
output_df["infer_tok_input_length"] = input_tokens_lens
output_df["infer_ref_output"] = output_texts
output_df["infer_tok_ref_output"] = output_tokens
output_df["infer_tok_ref_output_length"] = output_tokens_lens
# output_df.to_pickle(f"mixtral_8x7b_all15k_{len(output_tokens)}_BS{BS}_greedy_reference_fp16_mintoken1.pkl")
return output_df
def trim_twos(df):
# Remove all trailing 2s except for 1
def remove_trailing_twos(lst):
count = 0
for num in reversed(lst):
if num == 2:
count += 1
else:
break
return lst[:-count] if count > 0 else lst
df['infer_tok_ref_output'] = df['infer_tok_ref_output'].apply(
remove_trailing_twos)
df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
df['tok_ref_output'] = df['tok_ref_output'].apply(remove_trailing_twos)
df['tok_ref_output_len'] = df['tok_ref_output'].apply(len)
return df
def mbxp_stop(df):
stop_tokens = [13, 13940, 28832, 13]
def modify_list(lst):
for i in range(len(lst) - len(stop_tokens) + 1):
if lst[i:i + len(stop_tokens)] == stop_tokens:
return lst[:i + len(stop_tokens)]
return lst
df.loc[df['dataset'] == 'MBXP', 'infer_tok_ref_output'] = df[df['dataset']
== 'MBXP']['infer_tok_ref_output'].apply(modify_list)
df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
return df
def fix_name(df):
df.drop(columns=['ref_output'], inplace=True)
df.drop(columns=['tok_ref_output'], inplace=True)
df.drop(columns=['tok_ref_output_len'], inplace=True)
df.drop(columns=['infer_tok_ref_output_length'], inplace=True)
df.drop(columns=['infer_tok_input'], inplace=True)
df.drop(columns=['infer_tok_input_length'], inplace=True)
df.rename(columns={'infer_ref_output': 'ref_output'}, inplace=True)
df.rename(columns={'infer_tok_ref_output': 'tok_ref_output'}, inplace=True)
df.rename(columns={'trim_lengths': 'tok_ref_output_len'}, inplace=True)
return df
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_pkl", type=str, default="09292024_mixtral_15k_mintoken2_v1.pkl",
help="The path to the input pkl file")
parser.add_argument("--output_pkl", type=str, default="mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl",
help="The path to the output pickle.")
parser.add_argument("--checkpoint_path", type=str, default="/raid/data/mlperf-llm/Mixtral-8x7B-Instruct-v0.1",
help="The path to the mixtral checkpoint")
parser.add_argument("--batch_size", type=int, default=64,
help="Batch size of the refernece inference")
args = parser.parse_args()
df = pd.read_pickle(args.input_pkl)
df = run_infer(df, args.checkpoint_path, args.batch_size)
df = trim_twos(df)
df = mbxp_stop(df)
df = fix_name(df)
df.to_pickle(args.output_pkl)
#!/usr/bin/env python3
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import re
import numpy as np
import argparse
import evaluate
import nltk
from tqdm import tqdm
import timeit
import multiprocessing
import json
import pickle
import queue
from mxeval.execution import check_correctness as check_correctness_python
from mxeval.execution import (
check_correctness_cpp,
check_correctness_csharp,
check_correctness_go,
check_correctness_java,
check_correctness_javascript,
check_correctness_kotlin,
check_correctness_perl,
check_correctness_php,
check_correctness_ruby,
check_correctness_scala,
check_correctness_swift,
check_correctness_typescript,
)
nltk.download("punkt")
nltk.download("punkt_tab")
metric = evaluate.load("rouge")
def calculate_rouge_score(model_outputs, ref_outputs):
metric = evaluate.load("rouge")
m_preds = [pred.strip() for pred in model_outputs]
m_targets = [target.strip() for target in ref_outputs]
# rougeLSum expects newline after each sentence
m_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in m_preds]
m_targets = ["\n".join(nltk.sent_tokenize(target)) for target in m_targets]
m_result = metric.compute(
predictions=m_preds, references=m_targets, use_stemmer=True, use_aggregator=False
)
m_rouge_result = {k: round(np.mean(v) * 100, 4)
for k, v in m_result.items()}
return m_rouge_result
def find_numbers(x: str) -> list[str]:
"""Finds all numbers in a string."""
# Search for number, possibly negative (hyphen), with thousand separators
# (comma), and with a decimal point (period inbetween digits).
numbers = re.compile(
r'-?[\d,]*\.?\d+',
re.MULTILINE | re.DOTALL | re.IGNORECASE,
).findall(x)
return numbers
def find_number(x: str,
answer_delimiter: str = 'The answer is') -> str:
"""Finds the most relevant number in a string."""
# If model uses the answer delimiter, then select the first number following
# that format.
if answer_delimiter in x:
answer = x.split(answer_delimiter)[-1]
numbers = find_numbers(answer)
if numbers:
return numbers[0]
# In general, select the last number in the string.
numbers = find_numbers(x)
if numbers:
return numbers[-1]
return ''
def maybe_remove_comma(x: str) -> str:
# Example: 5,600 -> 5600
return x.replace(',', '')
def try_float(x: str):
try:
ret = float(x)
except BaseException:
ret = None
return ret
def postprocess_golang(code: str) -> str:
multi_line_imports = re.compile(
r"^import \(\n(.+)((?:\n.+)+)\n\)", re.MULTILINE)
line_imports = re.compile(r"^import \".*\"")
func_main = re.compile(r"^func main.*^}", re.MULTILINE | re.DOTALL)
code = code.replace("package main", "") # Remove package main
code = multi_line_imports.sub("", code)
code = line_imports.sub("", code)
code = func_main.sub("", code)
return code
def postprocess_scala(code: str) -> str:
code = code.replace("object Main extends App {", "")
code = "".join(code.splitlines(True)[:-1])
return code
def postprocess_python(code: str) -> str:
return code.lstrip()
def worker(inp_queue, out_queue):
while True:
try:
problem = inp_queue.get(timeout=5)
except queue.Empty:
break
key = f"{problem['lang']}_{problem['entry_point']}"
checker = eval(f"check_correctness_{problem['lang']}")
problem["task_id"] = key
problem["test"] = problem["test_code"]
solution = problem["response"]
try:
solution = solution[:solution.index("```")]
except ValueError:
# Happens when a code block isn't closed properly
pass
if problem["lang"] == "go":
solution = postprocess_golang(solution)
elif problem["lang"] == "python":
solution = postprocess_python(solution)
elif problem["lang"] == "scala":
solution = postprocess_scala(solution)
# Mixtral likes escaping underscores for some reason, so let's remove
# these
solution = solution.replace("\\_", "_")
# The evaluation script evaluates `code = prompt + solution + tests`
# But Mixtral regenerates the prompt in its output, so we should remove
# this
problem["prompt"] = ""
result = checker(problem, solution, timeout=20.0)
out_queue.put(
(key,
problem["lang"],
result["passed"],
result["result"],
problem["response"]))
def convert_pickle(df: pd.DataFrame, result_keys: dict):
problems = []
for _, row in df.iterrows():
lang, entry_point = row["id"].split("_", 1)
problems.append({
"lang": lang,
"prompt": row["input"],
"test_code": row["gt_output"],
"entry_point": entry_point,
"response": row[f"{result_keys['result']}"]
})
return problems
def evaluate_mbxp(n_works: int, df: pd.DataFrame, result_keys: dict):
print(f"Evaluating MBXP score...")
# Convert pickle file into dictionary
results = convert_pickle(df, result_keys)
by_lang = {}
for problem in results:
by_lang.setdefault(problem["lang"], []).append(problem)
inp_queue = multiprocessing.Queue()
out_queue = multiprocessing.Queue()
n_problems = 0
for lang, problems in by_lang.items():
if lang not in ["cpp", "python", "php",
"javascript", "ruby", "typescript"]:
raise RuntimeError(f"{lang} not in supported list.")
n_problems += len(problems)
for problem in problems:
inp_queue.put(problem)
start = timeit.default_timer()
workers = []
for _ in range(args.n_workers):
w = multiprocessing.Process(target=worker, args=(inp_queue, out_queue))
w.start()
workers.append(w)
passes = {}
n_passed = 0
lang_passed = {}
lang_counts = {}
for i in tqdm(range(n_problems)):
key, lang, passed, result, response = out_queue.get()
passes[key] = {
"passed": passed,
"result": result,
"response": response}
n_passed += passed
lang_passed.setdefault(lang, 0)
lang_passed[lang] += passed
lang_counts.setdefault(lang, 0)
lang_counts[lang] += 1
end = timeit.default_timer()
print(f"Processed {n_problems} in {end - start}s")
print(f"{100 * n_passed / n_problems : .02f}% pass@1")
print(lang_passed, " out of ", lang_counts)
gen_token_len = df[result_keys['length']].tolist()
gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
print(f"gen_tokens_per_sample: {gen_token_per_sample}")
# with open("evaluated_test.json", "w") as f:
# json.dump(passes, f, indent=2)
return n_passed / n_problems
def evaluate_openorca(df: pd.DataFrame, result_keys: dict):
print(f"Evaluating OpenOrca score...")
gen_output = df[f"{result_keys['result']}"].tolist()
gt_output = df.gt_output.tolist()
score = calculate_rouge_score(gen_output, gt_output)
gen_token_len = df[result_keys['length']].tolist()
gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
print(
f"OpenOrca score: {score}, gen_token_per_sample: {gen_token_per_sample}")
return score
def evaluate_gsm8k(df: pd.DataFrame, result_keys: dict):
print(f"Evaluating GSM8K score...")
gen_output = df[f"{result_keys['result']}"].tolist()
gt_numbers = df.gt_output.tolist()
gen_nums = [maybe_remove_comma(find_number(msg.split("\nQ:")[0]))
for msg in gen_output]
correct = 0
total = len(gt_numbers)
for idx in range(len(gt_numbers)):
ref = try_float(gt_numbers[idx])
tgt = try_float(gen_nums[idx])
if tgt is None:
continue
correct += (ref == tgt)
em = correct / total
gen_token_len = df[result_keys['length']].tolist()
gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
print(
f"EM: {em}, correct: {correct} / {total}, gen_token_per_sample: {gen_token_per_sample}")
return em
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--n_workers",
type=int,
default=10,
help="The number of processes to use")
parser.add_argument("--results_path", type=str, default="mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl",
help="The path to the results file pickle file")
parser.add_argument("--result_key", type=str, default="ref_output",
help="ref output dict key")
parser.add_argument("--length_key", type=str, default="tok_ref_output_len",
help="ref output dict key")
args = parser.parse_args()
"""
Sample command:
python3 nv_accuracy.py --results_path=trtllm_fp16_mixtral_8x7b_all15k_15000_BS128_greedy_06102024.pkl --result_key=nv_tllm_ref_output --length_key=nv_tllm_tok_ref_output_length
"""
result_keys = {
"result": args.result_key,
"length": args.length_key
}
"""
dataset MBXP (OpenOrca/GSM8K)
id typescript_minimum_Length
question /**\n * Write a typescript function to minimiz...
input <s> [INST] Complete the following code. Be con...
ref_output \nconst minimumLength = (s: string): number =>...
gt_output \nimport * as assert from 'assert'\n\nlet actu...
tok_input [1, 1, 28705, 733, 16289, 28793, 21929, 272, 2...
tok_ref_output [13, 1978, 7968, 4645, 327, 325, 28713, 28747,...
stop_sequence \n```\n
tok_stop_sequence [13, 13940, 28832, 13]
tok_input_len 139
tok_ref_output_len 123
"""
df = pd.read_pickle(args.results_path)
df_gsm8k = df[df['dataset'] == "GSM8K"].copy()
evaluate_gsm8k(df_gsm8k, result_keys)
df_openorca = df[df['dataset'] == "OpenOrca"].copy()
evaluate_openorca(df_openorca, result_keys)
df_mbxp = df[df['dataset'] == "MBXP"].copy()
evaluate_mbxp(args.n_workers, df_mbxp, result_keys)
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds
#
BasedOnStyle: Google
Standard: Cpp11
cmake_minimum_required(VERSION 3.12)
project(mlperf_loadgen)
# Read the version file
file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS)
# Extract the major, minor, and patch versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format)
string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS})
# Set the variables for the major, minor, and patch versions
set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}")
set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}")
set(mlperf_loadgen_VERSION_PATCH "${CMAKE_MATCH_3}")
# Check if the version format was parsed correctly
if(NOT DEFINED mlperf_loadgen_VERSION_MAJOR OR NOT DEFINED mlperf_loadgen_VERSION_MINOR OR NOT DEFINED mlperf_loadgen_VERSION_PATCH)
message(FATAL_ERROR "Version format in VERSION.txt is incorrect. Expected format: MAJOR.MINOR.PATCH")
endif()
# Print out the version
message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}.${mlperf_loadgen_VERSION_PATCH}")
# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
if (NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
endif()
# Extra build options can be specified by setting the MLPERF_LOADGEN_CXX_FLAGS variable
if (MLPERF_LOADGEN_CXX_FLAGS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MLPERF_LOADGEN_CXX_FLAGS}")
endif()
message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_STANDARD "14")
message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}")
message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}")
message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}")
# Output directory for libraries.
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}")
# Detect Python to use for generating source file with version info.
# NB: PythonInterp has been deprecated since CMake 3.12
# but it works with earlier versions of CMake.
find_package(PythonInterp)
message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}")
# Specify the source and destination files
set(CONF_FILE "mlperf.conf")
set(HEADER_FILE "mlperf_conf.h")
# Read the content of the configuration file
file(READ ${CONF_FILE} CONF_CONTENTS)
# Escape all double quotes and backslashes
string(REPLACE "\\" "\\\\" CONF_CONTENTS "${CONF_CONTENTS}")
string(REPLACE "\"" "\\\"" CONF_CONTENTS "${CONF_CONTENTS}")
# Handle new lines
string(REPLACE "\n" "\\n\"\n\"" CONF_CONTENTS "${CONF_CONTENTS}")
# Wrap the content in a C++ string declaration
set(FORMATTED_CONTENT "const char* mlperf_conf =\n\"${CONF_CONTENTS}\";\n")
# Write the formatted content to the header file
file(WRITE ${HEADER_FILE} "${FORMATTED_CONTENT}")
message(STATUS "Output config: ${CMAKE_BINARY_DIR}/mlperf_conf.h")
# Generate source file with version info.
execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR})
# Add source files.
set(SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h
${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc
${CMAKE_CURRENT_SOURCE_DIR}/early_stopping.cc
${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc
${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc
${CMAKE_CURRENT_SOURCE_DIR}/logging.cc
${CMAKE_CURRENT_SOURCE_DIR}/logging.h
${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc
${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h
${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils.h
${CMAKE_CURRENT_SOURCE_DIR}/results.h
${CMAKE_CURRENT_SOURCE_DIR}/results.cc
${CMAKE_CURRENT_SOURCE_DIR}/version.cc
${CMAKE_CURRENT_SOURCE_DIR}/version.h
${CMAKE_CURRENT_SOURCE_DIR}/mlperf_conf.h
${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt
${CMAKE_BINARY_DIR}/version_generated.cc
)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_library(mlperf_loadgen STATIC ${SOURCE})
target_link_libraries(mlperf_loadgen)
if(WIN32)
set (LIBS "")
else()
set (LIBS pthread)
endif()
add_executable(benchmark benchmark/repro.cpp)
target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS})
# Install library and headers.
install(TARGETS mlperf_loadgen
DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h")
include VERSION.txt
include mlperf.conf
# Overview {#mainpage}
## Introduction
* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
the performance of inference systems.
* It generates traffic for scenarios as formulated by a diverse set of experts
in the [MLCommons working group](https://mlcommons.org/).
* The scenarios emulate the workloads seen in mobile devices,
autonomous vehicles, robotics, and cloud-based setups.
* Although the LoadGen is not model or dataset aware, its strength is in its
reusability with logic that is.
## Integration Example and Flow
The following is an diagram of how the LoadGen can be integrated into an
inference system, resembling how some of the MLPerf reference models are
implemented.
<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
<ol style="padding: 20px">
<li>Benchmark knows the model, dataset, and preprocessing.</li>
<li>Benchmark hands dataset sample IDs to LoadGen.</li>
<li>LoadGen starts generating queries of sample IDs.</li>
<li>Benchmark creates requests to backend.</li>
<li>Result is post processed and forwarded to LoadGen.</li>
<li>LoadGen outputs logs for analysis.<br>
</ol>
</div>
## Useful Links
* [FAQ](README_FAQ.md)
* [LoadGen Build Instructions](README_BUILD.md)
* [LoadGen API](loadgen.h)
* [Test Settings](test_settings.h) -
A good description of available scenarios, modes, and knobs.
* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
Includes source for the LoadGen and reference models that use the LoadGen.
* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
Any mismatch with this is a bug in the LoadGen.
## Scope of the LoadGen's Responsibilities
### In Scope
* **Provide a reusable** C++ library with python bindings.
* **Implement** the traffic patterns of the MLPerf Inference scenarios and
modes.
* **Record** all traffic generated and received for later analysis and
verification.
* **Summarize** the results and whether performance constraints were met.
* **Target high-performance** systems with efficient multi-thread friendly
logging utilities.
* **Generate trust** via a shared, well-tested, and community-hardened
code base.
### Out of Scope
The LoadGen is:
* **NOT** aware of the ML model it is running against.
* **NOT** aware of the data formats of the model's inputs and outputs.
* **NOT** aware of how to score the accuracy of a model's outputs.
* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
Limitting the scope of the LoadGen in this way keeps it reusable across
different models and datasets without modification. Using composition and
dependency injection, the user can define their own model, datasets, and
metrics.
Additionally, not hardcoding MLPerf-specific test constraints, like test
duration and performance targets, allows users to use the LoadGen unmodified
for custom testing and continuous integration purposes.
## Submission Considerations
### Upstream all local modifications
* As a rule, no local modifications to the LoadGen's C++ library are allowed
for submission.
* Please upstream early and often to keep the playing field level.
### Choose your TestSettings carefully!
* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
requirements for submission. *e.g.:* target percentiles and latencies.
* For verification, the values in TestSettings are logged.
* To help make sure your settings are spec compliant, use
TestSettings::FromConfig in conjunction with the relevant config file provided
with the reference models.
## Responsibilities of a LoadGen User
### Implement the Interfaces
* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
them to the StartTest function.
* Call QuerySampleComplete for every sample received by
SystemUnderTest::IssueQuery.
### Assess Accuracy
* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
the accuracy of your system.
* For the official models, Python scripts will be provided by the MLPerf model
owners for you to do this automatically.
For templates of how to do the above in detail, refer to code for the demos,
tests, and reference models.
## LoadGen over the Network
For reference, on a high level a submission looks like this:
<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_submission.png" width="300px" style="padding: 20px">
</div>
The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing.
A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram:
<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_network_submission.png" width="300px" style="padding: 20px">
</div>
QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen. It is implemented by the submitter. The interface of the QDL is the same as the API to SUT.
In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group.
All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network.
QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future.
The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements.
### QDL details
The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class.
#### QDL Query issue and response over the network
The QDL gets the queries from the LoadGen through
```CPP
void IssueQuery(const std::vector<QuerySample>& samples)
```
The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT.
The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by:
```CPP
struct QuerySampleResponse {
ResponseId id;
uintptr_t data;
size_t size;
};
void QuerySamplesComplete(QuerySampleResponse* responses,
size_t response_count);
```
#### QDL Additional Methods
In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen:
```CPP
const std::string& Name();
```
The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark.
```CPP
void FlushQueries();
```
It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT.
### Example
Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network.
# Building the LoadGen {#ReadmeBuild}
## Prerequisites
sudo apt-get install libglib2.0-dev python-pip python3-pip
pip2 install absl-py numpy
pip3 install absl-py numpy
## Quick Start
### Installation - Python
pip install absl-py numpy
git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
cd mlperf_inference/loadgen
CFLAGS="-std=c++14 -O3" python -m pip install .
This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.
Alternatively, we provide wheels for several python versions and operating system that can be installed using pip directly.
pip install mlperf-loadgen
**NOTE:** Take into account that we only update the published wheels after an official release, they may not include the latest changes.
### Testing your Installation
The following command will run a simple end-to-end demo:
python mlperf_inference/loadgen/demos/py_demo_single_stream.py
A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile.
For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome:
* Type “chrome://tracing” in the address bar, then drag-n-drop the json.
* This may be useful for SUT performance tuning and understanding + debugging the loadgen.
### Installation - C++
To build the loadgen as a C++ library, rather than a python module:
git clone https://github.com/mlcommons/inference.git mlperf_inference
cd mlperf_inference
mkdir loadgen/build/ && cd loadgen/build/
cmake .. && cmake --build .
cp libmlperf_loadgen.a ..
## Quick start: Loadgen Over the Network
Refer to [LON demo](demos/lon/README.md) for a basic example.
# LoadGen FAQ {#ReadmeFAQ}
## Q: The LoadGen does not match the MLPerf specification. Who is right?
**A:**
The MLPerf spec is *always* right.
Please file a LoadGen bug so it may be resolved.
## Q: How can I file a bug?
**A:**
On GitHub: https://github.com/mlcommons/inference/issues/new
## Q: Can I make local modifications to the LoadGen for submission?
**A:**
No. To keep the playing field level, please upstream any local
modificiations you need to make. Ideally upstream such changes behind a runtime
flag or via an abstract interface the client can implement. This will help
with testability.
## Q: Where can I find the results of a test?
**A:**
By default, the loadgen will output an *mlperf_log_summary.txt* file
that summarizes the target metrics and constraints of the test, along with
other stats about the run.
*Note:* LogSettings also has a flag to forward the results to stdout and
there's an outstanding TODO to make this more programmable.
## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission?
**A:**
They are not. The LoadGen results are the ground truth for submission
results since they will work even for systems that forgo the python bindings.
If you notice a bug in the LoadGen's results, please file a bug or submit a
patch.
## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*?
**A:**
If you have a custom build setup, make sure you run the *version_generator.py*
script, which will create the cc file you are looking for. The official build
files that come with the LoadGen do this for you out of the box.
## Q: What is this *version_generator.py* script?
**A:**
The LoadGen records git stats (if available) and the SHA1 of all its
source files (always) at build time for verification purposes. This is easy
to circumvent, but try your best to run *version_generator.py* correctly;
ideally integrated with your build system if you have a custom build.
The intention is more to help with debugging efforts and detect accidental
version missmatches than to detect bad actors.
## Q: How do I view the *mlperf_log_trace.json* file?
**A:**
This file uses the [Trace Event Format]
(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit)
to record a timeline of all the threads involved.
You can view the file by typing [chrome://tracing](chrome://tracing) into
Chrome's address bar and dragging the json file there.
This file zips well and you can drag the zip file directly into
[chrome://tracing](chrome://tracing) too.
Please include zipped traces (and the other logs) when filing bug reports.
## Q: Why is the code littered with so many lambdas? My eyes hurt.
**A:**
Lambdas are a convenient and efficient way to ship arbitrary data + deferred
logic over to the logging thread without much boilerplate.
Much of the loadgen is built on top of the logging utilities.
Thus the lambdas. (Sorry about the eyes.)
## Q: What C++ version does the LoadGen target?
**A:**
It currently targets and requires C++14. It should compile with recent
versions of clang, gcc, and msvc.
## Q: What dependencies does the LoadGen code have?
**A:**
The C++ code has no external dependencies. The loadgen itself, logging
utilities, and unit test utilities are built solely on the C++ Standard Library.
The python bindings, however, do require
[pybind11](https://github.com/pybind/pybind11).
import sys
# Aliasing mlcommons_loadgen as mlperf_loadgen
sys.modules['mlperf_loadgen'] = sys.modules[__name__]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment