git init

3c15726c · yangzhong · 3c15726c · 3c15726c · 3c15726c · 3c15726c
Commit 3c15726c authored Nov 01, 2025 by yangzhong
20 changed files
--- a/language/mixtral-8x7b/evaluate-accuracy.py
+++ b/language/mixtral-8x7b/evaluate-accuracy.py
+import argparse
+from transformers import AutoTokenizer
+import nltk
+import evaluate
+import numpy as np
+import pandas as pd
+import json
+import re
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
+    )
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file", required=True, help="path to processed validation dataset"
+    )
+    parser.add_argument(
+        "--n_workers",
+        default=2,
+        type=int,
+        help="Number of workers used for the MBXP evaluation",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64", "float"],
+    )
+    args = parser.parse_args()
+    return args
+
+
+def get_groundtruth(processed_dataset_file):
+    data = pd.read_pickle(processed_dataset_file)
+    return data
+
+
+# Functions for evaluating GSM8K
+def find_numbers(x: str) -> list[str]:
+    """Finds all numbers in a string."""
+    # Search for number, possibly negative (hyphen), with thousand separators
+    # (comma), and with a decimal point (period inbetween digits).
+    numbers = re.compile(
+        r"-?[\d,]*\.?\d+",
+        re.MULTILINE | re.DOTALL | re.IGNORECASE,
+    ).findall(x)
+    return numbers
+
+
+def find_number(x: str, answer_delimiter: str = "The answer is") -> str:
+    """Finds the most relevant number in a string."""
+    # If model uses the answer delimiter, then select the first number following
+    # that format.
+    if answer_delimiter in x:
+        answer = x.split(answer_delimiter)[-1]
+        numbers = find_numbers(answer)
+        if numbers:
+            return numbers[0]
+
+    # In general, select the last number in the string.
+    numbers = find_numbers(x)
+    if numbers:
+        return numbers[-1]
+    return ""
+
+
+def maybe_remove_comma(x: str) -> str:
+    # Example: 5,600 -> 5600
+    return x.replace(",", "")
+
+
+def try_float(x: str):
+    try:
+        ret = float(x)
+    except BaseException:
+        ret = None
+    return ret
+
+
+# Functions for evaluating OpenOrca
+
+
+def postprocess_text(preds, targets):
+    preds = [pred.strip() for pred in preds]
+    targets = [target.strip() for target in targets]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
+
+    return preds, targets
+
+
+# Functions for MBXP
+
+
+def create_mbxp_dict(row, response):
+    lang, entry_point = row["id"].split("_", 1)
+    return {
+        "lang": lang,
+        "prompt": row["input"],
+        "test_code": row["gt_output"],
+        "entry_point": entry_point,
+        "response": response,
+    }
+
+
+def main():
+
+    args = get_args()
+    dataset_path = args.dataset_file
+    checkpoint_path = args.checkpoint_path
+    metric = evaluate.load("rouge")
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint_path,
+        model_max_length=2048,
+        padding_side="left",
+        use_fast=False,
+    )
+
+    data = get_groundtruth(args.dataset_file)
+    query_types, gt_outputs = data["dataset"], data["gt_output"]
+
+    target_required_GSM8K = []
+    target_required_OpenOrca = []
+    results_MBXP = []
+    preds_token_GSM8K = []
+    preds_token_OpenOrca = []
+    preds_token_MBXP = []
+
+    eval_dtype = np.int64
+    if args.dtype == "int32":
+        eval_dtype = np.int32
+    elif args.dtype == "float":
+        eval_dtype = np.float32
+
+    with open(args.mlperf_accuracy_file, "r") as f:
+        results = json.load(f)
+
+    seen = set()
+    gen_tok_len = 0
+    gen_num = 0
+    for pred in results:
+        gen_num += 1
+        qsl_idx = pred["qsl_idx"]
+        if qsl_idx in seen:
+            continue
+
+        seen.add(qsl_idx)
+
+        query_type = query_types.iloc[qsl_idx]
+        if query_type == "GSM8K":
+            target = gt_outputs.iloc[qsl_idx]
+            target_required_GSM8K.append(target)
+            pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
+
+            gen_tok_len += len(pred)
+            preds_token_GSM8K.append(pred)
+        elif query_type == "OpenOrca":
+            target = gt_outputs.iloc[qsl_idx]
+            target_required_OpenOrca.append(target)
+            pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
+
+            gen_tok_len += len(pred)
+            preds_token_OpenOrca.append(pred)
+        else:
+            target = data.iloc[qsl_idx]
+            pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
+            pred_str = tokenizer.decode(pred, skip_special_tokens=True)
+            results_MBXP.append(create_mbxp_dict(target, pred_str))
+
+            gen_tok_len += len(pred)
+
+    # OpenOrca metric
+    preds_decoded_text = tokenizer.batch_decode(
+        preds_token_OpenOrca, skip_special_tokens=True
+    )
+
+    preds, targets = postprocess_text(
+        preds_decoded_text, target_required_OpenOrca)
+
+    if preds:
+        result = metric.compute(
+            predictions=preds,
+            references=targets,
+            use_stemmer=True,
+            use_aggregator=False,
+        )
+        result = {k: float(round(np.mean(v) * 100, 4))
+                  for k, v in result.items()}
+        prediction_lens = [len(pred) for pred in preds]
+
+    else:
+        result = {}
+        prediction_lens = []
+
+    # GSM8K metric
+    preds_decoded_text = tokenizer.batch_decode(
+        preds_token_GSM8K, skip_special_tokens=True
+    )
+    pred_nums = [
+        maybe_remove_comma(find_number(pred_text.split("\nQ:")[0]))
+        for pred_text in preds_decoded_text
+    ]
+    gsm8k_total = len(target_required_GSM8K)
+    correct = 0
+    for idx in range(len(target_required_GSM8K)):
+        ref = try_float(target_required_GSM8K[idx])
+        tgt = try_float(pred_nums[idx])
+        if tgt is None:
+            continue
+        correct += ref == tgt
+
+    result["gsm8k"] = 100.0 * correct / gsm8k_total
+
+    # MBXP metric
+    from evaluate_mbxp import evaluate_mbxp
+
+    if results_MBXP:
+        result["mbxp"] = evaluate_mbxp(results_MBXP, args.n_workers)
+    else:
+        result["mbxp"] = 0
+
+    result = {
+        **result,
+        "gen_len": int(np.sum(prediction_lens)),
+        "gen_num": gen_num,
+        "gen_tok_len": gen_tok_len,
+        "tokens_per_sample": round(gen_tok_len / gen_num, 1),
+    }
+
+    print("\nResults\n")
+    print(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/language/mixtral-8x7b/evaluate_mbxp.py
+++ b/language/mixtral-8x7b/evaluate_mbxp.py
+import argparse
+import json
+import multiprocessing
+import pickle
+import queue
+import re
+import timeit
+
+import pandas as pd
+from tqdm import tqdm
+
+from mxeval.execution import check_correctness as check_correctness_python
+from mxeval.execution import (
+    check_correctness_cpp,
+    check_correctness_csharp,
+    check_correctness_go,
+    check_correctness_java,
+    check_correctness_javascript,
+    check_correctness_kotlin,
+    check_correctness_perl,
+    check_correctness_php,
+    check_correctness_ruby,
+    check_correctness_scala,
+    check_correctness_swift,
+    check_correctness_typescript,
+)
+
+
+def postprocess_golang(code: str) -> str:
+    multi_line_imports = re.compile(
+        r"^import \(\n(.+)((?:\n.+)+)\n\)", re.MULTILINE)
+    line_imports = re.compile(r"^import \".*\"")
+    func_main = re.compile(r"^func main.*^}", re.MULTILINE | re.DOTALL)
+
+    code = code.replace("package main", "")  # Remove package main
+    code = multi_line_imports.sub("", code)
+    code = line_imports.sub("", code)
+    code = func_main.sub("", code)
+
+    return code
+
+
+def postprocess_scala(code: str) -> str:
+    code = code.replace("object Main extends App {", "")
+    code = "".join(code.splitlines(True)[:-1])
+    return code
+
+
+def postprocess_python(code: str) -> str:
+    return code.lstrip()
+
+
+def worker(inp_queue, out_queue):
+    while True:
+        try:
+            problem = inp_queue.get(timeout=5)
+        except queue.Empty:
+            break
+
+        key = f"{problem['lang']}_{problem['entry_point']}"
+        checker = eval(f"check_correctness_{problem['lang']}")
+
+        problem["task_id"] = key
+        problem["test"] = problem["test_code"]
+
+        solution = problem["response"]
+
+        try:
+            solution = solution[: solution.index("```")]
+        except ValueError:
+            # Happens when a code block isn't closed properly
+            pass
+
+        if problem["lang"] == "go":
+            solution = postprocess_golang(solution)
+        elif problem["lang"] == "python":
+            solution = postprocess_python(solution)
+        elif problem["lang"] == "scala":
+            solution = postprocess_scala(solution)
+
+        # Mixtral likes escaping underscores for some reason, so let's remove
+        # these
+        solution = solution.replace("\\_", "_")
+
+        # The evaluation script evaluates `code = prompt + solution + tests`
+        # But Mixtral regenerates the prompt in its output, so we should remove
+        # this
+        problem["prompt"] = ""
+        try:
+            result = checker(problem, solution, timeout=20.0)
+            out_queue.put(
+                (
+                    key,
+                    problem["lang"],
+                    result["passed"],
+                    result["result"],
+                    problem["response"],
+                )
+            )
+        except Exception as e:
+            print(e)
+            out_queue.put(
+                (key, problem["lang"], False, "", problem["response"]))
+
+
+def evaluate_mbxp(results, n_workers):
+    by_lang = {}
+    for problem in results:
+        by_lang.setdefault(problem["lang"], []).append(problem)
+
+    inp_queue = multiprocessing.Queue()
+    out_queue = multiprocessing.Queue()
+
+    n_problems = 0
+
+    for lang, problems in by_lang.items():
+        if lang not in ["cpp", "python", "php",
+                        "javascript", "ruby", "typescript"]:
+            continue
+
+        n_problems += len(problems)
+        for problem in problems:
+            inp_queue.put(problem)
+
+    start = timeit.default_timer()
+    workers = []
+    for _ in range(n_workers):
+        w = multiprocessing.Process(target=worker, args=(inp_queue, out_queue))
+        w.start()
+        workers.append(w)
+
+    passes = {}
+    n_passed = 0
+    lang_passed = {}
+    lang_counts = {}
+    for i in tqdm(range(n_problems)):
+        key, lang, passed, result, response = out_queue.get()
+        passes[key] = {
+            "passed": passed,
+            "result": result,
+            "response": response}
+        n_passed += passed
+
+        lang_passed.setdefault(lang, 0)
+        lang_passed[lang] += passed
+
+        lang_counts.setdefault(lang, 0)
+        lang_counts[lang] += 1
+
+    end = timeit.default_timer()
+    print(f"Processed {n_problems} in {end - start}s")
+    print(f"{100 * n_passed / n_problems : .02f}% pass@1")
+    print(lang_passed, lang_counts)
+    with open("evaluated_test.json", "w") as f:
+        json.dump(passes, f, indent=2)
+
+    return 100 * n_passed / n_problems
--- a/language/mixtral-8x7b/launch.sh
+++ b/language/mixtral-8x7b/launch.sh
+#!/bin/bash
+
+MLCOMMONS_REPO_PATH="$(dirname "$(dirname "$PWD")")"
+
+# Add any volume mounts here with the following syntax
+# /path/to/src:/path/to/dir/in/container
+MOUNTS=(
+    $MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
+)
+
+# Set up docker environment file for current user
+rm -f .docker_env
+echo "CI_BUILD_USER=`id -u -n`" >> .docker_env
+echo "CI_BUILD_UID=`id -u`" >> .docker_env
+echo "CI_BUILD_GROUP=`id -g -n`" >> .docker_env
+echo "CI_BUILD_GID=`id -g`" >> .docker_env
+cat .docker_env
+
+# Build container
+docker build . -t llm/gpubringup
+
+# Build mount flags
+declare -a MOUNT_FLAGS
+for _mount in ${MOUNTS[@]}; do
+    _split=($(echo $_mount | tr ':' '\n'));
+    MOUNT_FLAGS+=("--mount type=bind,source=${_split[0]},target=${_split[1]}");
+done
+
+set -x
+nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
+  --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH \
+  --security-opt seccomp=unconfined \
+  -w $PWD \
+  --env-file `pwd`/.docker_env \
+  ${MOUNT_FLAGS[*]} \
+  llm/gpubringup \
+  bash ./with_the_same_user
--- a/language/mixtral-8x7b/main.py
+++ b/language/mixtral-8x7b/main.py
+import subprocess
+import mlperf_loadgen as lg
+import argparse
+import os
+import logging
+import sys
+from SUT import SUT, SUTServer
+
+sys.path.insert(0, os.getcwd())
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("Mixtral-8x7B-Instruct-v0.1-MAIN")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--scenario",
+        type=str,
+        choices=["Offline", "Server"],
+        default="Offline",
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="path to processed validation dataset",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy mode")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float32",
+        help="data type of the model, choose from float16, bfloat16 and float32",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda:0"],
+        default="cpu",
+        help="device to use",
+    )
+    parser.add_argument(
+        "--audit-conf",
+        type=str,
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--user-conf",
+        type=str,
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    # TODO: This interpretation of 'total-sample-count' is a little
+    # misleading. Fix it
+    parser.add_argument(
+        "--total-sample-count",
+        type=int,
+        default=24576,
+        help="Number of samples to use in benchmark.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1,
+        help="Model batch-size to use in benchmark.",
+    )
+    parser.add_argument(
+        "--output-log-dir", type=str, default="output-logs", help="Where logs are saved"
+    )
+    parser.add_argument(
+        "--enable-log-trace",
+        action="store_true",
+        help="Enable log tracing. This file can become quite large",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Number of workers to process queries",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+scenario_map = {
+    "offline": lg.TestScenario.Offline,
+    "server": lg.TestScenario.Server,
+}
+
+sut_map = {"offline": SUT, "server": SUTServer}
+
+
+def main():
+    args = get_args()
+
+    settings = lg.TestSettings()
+    settings.scenario = scenario_map[args.scenario.lower()]
+    # mlperf_conf is automatically loaded by the loadgen
+    # settings.FromConfig(args.mlperf_conf, "mixtral-8x7b", args.scenario)
+    settings.FromConfig(args.user_conf, "mixtral-8x7b", args.scenario)
+
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+
+    os.makedirs(args.output_log_dir, exist_ok=True)
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = args.output_log_dir
+    log_output_settings.copy_summary_to_stdout = True
+    log_settings = lg.LogSettings()
+    log_settings.log_output = log_output_settings
+    log_settings.enable_trace = args.enable_log_trace
+
+    sut_cls = sut_map[args.scenario.lower()]
+
+    sut = sut_cls(
+        model_path=args.model_path,
+        dtype=args.dtype,
+        batch_size=args.batch_size,
+        dataset_path=args.dataset_path,
+        total_sample_count=args.total_sample_count,
+        device=args.device,
+    )
+
+    # Start sut before loadgen starts
+    sut.start()
+    lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
+    log.info("Starting Benchmark run")
+    lg.StartTestWithLogSettings(
+        lgSUT,
+        sut.qsl,
+        settings,
+        log_settings,
+        args.audit_conf)
+
+    # Stop sut after completion
+    sut.stop()
+
+    log.info("Run Completed!")
+
+    log.info("Destroying SUT...")
+    lg.DestroySUT(lgSUT)
+
+    log.info("Destroying QSL...")
+    lg.DestroyQSL(sut.qsl)
+
+
+if __name__ == "__main__":
+    main()
--- a/language/mixtral-8x7b/requirements.txt
+++ b/language/mixtral-8x7b/requirements.txt
+transformers==4.46.2
+nltk==3.8.1
+evaluate==0.4.0
+absl-py==1.4.0
+rouge-score==0.1.2
+sentencepiece==0.2.0
+accelerate==1.2.1
+pybind11==2.10.4
--- a/language/mixtral-8x7b/run_accuracy.sh
+++ b/language/mixtral-8x7b/run_accuracy.sh
+CHECKPOINT_PATH="${CHECKPOINT_PATH:mistralai/Mixtral-8x7B-Instruct-v0.1}"
+DATASET_PATH="${DATASET_PATH:dataset/2024_06_06_mixtral_15k_v4.pkl}"
+
+mkdir -p "run_outputs"
+
+python3 -u main.py --scenario Offline \
+        --model-path ${CHECKPOINT_PATH} \
+        --accuracy \
+        --user-conf user.conf \
+        --total-sample-count 15000 \
+        --dataset-path ${DATASET_PATH} \
+        --output-log-dir offline_accuracy_loadgen_logs \
+        --dtype float32 \
+        --device cuda:0 2>&1 | tee offline_accuracy_log.log
+
+python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
+        --mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \
+        --dataset-file ${DATASET_PATH} \
+        --dtype int32
+
+python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
--- a/language/mixtral-8x7b/run_offline.sh
+++ b/language/mixtral-8x7b/run_offline.sh
+CHECKPOINT_PATH="${CHECKPOINT_PATH:mistralai/Mixtral-8x7B-Instruct-v0.1}"
+DATASET_PATH="${DATASET_PATH:dataset/2024_06_06_mixtral_15k_v4.pkl}"
+
+python -u main.py --scenario Offline \
+		--model-path ${CHECKPOINT_PATH} \
+		--user-conf user.conf \
+		--total-sample-count 15000 \
+		--dataset-path ${DATASET_PATH} \
+		--device cpu 2>&1 | tee server_log.log
--- a/language/mixtral-8x7b/run_server.sh
+++ b/language/mixtral-8x7b/run_server.sh
+
+
+CHECKPOINT_PATH="${CHECKPOINT_PATH:mistralai/Mixtral-8x7B-Instruct-v0.1}"
+DATASET_PATH="${DATASET_PATH:dataset/2024_06_06_mixtral_15k_v4.pkl}"
+
+python -u main.py --scenario Server \
+		--model-path ${CHECKPOINT_PATH} \
+		--user-conf user.conf \
+		--total-sample-count 15000 \
+		--dataset-path ${DATASET_PATH} \
+		--device cpu 2>&1 | tee server_log.log
--- a/language/mixtral-8x7b/standalone_infer/README.md
+++ b/language/mixtral-8x7b/standalone_infer/README.md
+# Mixtral reference standalone inference script
+
+The reference output and accuracy can be checked using the standalone hugginface inference script following the instructions below:
+
+```
+cd language/mixtral-8x7b
+docker build -t mlc-ngc .
+nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH --security-opt seccomp=unconfined -w $PWD -v $PWD:$PWD -t mlc-ngc
+
+pip install -r requirements.txt
+cd standalone_infer
+# Make sure the checkpoint and reference pickle file is already downloaded
+python3 hf_eval_all.py --input_pkl=09292024_mixtral_15k_mintoken2_v1.pkl --checkpoint_path=/raid/data/mlperf-llm/Mixtral-8x7B-Instruct-v0.1 --output_pkl=mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl --batch_size=64
+
+# Exit the container and enter the evaluation container
+exit
+docker build . -f Dockerfile.eval -t evaluation
+docker run -it --rm --net=host --runtime=nvidia --ipc=host -v $PWD:$PWD -w $PWD evaluation
+cd standalone_infer
+python3 run_accuracy.py --results_path=mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl
+```
+
+Expected output:
+```
+EM: 0.7366, correct: 3683 / 5000, gen_token_per_sample: 129.9604
+Evaluating OpenOrca score...
+OpenOrca score: {'rouge1': np.float64(45.5989), 'rouge2': np.float64(23.3526), 'rougeL': np.float64(30.4608), 'rougeLsum': np.float64(42.5396)}, gen_token_per_sample: 205.8656
+Evaluating MBXP score...
+100%|| 5000/5000 [02:33<00:00, 32.50it/s]
+Processed 5000 in 153.89411109898356s
+ 60.16% pass@1
+{'cpp': 381, 'typescript': 438, 'ruby': 419, 'python': 492, 'php': 809, 'javascript': 469}  out of  {'cpp': 743, 'typescript': 868, 'ruby': 846, 'python': 863, 'php': 846, 'javascript': 834}
+gen_tokens_per_sample: 98.7026
+```
--- a/language/mixtral-8x7b/standalone_infer/hf_eval_all.py
+++ b/language/mixtral-8x7b/standalone_infer/hf_eval_all.py
+#!/usr/bin/env python3
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
+import torch
+import pandas as pd
+import time
+from pathlib import Path
+import argparse
+
+
+def run_infer(df, ckpt_path, bs):
+    """
+    dataset                                                           GSM8K
+    id                                                            train.548
+    question              Gary manages two Amazon distribution centers. ...
+    input                 <s> [INST] As an expert problem solver solve s...
+    ref_output            The first center processes 10000 packages per ...
+    gt_output                                                         14000
+    tok_input             [1, 1, 28705, 733, 16289, 28793, 1136, 396, 75...
+    tok_ref_output        [415, 907, 4982, 9537, 28705, 28740, 28734, 28...
+    stop_sequence                                                      </s>
+    tok_stop_sequence                                                   [2]
+    tok_input_len                                                       662
+    tok_ref_output_len                                                  174
+    Name: 0, dtype: object
+    """
+    device = "cuda"  # the device to load the model onto
+
+    # Load the model from local if possible.
+    model_path = Path(ckpt_path)
+    if not model_path.exists():
+        raise RuntimeError(
+            f"{ckpt_path} not existed. Please download the checkpoint from mlcommon")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path, padding_side="left", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, device_map="auto", trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    # gen parameter. We stop at 1024. Starting from v5.0, min_token is set to
+    # 2 to avoid 0-output issue
+    gen_kwargs = {
+        # "min_new_tokens": 1,
+        "min_new_tokens": 2,
+        "max_new_tokens": 1024,
+        "do_sample": False,
+        "temperature": None,
+        "top_p": None,
+    }
+
+    # Start inference
+    BS = bs
+    bidx = 0
+    model.eval()
+
+    input_tokens = []
+    input_tokens_lens = []
+    output_tokens = []
+    output_tokens_lens = []
+    output_texts = []
+
+    tic = time.time()
+    for idx in range(0, len(df), BS):
+        tac = time.time()
+        print(f"Processing {idx}/{len(df)}, time: {tac - tic}s")
+        sidx = idx
+        eidx = min(sidx + BS, len(df))
+
+        # We use batch_encode_plus for batch inference.
+        # Note 9/29/2024: Mixtral changed its tokenizer in Jun. Using the Feb
+        # 29 2024 version.
+        batch_texts = df['input'][sidx:eidx].tolist()
+        batch_ids = tokenizer.batch_encode_plus(
+            batch_texts, return_tensors="pt", padding=True)
+        # tok_input_length = batch_ids['attention_mask'].sum(
+        #     axis=1).to(torch.int32).tolist()
+        # input_tokens_lens += tok_input_length
+        tok_input_id = batch_ids['input_ids'].to(torch.int32).tolist()
+        # Remove eos from the input id
+        tok_input_id = [[element for element in sublist if element !=
+                        tokenizer.eos_token_id] for sublist in tok_input_id]
+        input_tokens += tok_input_id
+        tok_input_length = [len(seq) for seq in tok_input_id]
+        input_tokens_lens += tok_input_length
+
+        batch_ids = batch_ids.to(device)
+        _, length = batch_ids.input_ids.shape
+        outputs = model.generate(**batch_ids, num_return_sequences=1,
+                                 **gen_kwargs)
+
+        output_ids = outputs[:, length:].cpu().tolist()
+        output_tokens += output_ids
+
+        # Filter out EOS
+        id_filtered = [[num for num in sublist if num !=
+                        tokenizer.eos_token_id] for sublist in output_ids]
+        output_id_len = [len(out) for out in id_filtered]
+        output_tokens_lens += output_id_len
+
+        # Detokenizer
+        output_msgs = tokenizer.batch_decode(
+            output_ids, skip_special_tokens=True)
+        output_texts += output_msgs
+        bidx += 1
+
+    # Assemble the output
+    output_df = df[:len(output_tokens)].copy()
+    output_df["infer_tok_input"] = input_tokens
+    output_df["infer_tok_input_length"] = input_tokens_lens
+    output_df["infer_ref_output"] = output_texts
+    output_df["infer_tok_ref_output"] = output_tokens
+    output_df["infer_tok_ref_output_length"] = output_tokens_lens
+
+    # output_df.to_pickle(f"mixtral_8x7b_all15k_{len(output_tokens)}_BS{BS}_greedy_reference_fp16_mintoken1.pkl")
+
+    return output_df
+
+
+def trim_twos(df):
+    # Remove all trailing 2s except for 1
+    def remove_trailing_twos(lst):
+        count = 0
+        for num in reversed(lst):
+            if num == 2:
+                count += 1
+            else:
+                break
+        return lst[:-count] if count > 0 else lst
+
+    df['infer_tok_ref_output'] = df['infer_tok_ref_output'].apply(
+        remove_trailing_twos)
+    df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
+    df['tok_ref_output'] = df['tok_ref_output'].apply(remove_trailing_twos)
+    df['tok_ref_output_len'] = df['tok_ref_output'].apply(len)
+    return df
+
+
+def mbxp_stop(df):
+    stop_tokens = [13, 13940, 28832, 13]
+
+    def modify_list(lst):
+        for i in range(len(lst) - len(stop_tokens) + 1):
+            if lst[i:i + len(stop_tokens)] == stop_tokens:
+                return lst[:i + len(stop_tokens)]
+        return lst
+
+    df.loc[df['dataset'] == 'MBXP', 'infer_tok_ref_output'] = df[df['dataset']
+                                                                 == 'MBXP']['infer_tok_ref_output'].apply(modify_list)
+    df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
+    return df
+
+
+def fix_name(df):
+    df.drop(columns=['ref_output'], inplace=True)
+    df.drop(columns=['tok_ref_output'], inplace=True)
+    df.drop(columns=['tok_ref_output_len'], inplace=True)
+    df.drop(columns=['infer_tok_ref_output_length'], inplace=True)
+    df.drop(columns=['infer_tok_input'], inplace=True)
+    df.drop(columns=['infer_tok_input_length'], inplace=True)
+    df.rename(columns={'infer_ref_output': 'ref_output'}, inplace=True)
+    df.rename(columns={'infer_tok_ref_output': 'tok_ref_output'}, inplace=True)
+    df.rename(columns={'trim_lengths': 'tok_ref_output_len'}, inplace=True)
+
+    return df
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_pkl", type=str, default="09292024_mixtral_15k_mintoken2_v1.pkl",
+                        help="The path to the input pkl file")
+    parser.add_argument("--output_pkl", type=str, default="mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl",
+                        help="The path to the output pickle.")
+    parser.add_argument("--checkpoint_path", type=str, default="/raid/data/mlperf-llm/Mixtral-8x7B-Instruct-v0.1",
+                        help="The path to the mixtral checkpoint")
+    parser.add_argument("--batch_size", type=int, default=64,
+                        help="Batch size of the refernece inference")
+    args = parser.parse_args()
+
+    df = pd.read_pickle(args.input_pkl)
+    df = run_infer(df, args.checkpoint_path, args.batch_size)
+
+    df = trim_twos(df)
+    df = mbxp_stop(df)
+    df = fix_name(df)
+
+    df.to_pickle(args.output_pkl)
--- a/language/mixtral-8x7b/standalone_infer/run_accuracy.py
+++ b/language/mixtral-8x7b/standalone_infer/run_accuracy.py
+#!/usr/bin/env python3
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import re
+import numpy as np
+import argparse
+import evaluate
+import nltk
+from tqdm import tqdm
+
+import timeit
+import multiprocessing
+import json
+import pickle
+import queue
+
+from mxeval.execution import check_correctness as check_correctness_python
+from mxeval.execution import (
+    check_correctness_cpp,
+    check_correctness_csharp,
+    check_correctness_go,
+    check_correctness_java,
+    check_correctness_javascript,
+    check_correctness_kotlin,
+    check_correctness_perl,
+    check_correctness_php,
+    check_correctness_ruby,
+    check_correctness_scala,
+    check_correctness_swift,
+    check_correctness_typescript,
+)
+
+nltk.download("punkt")
+nltk.download("punkt_tab")
+metric = evaluate.load("rouge")
+
+
+def calculate_rouge_score(model_outputs, ref_outputs):
+    metric = evaluate.load("rouge")
+    m_preds = [pred.strip() for pred in model_outputs]
+    m_targets = [target.strip() for target in ref_outputs]
+
+    # rougeLSum expects newline after each sentence
+    m_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in m_preds]
+    m_targets = ["\n".join(nltk.sent_tokenize(target)) for target in m_targets]
+    m_result = metric.compute(
+        predictions=m_preds, references=m_targets, use_stemmer=True, use_aggregator=False
+    )
+    m_rouge_result = {k: round(np.mean(v) * 100, 4)
+                      for k, v in m_result.items()}
+
+    return m_rouge_result
+
+
+def find_numbers(x: str) -> list[str]:
+    """Finds all numbers in a string."""
+    # Search for number, possibly negative (hyphen), with thousand separators
+    # (comma), and with a decimal point (period inbetween digits).
+    numbers = re.compile(
+        r'-?[\d,]*\.?\d+',
+        re.MULTILINE | re.DOTALL | re.IGNORECASE,
+    ).findall(x)
+    return numbers
+
+
+def find_number(x: str,
+                answer_delimiter: str = 'The answer is') -> str:
+    """Finds the most relevant number in a string."""
+    # If model uses the answer delimiter, then select the first number following
+    # that format.
+    if answer_delimiter in x:
+        answer = x.split(answer_delimiter)[-1]
+        numbers = find_numbers(answer)
+        if numbers:
+            return numbers[0]
+
+    # In general, select the last number in the string.
+    numbers = find_numbers(x)
+    if numbers:
+        return numbers[-1]
+    return ''
+
+
+def maybe_remove_comma(x: str) -> str:
+    # Example: 5,600 -> 5600
+    return x.replace(',', '')
+
+
+def try_float(x: str):
+    try:
+        ret = float(x)
+    except BaseException:
+        ret = None
+    return ret
+
+
+def postprocess_golang(code: str) -> str:
+    multi_line_imports = re.compile(
+        r"^import \(\n(.+)((?:\n.+)+)\n\)", re.MULTILINE)
+    line_imports = re.compile(r"^import \".*\"")
+    func_main = re.compile(r"^func main.*^}", re.MULTILINE | re.DOTALL)
+
+    code = code.replace("package main", "")  # Remove package main
+    code = multi_line_imports.sub("", code)
+    code = line_imports.sub("", code)
+    code = func_main.sub("", code)
+
+    return code
+
+
+def postprocess_scala(code: str) -> str:
+    code = code.replace("object Main extends App {", "")
+    code = "".join(code.splitlines(True)[:-1])
+    return code
+
+
+def postprocess_python(code: str) -> str:
+    return code.lstrip()
+
+
+def worker(inp_queue, out_queue):
+    while True:
+        try:
+            problem = inp_queue.get(timeout=5)
+        except queue.Empty:
+            break
+
+        key = f"{problem['lang']}_{problem['entry_point']}"
+        checker = eval(f"check_correctness_{problem['lang']}")
+
+        problem["task_id"] = key
+        problem["test"] = problem["test_code"]
+
+        solution = problem["response"]
+
+        try:
+            solution = solution[:solution.index("```")]
+        except ValueError:
+            # Happens when a code block isn't closed properly
+            pass
+
+        if problem["lang"] == "go":
+            solution = postprocess_golang(solution)
+        elif problem["lang"] == "python":
+            solution = postprocess_python(solution)
+        elif problem["lang"] == "scala":
+            solution = postprocess_scala(solution)
+
+        # Mixtral likes escaping underscores for some reason, so let's remove
+        # these
+        solution = solution.replace("\\_", "_")
+
+        # The evaluation script evaluates `code = prompt + solution + tests`
+        # But Mixtral regenerates the prompt in its output, so we should remove
+        # this
+        problem["prompt"] = ""
+
+        result = checker(problem, solution, timeout=20.0)
+        out_queue.put(
+            (key,
+             problem["lang"],
+             result["passed"],
+                result["result"],
+                problem["response"]))
+
+
+def convert_pickle(df: pd.DataFrame, result_keys: dict):
+    problems = []
+    for _, row in df.iterrows():
+        lang, entry_point = row["id"].split("_", 1)
+        problems.append({
+            "lang": lang,
+            "prompt": row["input"],
+            "test_code": row["gt_output"],
+            "entry_point": entry_point,
+            "response": row[f"{result_keys['result']}"]
+        })
+    return problems
+
+
+def evaluate_mbxp(n_works: int, df: pd.DataFrame, result_keys: dict):
+    print(f"Evaluating MBXP score...")
+    # Convert pickle file into dictionary
+    results = convert_pickle(df, result_keys)
+
+    by_lang = {}
+    for problem in results:
+        by_lang.setdefault(problem["lang"], []).append(problem)
+
+    inp_queue = multiprocessing.Queue()
+    out_queue = multiprocessing.Queue()
+
+    n_problems = 0
+
+    for lang, problems in by_lang.items():
+        if lang not in ["cpp", "python", "php",
+                        "javascript", "ruby", "typescript"]:
+            raise RuntimeError(f"{lang} not in supported list.")
+
+        n_problems += len(problems)
+        for problem in problems:
+            inp_queue.put(problem)
+
+    start = timeit.default_timer()
+    workers = []
+    for _ in range(args.n_workers):
+        w = multiprocessing.Process(target=worker, args=(inp_queue, out_queue))
+        w.start()
+        workers.append(w)
+
+    passes = {}
+    n_passed = 0
+    lang_passed = {}
+    lang_counts = {}
+    for i in tqdm(range(n_problems)):
+        key, lang, passed, result, response = out_queue.get()
+        passes[key] = {
+            "passed": passed,
+            "result": result,
+            "response": response}
+        n_passed += passed
+
+        lang_passed.setdefault(lang, 0)
+        lang_passed[lang] += passed
+
+        lang_counts.setdefault(lang, 0)
+        lang_counts[lang] += 1
+
+    end = timeit.default_timer()
+    print(f"Processed {n_problems} in {end - start}s")
+    print(f"{100 * n_passed / n_problems : .02f}% pass@1")
+    print(lang_passed, " out of ", lang_counts)
+
+    gen_token_len = df[result_keys['length']].tolist()
+    gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
+    print(f"gen_tokens_per_sample: {gen_token_per_sample}")
+
+    # with open("evaluated_test.json", "w") as f:
+    #    json.dump(passes, f, indent=2)
+
+    return n_passed / n_problems
+
+
+def evaluate_openorca(df: pd.DataFrame, result_keys: dict):
+    print(f"Evaluating OpenOrca score...")
+    gen_output = df[f"{result_keys['result']}"].tolist()
+    gt_output = df.gt_output.tolist()
+    score = calculate_rouge_score(gen_output, gt_output)
+    gen_token_len = df[result_keys['length']].tolist()
+    gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
+    print(
+        f"OpenOrca score: {score}, gen_token_per_sample: {gen_token_per_sample}")
+    return score
+
+
+def evaluate_gsm8k(df: pd.DataFrame, result_keys: dict):
+    print(f"Evaluating GSM8K score...")
+    gen_output = df[f"{result_keys['result']}"].tolist()
+    gt_numbers = df.gt_output.tolist()
+    gen_nums = [maybe_remove_comma(find_number(msg.split("\nQ:")[0]))
+                for msg in gen_output]
+    correct = 0
+    total = len(gt_numbers)
+    for idx in range(len(gt_numbers)):
+        ref = try_float(gt_numbers[idx])
+        tgt = try_float(gen_nums[idx])
+        if tgt is None:
+            continue
+        correct += (ref == tgt)
+
+    em = correct / total
+    gen_token_len = df[result_keys['length']].tolist()
+    gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
+    print(
+        f"EM: {em}, correct: {correct} / {total}, gen_token_per_sample: {gen_token_per_sample}")
+    return em
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--n_workers",
+        type=int,
+        default=10,
+        help="The number of processes to use")
+    parser.add_argument("--results_path", type=str, default="mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl",
+                        help="The path to the results file pickle file")
+    parser.add_argument("--result_key", type=str, default="ref_output",
+                        help="ref output dict key")
+    parser.add_argument("--length_key", type=str, default="tok_ref_output_len",
+                        help="ref output dict key")
+    args = parser.parse_args()
+
+    """
+    Sample command:
+    python3 nv_accuracy.py --results_path=trtllm_fp16_mixtral_8x7b_all15k_15000_BS128_greedy_06102024.pkl --result_key=nv_tllm_ref_output --length_key=nv_tllm_tok_ref_output_length
+    """
+
+    result_keys = {
+        "result": args.result_key,
+        "length": args.length_key
+    }
+
+    """
+    dataset                                                            MBXP (OpenOrca/GSM8K)
+    id                                            typescript_minimum_Length
+    question              /**\n * Write a typescript function to minimiz...
+    input                 <s> [INST] Complete the following code. Be con...
+    ref_output            \nconst minimumLength = (s: string): number =>...
+    gt_output             \nimport * as assert from 'assert'\n\nlet actu...
+    tok_input             [1, 1, 28705, 733, 16289, 28793, 21929, 272, 2...
+    tok_ref_output        [13, 1978, 7968, 4645, 327, 325, 28713, 28747,...
+    stop_sequence                                                   \n```\n
+    tok_stop_sequence                                [13, 13940, 28832, 13]
+    tok_input_len                                                       139
+    tok_ref_output_len                                                  123
+    """
+
+    df = pd.read_pickle(args.results_path)
+    df_gsm8k = df[df['dataset'] == "GSM8K"].copy()
+    evaluate_gsm8k(df_gsm8k, result_keys)
+    df_openorca = df[df['dataset'] == "OpenOrca"].copy()
+    evaluate_openorca(df_openorca, result_keys)
+    df_mbxp = df[df['dataset'] == "MBXP"].copy()
+    evaluate_mbxp(args.n_workers, df_mbxp, result_keys)
--- a/language/mixtral-8x7b/user.conf
+++ b/language/mixtral-8x7b/user.conf
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+#
--- a/loadgen/.clang-format
+++ b/loadgen/.clang-format
+BasedOnStyle: Google
+Standard: Cpp11
--- a/loadgen/CMakeLists.txt
+++ b/loadgen/CMakeLists.txt
+cmake_minimum_required(VERSION 3.12)
+
+project(mlperf_loadgen)
+
+# Read the version file
+file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS)
+
+# Extract the major, minor, and patch versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format)
+string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS})
+
+# Set the variables for the major, minor, and patch versions
+set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}")
+set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}")
+set(mlperf_loadgen_VERSION_PATCH "${CMAKE_MATCH_3}")
+
+# Check if the version format was parsed correctly
+if(NOT DEFINED mlperf_loadgen_VERSION_MAJOR OR NOT DEFINED mlperf_loadgen_VERSION_MINOR OR NOT DEFINED mlperf_loadgen_VERSION_PATCH)
+    message(FATAL_ERROR "Version format in VERSION.txt is incorrect. Expected format: MAJOR.MINOR.PATCH")
+endif()
+
+# Print out the version
+message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}.${mlperf_loadgen_VERSION_PATCH}")
+
+# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
+if (NOT MSVC)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
+endif()
+# Extra build options can be specified by setting the MLPERF_LOADGEN_CXX_FLAGS variable
+if (MLPERF_LOADGEN_CXX_FLAGS)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MLPERF_LOADGEN_CXX_FLAGS}")
+endif()
+message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
+set(CMAKE_CXX_STANDARD "14")
+message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}")
+message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}")
+message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}")
+
+# Output directory for libraries.
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}")
+
+# Detect Python to use for generating source file with version info.
+# NB: PythonInterp has been deprecated since CMake 3.12
+# but it works with earlier versions of CMake.
+find_package(PythonInterp)
+message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}")
+
+# Specify the source and destination files
+set(CONF_FILE "mlperf.conf")
+set(HEADER_FILE "mlperf_conf.h")
+
+# Read the content of the configuration file
+file(READ ${CONF_FILE} CONF_CONTENTS)
+
+# Escape all double quotes and backslashes
+string(REPLACE "\\" "\\\\" CONF_CONTENTS "${CONF_CONTENTS}")
+string(REPLACE "\"" "\\\"" CONF_CONTENTS "${CONF_CONTENTS}")
+
+# Handle new lines
+string(REPLACE "\n" "\\n\"\n\"" CONF_CONTENTS "${CONF_CONTENTS}")
+
+# Wrap the content in a C++ string declaration
+set(FORMATTED_CONTENT "const char* mlperf_conf =\n\"${CONF_CONTENTS}\";\n")
+
+# Write the formatted content to the header file
+file(WRITE ${HEADER_FILE} "${FORMATTED_CONTENT}")
+
+message(STATUS "Output config:  ${CMAKE_BINARY_DIR}/mlperf_conf.h")
+
+# Generate source file with version info.
+execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Add source files.
+set(SOURCE
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/early_stopping.cc  
+  ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/results.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/results.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/mlperf_conf.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt
+  ${CMAKE_BINARY_DIR}/version_generated.cc
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_library(mlperf_loadgen STATIC ${SOURCE})
+target_link_libraries(mlperf_loadgen)
+
+if(WIN32)
+set (LIBS "")
+else()
+set (LIBS pthread)
+endif()
+
+add_executable(benchmark benchmark/repro.cpp)
+target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS})
+
+# Install library and headers.
+install(TARGETS mlperf_loadgen
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h")
--- a/loadgen/MANIFEST.in
+++ b/loadgen/MANIFEST.in
+include VERSION.txt
+include mlperf.conf
--- a/loadgen/README.md
+++ b/loadgen/README.md
+# Overview {#mainpage}
+
+## Introduction
+
+* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
+  the performance of inference systems.
+* It generates traffic for scenarios as formulated by a diverse set of experts
+  in the [MLCommons working group](https://mlcommons.org/).
+* The scenarios emulate the workloads seen in mobile devices,
+  autonomous vehicles, robotics, and cloud-based setups.
+* Although the LoadGen is not model or dataset aware, its strength is in its
+  reusability with logic that is.
+
+## Integration Example and Flow
+The following is an diagram of how the LoadGen can be integrated into an
+inference system, resembling how some of the MLPerf reference models are
+implemented.
+<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
+<ol style="padding: 20px">
+<li>Benchmark knows the model, dataset, and preprocessing.</li>
+<li>Benchmark hands dataset sample IDs to LoadGen.</li>
+<li>LoadGen starts generating queries of sample IDs.</li>
+<li>Benchmark creates requests to backend.</li>
+<li>Result is post processed and forwarded to LoadGen.</li>
+<li>LoadGen outputs logs for analysis.<br>
+</ol>
+</div>
+
+## Useful Links
+* [FAQ](README_FAQ.md)
+* [LoadGen Build Instructions](README_BUILD.md)
+* [LoadGen API](loadgen.h)
+* [Test Settings](test_settings.h) -
+  A good description of available scenarios, modes, and knobs.
+* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
+  Includes source for the LoadGen and reference models that use the LoadGen.
+* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
+  Any mismatch with this is a bug in the LoadGen.
+
+## Scope of the LoadGen's Responsibilities
+
+### In Scope
+* **Provide a reusable** C++ library with python bindings.
+* **Implement** the traffic patterns of the MLPerf Inference scenarios and
+  modes.
+* **Record** all traffic generated and received for later analysis and
+  verification.
+* **Summarize** the results and whether performance constraints were met.
+* **Target high-performance** systems with efficient multi-thread friendly
+  logging utilities.
+* **Generate trust** via a shared, well-tested, and community-hardened
+  code base.
+
+### Out of Scope
+The LoadGen is:
+* **NOT** aware of the ML model it is running against.
+* **NOT** aware of the data formats of the model's inputs and outputs.
+* **NOT** aware of how to score the accuracy of a model's outputs.
+* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
+
+Limitting the scope of the LoadGen in this way keeps it reusable across
+different models and datasets without modification. Using composition and
+dependency injection, the user can define their own model, datasets, and
+metrics.
+
+Additionally, not hardcoding MLPerf-specific test constraints, like test
+duration and performance targets, allows users to use the LoadGen unmodified
+for custom testing and continuous integration purposes.
+
+## Submission Considerations
+
+### Upstream all local modifications
+* As a rule, no local modifications to the LoadGen's C++ library are allowed
+for submission.
+* Please upstream early and often to keep the playing field level.
+
+### Choose your TestSettings carefully!
+* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
+requirements for submission. *e.g.:* target percentiles and latencies.
+* For verification, the values in TestSettings are logged.
+* To help make sure your settings are spec compliant, use
+TestSettings::FromConfig in conjunction with the relevant config file provided
+with the reference models.
+
+## Responsibilities of a LoadGen User
+
+### Implement the Interfaces
+* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
+  them to the StartTest function.
+* Call QuerySampleComplete for every sample received by
+  SystemUnderTest::IssueQuery.
+
+### Assess Accuracy
+* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
+  the accuracy of your system.
+* For the official models, Python scripts will be provided by the MLPerf model
+  owners for you to do this automatically.
+
+For templates of how to do the above in detail, refer to code for the demos,
+tests, and reference models.
+
+
+## LoadGen over the Network
+
+For reference, on a high level a submission looks like this:
+
+<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_submission.png" width="300px" style="padding: 20px">
+</div>
+
+The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing.
+
+A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram:
+
+<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_network_submission.png" width="300px" style="padding: 20px">
+</div>
+
+QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen.  It is implemented by the submitter. The interface of the QDL is the same as the API to SUT. 
+
+In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group.
+
+All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network.
+
+QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future.
+
+The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements.
+
+### QDL details
+The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class.
+
+#### QDL Query issue and response over the network
+
+The QDL gets the queries from the LoadGen through 
+```CPP
+void IssueQuery(const std::vector<QuerySample>& samples)
+```
+
+The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT.
+
+The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by:
+
+```CPP
+struct QuerySampleResponse {
+  ResponseId id;
+  uintptr_t data;
+  size_t size;
+};
+void QuerySamplesComplete(QuerySampleResponse* responses, 
+                          size_t response_count);
+
+```
+
+#### QDL Additional Methods
+
+In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen:
+```CPP
+const std::string& Name();
+```
+The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark.
+```CPP
+void FlushQueries();
+```
+
+It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT.
+
+### Example
+
+Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network.
--- a/loadgen/README_BUILD.md
+++ b/loadgen/README_BUILD.md
+# Building the LoadGen {#ReadmeBuild}
+
+## Prerequisites
+
+    sudo apt-get install libglib2.0-dev python-pip python3-pip
+    pip2 install absl-py numpy
+    pip3 install absl-py numpy
+
+## Quick Start
+### Installation - Python
+
+    pip install absl-py numpy
+    git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference/loadgen
+    CFLAGS="-std=c++14 -O3" python -m pip install .
+
+This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.
+
+Alternatively, we provide wheels for several python versions and operating system that can be installed using pip directly.
+
+    pip install mlperf-loadgen
+
+**NOTE:** Take into account that we only update the published wheels after an official release, they may not include the latest changes.
+
+### Testing your Installation
+The following command will run a simple end-to-end demo:
+
+    python mlperf_inference/loadgen/demos/py_demo_single_stream.py
+
+A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile.
+
+For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome:
+* Type “chrome://tracing” in the address bar, then drag-n-drop the json.
+* This may be useful for SUT performance tuning and understanding + debugging the loadgen.
+
+### Installation - C++
+To build the loadgen as a C++ library, rather than a python module:
+
+    git clone https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference
+    mkdir loadgen/build/ && cd loadgen/build/
+    cmake .. && cmake --build .
+    cp libmlperf_loadgen.a ..
+
+## Quick start: Loadgen Over the Network
+
+Refer to [LON demo](demos/lon/README.md) for a basic example.
--- a/loadgen/README_FAQ.md
+++ b/loadgen/README_FAQ.md
+# LoadGen FAQ {#ReadmeFAQ}
+
+## Q: The LoadGen does not match the MLPerf specification. Who is right?
+**A:**
+The MLPerf spec is *always* right.
+Please file a LoadGen bug so it may be resolved.
+
+## Q: How can I file a bug?
+**A:**
+On GitHub: https://github.com/mlcommons/inference/issues/new
+
+## Q: Can I make local modifications to the LoadGen for submission?
+**A:**
+No. To keep the playing field level, please upstream any local
+modificiations you need to make. Ideally upstream such changes behind a runtime
+flag or via an abstract interface the client can implement. This will help
+with testability.
+
+## Q: Where can I find the results of a test?
+**A:**
+By default, the loadgen will output an *mlperf_log_summary.txt* file
+that summarizes the target metrics and constraints of the test, along with
+other stats about the run.
+
+*Note:* LogSettings also has a flag to forward the results to stdout and
+there's an outstanding TODO to make this more programmable.
+
+## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission?
+**A:**
+They are not. The LoadGen results are the ground truth for submission
+results since they will work even for systems that forgo the python bindings.
+If you notice a bug in the LoadGen's results, please file a bug or submit a
+patch.
+
+## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*?
+**A:**
+If you have a custom build setup, make sure you run the *version_generator.py*
+script, which will create the cc file you are looking for. The official build
+files that come with the LoadGen do this for you out of the box.
+
+## Q: What is this *version_generator.py* script?
+**A:**
+The LoadGen records git stats (if available) and the SHA1 of all its
+source files (always) at build time for verification purposes. This is easy
+to circumvent, but try your best to run *version_generator.py* correctly;
+ideally integrated with your build system if you have a custom build.
+The intention is more to help with debugging efforts and detect accidental
+version missmatches than to detect bad actors.
+
+## Q: How do I view the *mlperf_log_trace.json* file?
+**A:**
+This file uses the [Trace Event Format]
+(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit)
+to record a timeline of all the threads involved.
+You can view the file by typing [chrome://tracing](chrome://tracing) into
+Chrome's address bar and dragging the json file there.
+This file zips well and you can drag the zip file directly into
+[chrome://tracing](chrome://tracing) too.
+Please include zipped traces (and the other logs) when filing bug reports.
+
+## Q: Why is the code littered with so many lambdas? My eyes hurt.
+**A:**
+Lambdas are a convenient and efficient way to ship arbitrary data + deferred
+logic over to the logging thread without much boilerplate.
+Much of the loadgen is built on top of the logging utilities.
+Thus the lambdas. (Sorry about the eyes.)
+
+## Q: What C++ version does the LoadGen target?
+**A:**
+It currently targets and requires C++14. It should compile with recent
+versions of clang, gcc, and msvc.
+
+## Q: What dependencies does the LoadGen code have?
+**A:**
+The C++ code has no external dependencies. The loadgen itself, logging
+utilities, and unit test utilities are built solely on the C++ Standard Library.
+The python bindings, however, do require
+[pybind11](https://github.com/pybind/pybind11).
--- a/loadgen/VERSION.txt
+++ b/loadgen/VERSION.txt
+5.0.13
--- a/loadgen/__init__.py
+++ b/loadgen/__init__.py
+import sys
+
+# Aliasing mlcommons_loadgen as mlperf_loadgen
+sys.modules['mlperf_loadgen'] = sys.modules[__name__]