git init

3c15726c · yangzhong · 3c15726c · 3c15726c · 3c15726c · 3c15726c
Commit 3c15726c authored Nov 01, 2025 by yangzhong
20 changed files
--- a/language/gpt-j/systemStatus.py
+++ b/language/gpt-j/systemStatus.py
+import psutil
+import torch
+
+
+def get_gpu_memory_info():
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        total_memory = torch.cuda.get_device_properties(device).total_memory
+        allocated_memory = torch.cuda.memory_allocated(device)
+        free_memory = total_memory - allocated_memory
+    return free_memory / (1024**3)
+
+
+def get_cpu_memory_info():
+    mem = psutil.virtual_memory()
+    return mem.total / (1024**3)
--- a/language/gpt-j/tokenizer_GPTJ.py
+++ b/language/gpt-j/tokenizer_GPTJ.py
+from transformers import AutoTokenizer
+
+
+def get_transformer_autotokenizer(model_name: str):
+    return AutoTokenizer.from_pretrained(
+        model_name,
+        model_max_length=2048,
+        padding_side="left",
+        use_fast=False,
+    )
--- a/language/gpt-j/user.conf
+++ b/language/gpt-j/user.conf
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
--- a/language/gpt-j/utils.py
+++ b/language/gpt-j/utils.py
+import json
+import os
+import io
+
+
+def _make_r_io_base(f, mode: str):
+    if not isinstance(f, io.IOBase):
+        f = open(f, mode=mode)
+    return f
+
+
+def jload(f, mode="r"):
+    """Load a .json file into a dictionary."""
+    f = _make_r_io_base(f, mode)
+    jdict = json.load(f)
+    f.close()
+    return jdict
--- a/language/llama2-70b/Dockerfile
+++ b/language/llama2-70b/Dockerfile
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+SHELL ["/bin/bash", "-c"]
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+ENV TZ=US/Pacific
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+RUN rm -rf /var/lib/apt/lists/* && rm /etc/apt/sources.list.d/* \
+ && apt update \
+ && apt install -y --no-install-recommends build-essential autoconf \
+        libtool git ccache curl wget pkg-config sudo ca-certificates \
+        automake libssl-dev bc python3-dev python3-pip google-perftools \
+        gdb libglib2.0-dev clang sshfs libre2-dev libboost-dev \
+        libnuma-dev numactl sysstat sshpass ntpdate less iputils-ping \
+ && apt -y autoremove \
+ && apt remove -y cmake \
+ && apt install -y --no-install-recommends pkg-config zip g++ zlib1g-dev \
+        unzip libarchive-dev
+RUN apt install -y --no-install-recommends rsync
+
+# Install setuptools
+RUN python3 -m pip install --upgrade pip \
+    && python3 -m pip install --upgrade setuptools wheel virtualenv
+
+# Install conda
+WORKDIR /tmp
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh \
+    && bash Miniconda3-* -b -p /opt/miniconda3
+ENV PATH="$PATH:/opt/miniconda3/bin"
+RUN conda create -n llama2-70b python=3.10
+RUN chmod -R 777 /opt/miniconda3
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
+# Reference Implementation for llama2-70b
+
+**Basic implementation for llama2-70b. Few noteworthy items:**
+
+ Streamer for communicating with loadgen has quite some overhead. This is only meant to provide functional implementation
+ For custom/optimized implementations of this benchmark it is important to include the :
+        - For server scenario, it is necessary to call `lg.FirstTokenComplete(response)` for each query. This way the first token will be reported and it's latency will be measured.
+        - For all scenarios, when calling `lg.QuerySamplesComplete(response)`, it is necessary that each of the elements in response is a `lg.QuerySampleResponse` that contains the number of tokens (can be create this way: `lg.QuerySampleResponse(qitem.id, bi[0], bi[1], n_tokens)`). The number of tokens reported should match with the number of tokens on your answer and this will be checked in [TEST06](../../compliance/nvidia/TEST06/)
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama2-70b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+
+ 
+## Prepare environment
+
+Copy the mlperf.conf file to this folder.
+```
+cp ../../mlperf.conf .
+```
+
+For a CPU-only run:
+
+```
+conda create -n llama2-70b python=3.9
+conda activate llama2-70b
+
+# Install packages
+conda install pybind11==2.10.4 -c conda-forge -y
+python -m pip install torch==2.2.0.dev20231006+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
+pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
+
+export CUR_DIR=${PWD}
+cd <inference-repo-root>/loadgen
+
+# Need to fetch Pablo's changes
+git fetch origin pull/1523/head:llm-server
+git merge llm-server
+
+python -m pip install .
+```
+
+For a GPU-based run:
+
+A dockerfile is provided, along with scripts to help launch it. First, add any docker volume mounts you want in
+`launch.sh`. There is a section at the top of the file that looks like:
+```
+# Add any volume mounts here with the following syntax
+# /path/to/src:/path/to/dir/in/container
+MOUNTS=(
+    $MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
+)
+```
+
+For example if you have a raid space located at `/raid/data` on your local machine, you can add it to the same path in the container like so:
+```
+# Add any volume mounts here with the following syntax
+# /path/to/src:/path/to/dir/in/container
+MOUNTS=(
+    $MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
+    /raid/data:/raid/data
+)
+```
+Once you have added all your mounts, launch the container with `bash launch.sh`.
+
+Inside the container, set up the environment with `bash build.sh`. This will install all the dependencies from the
+CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch.
+
+
+## Get Model
+### MLCommons Members Download
+MLCommons hosts the model and preprocessed dataset for download **exclusively by MLCommons Members**. You must first agree to the [confidentiality notice](https://llama2.mlcommons.org) using your organizational email address, then you will receive a link to a directory containing Rclone download instructions. _If you cannot access the form but you are part of a MLCommons Member organization, submit the [MLCommons subscription form](https://mlcommons.org/community/subscribe/) with your organizational email address and [associate a Google account](https://accounts.google.com/SignUpWithoutGmail) with your organizational email address._
+
+
+### External Download
+ First go to [llama2-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
+ Requires Git Large Files Storage
+```
+export CHECKPOINT_PATH=${PWD}/Llama-2-70b-chat-hf
+git lfs install
+git clone https://huggingface.co/meta-llama/Llama-2-70b-chat-hf ${CHECKPOINT_PATH}
+
+```
+
+## Get Dataset
+
+### Preprocessed
+
+You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket.
+
+To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
+To install Rclone on Linux/macOS/BSD systems, run:
+```
+sudo -v ; curl https://rclone.org/install.sh | sudo bash
+```
+Once Rclone is installed, run the following command to authenticate with the bucket:
+```
+rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+```
+You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
+
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/open_orca ./open_orca -P
+```
+
+### Unprocessed
+
+You can also download and process the dataset yourself following the command below:
+
+```
+# First get the `open-orca` parquet from huggingface
+export OPENORCA_DATASET=${PWD}/open-orca
+git clone https://huggingface.co/datasets/Open-Orca/OpenOrca ${OPENORCA_DATASET}
+
+export OPENORCA_PARQUET=${OPENORCA_DATASET}/1M-GPT4-Augmented.parquet
+EXPORT_DIR=${PWD}/processed-openorca
+export DATASET_PATH=${PWD}/processed-data.pkl
+
+# Process the dataset according the Taskforce's agreed criteria
+python3 processorca.py --dataset_pq_path=${OPENORCA_PARQUET} --model_dir=${CHECKPOINT_PATH} --seqlen_limit=1024 --export_dir=${EXPORT_DIR} --num_total_samples=24576
+
+mv ${EXPORT_DIR}/open_orca_gpt4_tokenized_llama.sampled_24576.pkl ${DATASET_PATH}
+```
+
+The script will perform the following steps on the original open_orca GPT4 dataset:
+- filter out all queries with non-ascii characters, except for normal unicode quotes and hyphens.
+- filter out all queries with out-of-bound input/output sequence lengths
+- filter out all queries with expected answers shorter than 2 words (known to cause issues for Llama2)
+- filter out all queries with prompts that generate bad output texts using Llama2 models
+- sample equally from the sub-dataset (i.e. COT, NIV, FLAN, T0) and form the final dataset.
+
+## Run Performance Benchmarks
+
+### Offline
+```
+python -u main.py --scenario Offline \
+                --model-path ${CHECKPOINT_PATH} \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --device cpu \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir offline-logs
+
+```
+
+For a GPU-based run:
+```
+python3 -u main.py --scenario Offline \
+        --model-path ${CHECKPOINT_PATH} \
+        --mlperf-conf mlperf.conf \
+        --user-conf user.conf \
+        --total-sample-count 24576 \
+        --dataset-path ${DATASET_PATH} \
+        --output-log-dir offline-logs \
+        --dtype float32 \
+        --device cuda:0 2>&1 | tee offline_performance_log.log
+```
+
+### Server
+```
+python -u main.py --scenario Server \
+                --model-path ${CHECKPOINT_PATH} \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --device cpu \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir server-logs
+```
+
+The ServerSUT was not tested for GPU runs.
+
+
+## Run Accuracy Benchmarks
+
+### Offline
+```
+OUTPUT_LOG_DIR=offline-accuracy-logs
+
+mkdir -p "run_outputs"  # The script will dump all the outputs to 'run_outputs'.
+
+python -u main.py --scenario Offline \
+                --model-path ${CHECKPOINT_PATH} \
+                --accuracy \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir ${OUTPUT_LOG_DIR} \
+                --device cpu
+
+
+ACCURACY_LOG_FILE=${OUTPUT_LOG_DIR}/mlperf_log_accuracy.json
+if [ -e ${ACCURACY_LOG_FILE} ]; then
+        python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
+                --mlperf-accuracy-file ${ACCURACY_LOG_FILE} --dataset-file ${DATASET_PATH} --dtype int32
+fi
+
+# Optional: Create a pickled pandas DataFrame that is the original dataset with extra columns with output data from the
+# accuracy run. The following columns will be added:
+# - "gen_output_tok_id": A list of ints representing the tokenized output sequence.
+# - "gen_output_text": A str representing the untokenized output sequence.
+# - "gen_output_tok_len": An int representing the number of output tokens.
+# - "rouge1": The rouge1 score for this sample
+# - "rouge2": The rouge2 score for this sample
+# - "rougeL": The rougeL score for this sample
+# This file will by default be saved to 'full_output.pkl'. You can modify this with --output-pkl-path.
+python consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
+```
+
+For the GPU run - The above steps have been automated in `run_accuracy.sh`. You can also modify this script to use
+`--device cpu` to adapt it to a CPU-only run.
+
+
+### Server
+```
+OUTPUT_LOG_DIR=server-accuracy-logs
+
+python -u main.py --scenario Server \
+                --model-path ${CHECKPOINT_PATH} \
+                --accuracy \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir ${OUTPUT_LOG_DIR} \
+                --device cpu
+
+
+ACCURACY_LOG_FILE=${OUTPUT_LOG_DIR}/mlperf_log_accuracy.json
+if [ -e ${ACCURACY_LOG_FILE} ]; then
+        python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
+                --mlperf-accuracy-file ${ACCURACY_LOG_FILE} --dataset-file ${DATASET_PATH} --dtype int32
+fi
+```
+
+The ServerSUT was not tested for GPU runs.
+
+
+## Accuracy Target
+Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets (normalized to a 0-100
+scale from a 0.0-1.0 scale):
+- Rouge1: 44.4312
+- Rouge2: 22.0352
+- RougeL: 28.6162
+- Tokens per sample: 294.45
+
+This was run on a DGX-H100 node. Total runtime was ~4.5 days.
+
+# Run llama2-70b-interactive benchmark
+
+For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
+
+In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
+
+```
+python -u main.py --scenario Server \
+                --model-path ${CHECKPOINT_PATH} \
+                --mlperf-conf mlperf.conf \
+                --user-conf user.conf \
+                --total-sample-count 24576 \
+                --device cpu \
+                --dataset-path ${DATASET_PATH} \
+                --output-log-dir server-logs \
+                --lg-model-name llama2-70b-interactive
+```
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
+import os
+import time
+import numpy as np
+import array
+import torch
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+from transformers.generation.streamers import BaseStreamer
+
+import pickle
+import time
+import threading
+import tqdm
+import queue
+
+import logging
+from typing import TYPE_CHECKING, Optional, List
+from pathlib import Path
+
+import mlperf_loadgen as lg
+from dataset import Dataset
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("Llama-70B-SUT")
+
+gen_kwargs = {
+    "early_stopping": True,
+    "max_new_tokens": 1024,
+    "min_new_tokens": 1,
+    "num_beams": 1,
+    "do_sample": False,
+}
+
+
+class FirstTokenStreamer(BaseStreamer):
+    """Streams first tokens to a 'holder'"""
+
+    def __init__(
+        self, first_token, tokens_cache=[], is_first_token=True, response_ids=[]
+    ):
+        """Response ids added to 'sign' the first token"""
+
+        self.first_token = first_token  # Queue for first token
+        self.is_first_token = is_first_token
+
+        # Cache for subsequent generated tokens
+        self.tokens_cache = tokens_cache
+
+        self.response_ids = response_ids
+
+        self.is_prompt = (
+            True  # The first tokens sent to the streamer are actually the input prompts
+        )
+
+    def put(self, value):
+        """Caches the tokens as they're generated. Assumes bs=1"""
+
+        # Prompts are streamed first so we need to skip the first time value
+        # that arrives
+        if self.is_prompt:
+            self.is_prompt = False
+            return
+
+        value = value.item()
+        if self.is_first_token:
+
+            # Add generated first token together with its query response_id to
+            # first tokens queue
+            self.first_token.put((value, self.response_ids[0]))
+
+            self.is_first_token = False
+            return
+
+        self.tokens_cache.append(value)
+
+    def end(self):
+        pass
+
+    def get_out_tokens(self):
+        return self.tokens_cache
+
+
+class SUT:
+    def __init__(
+        self,
+        model_path=None,
+        dtype="bfloat16",
+        device="cpu",
+        batch_size=None,
+        total_sample_count=24576,
+        dataset_path=None,
+        use_cached_outputs=False,
+        # Set this to True *only for test accuracy runs* in case your prior
+        # session was killed partway through
+        workers=1,
+    ):
+
+        self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
+        self.device = device
+
+        if not batch_size:
+            if device == "cpu":
+                batch_size = 1
+            else:
+                batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
+        self.batch_size = batch_size
+
+        # dtype
+        if dtype == "bfloat16":
+            self.amp_enabled = True
+            self.amp_dtype = torch.bfloat16
+        elif dtype == "float16":
+            self.amp_enabled = True
+            self.amp_dtype = torch.float16
+        else:
+            self.amp_enabled = False
+            self.amp_dtype = torch.float32
+
+        if "cuda" in self.device:
+            assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
+
+        self.dataset_path = dataset_path
+        self.data_object = Dataset(
+            self.model_path,
+            dataset_path=self.dataset_path,
+            total_sample_count=total_sample_count,
+            device=self.device,
+        )
+        self.qsl = lg.ConstructQSL(
+            self.data_object.total_sample_count,
+            self.data_object.perf_count,
+            self.data_object.LoadSamplesToRam,
+            self.data_object.UnloadSamplesFromRam,
+        )
+
+        self.load_model()
+
+        self.num_workers = workers
+        self.worker_threads = [None] * self.num_workers
+        self.query_queue = queue.Queue()
+
+        self.use_cached_outputs = use_cached_outputs
+        self.sample_counter = 0
+        self.sample_counter_lock = threading.Lock()
+
+    def start(self):
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic"""
+
+        while True:
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            query_ids = [q.index for q in qitem]
+
+            fname = "q" + "_".join([str(i) for i in query_ids])
+            fname = f"run_outputs/{fname}.pkl"
+            _p = Path(fname)
+            if self.use_cached_outputs and _p.exists():
+                # Read cache
+                with _p.open(mode="rb") as f:
+                    d = pickle.load(f)
+                processed_output = d["outputs"]
+                tik1 = None
+                tik2 = None
+                tik3 = None
+                tok = None
+            else:
+                # Construct / collate batch
+                max_seq_len = 1024
+
+                tik1 = time.time()
+
+                input_ids_tensor = []
+                input_masks_tensor = []
+                input_len = []
+                for q in qitem:
+                    input_ids_tensor.append(
+                        pad(
+                            self.data_object.input_ids[q.index],
+                            (
+                                max_seq_len -
+                                self.data_object.input_lens[q.index],
+                                0,
+                                0,
+                                0,
+                            ),
+                            value=self.tokenizer.pad_token_id,
+                        )
+                    )
+                    input_masks_tensor.append(
+                        pad(
+                            self.data_object.attention_masks[q.index],
+                            (
+                                max_seq_len -
+                                self.data_object.input_lens[q.index],
+                                0,
+                                0,
+                                0,
+                            ),
+                            value=0,
+                        )
+                    )
+                    input_len.append(self.data_object.input_lens[q.index])
+                input_ids_tensor = torch.cat(input_ids_tensor)
+                input_masks_tensor = torch.cat(input_masks_tensor)
+
+                assert input_ids_tensor.shape == input_masks_tensor.shape
+                assert input_ids_tensor.shape[0] <= self.batch_size
+
+                tik2 = time.time()
+
+                pred_output_tokens = self.model.generate(
+                    input_ids=input_ids_tensor,
+                    attention_mask=input_masks_tensor,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    **gen_kwargs,
+                )
+
+                tik3 = time.time()
+
+                processed_output = self.data_object.postProcess(
+                    pred_output_tokens,
+                    input_seq_lens=input_len,
+                    query_id_list=query_ids,
+                )
+
+            for i in range(len(qitem)):
+                n_tokens = processed_output[i].shape[0]
+                response_array = array.array(
+                    "B", processed_output[i].tobytes())
+                bi = response_array.buffer_info()
+                response = [
+                    lg.QuerySampleResponse(
+                        qitem[i].id,
+                        bi[0],
+                        bi[1],
+                        n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+            tok = time.time()
+
+            with self.sample_counter_lock:
+                self.sample_counter += len(qitem)
+                print(f"Samples run: {self.sample_counter}")
+                if tik1:
+                    print(f"\tBatchMaker time: {tik2 - tik1}")
+                    print(f"\tInference time: {tik3 - tik2}")
+                    print(f"\tPostprocess time: {tok - tik3}")
+                    print(f"\t==== Total time: {tok - tik1}")
+                else:
+                    print(f"\tLoaded from cache: {_p}")
+
+    def load_model(self):
+        self.model = LlamaForCausalLM.from_pretrained(
+            self.model_path,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            torch_dtype=self.amp_dtype,
+        )
+        print("Loaded model")
+
+        self.device = torch.device(self.device)
+        if self.device == "cpu":
+            self.model = self.model.to(
+                self.device
+            )  # Force CPU if your system has GPU and you specifically want CPU-only run
+
+        self.model.eval()
+        try:  # for systems with low ram, the below command gives error as some part is offloaded to disk
+            self.model = self.model.to(memory_format=torch.channels_last)
+        except BaseException:
+            pass
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            model_max_length=1024,
+            padding_side="left",
+            use_fast=False,
+        )
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("Loaded tokenizer")
+
+    def get_sut(self):
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        return self.sut
+
+    def get_qsl(self):
+        return self.qsl
+
+    def predict(self, **kwargs):
+        raise NotImplementedError
+
+    def issue_queries(self, query_samples):
+        """Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+
+        list_prompts_tokens = []
+        list_prompts_attn_masks = []
+
+        print(f"IssueQuery started with {len(query_samples)} samples")
+        while len(query_samples) > 0:
+            self.query_queue.put(query_samples[: self.batch_size])
+            query_samples = query_samples[self.batch_size:]
+        print(f"IssueQuery done")
+
+    def flush_queries(self):
+        pass
+
+    def __del__(self):
+        pass
+
+
+class SUTServer(SUT):
+    def __init__(
+        self,
+        model_path=None,
+        dtype="bfloat16",
+        device="cpu",
+        total_sample_count=24576,
+        dataset_path=None,
+        batch_size=None,
+        workers=1,
+    ):
+
+        super().__init__(
+            model_path=model_path,
+            dtype=dtype,
+            device=device,
+            total_sample_count=total_sample_count,
+            dataset_path=dataset_path,
+            workers=workers,
+        )
+
+        self.first_token_queue = queue.Queue()
+
+    def start(self):
+
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+        # Create first token response thread
+        self.ft_response_thread = threading.Thread(
+            target=self.process_first_tokens)
+        self.ft_response_thread.start()
+
+    def process_first_tokens(self):
+
+        while True:
+            first_token_item = self.first_token_queue.get()
+
+            if first_token_item is None:
+                log.info("Exiting First token response thread")
+                break
+
+            first_tokens, response_id = first_token_item
+
+            response_data = array.array(
+                "B", np.array(
+                    first_tokens, np.int32).tobytes())
+            bi = response_data.buffer_info()
+            response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
+            lg.FirstTokenComplete(response)
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic"""
+        while True:
+
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            input_ids_tensor = self.data_object.input_ids[qitem.index]
+            input_masks_tensor = self.data_object.attention_masks[qitem.index]
+
+            # TODO: This PoC is super slow with significant overhead. Best to
+            # create a patch to `generate`
+            tokens_cache = []
+            tokens_streamer = FirstTokenStreamer(
+                self.first_token_queue,
+                tokens_cache=tokens_cache,
+                is_first_token=True,
+                response_ids=[qitem.id],
+            )
+
+            _ = self.model.generate(
+                input_ids=input_ids_tensor,
+                attention_mask=input_masks_tensor,
+                pad_token_id=self.tokenizer.pad_token_id,
+                streamer=tokens_streamer,
+                **gen_kwargs,
+            )
+
+            output_tokens = tokens_streamer.get_out_tokens()
+            n_tokens = len(output_tokens)
+            response_array = array.array(
+                "B", np.array(output_tokens, np.int32).tobytes()
+            )
+            bi = response_array.buffer_info()
+            response = [
+                lg.QuerySampleResponse(
+                    qitem.id,
+                    bi[0],
+                    bi[1],
+                    n_tokens)]
+            lg.QuerySamplesComplete(response)
+
+    def issue_queries(self, query_samples):
+
+        self.query_queue.put(query_samples[0])
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+        self.first_token_queue.put(None)
+        self.ft_response_thread.join()
--- a/language/llama2-70b/SUT_API.py
+++ b/language/llama2-70b/SUT_API.py
+import os
+import time
+import numpy as np
+import array
+import torch
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+from transformers.generation.streamers import BaseStreamer
+
+import json
+import pickle
+import time
+import threading
+import tqdm
+import queue
+
+import logging
+from typing import TYPE_CHECKING, Optional, List
+from pathlib import Path
+
+import more_itertools as mit
+from concurrent.futures.thread import ThreadPoolExecutor
+
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+
+import mlperf_loadgen as lg
+from dataset import Dataset
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("Llama-70B-SUT")
+
+gen_kwargs = {
+    "early_stopping": True,
+    "max_new_tokens": 1024,
+    "min_new_tokens": 1,
+    "num_beams": 1,
+    "do_sample": False,
+}
+
+
+class FirstTokenStreamer(BaseStreamer):
+    """Streams first tokens to a 'holder'"""
+
+    def __init__(
+        self, first_token, tokens_cache=[], is_first_token=True, response_ids=[]
+    ):
+        """Response ids added to 'sign' the first token"""
+
+        self.first_token = first_token  # Queue for first token
+        self.is_first_token = is_first_token
+
+        # Cache for subsequent generated tokens
+        self.tokens_cache = tokens_cache
+
+        self.response_ids = response_ids
+
+        self.is_prompt = (
+            True  # The first tokens sent to the streamer are actually the input prompts
+        )
+
+    def put(self, value):
+        """Caches the tokens as they're generated. Assumes bs=1"""
+
+        # Prompts are streamed first so we need to skip the first time value
+        # that arrives
+        if self.is_prompt:
+            self.is_prompt = False
+            return
+
+        value = value.item()
+        if self.is_first_token:
+
+            # Add generated first token together with its query response_id to
+            # first tokens queue
+            self.first_token.put((value, self.response_ids[0]))
+
+            self.is_first_token = False
+            return
+
+        self.tokens_cache.append(value)
+
+    def end(self):
+        pass
+
+    def get_out_tokens(self):
+        return self.tokens_cache
+
+
+class SUT:
+    def __init__(
+        self,
+        model_path=None,
+        api_server=None,
+        api_model_name=None,
+        dtype="bfloat16",
+        device="cpu",
+        batch_size=None,
+        total_sample_count=24576,
+        dataset_path=None,
+        use_cached_outputs=False,
+        # Set this to True *only for test accuracy runs* in case your prior
+        # session was killed partway through
+        workers=1,
+    ):
+
+        self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
+        self.device = device
+        self.api_servers = []
+        if api_server:
+            self.api_servers.append(api_server)
+        self.api_model_name = api_model_name
+        self.device = device
+
+        batch_size = total_sample_count
+        self.batch_size = batch_size
+
+        # dtype
+        if dtype == "bfloat16":
+            self.amp_enabled = True
+            self.amp_dtype = torch.bfloat16
+        elif dtype == "float16":
+            self.amp_enabled = True
+            self.amp_dtype = torch.float16
+        else:
+            self.amp_enabled = False
+            self.amp_dtype = torch.float32
+
+        if "cuda" in self.device:
+            assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
+
+        self.dataset_path = dataset_path
+        self.data_object = Dataset(
+            self.model_path,
+            dataset_path=self.dataset_path,
+            total_sample_count=total_sample_count,
+            device=self.device,
+        )
+        self.qsl = lg.ConstructQSL(
+            self.data_object.total_sample_count,
+            self.data_object.perf_count,
+            self.data_object.LoadSamplesToRam,
+            self.data_object.UnloadSamplesFromRam,
+        )
+
+        # self.load_model()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            model_max_length=1024,
+            padding_side="left",
+            use_fast=True,
+        )  # changed from false
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.num_workers = workers
+        self.worker_threads = [None] * self.num_workers
+        self.query_queue = queue.Queue()
+
+        self.use_cached_outputs = use_cached_outputs
+        self.sample_counter = 0
+        self.sample_counter_lock = threading.Lock()
+
+    def start(self):
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+    def query_api_vllm(self, inputs, idx):
+        headers = {
+            "Content-Type": "application/json",
+        }
+        json_data = {
+            "model": self.api_model_name,
+            "prompt": inputs,
+            "min_tokens": 1,
+            "max_tokens": 1024,
+        }
+
+        response_code = 0
+        print(f"Server path {self.api_servers[idx]}/v1/completions")
+        while response_code != 200:
+            try:
+                response = requests.post(
+                    f"{self.api_servers[idx]}/v1/completions",
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                )
+                response_code = response.status_code
+            except Exception as e:
+                print(e)
+                print("connection failure")
+                break
+        return [resp["text"] for resp in json.loads(response.text)["choices"]]
+
+    def api_action_handler(self, chunk, server_idx):
+        output = self.query_api_vllm(chunk, server_idx)
+        return output
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic"""
+
+        while True:
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            query_ids = [q.index for q in qitem]
+
+            fname = "q" + "_".join([str(i) for i in query_ids])
+            fname = f"run_outputs/{fname}.pkl"
+            _p = Path(fname)
+            if self.use_cached_outputs and _p.exists():
+                # Read cache
+                with _p.open(mode="rb") as f:
+                    d = pickle.load(f)
+                processed_output = d["outputs"]
+                tik1 = None
+                tik2 = None
+                tik3 = None
+                tok = None
+            else:
+                # Construct / collate batch
+                max_seq_len = 1024
+
+                tik1 = time.time()
+
+                # OpenAI-API servers don't require padding and can take input tokens
+                # directly, so we build our input_ids_tensor as a jagged list
+                input_ids_tensor = []
+                for q in qitem:
+                    # input_ids_tensor.append(self.data_object.input_ids[q.index].tolist())
+                    input_ids_tensor += self.data_object.input_ids[q.index].tolist(
+                    )
+
+                # NOTE(mgoin): I don't think this has to be a torch tensor
+                # input_ids_tensor = torch.cat(input_ids_tensor)
+
+                # print(input_ids_tensor)
+
+                assert len(input_ids_tensor) <= self.batch_size
+
+                tik2 = time.time()
+
+                # NOTE(mgoin): I don't think threading is necessary since we are submitting all queries in one request
+                # The API server should take care of mini-batches and
+                # scheduling
+                if self.api_servers:
+                    """
+                    decoded = self.tokenizer.batch_decode(input_ids_tensor)
+                    cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
+                    cleaned_chunks = [list(c) for c in mit.divide(len(self.api_servers), cleaned)]
+                    """
+                    cleaned_chunks = [input_ids_tensor]
+                    with ThreadPoolExecutor(
+                        max_workers=len(self.api_servers)
+                    ) as executor:
+                        # needs to be tested
+                        output_chunks = list(
+                            executor.map(
+                                self.api_action_handler,
+                                cleaned_chunks,
+                                range(len(self.api_servers)),
+                            )
+                        )
+                    output = []
+                    for row in output_chunks:
+                        output += row
+                else:
+                    print(
+                        "Error: Specify at least one API to which the request is to be sent!"
+                    )
+                    exit(1)
+
+                tik3 = time.time()
+
+            processed_output = self.tokenizer(output)["input_ids"]
+            # for i in range(len(qitem)):
+            for i in range(len(processed_output)):
+                # NOTE(mgoin): Not optimal to make numpy arrays just to
+                # serialize
+                unpadded = np.array(processed_output[i])
+                n_tokens = unpadded.shape[0]
+                response_array = array.array("B", unpadded.tobytes())
+                bi = response_array.buffer_info()
+                response = [
+                    lg.QuerySampleResponse(
+                        qitem[i].id,
+                        bi[0],
+                        bi[1],
+                        n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+            tok = time.time()
+
+            with self.sample_counter_lock:
+                self.sample_counter += len(qitem)
+                print(f"Samples run: {self.sample_counter}")
+                if tik1:
+                    print(f"\tBatchMaker time: {tik2 - tik1}")
+                    print(f"\tInference time: {tik3 - tik2}")
+                    print(f"\tPostprocess time: {tok - tik3}")
+                    print(f"\t==== Total time: {tok - tik1}")
+                else:
+                    print(f"\tLoaded from cache: {_p}")
+
+    def get_sut(self):
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        return self.sut
+
+    def get_qsl(self):
+        return self.qsl
+
+    def predict(self, **kwargs):
+        raise NotImplementedError
+
+    def issue_queries(self, query_samples):
+        """Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+
+        list_prompts_tokens = []
+        list_prompts_attn_masks = []
+
+        print(f"IssueQuery started with {len(query_samples)} samples")
+        while len(query_samples) > 0:
+            self.query_queue.put(query_samples[: self.batch_size])
+            query_samples = query_samples[self.batch_size:]
+        print(f"IssueQuery done")
+
+    def flush_queries(self):
+        pass
+
+    def __del__(self):
+        pass
+
+
+class SUTServer(SUT):
+    def __init__(
+        self,
+        model_path=None,
+        api_server=None,
+        api_model_name=None,
+        dtype="bfloat16",
+        device="cpu",
+        total_sample_count=24576,
+        dataset_path=None,
+        batch_size=None,
+        workers=1,
+    ):
+
+        super().__init__(
+            model_path=model_path,
+            api_server=None,
+            api_model_name=None,
+            dtype=dtype,
+            device=device,
+            total_sample_count=total_sample_count,
+            dataset_path=dataset_path,
+            workers=workers,
+        )
+
+        with open(f"{self.model_path}/tokenizer.json", "r") as token_file:
+            llama_tokenizer = json.load(token_file)
+        self.llama_vocab = llama_tokenizer["model"]["vocab"]
+
+        self.first_token_queue = queue.Queue()
+
+    def start(self):
+
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+        # Create first token response thread
+        self.ft_response_thread = threading.Thread(
+            target=self.process_first_tokens)
+        self.ft_response_thread.start()
+
+    def process_first_tokens(self):
+
+        while True:
+            first_token_item = self.first_token_queue.get()
+
+            if first_token_item is None:
+                log.info("Exiting First token response thread")
+                break
+
+            first_tokens, response_id = first_token_item
+
+            response_data = array.array(
+                "B", np.array(first_tokens, np.float32).tobytes()
+            )
+            bi = response_data.buffer_info()
+            response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
+            lg.FirstTokenComplete(response)
+
+    def stream_api_vllm(self, input, response_ids, idx):
+        headers = {
+            "Content-Type": "application/json",
+        }
+
+        json_data = {
+            "model": "/opt/app-root/share/models",
+            "prompt": input,
+            "max_tokens": 1024,
+            "temperature": 0,
+            "stream": True,
+            "logprobs": 1,
+        }
+
+        while True:
+            try:
+                token_cache = []
+                s = requests.Session()
+                first = True
+                with s.post(
+                    f"{self.api_servers[idx]}/v1/completions",
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                    stream=True,
+                ) as resp:
+                    for line in resp.iter_lines():
+                        if line:
+                            decoded = line.decode()
+                            if decoded.startswith(
+                                    "data") and "[DONE]" not in decoded:
+                                inter = json.loads(decoded[6:])["choices"][0][
+                                    "logprobs"
+                                ]
+                                if "top_logprobs" in inter:
+                                    token_s = list(
+                                        inter["top_logprobs"][0].keys())[0]
+                                    token = self.llama_vocab[token_s]
+                                    if first:
+                                        self.first_token_queue.put(
+                                            (token, response_ids[0])
+                                        )
+                                        first = False
+                                    token_cache.append(token)
+                s.close()
+                if token_cache:
+                    return token_cache
+            except Exception as e:
+                s.close()
+                print("Connection failure")
+                print(f"An exception occurred: {type(e).__name__}")
+                print(f"Exception details: {e}")
+
+    def async_process_query(self, input_ids_tensor, qitem_id, idx):
+        decoded = self.tokenizer.decode(input_ids_tensor[0])
+        response_ids = [qitem_id]
+        output_tokens = self.stream_api_vllm(decoded, response_ids, idx)
+        n_tokens = len(output_tokens)
+        if n_tokens <= 1:
+            print("WARNING: caught low token count")
+            print(input_ids_tensor)
+            print(output_tokens)
+        response_array = array.array(
+            "B", np.array(
+                output_tokens, np.int32).tobytes())
+        bi = response_array.buffer_info()
+        response = [lg.QuerySampleResponse(qitem_id, bi[0], bi[1], n_tokens)]
+        lg.QuerySamplesComplete(response)
+        sys.exit()
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic"""
+        server_idx = 0
+        while True:
+
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            input_ids_tensor = self.data_object.input_ids[qitem.index]
+            input_masks_tensor = self.data_object.attention_masks[qitem.index]
+
+            if self.api_servers:
+                threading.Thread(
+                    target=self.async_process_query,
+                    args=(input_ids_tensor, qitem.id, server_idx),
+                ).start()
+                server_idx = (server_idx + 1) % len(self.api_servers)
+            else:
+                # TODO: This PoC is super slow with significant overhead. Best
+                # to create a patch to `generate`
+                tokens_cache = []
+                tokens_streamer = FirstTokenStreamer(
+                    self.first_token_queue,
+                    tokens_cache=tokens_cache,
+                    is_first_token=True,
+                    response_ids=[qitem.id],
+                )
+
+                _ = self.model.generate(
+                    input_ids=input_ids_tensor,
+                    attention_mask=input_masks_tensor,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    streamer=tokens_streamer,
+                    **gen_kwargs,
+                )
+
+                output_tokens = tokens_streamer.get_out_tokens()
+
+                n_tokens = len(output_tokens)
+                response_array = array.array(
+                    "B", np.array(output_tokens, np.int32).tobytes()
+                )
+                bi = response_array.buffer_info()
+                response = [
+                    lg.QuerySampleResponse(
+                        qitem.id, bi[0], bi[1], n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+    def issue_queries(self, query_samples):
+
+        self.query_queue.put(query_samples[0])
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+        self.first_token_queue.put(None)
+        self.ft_response_thread.join()
--- a/language/llama2-70b/build.sh
+++ b/language/llama2-70b/build.sh
+set -e
+
+conda install pybind11==2.10.4 -c conda-forge -y
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch-nightly -c nvidia
+python -m pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
+
+
+cd ../../loadgen && python3 -m pip install .
--- a/language/llama2-70b/consolidate_results.py
+++ b/language/llama2-70b/consolidate_results.py
+import argparse
+import evaluate
+import glob
+import nltk
+import numpy as np
+import os
+import pandas as pd
+import pickle
+
+from pathlib import Path
+from transformers import LlamaTokenizerFast
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to .pkl generated by processorca.py",
+    )
+    parser.add_argument(
+        "--run-outputs",
+        type=str,
+        default="run_outputs",
+        help="Output dir generated by accuracy run.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="Path to Llamav2 HuggingFace repo clone",
+    )
+    parser.add_argument(
+        "--output-pkl-path",
+        type=str,
+        default="full_output.pkl",
+        help="Path to dump output to",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def load_dataset(p: os.PathLike):
+    print(f"Loading from {p}...")
+    return pd.read_pickle(p)
+
+
+def load_run_outputs(p: os.PathLike):
+    g = glob.glob(str(Path(p) / "q*.pkl"))
+
+    by_query_idx = dict()
+    for pkl_file in g:
+        print(f"Loading from {pkl_file}...")
+        with open(pkl_file, "rb") as f:
+            d = pickle.load(f)
+        assert len(d["query_ids"]) == len(d["outputs"])
+
+        for i in range(len(d["query_ids"])):
+            qid = d["query_ids"][i]
+            assert qid not in by_query_idx
+            by_query_idx[qid] = d["outputs"][i]
+
+    return by_query_idx
+
+
+def main(args):
+    # Set up decode and evaluation objects
+    tokenizer = LlamaTokenizerFast.from_pretrained(args.model_dir)
+    metric = evaluate.load("rouge")
+    nltk.download("punkt")
+
+    # Load Data
+    df = load_dataset(args.dataset_path)
+    run_outputs = load_run_outputs(args.run_outputs)
+    assert len(run_outputs) == 24576
+
+    # Set up columns to add
+    output_tok_ids_col = [None] * 24576
+    output_text_col = [None] * 24576
+    output_lens = [None] * 24576
+
+    # Process data
+    no_eos_ids = []
+    for qid, output in tqdm(run_outputs.items()):
+        L = list(output)
+        # Prune trailing 2s (EOS token)
+        try:
+            first2 = L.index(2)
+            L = L[:first2]
+        except ValueError:
+            # Do nothing
+            no_eos_ids.append(qid)
+
+        assert L[-1] != 2
+        output_tok_ids_col[qid] = L
+        output_lens[qid] = len(L)
+
+        # Decode tokens
+        output_text_col[qid] = tokenizer.decode(
+            output_tok_ids_col[qid], skip_special_tokens=True
+        )
+    print(f"Found {len(no_eos_ids)} samples with no EOS token")
+
+    print("Calculating rouge scores...")
+    def _preproc(s): return "\n".join(nltk.sent_tokenize(s.strip()))
+    preds = list(map(_preproc, output_text_col))
+    targets = list(map(_preproc, list(df["output"])))
+    rouge_scores = metric.compute(
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
+
+    assert len(rouge_scores["rouge1"]) == 24576
+    assert len(rouge_scores["rouge2"]) == 24576
+    assert len(rouge_scores["rougeL"]) == 24576
+
+    agg = {k: round(np.mean(v) * 100, 4) for k, v in rouge_scores.items()}
+    print(agg)
+    print("Avg output seqlen:", np.mean(output_lens))
+
+    # Set columns
+    df["gen_output_tok_id"] = output_tok_ids_col
+    df["gen_output_text"] = output_text_col
+    df["gen_output_tok_len"] = output_lens
+    df["rouge1"] = rouge_scores["rouge1"]
+    df["rouge2"] = rouge_scores["rouge2"]
+    df["rougeL"] = rouge_scores["rougeL"]
+
+    p = Path(args.output_pkl_path)
+    p.parent.mkdir(exist_ok=True)
+    df.to_pickle(p)
+    print(f"Dumped to {p}")
+
+
+if __name__ == "__main__":
+    main(get_args())
--- a/language/llama2-70b/dataset.py
+++ b/language/llama2-70b/dataset.py
+import random
+import os
+import time
+import numpy as np
+import torch
+from datasets import load_dataset, load_from_disk
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from typing import Optional, Dict, Sequence
+import io
+
+# import utils
+import copy
+import pickle
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("Llama-70B-Dataset")
+
+
+class Dataset:
+    def __init__(
+        self,
+        model_name=None,
+        total_sample_count=24576,
+        perf_count_override=None,
+        dataset_path=None,
+        device="cpu",
+    ):
+        self.model_name = model_name or "meta-llama/Llama-2-70b-chat-hf"
+        self.dataset_path = dataset_path
+        self.max_length = 1024
+        self.device = device
+
+        # self.total_sample_count = total_sample_count
+
+        self.load_tokenizer()
+        self.load_processed_dataset()
+
+        self.total_sample_count = min(len(self.input_ids), total_sample_count)
+        self.perf_count = perf_count_override or self.total_sample_count
+
+    def load_tokenizer(self):
+        """Returns tokenizer"""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            model_max_length=1024,
+            padding_side="left",
+            use_fast=False,
+        )
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def load_processed_dataset(self):
+        if not os.path.isfile(self.dataset_path):
+            log.warn(
+                "Processed pickle file {} not found. Please check that the path is correct".format(
+                    self.dataset_path
+                )
+            )
+
+        print("Loading dataset...")
+        import pandas as pd
+
+        processed_data = pd.read_pickle(self.dataset_path)
+
+        input_tokens = processed_data["tok_input"]
+
+        self.input_ids = []
+        self.input_lens = []
+        self.attention_masks = []
+
+        for ids in input_tokens:
+            input_ids = torch.tensor(ids, dtype=torch.int32).view(
+                1, -1).to(self.device)
+            attn_mask = torch.ones_like(input_ids)
+            self.input_ids.append(input_ids)
+            self.attention_masks.append(attn_mask)
+            self.input_lens.append(input_ids.shape[-1])
+        print("Finished loading dataset.")
+
+    def postProcess(
+        self,
+        out_tokens,
+        input_seq_lens=None,
+        query_id_list=None,
+        sample_index_list=None,
+    ):
+        """Postprocesses output prediction"""
+
+        # TODO: Create response object in postProcess(?)
+        """
+        preds = []
+        for i in range(out_tokens.shape[0]):
+            #pred = out_tokens[i].reshape(-1).cpu().numpy() # Slice up to original input length as below?
+
+            input_len = input_seq_lens[i] if input_seq_lens else 0
+            pred = out_tokens[i, input_len:].reshape(-1).cpu().numpy()
+            preds.append(pred)
+        """
+        # Everything is padded to max_len (1024), so prune the input and parse
+        # to numpy
+        output_seq = out_tokens[:, 1024:].cpu().numpy()
+        assert len(query_id_list) == output_seq.shape[0]
+
+        # Save outputs
+        if not os.path.exists("run_outputs"):
+            os.makedirs("run_outputs")
+        fname = "q" + "_".join([str(i) for i in query_id_list])
+        fname = f"run_outputs/{fname}.pkl"
+        with open(fname, mode="wb") as f:
+            d = {"query_ids": query_id_list, "outputs": output_seq}
+            print(f"Saving outputs to {fname}")
+            pickle.dump(d, f)
+
+        return output_seq
+
+    def LoadSamplesToRam(self, sample_list):
+        pass
+
+    def UnloadSamplesFromRam(self, sample_list):
+        pass
+
+    def __del__(self):
+        pass
--- a/language/llama2-70b/evaluate-accuracy.py
+++ b/language/llama2-70b/evaluate-accuracy.py
+import argparse
+from transformers import AutoTokenizer
+import nltk
+import evaluate
+import numpy as np
+import json
+from multiprocessing import Pool, cpu_count
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
+    )
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file",
+        required=True,
+        help="path to processed openorca validation set",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64", "float"],
+    )
+    args = parser.parse_args()
+    return args
+
+
+def get_groundtruth(processed_dataset_file):
+    import pandas as pd
+
+    data = pd.read_pickle(processed_dataset_file)
+    ground_truths = data["output"]
+    return ground_truths
+
+
+def postprocess_text(preds, targets):
+    preds = [pred.strip() for pred in preds]
+    targets = [target.strip() for target in targets]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
+
+    return preds, targets
+
+
+def compute_rouge_chunk(chunk):
+    """Compute ROUGE scores for a chunk of predictions and references."""
+    metric = evaluate.load("rouge")
+    preds, targets = chunk
+    result = metric.compute(
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
+    return result
+
+
+def main():
+
+    args = get_args()
+    dataset_path = args.dataset_file
+    checkpoint_path = args.checkpoint_path
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint_path,
+        model_max_length=2048,
+        padding_side="left",
+        use_fast=False,
+    )
+
+    targets = get_groundtruth(args.dataset_file)
+
+    target_required = []
+    preds_token_ids = []
+
+    eval_dtype = np.int64
+    if args.dtype == "int32":
+        eval_dtype = np.int32
+    elif args.dtype == "float":
+        eval_dtype = np.float32
+
+    with open(args.mlperf_accuracy_file, "r") as f:
+        results = json.load(f)
+
+    seen = set()
+    gen_tok_len = 0
+    for pred in results:
+        qsl_idx = pred["qsl_idx"]
+        if qsl_idx in seen:
+            continue
+
+        seen.add(qsl_idx)
+        target = targets[qsl_idx]
+        target_required.append(target)
+        pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
+
+        gen_tok_len += len(pred)
+        preds_token_ids.append(pred)
+
+    preds_decoded_text = tokenizer.batch_decode(
+        preds_token_ids, skip_special_tokens=True
+    )
+
+    preds, targets = postprocess_text(preds_decoded_text, target_required)
+
+    # Split data into chunks for parallel processing
+    num_chunks = cpu_count()  # Number of parallel processes
+    chunk_size = len(preds) // num_chunks + (len(preds) % num_chunks > 0)
+
+    chunks = [
+        (preds[i:i + chunk_size], targets[i:i + chunk_size])
+        for i in range(0, len(preds), chunk_size)
+    ]
+
+    # Use multiprocessing Pool to compute ROUGE scores in parallel
+    with Pool(num_chunks) as pool:
+        results_list = pool.map(compute_rouge_chunk, chunks)
+
+    # Aggregate results from all chunks
+    aggregated_results = {}
+
+    for result in results_list:
+        for k, v in result.items():
+            if k not in aggregated_results:
+                aggregated_results[k] = []
+            aggregated_results[k].extend(v)
+
+    final_result = {k: round(np.mean(v) * 100, 4)
+                    for k, v in aggregated_results.items()}
+
+    prediction_lens = [len(pred) for pred in preds]
+    gen_num = len(preds)
+
+    final_result.update({
+        "gen_len": np.sum(prediction_lens),
+        "gen_num": gen_num,
+        "gen_tok_len": gen_tok_len,
+        "tokens_per_sample": round(gen_tok_len / gen_num, 1),
+    })
+
+    print("\nResults\n")
+    print(final_result)
+
+
+if __name__ == "__main__":
+    main()
--- a/language/llama2-70b/launch.sh
+++ b/language/llama2-70b/launch.sh
+#!/bin/bash
+
+MLCOMMONS_REPO_PATH="$(dirname "$(dirname "$PWD")")"
+
+# Add any volume mounts here with the following syntax
+# /path/to/src:/path/to/dir/in/container
+MOUNTS=(
+    $MLCOMMONS_REPO_PATH:$MLCOMMONS_REPO_PATH
+)
+
+# Set up docker environment file for current user
+rm -f .docker_env
+echo "CI_BUILD_USER=`id -u -n`" >> .docker_env
+echo "CI_BUILD_UID=`id -u`" >> .docker_env
+echo "CI_BUILD_GROUP=`id -g -n`" >> .docker_env
+echo "CI_BUILD_GID=`id -g`" >> .docker_env
+cat .docker_env
+
+# Build container
+docker build . -t llm/gpubringup
+
+# Build mount flags
+declare -a MOUNT_FLAGS
+for _mount in ${MOUNTS[@]}; do
+    _split=($(echo $_mount | tr ':' '\n'));
+    MOUNT_FLAGS+=("--mount type=bind,source=${_split[0]},target=${_split[1]}");
+done
+
+set -x
+nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
+  --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --cap-add=DAC_READ_SEARCH \
+  --security-opt seccomp=unconfined \
+  -w $PWD \
+  --env-file `pwd`/.docker_env \
+  ${MOUNT_FLAGS[*]} \
+  llm/gpubringup \
+  bash ./with_the_same_user
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
+import subprocess
+import mlperf_loadgen as lg
+import argparse
+import os
+import logging
+import sys
+import requests
+import json
+
+sys.path.insert(0, os.getcwd())
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("Llama-70B-MAIN")
+
+# function to check the model name in server matches the user specified one
+
+
+def verify_model_name(user_specified_name, url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        response_dict = response.json()
+        server_model_name = response_dict["data"][0]["id"]
+        if user_specified_name == server_model_name:
+            return {"matched": True, "error": False}
+        else:
+            return {
+                "matched": False,
+                "error": f"User specified {user_specified_name} and server model name {server_model_name} mismatch!",
+            }
+    else:
+        return {
+            "matched": False,
+            "error": f"Failed to get a valid response. Status code: {response.status_code}",
+        }
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--scenario",
+        type=str,
+        choices=["Offline", "Server"],
+        default="Offline",
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="meta-llama/Llama-2-70b-chat-hf",
+        help="Model name",
+    )
+    parser.add_argument("--dataset-path", type=str, default=None, help="")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy mode")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float32",
+        help="data type of the model, choose from float16, bfloat16 and float32",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda:0"],
+        default="cpu",
+        help="device to use",
+    )
+    parser.add_argument(
+        "--audit-conf",
+        type=str,
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--user-conf",
+        type=str,
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    # TODO: This interpretation of 'total-sample-count' is a little
+    # misleading. Fix it
+    parser.add_argument(
+        "--total-sample-count",
+        type=int,
+        default=24576,
+        help="Number of samples to use in benchmark.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1,
+        help="Model batch-size to use in benchmark.",
+    )
+    parser.add_argument(
+        "--output-log-dir", type=str, default="output-logs", help="Where logs are saved"
+    )
+    parser.add_argument(
+        "--enable-log-trace",
+        action="store_true",
+        help="Enable log tracing. This file can become quite large",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Number of workers to process queries",
+    )
+    parser.add_argument("--vllm", action="store_true", help="vllm mode")
+    parser.add_argument(
+        "--api-model-name",
+        type=str,
+        default="meta-llama/Llama-2-70b-chat-hf",
+        help="Model name(specified in llm server)",
+    )
+    parser.add_argument(
+        "--api-server",
+        type=str,
+        default=None,
+        help="Specify an api endpoint call to use api mode",
+    )
+    parser.add_argument(
+        "--lg-model-name",
+        type=str,
+        default="llama2-70b",
+        choices=["llama2-70b", "llama2-70b-interactive"],
+        help="Model name(specified in llm server)",
+    )
+    args = parser.parse_args()
+    return args
+
+
+scenario_map = {
+    "offline": lg.TestScenario.Offline,
+    "server": lg.TestScenario.Server,
+}
+
+
+def main():
+    args = get_args()
+
+    if args.vllm:
+        resp = verify_model_name(
+            args.api_model_name,
+            args.api_server + "/v1/models")
+        if resp["error"]:
+            print(f"\n\n\033[91mError:\033[0m", end=" ")
+            print(resp["error"])
+            sys.exit(1)
+
+    settings = lg.TestSettings()
+    settings.scenario = scenario_map[args.scenario.lower()]
+    # mlperf.conf is automatically loaded by the loadgen
+    settings.FromConfig(args.user_conf, args.lg_model_name, args.scenario)
+
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+
+    os.makedirs(args.output_log_dir, exist_ok=True)
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = args.output_log_dir
+    log_output_settings.copy_summary_to_stdout = True
+    log_settings = lg.LogSettings()
+    log_settings.log_output = log_output_settings
+    log_settings.enable_trace = args.enable_log_trace
+
+    if args.vllm:
+        from SUT_API import SUT, SUTServer
+    else:
+        from SUT import SUT, SUTServer
+
+    sut_map = {"offline": SUT, "server": SUTServer}
+
+    sut_cls = sut_map[args.scenario.lower()]
+
+    if args.vllm:
+        sut = sut_cls(
+            model_path=args.model_path,
+            dtype=args.dtype,
+            batch_size=args.batch_size,
+            dataset_path=args.dataset_path,
+            total_sample_count=args.total_sample_count,
+            device=args.device,
+            api_server=args.api_server,
+            api_model_name=args.api_model_name,
+            workers=args.num_workers,
+        )
+    else:
+        sut = sut_cls(
+            model_path=args.model_path,
+            dtype=args.dtype,
+            batch_size=args.batch_size,
+            dataset_path=args.dataset_path,
+            total_sample_count=args.total_sample_count,
+            device=args.device,
+            workers=args.num_workers,
+        )
+
+    # Start sut before loadgen starts
+    sut.start()
+    lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
+    log.info("Starting Benchmark run")
+    lg.StartTestWithLogSettings(
+        lgSUT,
+        sut.qsl,
+        settings,
+        log_settings,
+        args.audit_conf)
+
+    # Stop sut after completion
+    sut.stop()
+
+    log.info("Run Completed!")
+
+    log.info("Destroying SUT...")
+    lg.DestroySUT(lgSUT)
+
+    log.info("Destroying QSL...")
+    lg.DestroyQSL(sut.qsl)
+
+
+if __name__ == "__main__":
+    main()
--- a/language/llama2-70b/processorca.py
+++ b/language/llama2-70b/processorca.py
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import argparse
+import pandas as pd
+import numpy as np
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from transformers import LlamaTokenizerFast
+from typing import Dict
+
+__doc__ = """
+This script takes the open_orca GPT4 dataset parquet and perform the following preprocessing and filtering steps:
+1. filter out all queries with non-ascii characters, except for normal unicode quotes and hyphens.
+2. filter out all queries with out-of-bound input/output sequence lengths
+3. filter out all queries with expected answers shorter than 2 words (known to cause issues for Llama2)
+4. filter out all queries with prompts that generate bad output texts using Llama2 models
+4. sample equally from the sub-dataset (i.e. COT, NIV, FLAN, T0) and form the final dataset.
+"""
+
+llama_prompt_system = "<s>[INST] <<SYS>>\n{}\n<</SYS>>\n\n{} [/INST]"
+llama_prompt_no_system = "<s>[INST] {} [/INST]"
+
+
+def format_llama_input(row):
+    if row["system_prompt"]:
+        return llama_prompt_system.format(
+            row["system_prompt"], row["question"])
+    else:
+        return llama_prompt_no_system.format(row["question"])
+
+
+def is_english(s):
+    for c in s:
+        allowed = c.isascii()
+        allowed = allowed or (
+            c in ["’", "–", "“", "”", "—"]
+        )  # Taken from Habana: Unicode quotes and hyphens
+        if not allowed:
+            return False
+    return True
+
+
+def _tokenize_helper(x, llama_tokenizer=None, append_response_init_token=True):
+    if not isinstance(x, str):
+        return []
+
+    tokens = llama_tokenizer(x)["input_ids"]
+
+    if append_response_init_token:
+        # Workaround to enable cheat checking for first token: Llama always outputs token 29871 first
+        # It is possible for submitters to just immediately output this token
+        # to achieve a very fast TTFT.
+        tokens.append(29871)
+    return tokens
+
+
+@dataclass
+class Keyphrase:
+    col: str
+    phrase: str
+    startswith: bool = False
+    case: bool = False
+
+
+class OpenOrcaDatasetGenerator:
+    def __init__(
+        self,
+        pq_path: os.PathLike,
+        model_dir: os.PathLike,
+        io_token_limit: int,
+        calibration_subset_size: int = 1000,
+    ):
+        self.pq_path = Path(pq_path)
+        self.model_dir = Path(model_dir)
+        self.io_token_limit = io_token_limit
+        self.keyphrases = []
+        self.calibration_subset_size = calibration_subset_size
+
+    def load_parquet(self) -> pd.DataFrame:
+        llama_tokenizer = LlamaTokenizerFast.from_pretrained(self.model_dir)
+
+        tik = time.time()
+        df = pd.read_parquet(self.pq_path)
+        print(f"Tokenizing input")
+        df.rename(columns={"response": "output"}, inplace=True)
+        df["input"] = df.apply(format_llama_input, axis=1)
+
+        input_tokenizer = partial(
+            _tokenize_helper,
+            llama_tokenizer=llama_tokenizer)
+        output_tokenizer = partial(
+            _tokenize_helper,
+            llama_tokenizer=llama_tokenizer,
+            append_response_init_token=False,
+        )
+        df["tok_input"] = df["input"].apply(input_tokenizer)
+        df["tok_output"] = df["output"].apply(output_tokenizer)
+        tok = time.time()
+        print(f"Loaded parquet and tokenized in {tok-tik} sec.")
+        return df
+
+    def filter_english(self, df: pd.DataFrame) -> pd.DataFrame:
+        df["input_english"] = df["input"].apply(is_english)
+        df["output_english"] = df["output"].apply(is_english)
+        df["all_english"] = df["input_english"] & df["output_english"]
+
+        # Filter based on english tokens
+        df = df[df["all_english"]].drop(
+            ["input_english", "output_english", "all_english"], axis=1
+        )
+        return df.reset_index(drop=True)
+
+    def filter_seqlen_oob(self, df: pd.DataFrame) -> pd.DataFrame:
+        df["tok_input_length"] = df["tok_input"].apply(lambda x: len(x))
+        df["tok_output_length"] = df["tok_output"].apply(lambda x: len(x))
+
+        # Filter based on sequence length
+        df = df[df["tok_input_length"] < self.io_token_limit]
+        df = df[df["tok_output_length"] < self.io_token_limit]
+        return df.reset_index(drop=True)
+
+    def filter_short_expected_response(self, df: pd.DataFrame) -> pd.DataFrame:
+        # We have found that short expected responses (such as for yes/no and true/false questions, or multiple choice
+        # questions where the expected response is just the choice, i.e. (B)), disproportionately have lower Rouge1
+        # scores (< 0.02).
+
+        # Filter out 1 and 2 word expected responses. We've seen best results when this is filtered to >= 6, but it is
+        # hard to justify removing that many samples.
+        df = df[df["tok_output_length"] >= 3]
+        return df.reset_index(drop=True)
+
+    def filter_bad_prompts(
+        self, df: pd.DataFrame, only_niv_t0: bool = True
+    ) -> pd.DataFrame:
+        # Some prompts underperform and cause very bad Rouge scores for a significant percentage of samples with these
+        # prompts. See Jupyter notebook for analysis.
+        # These generally only affect NIV and t0 and do not exist in flan or cot.
+        # Set 'only_niv_t0' to True to explicitly only remove these prompts
+        # from niv and t0 samples.
+        bad_prompts = [
+            "",
+            "You are an AI assistant that follows instruction extremely well. Help as much as you can.",
+            "You are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.",
+            "You are an AI assistant. Provide a detailed answer so user don't need to search outside to understand the answer.",
+            "User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.",
+            "Explain how you used the definition to come up with the answer.",
+        ]
+        for prompt in bad_prompts:
+            criteria = df.system_prompt == prompt
+            if only_niv_t0:
+                criteria = criteria & (
+                    (df.origin == "niv") | (
+                        df.origin == "t0"))
+            df = df[~criteria]
+
+        return df.reset_index(drop=True)
+
+    def register_keyphrase(self, keyphrase: Keyphrase):
+        self.keyphrases.append(keyphrase)
+
+    def filter_keyphrases(self, df: pd.DataFrame) -> pd.DataFrame:
+        # Filter out registered keyphrases. This is unused for the final
+        # dataset as there are no registered keyphrases.
+        for kp in self.keyphrases:
+            if kp.startswith:
+                selector = df[kp.col].str.startswith(kp.phrase)
+            else:
+                selector = df[kp.col].str.contains(kp.phrase, case=kp.case)
+            df = df[~selector]
+        return df.reset_index(drop=True)
+
+    def set_origins(self, df: pd.DataFrame) -> pd.DataFrame:
+        def get_sample_origin(x): return x.split(".")[0]
+        df["origin"] = df["id"].apply(get_sample_origin)
+        return df
+
+    def _per_origin_split(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
+        print(f"Unique sample origin datasets: {df.origin.unique()}")
+        dfs_by_origin = dict(tuple(df.groupby("origin")))
+        for origin, sub_df in dfs_by_origin.items():
+            sub_df = sub_df.reset_index(drop=True, inplace=True)
+        return dfs_by_origin
+
+    def _get_sampling(self, df, N, rng_seed: int = 1337):
+        _N = min(df.shape[0], N)
+        if _N < N:
+            raise RuntimeError(f"Not enough samples. Requires {N - _N} more.")
+        return df.sample(n=_N, random_state=rng_seed)
+
+    def sample(
+        self, dfs_by_origin: Dict[str, pd.DataFrame], n_total, rng_seed: int = 1337
+    ) -> pd.DataFrame:
+        nways = len(dfs_by_origin)
+        assert (
+            n_total % nways == 0
+        ), f"Total number of samples ({n_total}) must be divisible by n_origins ({nways})"
+
+        split_size = n_total // nways
+        samplings = []
+        for origin, df in dfs_by_origin.items():
+            print(f"Sampling {split_size} from {origin}")
+            sample = self._get_sampling(df, split_size, rng_seed=rng_seed)
+            samplings.append(sample)
+
+        sampled_df = pd.concat(samplings)
+        sampled_df = sampled_df.reset_index(drop=True)
+        return sampled_df
+
+    def generate(
+        self,
+        export_dir: os.PathLike,
+        n_samples: int = 24576,
+        use_cached: bool = True,
+        calib_rng_seed: int = 12345,
+    ):
+        export_dir = Path(export_dir)
+        if not export_dir.exists():
+            print(f"Creating {export_dir}")
+            export_dir.mkdir(parents=True)
+        if export_dir.is_file():
+            raise ValueError(
+                f"Cannot export to file {export_dir}. Must be a directory."
+            )
+
+        full_fpath = export_dir / f"open_orca_gpt4_tokenized_llama.full.pkl"
+        if full_fpath.exists() and use_cached:
+            df = pd.read_pickle(full_fpath)
+        else:
+            df = self.load_parquet()
+            df = self.set_origins(df)
+
+            # Apply filters
+            df = self.filter_english(df)
+            df = self.filter_seqlen_oob(df)
+            df = self.filter_short_expected_response(df)
+            df = self.filter_bad_prompts(df)
+            df = self.filter_keyphrases(df)
+            df.to_pickle(full_fpath)
+
+        dfs_by_origin = self._per_origin_split(df)
+
+        # Export base files
+        for origin, sub_df in dfs_by_origin.items():
+            print(f"Subset '{origin}' has {sub_df.shape[0]} samples")
+            origin_fpath = export_dir / \
+                f"open_orca_gpt4_tokenized_llama.{origin}.pkl"
+            if not origin_fpath.exists() or not use_cached:
+                sub_df.to_pickle(origin_fpath)
+
+        # Strategy:
+        # After some analysis, we found that OpenOrca's dataset has a skewed "origin-dataset" distribution:
+        # cot and niv have significantly fewer samples (71K and 58K) compared to flan and t0 (375K and 278K).
+        # cot has a higher rouge score from a 100k sampling (of the whole dataset) than the rest, while niv has lower.
+        # Sample from each dataset equally.
+        sampled_df = self.sample(dfs_by_origin, n_samples)
+        sampled_fpath = (
+            export_dir /
+            f"open_orca_gpt4_tokenized_llama.sampled_{n_samples}.pkl"
+        )
+        sampled_df.to_pickle(sampled_fpath)
+
+        # Calibration dataset
+        calib_ds = sampled_df.sample(
+            n=self.calibration_subset_size, random_state=calib_rng_seed
+        )
+        calib_ds = calib_ds.reset_index(drop=True)
+        calib_fpath = (
+            export_dir
+            / f"open_orca_gpt4_tokenized_llama.calibration_{self.calibration_subset_size}.pkl"
+        )
+        calib_ds.to_pickle(calib_fpath)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset_pq_path",
+        type=str,
+        default="/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet",
+        help="the path to the open_orca GPT4 parquet.",
+    )
+    parser.add_argument(
+        "--model_dir", type=str, default="/raid/data/mlperf-llm/Llama-2-70b-chat-hf"
+    )
+    parser.add_argument(
+        "--seqlen_limit",
+        type=int,
+        default=1024,
+        help="Upper limit of the input/output sequence lengths",
+    )
+    parser.add_argument(
+        "--export_dir",
+        type=str,
+        default="/raid/data/mlperf-llm/OpenOrca/llama/filtered",
+        help="Path to the output pkl file.",
+    )
+    parser.add_argument(
+        "--num_total_samples",
+        type=int,
+        default=24576,
+        help="Number of samples to generate",
+    )
+    parser.add_argument(
+        "--calibration_subset_size",
+        type=int,
+        default=1000,
+        help="Number of samples for calibration subset",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    ds_gen = OpenOrcaDatasetGenerator(
+        pq_path=args.dataset_pq_path,
+        model_dir=args.model_dir,
+        io_token_limit=args.seqlen_limit,
+        calibration_subset_size=args.calibration_subset_size,
+    )
+    ds_gen.generate(
+        export_dir=args.export_dir,
+        n_samples=args.num_total_samples,
+    )
+
+    # Sample command to run:
+    # python3 processorca.py
+    # --dataset_pq_path=/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet
+    # --model_dir=/raid/data/mlperf-llm/Llama-2-70b-chat-hf
+    # --seqlen_limit=1024
+    # --export_dir=/raid/data/mlperf-llm/OpenOrca/llama/filtered
+    # --num_total_samples=24576
--- a/language/llama2-70b/run_accuracy.sh
+++ b/language/llama2-70b/run_accuracy.sh
+CHECKPOINT_PATH="${CHECKPOINT_PATH:-meta-llama/Llama-2-70b-chat-hf}"
+DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
+
+mkdir -p "run_outputs"
+
+python3 -u main.py --scenario Offline \
+        --model-path ${CHECKPOINT_PATH} \
+        --accuracy \
+        --mlperf-conf mlperf.conf \
+        --user-conf user.conf \
+        --total-sample-count 24576 \
+        --dataset-path ${DATASET_PATH} \
+        --output-log-dir offline_accuracy_loadgen_logs \
+        --dtype float32 \
+        --device cuda:0 2>&1 | tee offline_accuracy_log.log
+
+python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \
+        --mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \
+        --dataset-file ${DATASET_PATH} \
+        --dtype int32
+
+python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH}
--- a/language/llama2-70b/run_offline.sh
+++ b/language/llama2-70b/run_offline.sh
+CHECKPOINT_PATH="${CHECKPOINT_PATH:-meta-llama/Llama-2-70b-chat-hf}"
+DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
+
+python -u main.py --scenario Offline \
+		--model-path ${CHECKPOINT_PATH} \
+		--mlperf-conf mlperf.conf \
+		--user-conf user.conf \
+		--total-sample-count 24576 \
+		--dataset-path ${DATASET_PATH} \
+		--device cpu 2>&1 | tee server_log.log
--- a/language/llama2-70b/run_server.sh
+++ b/language/llama2-70b/run_server.sh
+
+
+CHECKPOINT_PATH="${CHECKPOINT_PATH:-meta-llama/Llama-2-70b-chat-hf}"
+DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}"
+
+python -u main.py --scenario Server \
+		--model-path ${CHECKPOINT_PATH} \
+		--mlperf-conf mlperf.conf \
+		--user-conf user.conf \
+		--total-sample-count 24576 \
+		--dataset-path ${DATASET_PATH} \
+		--device cpu 2>&1 | tee server_log.log
--- a/language/llama2-70b/user.conf
+++ b/language/llama2-70b/user.conf
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+#
+*.Offline.min_duration = 600000
+*.Offline.min_query_count = 2000
+
+*.Server.target_qps = 0.5
+*.Server.min_duration = 120000
+*.Server.min_query_count = 100
--- a/language/llama2-70b/with_the_same_user
+++ b/language/llama2-70b/with_the_same_user
+#!/usr/bin/env bash
+# wkong: manually set the user info in env first
+
+set -ex
+
+if [ -z "$@" ]; then
+  COMMAND=(bash)
+else
+  COMMAND=("$@")
+fi
+
+apt-get update && apt-get install -y sudo
+
+getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" --disabled-password --quiet "${CI_BUILD_USER}"
+
+usermod -a -G dip "${CI_BUILD_USER}"
+usermod -a -G sudo "${CI_BUILD_USER}"
+usermod -a -G root "${CI_BUILD_USER}"
+
+echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+mkdir -p /home/"${CI_BUILD_USER}"
+touch /home/"${CI_BUILD_USER}"/.bashrc
+echo 'export PATH="$PATH:/opt/miniconda3/bin"' >> /home/"${CI_BUILD_USER}"/.bashrc
+
+sudo -H -u "#${CI_BUILD_UID}" --preserve-env \
+  PATH="${PATH}" \
+  LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
+  PYTHONPATH="${PYTHONPATH}" \
+  bash -c "conda init bash"
+
+echo 'conda activate llama2-70b' >> /home/"${CI_BUILD_USER}"/.bashrc
+
+
+sudo -H -u "#${CI_BUILD_UID}" --preserve-env \
+  PATH="${PATH}" \
+  LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
+  PYTHONPATH="${PYTHONPATH}" \
+  ${COMMAND[@]}