v1.0

24eacbc0 · chenzk · 24eacbc0 · 24eacbc0 · 24eacbc0 · 24eacbc0
Commit 24eacbc0 authored May 09, 2024 by chenzk
20 changed files
--- a/finetune/configs/ds_config_zero2.json
+++ b/finetune/configs/ds_config_zero2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "contiguous_gradients": true
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false,
+    "flops_profiler": {
+        "enabled": false,
+        "profile_step": 1,
+        "module_depth": -1,
+        "top_modules": 1,
+        "detailed": true,
+        "output_file": null
+    }
+}
\ No newline at end of file
--- a/finetune/configs/ds_config_zero2_offload.json
+++ b/finetune/configs/ds_config_zero2_offload.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "contiguous_gradients": true,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false,
+    "flops_profiler": {
+        "enabled": false,
+        "profile_step": 1,
+        "module_depth": -1,
+        "top_modules": 1,
+        "detailed": true,
+        "output_file": null
+    }
+}
\ No newline at end of file
--- a/finetune/configs/ds_config_zero3.json
+++ b/finetune/configs/ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "reduce_scatter": true,
+        "contiguous_gradients": true,
+        "overlap_comm": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false,
+    "flops_profiler": {
+        "enabled": false,
+        "profile_step": 1,
+        "module_depth": -1,
+        "top_modules": 1,
+        "detailed": true,
+        "output_file": null
+    }
+}
\ No newline at end of file
--- a/finetune/configs/ds_config_zero3_offload.json
+++ b/finetune/configs/ds_config_zero3_offload.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "reduce_scatter": true,
+        "contiguous_gradients": true,
+        "overlap_comm": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_gather_16bit_weights_on_model_save": true,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "wall_clock_breakdown": false,
+    "flops_profiler": {
+        "enabled": false,
+        "profile_step": 1,
+        "module_depth": -1,
+        "top_modules": 1,
+        "detailed": true,
+        "output_file": null
+    }
+}
\ No newline at end of file
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
+# -*- coding: utf-8 -*-
+import json
+from typing import Dict, Optional
+from dataclasses import dataclass, field
+import torch
+from torch.utils.data import Dataset
+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, TrainingArguments, Trainer
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="baichuan-inc/Baichuan2-7B-Base")
+@dataclass
+class DataArguments:
+    train_data_path: str = field(
+        default="data/AdvertiseGenChatML/train.json",
+        metadata={"help": "Path to the training data."},
+    )
+    eval_data_path: str = field(
+        default="data/AdvertiseGenChatML/dev.json",
+        metadata={"help": "Path to the test data."},
+    )
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    use_lora: bool = field(default=False)
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(
+        self,
+        data_path,
+        tokenizer,
+        model_max_length=4096,
+        user_tokens=[1786, 4194, 95388],
+        assistant_tokens=[1786, 10850, 95388],
+    ):
+        super(SupervisedDataset, self).__init__()
+        self.data = json.load(open(data_path))
+        self.tokenizer = tokenizer
+        self.model_max_length = model_max_length
+        self.user_tokens = user_tokens
+        self.assistant_tokens = assistant_tokens
+        self.ignore_index = -100
+        item = self.preprocessing(self.data[0])
+        print("input:", self.tokenizer.decode(item["input_ids"]))
+        labels = []
+        for id_ in item["label_ids"]:
+            if id_ == -100:
+                continue
+            labels.append(id_)
+        print("label:", self.tokenizer.decode(labels))
+    def __len__(self):
+        return len(self.data)
+    def preprocessing(self, example):
+        input_ids = [self.tokenizer.bos_token_id]
+        label_ids = []
+        for message in example["messages"]:
+            role = message["role"]
+            content = message["content"]
+            content_ids = self.tokenizer.encode(content, add_special_tokens=False)
+            if role == "user":
+                input_ids += self.user_tokens + content_ids
+                label_ids += [self.ignore_index] * len(self.user_tokens) + [
+                    self.ignore_index
+                ] * len(content_ids)
+            else:
+                input_ids += self.assistant_tokens + content_ids
+                label_ids += (
+                    [self.ignore_index] * len(self.assistant_tokens)
+                    + content_ids
+                )#+ [self.tokenizer.eos_token_id]
+        input_ids = input_ids[: self.model_max_length]
+        label_ids = label_ids[: self.model_max_length]
+        # input_ids += [self.tokenizer.eos_token_id] * (len(label_ids) - len(input_ids))
+        input_ids += [self.tokenizer.eos_token_id] * (
+            self.model_max_length - len(input_ids)
+        )
+        label_ids += [self.ignore_index] * (self.model_max_length - len(label_ids))
+        input_ids = torch.LongTensor(input_ids)
+        label_ids = torch.LongTensor(label_ids)
+        # print(f"len input_ids: {len(input_ids)}, len label_ids: {len(label_ids)}")
+        attention_mask = input_ids.ne(self.tokenizer.eos_token_id)
+        return {
+            "input_ids": input_ids,
+            "label_ids": label_ids,
+            "attention_mask": attention_mask,
+        }
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        return self.preprocessing(self.data[idx])
+def load_model_and_tokenizer(
+    model_path: str,
+    max_length: int = 4096,
+    use_lora: bool = True,
+    bf16: bool = False,
+    fp16: bool = False,
+):
+    """load model and tokenizer"""
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    assert not (bf16 and fp16), "bf16 or fp16, not both"
+    if bf16:
+        dtype = torch.bfloat16
+    elif fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_path,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+    )
+    if use_lora:
+        from peft import LoraConfig, TaskType, get_peft_model
+        lora_config = LoraConfig(
+            init_lora_weights="gaussian",
+            task_type=TaskType.CAUSAL_LM,
+            target_modules=["q_proj", "v_proj"],
+            r=8,
+            lora_alpha=32,
+            lora_dropout=0.1,
+            inference_mode=False,
+        )
+        model = get_peft_model(model, lora_config)
+        # trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305
+        model.print_trainable_parameters()
+        # model.enable_input_require_grads()  # need when using adapter
+    return model, tokenizer
+if __name__ == "__main__":
+    model_path = "/mnt/data/user/tc_agi/yh/models/MiniCPM"
+    max_length = 512
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model, tokenizer = load_model_and_tokenizer(
+        model_path=model_args.model_name_or_path,
+        max_length=training_args.model_max_length,
+        use_lora=training_args.use_lora,
+    )
+    train_dataset = SupervisedDataset(
+        data_path=data_args.train_data_path,
+        tokenizer=tokenizer,
+        model_max_length=training_args.model_max_length,
+    )
+    eval_dataset = SupervisedDataset(
+        data_path=data_args.eval_data_path,
+        tokenizer=tokenizer,
+        model_max_length=training_args.model_max_length,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+    )
+    trainer.train()
+    # save the incremental PEFT weights, more details can be found in https://huggingface.co/blog/peft
+    # model.save_pretrained("output_dir") 
--- a/finetune/lora_finetune.ipynb
+++ b/finetune/lora_finetune.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MiniCPM-2B 参数高效微调（LoRA）消费级单卡示例\n",
+    "\n",
+    "本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调，使其具备专业的广告生成能力的代码示例。\n",
+    "\n",
+    "## 硬件需求\n",
+    "- 显存：12GB\n",
+    "- 显卡架构：安培架构（推荐）\n",
+    "- 内存：16GB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 准备数据集\n",
+    "\n",
+    "下载 AdvertiseGen 数据集\n",
+    "- [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)\n",
+    "- [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)\n",
+    "\n",
+    "下载后的数据集格式为 `.tar.gz` 的压缩格式，接下来的操作中，假设该压缩包被置于 `finetune/data/`。\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 校验文件完整性\n",
+    "!md5sum data/AdvertiseGen.tar.gz "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 解压数据集\n",
+    "!tar xvf data/AdvertiseGen.tar.gz "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 转换为 ChatML 格式\n",
+    "import os\n",
+    "import shutil\n",
+    "import json\n",
+    "\n",
+    "input_dir = \"data/AdvertiseGen\"\n",
+    "output_dir = \"data/AdvertiseGenChatML\"\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "for fn in [\"train.json\", \"dev.json\"]:\n",
+    "    data_out_list = []\n",
+    "    with open(os.path.join(input_dir, fn), \"r\") as f, open(os.path.join(output_dir, fn), \"w\") as fo:\n",
+    "        for line in f:\n",
+    "            if len(line.strip()) > 0:\n",
+    "                data = json.loads(line)\n",
+    "                data_out = {\n",
+    "                    \"messages\": [\n",
+    "                        {\n",
+    "                            \"role\": \"user\",\n",
+    "                            \"content\": data[\"content\"],\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"role\": \"assistant\",\n",
+    "                            \"content\": data[\"summary\"],\n",
+    "                        },\n",
+    "                    ]\n",
+    "                }\n",
+    "                data_out_list.append(data_out)\n",
+    "        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 使用 LoRA 进行微调\n",
+    "\n",
+    "命令行一键运行"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!bash lora_finetune_ds.sh"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/finetune/lora_finetune.sh
+++ b/finetune/lora_finetune.sh
+formatted_time=$(date +"%Y%m%d%H%M%S")
+echo $formatted_time
+deepspeed --include localhost:0 `pwd`/finetune/finetune.py \
+    --model_name_or_path checkpoint/miniCPM-bf16/ \
+    --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
+    --train_data_path data/AdvertiseGenChatML/train.json \
+    --eval_data_path data/AdvertiseGenChatML/dev.json \
+    --learning_rate 1e-3 --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 --fp16 --use_lora \
+    --gradient_accumulation_steps 1 --warmup_steps 100 \
+    --max_steps 3000 --weight_decay 0.01 \
+    --evaluation_strategy steps --eval_steps 500 \
+    --save_strategy steps --save_steps 500 --seed 42 \
+    --log_level info --logging_strategy steps --logging_steps 10 \
+    --deepspeed `pwd`/finetune/configs/ds_config_zero3_offload.json
--- a/finetune/requirements.txt
+++ b/finetune/requirements.txt
+# for finetune
+jieba>=0.42.1
+ruamel_yaml>=0.18.5
+rouge_chinese>=1.0.3
+jupyter>=1.0.0
+datasets>=2.16.1
+peft>=0.7.1
+transformers==4.37.2
+# deepspeed>=0.13.1
+# flash_attn>=2.5.1
--- a/finetune/sft_finetune.sh
+++ b/finetune/sft_finetune.sh
+formatted_time=$(date +"%Y%m%d%H%M%S")
+echo $formatted_time
+deepspeed --include localhost:0,1,2,3 `pwd`/finetune/finetune.py \
+    --model_name_or_path checkpoint/miniCPM-bf16/ \
+    --output_dir output/AdvertiseGenSFT/$formatted_time/ \
+    --train_data_path data/AdvertiseGenChatML/train.json \
+    --eval_data_path data/AdvertiseGenChatML/dev.json \
+    --learning_rate 1e-3 --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 4 --bf16 \
+    --gradient_accumulation_steps 8 --warmup_steps 100 \
+    --max_steps 3000 --weight_decay 0.01 \
+    --evaluation_strategy steps --eval_steps 500 \
+    --save_strategy steps --save_steps 500 --seed 42 \
+    --log_level info --logging_strategy steps --logging_steps 10 \
+    --deepspeed `pwd`/finetune/configs/ds_config_zero3_offload.json
--- a/infer.py
+++ b/infer.py
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+import torch
+import torch.nn.functional as F
+import time
+torch.manual_seed(0)
+# path = "output/AdvertiseGenLoRA/xxx/checkpoint-3000" # xxx：系统时间路径
+num_labels = 2 # if cls=n: num_labels = n
+content = "简约而不简单的牛仔外套，白色的衣身十分百搭。衣身多处有做旧破洞设计，打破单调乏味，增加一丝造型看点。衣身后背处有趣味刺绣装饰，丰富层次感，彰显别样时尚。"
+path = 'checkpoint/miniCPM-bf16'
+tokenizer = AutoTokenizer.from_pretrained(path)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16, num_labels=num_labels, device_map=device, trust_remote_code=True)
+# start_time = time.time()
+# responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮？差距多少？", temperature=0.5, top_p=0.8, repetition_penalty=1.02)
+input = [tokenizer.bos_token_id]
+content = tokenizer.encode(content, add_special_tokens=False)
+user_tokens=[1786, 4194, 95388]
+model_max_length=4096
+sequence_lengths = -1
+input += user_tokens + content
+input = input[: model_max_length]
+input += [tokenizer.eos_token_id] * (model_max_length - len(input))
+input = torch.LongTensor(input).unsqueeze(0).to(device)
+attention_mask = input.ne(tokenizer.eos_token_id).unsqueeze(0).to(device)
+with torch.no_grad():
+  output = model(
+    input,
+    attention_mask=attention_mask,
+    position_ids=None,
+    past_key_values=None,
+    inputs_embeds=None,
+    use_cache=None,
+    output_attentions=None,
+    output_hidden_states=None,
+    return_dict=True,
+  )
+logits = F.softmax(output[0].view(-1, num_labels), dim=1)
+# print("infer time：", time.time() - start_time, "s")
+logits = logits.argmax(1)
+print(logits)
--- a/inference/README.md
+++ b/inference/README.md
+# VLLM 推理 MiniCPM | MiniCPM inference on VLLM
+### 中文
+* 安装支持 MiniCPM 的 vLLM
+  - 因为 MiniCPM 采用 MUP 结构，在矩阵乘法中存在一定的放缩计算，与Llama类模型结构有细微差别。
+  - 我们基于版本为 0.2.2 的 vLLM 实现了 MiniCPM 的推理，代码位于仓库[inference](https://github.com/OpenBMB/MiniCPM/tree/main/inference)文件夹下，未来将会支持更新的vLLM 版本。
+* 安装支持 MiniCPM 的 vLLM 版本
+```shell
+pip install inference/vllm
+```
+* 将Huggingface Transformers仓库转为vLLM-MiniCPM支持的格式，其中`<hf_repo_path>`, `<vllmcpm_repo_path>`均为本地路径
+```shell
+python inference/convert_hf_to_vllmcpm.py --load <hf_repo_path> --save <vllmcpm_repo_path>
+```
+* 测试样例
+```shell
+cd inference/vllm/examples/infer_cpm
+python inference.py --model_path <vllmcpm_repo_path> --prompt_path prompts/prompt_demo.txt
+```
+* 期望输出
+```shell
+<用户>: Which city is the capital of China?
+<AI>:
+ The capital city of China is Beijing. Beijing is a major political, cultural, and economic center in China, and it is known for its rich history, beautiful architecture, and vibrant nightlife. It is also home to many of China's most important cultural and historical sites, including the Forbidden City, the Great Wall of China, and the Temple of Heaven. Beijing is a popular destination for tourists from around the world, and it is an important hub for international business and trade.
+```
+### English
+* Install vLLM which supports MiniCPM
+ - The structure of MiniCPM is not completely same as Llama, since MiniCPM uses the structure of MUP and scaling is applied in matrix multiplications.
+ - We implemented the inference of MiniCPM in vLLM 0.2.2, and the code is located at [inference](https://github.com/OpenBMB/MiniCPM/tree/main/inference). Newer vLLM versions will be supported in the future.
+* Install vLLM which supports MiniCPM
+```shell
+pip install inference/vllm
+```
+* Convert Huggingface repo to vllm-cpm repo，where `<hf_repo_path>`, `<vllmcpm_repo_path>` are local paths
+```shell
+python inference/convert_hf_to_vllmcpm.py --load <hf_repo_path> --save <vllmcpm_repo_path>
+```
+* Test cases
+```shell
+cd inference/vllm/examples/infer_cpm
+python inference.py --model_path <vllmcpm_repo_path> --prompt_path prompts/prompt_demo.txt
+```
+* Expected Output
+```shell
+<用户>: Which city is the capital of China?
+<AI>:
+ The capital city of China is Beijing. Beijing is a major political, cultural, and economic center in China, and it is known for its rich history, beautiful architecture, and vibrant nightlife. It is also home to many of China's most important cultural and historical sites, including the Forbidden City, the Great Wall of China, and the Temple of Heaven. Beijing is a popular destination for tourists from around the world, and it is an important hub for international business and trade.
+```
\ No newline at end of file
--- a/inference/convert_hf_to_vllmcpm.py
+++ b/inference/convert_hf_to_vllmcpm.py
+import argparse
+import json
+import os
+import shutil
+from tqdm import tqdm
+from collections import OrderedDict
+import torch
+def convert_model(config, ckpt):
+    # config
+    config_bmt = OrderedDict(
+        {
+            "_dtype": "bf16",
+            "activate_fn": "silu",
+            "architectures": [
+                "CPMDragonflyForCausalLM"
+            ],
+            "model_type": "cpm_dragonfly",
+            "base": 10000,
+            "dim_ff": config['intermediate_size'],
+            "dim_head": config['hidden_size'] // config['num_attention_heads'],
+            "dim_model": config['hidden_size'],
+            "dim_model_base": 256,
+            "dropout_p": 0.0,
+            "eps": config['rms_norm_eps'],
+            "init_std": config['initializer_range'],
+            "num_heads": config['num_attention_heads'],
+            "num_kv_heads": config['num_key_value_heads'],
+            "num_layers": config['num_hidden_layers'],
+            "orig_max_length": 4096,
+            "pose_prob": 0.0,
+            "pose_scaling_factor": 1.0,
+            "qk_norm": False,
+            "rope_scaling_factor": 1,
+            "rope_scaling_type": "",
+            "scale": True,
+            "scale_depth": config['scale_depth'],
+            "scale_emb": config['scale_emb'],
+            "tie_lm_head": True,
+            "tp": 0,
+            "transformers_version": "4.35.0",
+            "vocab_size": config['vocab_size']
+        }
+    )
+    model_bmt = OrderedDict()
+    model_bmt["input_embedding.weight"] = ckpt['model.embed_tokens.weight'].contiguous()
+    model_bmt["encoder.output_layernorm.weight"] = ckpt['model.norm.weight'].contiguous()
+    for lnum in tqdm(range(config_bmt['num_layers'])):
+        hf_pfx = f"model.layers.{lnum}"
+        bmt_pfx = f"encoder.layers.{lnum}"
+        model_bmt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = ckpt[f"{hf_pfx}.input_layernorm.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = ckpt[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = ckpt[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = ckpt[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = ckpt[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = ckpt[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = ckpt[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = ckpt[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
+        model_bmt[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = ckpt[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
+    return config_bmt, model_bmt
+def load_model_ckpt(args):
+    with open(os.path.join(args.load, "config.json"), 'r') as fin:
+        config = json.load(fin)
+    ckpt = torch.load(os.path.join(args.load, "pytorch_model.bin"))
+    os.makedirs(f"{args.save}", exist_ok=True)
+    # model and config
+    hf_config, hf_ckpt = convert_model(config, ckpt)
+    with open(os.path.join(args.save, "config.json"), 'w') as fout:
+        json.dump(hf_config, fout, indent=4)
+    torch.save(hf_ckpt, f"{args.save}/pytorch_model.pt")
+    # tokenizer
+    shutil.copyfile(f"{args.load}/tokenizer.json", f"{args.save}/tokenizer.json")
+    shutil.copyfile(f"{args.load}/tokenizer.model", f"{args.save}/tokenizer.model")
+    shutil.copyfile(f"{args.load}/special_tokens_map.json", f"{args.save}/special_tokens_map.json")
+    shutil.copyfile(f"{args.load}/tokenizer_config.json", f"{args.save}/tokenizer_config.json")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--load", type=str, default="")
+    parser.add_argument("--save", type=str, default="")
+    args = parser.parse_args()
+    load_model_ckpt(args)
\ No newline at end of file
--- a/inference/vllm/.pylintrc
+++ b/inference/vllm/.pylintrc
+# This Pylint rcfile contains a best-effort configuration to uphold the
+# best-practices and style described in the Google Python style guide:
+#   https://google.github.io/styleguide/pyguide.html
+#
+# Its canonical open-source location is:
+#   https://google.github.io/styleguide/pylintrc
+[MASTER]
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=docs
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths.
+ignore-patterns=
+# Pickle collected data for later comparisons.
+persistent=no
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+# Use multiple processes to speed up Pylint.
+jobs=4
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+[MESSAGES CONTROL]
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+#enable=
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=abstract-method,
+        apply-builtin,
+        arguments-differ,
+        attribute-defined-outside-init,
+        backtick,
+        bad-option-value,
+        basestring-builtin,
+        buffer-builtin,
+        c-extension-no-member,
+        consider-using-enumerate,
+        cmp-builtin,
+        cmp-method,
+        coerce-builtin,
+        coerce-method,
+        delslice-method,
+        div-method,
+        duplicate-code,
+        eq-without-hash,
+        execfile-builtin,
+        file-builtin,
+        filter-builtin-not-iterating,
+        fixme,
+        getslice-method,
+        global-statement,
+        hex-method,
+        idiv-method,
+        implicit-str-concat-in-sequence,
+        import-error,
+        import-self,
+        import-star-module-level,
+        inconsistent-return-statements,
+        input-builtin,
+        intern-builtin,
+        invalid-str-codec,
+        locally-disabled,
+        logging-fstring-interpolation,  # added by vLLM
+        logging-not-lazy,  # added by vLLM
+        long-builtin,
+        long-suffix,
+        map-builtin-not-iterating,
+        misplaced-comparison-constant,
+        missing-class-docstring,  # TODO (vLLM): enable
+        missing-function-docstring,
+        missing-module-docstring,  # TODO (vLLM): enable
+        metaclass-assignment,
+        next-method-called,
+        next-method-defined,
+        no-absolute-import,
+        no-else-break,
+        no-else-continue,
+        no-else-raise,
+        no-else-return,
+        no-init,  # added
+        no-member,
+        no-name-in-module,
+        no-self-use,
+        nonzero-method,
+        oct-method,
+        old-division,
+        old-ne-operator,
+        old-octal-literal,
+        old-raise-syntax,
+        parameter-unpacking,
+        print-statement,
+        raising-string,
+        range-builtin-not-iterating,
+        raw_input-builtin,
+        rdiv-method,
+        reduce-builtin,
+        relative-import,
+        reload-builtin,
+        round-builtin,
+        setslice-method,
+        signature-differs,
+        standarderror-builtin,
+        suppressed-message,
+        sys-max-int,
+        too-few-public-methods,
+        too-many-ancestors,
+        too-many-arguments,
+        too-many-boolean-expressions,
+        too-many-branches,
+        too-many-instance-attributes,
+        too-many-locals,
+        too-many-nested-blocks,
+        too-many-public-methods,
+        too-many-return-statements,
+        too-many-statements,
+        trailing-newlines,
+        unichr-builtin,
+        unicode-builtin,
+        unnecessary-pass,
+        unpacking-in-except,
+        unspecified-encoding,
+        useless-else-on-loop,
+        useless-object-inheritance,
+        useless-suppression,
+        using-cmp-argument,
+        wrong-import-order,
+        xrange-builtin,
+        zip-builtin-not-iterating,
+[REPORTS]
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+# Tells whether to display a full report or only the messages
+reports=no
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+[BASIC]
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
+# Regular expression matching correct method names
+method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+[TYPECHECK]
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+[FORMAT]
+# Maximum number of characters on a single line.
+max-line-length=80
+# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
+# lines made too long by directives to pytype.
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=(?x)(
+  ^\s*(\#\ )?<?https?://\S+>?$|
+  ^\s*(from\s+\S+\s+)?import\s+.+$)
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+# Maximum number of lines in a module
+max-module-lines=99999
+# String used as indentation unit.  The internal Google style guide mandates 2
+# spaces.  Google's externaly-published style guide says 4, consistent with
+# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
+# projects (like TensorFlow).
+indent-string='    '
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+[MISCELLANEOUS]
+# List of note tags to take in consideration, separated by a comma.
+notes=TODO
+[STRING]
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=yes
+[VARIABLES]
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
+[LOGGING]
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging,absl.logging,tensorflow.io.logging
+[SIMILARITIES]
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+# Ignore comments when computing similarities.
+ignore-comments=yes
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+# Ignore imports when computing similarities.
+ignore-imports=no
+[SPELLING]
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+[IMPORTS]
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,
+                   TERMIOS,
+                   Bastion,
+                   rexec,
+                   sets
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant, absl
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+[CLASSES]
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,
+                            class_
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+[EXCEPTIONS]
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=StandardError,
+                       Exception,
+                       BaseException
--- a/inference/vllm/.readthedocs.yaml
+++ b/inference/vllm/.readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+version: 2
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+sphinx:
+   configuration: docs/source/conf.py
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+formats:
+   - pdf
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: docs/requirements-docs.txt
--- a/inference/vllm/CONTRIBUTING.md
+++ b/inference/vllm/CONTRIBUTING.md
+# Contributing to vLLM
+Thank you for your interest in contributing to vLLM!
+Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
+There are several ways you can contribute to the project:
+- Identify and report any issues or bugs.
+- Request or add a new model.
+- Suggest or implement new features.
+However, remember that contributions aren't just about code.
+We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
+Talk about it in your blog posts, highlighting how it's driving your incredible projects.
+Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
+## Setup for development
+### Build from source
+```bash
+pip install -r requirements.txt
+pip install -e .  # This may take several minutes.
+```
+### Testing
+```bash
+pip install -r requirements-dev.txt
+# Static type checking
+mypy
+# Unit tests
+pytest tests/
+```
+**Note:** Currently, the repository does not pass the mypy tests.
+## Contributing Guidelines
+### Issue Reporting
+If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
+If not, please file a new issue, providing as much relevant information as possible.
+### Coding Style Guide
+In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
+We include a formatting script [`format.sh`](./format.sh) to format the code.
+### Pull Requests
+When submitting a pull request:
+1. Make sure your code has been rebased on top of the latest commit on the main branch.
+2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
+3. Include a detailed description of the changes in the pull request.
+Explain why you made the changes you did.
+If your pull request fixes an open issue, please include a reference to it in the description.
+### Code Reviews
+All submissions, including submissions by project members, require a code review.
+To make the review process as smooth as possible, please:
+1. Keep your changes as concise as possible.
+If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
+2. Respond to all comments within a reasonable time frame.
+If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
+### Thank You
+Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+Your contributions make vLLM a great tool for everyone!
--- a/inference/vllm/Dockerfile
+++ b/inference/vllm/Dockerfile
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+WORKDIR /workspace
+# install build and runtime dependencies
+COPY requirements.txt requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements.txt
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt
+# image to build pytorch extensions
+FROM dev AS build
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY requirements.txt requirements.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm/__init__.py vllm/__init__.py
+# max jobs used by Ninja to build extensions
+ENV MAX_JOBS=$max_jobs
+RUN python3 setup.py build_ext --inplace
+# image to run unit testing suite
+FROM dev AS test
+# copy pytorch extensions separately to avoid having to rebuild
+# when python code changes
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY tests tests
+COPY vllm vllm
+ENTRYPOINT ["python3", "-m", "pytest", "tests"]
+# use CUDA base as CUDA runtime dependencies are already installed via pip
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+# libnccl required for ray
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+WORKDIR /workspace
+COPY requirements.txt requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements.txt
+FROM vllm-base AS vllm
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY vllm vllm
+EXPOSE 8000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
+# openai api server alternative
+FROM vllm-base AS vllm-openai
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate fschat
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY vllm vllm
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/inference/vllm/LICENSE
+++ b/inference/vllm/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/inference/vllm/MANIFEST.in
+++ b/inference/vllm/MANIFEST.in
+include LICENSE
+include requirements.txt
+recursive-include csrc *
--- a/inference/vllm/README.md
+++ b/inference/vllm/README.md
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
+  </picture>
+</p>
+<h3 align="center">
+Easy, fast, and cheap LLM serving for everyone
+</h3>
+<p align="center">
+| <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+</p>
+---
+*Latest News* 🔥
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
+- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
+- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
+- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+---
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+vLLM is fast with:
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with **PagedAttention**
+- Continuous batching of incoming requests
+- Optimized CUDA kernels
+vLLM is flexible and easy to use with:
+- Seamless integration with popular Hugging Face models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+vLLM seamlessly supports many Hugging Face models, including the following architectures:
+- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
+- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
+- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
+- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
+- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Phi-1.5 (`microsoft/phi-1_5`, etc.)
+- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+```bash
+pip install vllm
+```
+## Getting Started
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
+- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
+- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
+- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+## Contributing
+We welcome and value any contributions and collaborations.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+## Citation
+If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
+```bibtex
+@inproceedings{kwon2023efficient,
+  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
+  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
+  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
+  year={2023}
+}
+```
--- a/inference/vllm/benchmarks/README.md
+++ b/inference/vllm/benchmarks/README.md
+# Benchmarking vLLM
+## Downloading the ShareGPT dataset
+You can download the dataset by running:
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```