v1.0

24eacbc0 · chenzk · 24eacbc0 · 24eacbc0 · 24eacbc0 · 24eacbc0
Commit 24eacbc0 authored May 09, 2024 by chenzk
20 changed files
--- a/inference/vllm/examples/infer_cpm/transform.py
+++ b/inference/vllm/examples/infer_cpm/transform.py
+import json
+import pandas as pd
+
+with open("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.jsonl", 'r')  as fin:
+    data = json.load(fin)
+    
+# with open("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.xlxs", 'w')  as fout:
+df = pd.DataFrame(data)
+df.to_excel("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.xlsx")
--- a/inference/vllm/examples/infer_cpm/vllm_convert_checkpoint_to_hf.py
+++ b/inference/vllm/examples/infer_cpm/vllm_convert_checkpoint_to_hf.py
+import argparse
+import json
+import os
+import shutil
+import sys
+from collections import OrderedDict
+
+import torch
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.configs import CPMMistralConfig
+
+
+def find_ckpt(directory):
+    files = os.listdir(directory)
+    ckpt = [file for file in files if file.endswith(".pt")]
+    assert len(ckpt) == 1
+    return '/'.join((directory, ckpt[0]))
+
+def convert_model(ckpt, layernum):
+    print("Total number in orgckpt", len(ckpt))
+    model_hf = OrderedDict()
+
+    model_hf['model.embed_tokens.weight'] = ckpt["input_embedding.weight"].contiguous()
+    model_hf['model.norm.weight'] = ckpt["encoder.output_layernorm.weight"].contiguous()
+    try:
+        model_hf['lm_head.weight'] = ckpt['lm_head.weight'].contiguous()
+        print("lm_head found")
+    except:
+        model_hf['lm_head.weight'] = ckpt["input_embedding.weight"].contiguous()
+        print("lm_head not found")
+
+
+    # print(ckpt.keys())
+
+    for lnum in range(layernum):
+        hf_pfx = f"model.layers.{lnum}"
+        bmt_pfx = f"encoder.layers.{lnum}"
+        
+        model_hf[f"{hf_pfx}.input_layernorm.weight"] = ckpt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = ckpt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous()
+        model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous()
+
+        try:
+            model_hf[f"{hf_pfx}.self_attn.q_norm.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.q_norm.weight"].contiguous()
+            model_hf[f"{hf_pfx}.self_attn.k_norm.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.k_norm.weight"].contiguous()
+        except:
+            print("Error!")
+            pass
+
+    
+    print("Total number in converted", len(model_hf))
+
+    return model_hf
+
+
+
+
+
+
+        
+
+
+def load_model_ckpt(args):
+    ckpt = torch.load(find_ckpt(args.load))
+    # with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
+    #     config = json.load(fin)
+    config = CPMMistralConfig.from_pretrained(args.load)
+
+    print(config)
+    # from IPython import embed; embed(header="222")
+    
+    if args.save is not None:
+        config_name = args.save #"cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
+    else:
+        config_name = args.load
+
+    hf_ckpt = convert_model(ckpt, config.num_layers)
+
+    
+
+    os.makedirs(f"{config_name}", exist_ok=True)
+
+    torch.save(hf_ckpt, f"{config_name}/pytorch_model.bin")
+    config.save_pretrained(f"{config_name}")
+
+    # try:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #             args.load,
+    #             trust_remote_code=True)
+    #     tokenizer.save_pretrained(f"{config_name}")
+
+    #     # for fast tokenizer bug
+    #     if 'tokenizer.json' in os.listdir(f"{config_name}"):
+    #         os.remove(f"{config_name}/tokenizer.json")
+
+    #     print("Tokenizer loaded")
+    # except Exception as e:
+    #     print(e)
+    #     try:
+    #         shutil.copyfile("autoeval/120k_v2.model", f"wip.{config_name}/tokenizer.model")
+    #         shutil.copyfile("autoeval/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
+    #         shutil.copyfile("autoeval/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
+    #         print("Tokenizer copied")
+    #     except Exception as e:
+    #         print("Tokenizer not exists!")
+    #         print(e)
+       
+    
+
+    
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
+    parser.add_argument("--load", type=str, default="wip.job_469747_ckpt_4069")
+
+    parser.add_argument("--save", type=str, default=None)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    load_model_ckpt(args)
--- a/inference/vllm/examples/infer_cpm_mistral/inference.py
+++ b/inference/vllm/examples/infer_cpm_mistral/inference.py
+import argparse
+
+from vllm import LLM, SamplingParams
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--model_path", type=str, default="")
+
+args = parser.parse_args()
+
+# Sample prompts.
+prompts = [
+    "北京烤鸭真好吃，正好有一家北京烤鸭店，我想去吃北京烤鸭",
+    "def reverse_list(list):",
+    "Beijing is the capital of",
+    "1 + 1 = ",
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+    '''0123456789'''*810 + " what is next number?",
+    "您好，三加二等于多少？",
+    "我是",
+    ""
+]
+
+params_dict = {
+    "n": 1,
+    "best_of": None,
+    "presence_penalty": 0.0,
+    "frequency_penalty": 0.0,
+    "temperature": 1,
+    "top_p": 0.9,
+    "top_k": -1,
+    "use_beam_search": False,
+    "length_penalty": 1.0,
+    "early_stopping": False,
+    "stop": None,
+    "stop_token_ids": None,
+    "ignore_eos": False,
+    "max_tokens": 100,
+    "logprobs": None,
+    "prompt_logprobs": None,
+    "skip_special_tokens": True,
+}
+
+# Create a sampling params object.
+sampling_params = SamplingParams(**params_dict)
+
+# Create an LLM.
+llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/inference/vllm/examples/infer_cpm_mistral/run_infer.sh
+++ b/inference/vllm/examples/infer_cpm_mistral/run_infer.sh
+#!/bin/bash
+
+CHECKPOINT=$1
+SAVE_TO=$2
+HF_MODEL_NAME=wip.$SAVE_TO  # huggingface上的模型名
+
+python vllm_convert_checkpoint_to_hf.py --load $CHECKPOINT --save $SAVE_TO
+python inference.py --model_path=$HF_MODEL_NAME
--- a/inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.bk.py
+++ b/inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.bk.py
+import argparse
+import json
+import os
+import shutil
+import sys
+from collections import OrderedDict
+
+import torch
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.configs import CPMMistralConfig
+
+
+def find_ckpt(directory):
+    files = os.listdir(directory)
+    ckpt = [file for file in files if file.endswith(".pt")]
+    assert len(ckpt) == 1
+    return '/'.join((directory, ckpt[0]))
+
+def convert_model(ckpt, layernum):
+    print("Total number in orgckpt", len(ckpt))
+    model_hf = OrderedDict()
+
+    model_hf['model.embed_tokens.weight'] = ckpt["input_embedding.weight"].contiguous()
+    model_hf['model.norm.weight'] = ckpt["encoder.output_layernorm.weight"].contiguous()
+    try:
+        model_hf['lm_head.weight'] = ckpt['lm_head.weight'].contiguous()
+        print("lm_head found")
+    except:
+        model_hf['lm_head.weight'] = ckpt["input_embedding.weight"].contiguous()
+        print("lm_head not found")
+
+
+    for lnum in range(layernum):
+        hf_pfx = f"model.layers.{lnum}"
+        bmt_pfx = f"encoder.layers.{lnum}"
+        
+        model_hf[f"{hf_pfx}.input_layernorm.weight"] = ckpt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = ckpt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous()
+        model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous()
+    
+    print("Total number in converted", len(model_hf))
+
+    return model_hf
+
+
+
+
+
+
+        
+
+
+def load_model_ckpt(args):
+    ckpt = torch.load(find_ckpt(args.load))
+    # with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
+    #     config = json.load(fin)
+    config = CPMMistralConfig.from_pretrained(args.load)
+    print(config)
+    # from IPython import embed; embed(header="222")
+    
+    config_name = args.save #"cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
+    hf_ckpt = convert_model(ckpt, config.num_layers)
+
+    
+    os.makedirs(f"wip.{config_name}", exist_ok=True)
+
+    torch.save(hf_ckpt, f"wip.{config_name}/pytorch_model.bin")
+    config.save_pretrained(f"wip.{config_name}")
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+                args.load,
+                trust_remote_code=True)
+        tokenizer.save_pretrained(f"wip.{config_name}")
+
+        # for fast tokenizer bug
+        if 'tokenizer.json' in os.listdir(f"wip.{config_name}"):
+            os.remove(f"wip.{config_name}/tokenizer.json")
+
+        print("Tokenizer loaded")
+    except Exception as e:
+        print(e)
+        try:
+            shutil.copyfile("autoeval/tokenizer.model", f"wip.{config_name}/tokenizer.model")
+            shutil.copyfile("autoeval/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
+            shutil.copyfile("autoeval/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
+            print("Tokenizer copied")
+        except Exception as e:
+            print("Tokenizer not exists!")
+            print(e)
+       
+    
+
+    
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
+    parser.add_argument("--load", type=str, default="wip.job_469747_ckpt_4069D")
+
+    parser.add_argument("--save", type=str, default=None)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    load_model_ckpt(args)
--- a/inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.py
+++ b/inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.py
+import argparse
+import json
+import os
+import shutil
+import sys
+from collections import OrderedDict
+
+import torch
+from transformers import AutoTokenizer, LlamaTokenizerFast
+
+from vllm.transformers_utils.configs import CPMMistralConfig
+
+
+def find_ckpt(directory):
+    files = os.listdir(directory)
+    ckpt = [file for file in files if file.endswith(".pt")]
+    assert len(ckpt) == 1
+    return "/".join((directory, ckpt[0]))
+
+
+def convert_model(ckpt, layernum):
+    print("Total number in orgckpt", len(ckpt))
+    model_hf = OrderedDict()
+
+    model_hf["model.embed_tokens.weight"] = ckpt["input_embedding.weight"].contiguous()
+    model_hf["model.norm.weight"] = ckpt["encoder.output_layernorm.weight"].contiguous()
+    try:
+        model_hf["lm_head.weight"] = ckpt["lm_head.weight"].contiguous()
+        print("lm_head found")
+    except:
+        model_hf["lm_head.weight"] = ckpt["input_embedding.weight"].contiguous()
+        print("lm_head not found")
+
+    for lnum in range(layernum):
+        hf_pfx = f"model.layers.{lnum}"
+        bmt_pfx = f"encoder.layers.{lnum}"
+
+        model_hf[f"{hf_pfx}.input_layernorm.weight"] = ckpt[
+            f"{bmt_pfx}.self_att.layernorm_before_attention.weight"
+        ].contiguous()
+
+        model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = ckpt[
+            f"{bmt_pfx}.self_att.self_attention.project_q.weight"
+        ].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = ckpt[
+            f"{bmt_pfx}.self_att.self_attention.project_k.weight"
+        ].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = ckpt[
+            f"{bmt_pfx}.self_att.self_attention.project_v.weight"
+        ].contiguous()
+        model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = ckpt[
+            f"{bmt_pfx}.self_att.self_attention.attention_out.weight"
+        ].contiguous()
+
+        model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = ckpt[
+            f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"
+        ].contiguous()
+
+        model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous()
+        model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous()
+
+        model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous()
+    print("Total number in converted", len(model_hf))
+
+    return model_hf
+
+
+def load_model_ckpt(args):
+    ckpt = torch.load(find_ckpt(args.load))
+    # with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
+    #     config = json.load(fin)
+    config = CPMMistralConfig.from_pretrained(args.load)
+    print(config)
+    # from IPython import embed; embed(header="222")
+
+    config_name = args.save  # "cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
+    hf_ckpt = convert_model(ckpt, config.num_layers)
+
+    os.makedirs(f"wip.{config_name}", exist_ok=True)
+
+    torch.save(hf_ckpt, f"wip.{config_name}/pytorch_model.bin")
+    config.save_pretrained(f"wip.{config_name}")
+
+    try:
+        try:
+            tokenizer = LlamaTokenizerFast.from_pretrained(args.load, trust_remote_code=True)
+        except Exception as e:
+            print(e)
+            tokenizer = AutoTokenizer.from_pretrained(args.load, trust_remote_code=True)
+        tokenizer.save_pretrained(f"wip.{config_name}")
+
+        # for fast tokenizer bug
+        if "tokenizer.json" in os.listdir(f"wip.{config_name}"):
+            os.remove(f"wip.{config_name}/tokenizer.json")
+
+        print("Tokenizer loaded")
+    except Exception as e:
+        print(e)
+        try:
+            shutil.copyfile(f"{args.load}/tokenizer.model", f"wip.{config_name}/tokenizer.model")
+            shutil.copyfile(f"{args.load}/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
+            shutil.copyfile(f"{args.load}/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
+            print("Tokenizer copied")
+        except Exception as e:
+            print("Tokenizer not exists!")
+            print(e)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
+    parser.add_argument("--load", type=str, default="wip.job_469747_ckpt_4069D")
+
+    parser.add_argument("--save", type=str, default=None)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    load_model_ckpt(args)
--- a/inference/vllm/examples/llm_engine_example.py
+++ b/inference/vllm/examples/llm_engine_example.py
+import argparse
+from typing import List, Tuple
+
+from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
+
+
+def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
+        ("What is the meaning of life?",
+         SamplingParams(n=2,
+                        best_of=5,
+                        temperature=0.8,
+                        top_p=0.95,
+                        frequency_penalty=0.1)),
+        ("It is only with the heart that one can see rightly",
+         SamplingParams(n=3, best_of=3, use_beam_search=True,
+                        temperature=0.0)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            engine.add_request(str(request_id), prompt, sampling_params)
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    process_requests(engine, test_prompts)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
--- a/inference/vllm/examples/offline_inference.py
+++ b/inference/vllm/examples/offline_inference.py
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/inference/vllm/examples/openai_chatcompletion_client.py
+++ b/inference/vllm/examples/openai_chatcompletion_client.py
+import openai
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+# List models API
+models = openai.Model.list()
+print("Models:", models)
+
+model = models["data"][0]["id"]
+
+# Chat completion API
+chat_completion = openai.ChatCompletion.create(
+    model=model,
+    messages=[{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "Who won the world series in 2020?"
+    }, {
+        "role":
+        "assistant",
+        "content":
+        "The Los Angeles Dodgers won the World Series in 2020."
+    }, {
+        "role": "user",
+        "content": "Where was it played?"
+    }])
+
+print("Chat completion results:")
+print(chat_completion)
--- a/inference/vllm/examples/openai_completion_client.py
+++ b/inference/vllm/examples/openai_completion_client.py
+import openai
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+# List models API
+models = openai.Model.list()
+print("Models:", models)
+
+model = models["data"][0]["id"]
+
+# Completion API
+stream = False
+completion = openai.Completion.create(
+    model=model,
+    prompt="A robot may not injure a human being",
+    echo=False,
+    n=2,
+    stream=stream,
+    logprobs=3)
+
+print("Completion results:")
+if stream:
+    for c in completion:
+        print(c)
+else:
+    print(completion)
--- a/inference/vllm/format.sh
+++ b/inference/vllm/format.sh
+#!/usr/bin/env bash
+# YAPF formatter, adapted from ray and skypilot.
+#
+# Usage:
+#    # Do work and commit your work.
+
+#    # Format files that differ from origin/main.
+#    bash format.sh
+
+#    # Commit changed files with message 'Run yapf and pylint'
+#
+#
+# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# You are encouraged to run this locally before pushing changes for review.
+
+# Cause the script to exit if a single command fails
+set -eo pipefail
+
+# this stops git rev-parse from failing if we run this from the .git directory
+builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
+ROOT="$(git rev-parse --show-toplevel)"
+builtin cd "$ROOT" || exit 1
+
+YAPF_VERSION=$(yapf --version | awk '{print $2}')
+PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
+MYPY_VERSION=$(mypy --version | awk '{print $2}')
+
+# # params: tool name, tool version, required version
+tool_version_check() {
+    if [[ $2 != $3 ]]; then
+        echo "Wrong $1 version installed: $3 is required, not $2."
+        exit 1
+    fi
+}
+
+tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
+
+YAPF_FLAGS=(
+    '--recursive'
+    '--parallel'
+)
+
+YAPF_EXCLUDES=(
+    '--exclude' 'build/**'
+)
+
+# Format specified files
+format() {
+    yapf --in-place "${YAPF_FLAGS[@]}" "$@"
+}
+
+# Format files that differ from main branch. Ignores dirs that are not slated
+# for autoformat yet.
+format_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause yapf to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+             yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
+    fi
+
+}
+
+# Format all files
+format_all() {
+    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests
+}
+
+## This flag formats individual files. --files *must* be the first command line
+## arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   format "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is formatted.
+elif [[ "$1" == '--all' ]]; then
+   format_all
+else
+   # Format only the files that changed in last commit.
+   format_changed
+fi
+echo 'vLLM yapf: Done'
+
+# Run mypy
+# TODO(zhuohan): Enable mypy
+# echo 'vLLM mypy:'
+# mypy
+
+# Lint specified files
+lint() {
+    pylint "$@"
+}
+
+# Lint files that differ from main branch. Ignores dirs that are not slated
+# for autolint yet.
+lint_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause pylint to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+             pylint
+    fi
+
+}
+
+# Run Pylint
+echo 'vLLM Pylint:'
+## This flag lints individual files. --files *must* be the first command line
+## arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   lint "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is linted.
+elif [[ "$1" == '--all' ]]; then
+   lint vllm tests
+else
+   # Format only the files that changed in last commit.
+   lint_changed
+fi
+
+if ! git diff --quiet &>/dev/null; then
+    echo 'Reformatted files. Please review and stage the changes.'
+    echo 'Changes not staged for commit:'
+    echo
+    git --no-pager diff --name-only
+
+    exit 1
+fi
--- a/inference/vllm/mypy.ini
+++ b/inference/vllm/mypy.ini
+[mypy]
+python_version = 3.8
+
+ignore_missing_imports = True
+
+files = vllm
+# TODO(woosuk): Include the code from Megatron and HuggingFace.
+exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
--- a/inference/vllm/pyproject.toml
+++ b/inference/vllm/pyproject.toml
+[build-system]
+requires = [
+    "ninja",
+    "packaging",
+    "setuptools",
+    "torch >= 2.1.0",
+    "wheel",
+]
+build-backend = "setuptools.build_meta"
--- a/inference/vllm/requirements-dev.txt
+++ b/inference/vllm/requirements-dev.txt
+# formatting
+yapf==0.32.0
+pylint==2.8.2
+
+# type checking
+mypy==0.991
+types-PyYAML
+types-requests
+types-setuptools
+
+# testing
+pytest
+pytest-forked
+pytest-asyncio
+
--- a/inference/vllm/requirements.txt
+++ b/inference/vllm/requirements.txt
+ninja  # For faster builds.
+psutil
+ray >= 2.5.1
+pandas  # Required for Ray data.
+pyarrow  # Required for Ray data.
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+einops  # Required for phi-1_5
+torch >= 2.1.0
+transformers >= 4.34.0  # Required for Mistral.
+xformers >= 0.0.22.post7  # Required for CUDA 12.1.
+fastapi
+uvicorn[standard]
+pydantic == 1.10.13  # Required for OpenAI server.
--- a/inference/vllm/setup.py
+++ b/inference/vllm/setup.py
+import io
+import os
+import re
+import subprocess
+from typing import List, Set
+import warnings
+
+from packaging.version import parse, Version
+import setuptools
+import torch
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+
+ROOT_DIR = os.path.dirname(__file__)
+
+MAIN_CUDA_VERSION = "12.1"
+
+# Supported NVIDIA GPU architectures.
+SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+
+# Compiler flags.
+CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
+# TODO(woosuk): Should we use -O3?
+NVCC_FLAGS = ["-O2", "-std=c++17"]
+
+ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+
+if CUDA_HOME is None:
+    raise RuntimeError(
+        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+
+
+def get_nvcc_cuda_version(cuda_dir: str) -> Version:
+    """Get the CUDA version from nvcc.
+
+    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
+    """
+    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                          universal_newlines=True)
+    output = nvcc_output.split()
+    release_idx = output.index("release") + 1
+    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
+    return nvcc_cuda_version
+
+
+def get_torch_arch_list() -> Set[str]:
+    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
+    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
+    # compiler to additionally include PTX code that can be runtime-compiled
+    # and executed on the 8.6 or newer architectures. While the PTX code will
+    # not give the best performance on the newer architectures, it provides
+    # forward compatibility.
+    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+    if env_arch_list is None:
+        return set()
+
+    # List are separated by ; or space.
+    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
+    if not torch_arch_list:
+        return set()
+
+    # Filter out the invalid architectures and print a warning.
+    valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in SUPPORTED_ARCHS})
+    arch_list = torch_arch_list.intersection(valid_archs)
+    # If none of the specified architectures are valid, raise an error.
+    if not arch_list:
+        raise RuntimeError(
+            "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
+            f"variable ({env_arch_list}) is supported. "
+            f"Supported CUDA architectures are: {valid_archs}.")
+    invalid_arch_list = torch_arch_list - valid_archs
+    if invalid_arch_list:
+        warnings.warn(
+            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
+            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
+            f"({env_arch_list}). Supported CUDA architectures are: "
+            f"{valid_archs}.")
+    return arch_list
+
+
+# First, check the TORCH_CUDA_ARCH_LIST environment variable.
+compute_capabilities = get_torch_arch_list()
+if not compute_capabilities:
+    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
+    # GPUs on the current machine.
+    device_count = torch.cuda.device_count()
+    for i in range(device_count):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 7:
+            raise RuntimeError(
+                "GPUs with compute capability below 7.0 are not supported.")
+        compute_capabilities.add(f"{major}.{minor}")
+
+nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
+if not compute_capabilities:
+    # If no GPU is specified nor available, add all supported architectures
+    # based on the NVCC CUDA version.
+    compute_capabilities = SUPPORTED_ARCHS.copy()
+    if nvcc_cuda_version < Version("11.1"):
+        compute_capabilities.remove("8.6")
+    if nvcc_cuda_version < Version("11.8"):
+        compute_capabilities.remove("8.9")
+        compute_capabilities.remove("9.0")
+
+# Validate the NVCC CUDA version.
+if nvcc_cuda_version < Version("11.0"):
+    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
+if nvcc_cuda_version < Version("11.1"):
+    if any(cc.startswith("8.6") for cc in compute_capabilities):
+        raise RuntimeError(
+            "CUDA 11.1 or higher is required for compute capability 8.6.")
+if nvcc_cuda_version < Version("11.8"):
+    if any(cc.startswith("8.9") for cc in compute_capabilities):
+        # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
+        # However, GPUs with compute capability 8.9 can also run the code generated by
+        # the previous versions of CUDA 11 and targeting compute capability 8.0.
+        # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
+        # instead of 8.9.
+        warnings.warn(
+            "CUDA 11.8 or higher is required for compute capability 8.9. "
+            "Targeting compute capability 8.0 instead.")
+        compute_capabilities = set(cc for cc in compute_capabilities
+                                   if not cc.startswith("8.9"))
+        compute_capabilities.add("8.0+PTX")
+    if any(cc.startswith("9.0") for cc in compute_capabilities):
+        raise RuntimeError(
+            "CUDA 11.8 or higher is required for compute capability 9.0.")
+
+# Add target compute capabilities to NVCC flags.
+for capability in compute_capabilities:
+    num = capability[0] + capability[2]
+    NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
+    if capability.endswith("+PTX"):
+        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
+
+# Use NVCC threads to parallelize the build.
+if nvcc_cuda_version >= Version("11.2"):
+    num_threads = min(os.cpu_count(), 8)
+    NVCC_FLAGS += ["--threads", str(num_threads)]
+
+ext_modules = []
+
+# Cache operations.
+cache_extension = CUDAExtension(
+    name="vllm.cache_ops",
+    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(cache_extension)
+
+# Attention kernels.
+attention_extension = CUDAExtension(
+    name="vllm.attention_ops",
+    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(attention_extension)
+
+# Positional encoding kernels.
+positional_encoding_extension = CUDAExtension(
+    name="vllm.pos_encoding_ops",
+    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(positional_encoding_extension)
+
+# Layer normalization kernels.
+layernorm_extension = CUDAExtension(
+    name="vllm.layernorm_ops",
+    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(layernorm_extension)
+
+# Activation kernels.
+activation_extension = CUDAExtension(
+    name="vllm.activation_ops",
+    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(activation_extension)
+
+# Quantization kernels.
+quantization_extension = CUDAExtension(
+    name="vllm.quantization_ops",
+    sources=[
+        "csrc/quantization.cpp",
+        "csrc/quantization/awq/gemm_kernels.cu",
+        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+    ],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(quantization_extension)
+
+# Misc. CUDA utils.
+cuda_utils_extension = CUDAExtension(
+    name="vllm.cuda_utils",
+    sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": NVCC_FLAGS,
+    },
+)
+ext_modules.append(cuda_utils_extension)
+
+
+def get_path(*filepath) -> str:
+    return os.path.join(ROOT_DIR, *filepath)
+
+
+def find_version(filepath: str) -> str:
+    """Extract version information from the given filepath.
+
+    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
+    """
+    with open(filepath) as fp:
+        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                                  fp.read(), re.M)
+        if version_match:
+            return version_match.group(1)
+        raise RuntimeError("Unable to find version string.")
+
+
+def get_vllm_version() -> str:
+    version = find_version(get_path("vllm", "__init__.py"))
+    cuda_version = str(nvcc_cuda_version)
+    if cuda_version != MAIN_CUDA_VERSION:
+        cuda_version_str = cuda_version.replace(".", "")[:3]
+        version += f"+cu{cuda_version_str}"
+    return version
+
+
+def read_readme() -> str:
+    """Read the README file if present."""
+    p = get_path("README.md")
+    if os.path.isfile(p):
+        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+    else:
+        return ""
+
+
+def get_requirements() -> List[str]:
+    """Get Python package dependencies from requirements.txt."""
+    with open(get_path("requirements.txt")) as f:
+        requirements = f.read().strip().split("\n")
+    return requirements
+
+
+setuptools.setup(
+    name="vllm",
+    version=get_vllm_version(),
+    author="vLLM Team",
+    license="Apache 2.0",
+    description=("A high-throughput and memory-efficient inference and "
+                 "serving engine for LLMs"),
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/vllm-project/vllm",
+    project_urls={
+        "Homepage": "https://github.com/vllm-project/vllm",
+        "Documentation": "https://vllm.readthedocs.io/en/latest/",
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "License :: OSI Approved :: Apache Software License",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
+                                               "examples", "tests")),
+    python_requires=">=3.8",
+    install_requires=get_requirements(),
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+    package_data={"vllm": ["py.typed"]},
+)
--- a/inference/vllm/tests/__init__.py
+++ b/inference/vllm/tests/__init__.py
--- a/inference/vllm/tests/async_engine/api_server_async_engine.py
+++ b/inference/vllm/tests/async_engine/api_server_async_engine.py
+"""vllm.entrypoints.api_server with some extra logging for testing."""
+import argparse
+from typing import Any, Dict
+
+import uvicorn
+from fastapi.responses import JSONResponse, Response
+
+import vllm.entrypoints.api_server
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+app = vllm.entrypoints.api_server.app
+
+
+class AsyncLLMEngineWithStats(AsyncLLMEngine):
+
+    # pylint: disable=redefined-outer-name
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_aborts = 0
+
+    async def abort(self, request_id: str) -> None:
+        await super().abort(request_id)
+        self._num_aborts += 1
+
+    def testing_stats(self) -> Dict[str, Any]:
+        return {"num_aborted_requests": self._num_aborts}
+
+
+@app.get("/stats")
+def stats() -> Response:
+    """Get the statistics of the engine."""
+    return JSONResponse(engine.testing_stats())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
+    vllm.entrypoints.api_server.engine = engine
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
--- a/inference/vllm/tests/async_engine/test_api_server.py
+++ b/inference/vllm/tests/async_engine/test_api_server.py
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from pathlib import Path
+
+import pytest
+import requests
+
+
+def _query_server(prompt: str) -> dict:
+    response = requests.post("http://localhost:8000/generate",
+                             json={
+                                 "prompt": prompt,
+                                 "max_tokens": 100,
+                                 "temperature": 0,
+                                 "ignore_eos": True
+                             })
+    response.raise_for_status()
+    return response.json()
+
+
+@pytest.fixture
+def api_server():
+    script_path = Path(__file__).parent.joinpath(
+        "api_server_async_engine.py").absolute()
+    # pylint: disable=consider-using-with
+    uvicorn_process = subprocess.Popen([
+        sys.executable, "-u",
+        str(script_path), "--model", "facebook/opt-125m"
+    ])
+    yield
+    uvicorn_process.terminate()
+
+
+# pylint: disable=redefined-outer-name, unused-argument
+def test_api_server(api_server):
+    """
+    Run the API server and test it.
+
+    We run both the server and requests in separate processes.
+
+    We test that the server can handle incoming requests, including
+    multiple requests at the same time, and that it can handle requests
+    being cancelled without crashing.
+    """
+    with Pool(32) as pool:
+        # Wait until the server is ready
+        prompts = ["Hello world"] * 1
+        result = None
+        while not result:
+            # pylint: disable=bare-except
+            try:
+                for result in pool.map(_query_server, prompts):
+                    break
+            except:
+                time.sleep(1)
+
+        # Actual tests start here
+        # Try with 1 prompt
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests == 0
+
+        # Try with 100 prompts
+        prompts = ["Hello world"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
+
+        # Cancel requests
+        pool.map_async(_query_server, prompts)
+        time.sleep(0.01)
+        pool.terminate()
+        pool.join()
+
+        # check cancellation stats
+        num_aborted_requests = requests.get(
+            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        assert num_aborted_requests > 0
+
+    # check that server still runs after cancellations
+    with Pool(32) as pool:
+        # Try with 100 prompts
+        prompts = ["Hello world"] * 100
+        for result in pool.map(_query_server, prompts):
+            assert result
--- a/inference/vllm/tests/async_engine/test_async_llm_engine.py
+++ b/inference/vllm/tests/async_engine/test_async_llm_engine.py
+import asyncio
+from dataclasses import dataclass
+
+import pytest
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+
+@dataclass
+class RequestOutput:
+    request_id: int
+    finished: bool = False
+
+
+class MockEngine:
+
+    def __init__(self):
+        self.step_calls = 0
+        self.add_request_calls = 0
+        self.abort_request_calls = 0
+        self.request_id = None
+
+    async def step_async(self):
+        self.step_calls += 1
+        return [RequestOutput(
+            request_id=self.request_id)] if self.request_id else []
+
+    def generate(self, request_id):
+        self.request_id = request_id
+
+    def stop_generating(self):
+        self.request_id = None
+
+    def add_request(self, **kwargs):
+        del kwargs  # Unused
+        self.add_request_calls += 1
+
+    def abort_request(self, request_id):
+        del request_id  # Unused
+        self.abort_request_calls += 1
+
+
+class MockAsyncLLMEngine(AsyncLLMEngine):
+
+    def _init_engine(self, *args, **kwargs):
+        return MockEngine()
+
+
+@pytest.mark.asyncio
+async def test_new_requests_event():
+    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine.start_background_loop()
+    await asyncio.sleep(0.01)
+    assert engine.engine.step_calls == 0
+
+    await engine.add_request("1", "", None)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 1
+    assert engine.engine.step_calls == 1
+
+    await engine.add_request("2", "", None)
+    engine.engine.generate("2")
+    await asyncio.sleep(0)
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls == 2
+    await asyncio.sleep(0)
+    assert engine.engine.step_calls == 3
+    engine.engine.stop_generating()
+    await asyncio.sleep(0)
+    assert engine.engine.step_calls == 4
+    await asyncio.sleep(0)
+    assert engine.engine.step_calls == 4
+
+    await engine.add_request("3", "", None)
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == 5
+    await asyncio.sleep(0.01)
+    assert engine.engine.add_request_calls == 3
+    assert engine.engine.step_calls == 5