Commit 24eacbc0 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
import json
import pandas as pd
with open("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.jsonl", 'r') as fin:
data = json.load(fin)
# with open("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.xlxs", 'w') as fout:
df = pd.DataFrame(data)
df.to_excel("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.xlsx")
import argparse
import json
import os
import shutil
import sys
from collections import OrderedDict
import torch
from transformers import AutoTokenizer
from vllm.transformers_utils.configs import CPMMistralConfig
def find_ckpt(directory):
files = os.listdir(directory)
ckpt = [file for file in files if file.endswith(".pt")]
assert len(ckpt) == 1
return '/'.join((directory, ckpt[0]))
def convert_model(ckpt, layernum):
print("Total number in orgckpt", len(ckpt))
model_hf = OrderedDict()
model_hf['model.embed_tokens.weight'] = ckpt["input_embedding.weight"].contiguous()
model_hf['model.norm.weight'] = ckpt["encoder.output_layernorm.weight"].contiguous()
try:
model_hf['lm_head.weight'] = ckpt['lm_head.weight'].contiguous()
print("lm_head found")
except:
model_hf['lm_head.weight'] = ckpt["input_embedding.weight"].contiguous()
print("lm_head not found")
# print(ckpt.keys())
for lnum in range(layernum):
hf_pfx = f"model.layers.{lnum}"
bmt_pfx = f"encoder.layers.{lnum}"
model_hf[f"{hf_pfx}.input_layernorm.weight"] = ckpt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"].contiguous()
model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = ckpt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous()
try:
model_hf[f"{hf_pfx}.self_attn.q_norm.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.q_norm.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.k_norm.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.k_norm.weight"].contiguous()
except:
print("Error!")
pass
print("Total number in converted", len(model_hf))
return model_hf
def load_model_ckpt(args):
ckpt = torch.load(find_ckpt(args.load))
# with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
# config = json.load(fin)
config = CPMMistralConfig.from_pretrained(args.load)
print(config)
# from IPython import embed; embed(header="222")
if args.save is not None:
config_name = args.save #"cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
else:
config_name = args.load
hf_ckpt = convert_model(ckpt, config.num_layers)
os.makedirs(f"{config_name}", exist_ok=True)
torch.save(hf_ckpt, f"{config_name}/pytorch_model.bin")
config.save_pretrained(f"{config_name}")
# try:
# tokenizer = AutoTokenizer.from_pretrained(
# args.load,
# trust_remote_code=True)
# tokenizer.save_pretrained(f"{config_name}")
# # for fast tokenizer bug
# if 'tokenizer.json' in os.listdir(f"{config_name}"):
# os.remove(f"{config_name}/tokenizer.json")
# print("Tokenizer loaded")
# except Exception as e:
# print(e)
# try:
# shutil.copyfile("autoeval/120k_v2.model", f"wip.{config_name}/tokenizer.model")
# shutil.copyfile("autoeval/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
# shutil.copyfile("autoeval/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
# print("Tokenizer copied")
# except Exception as e:
# print("Tokenizer not exists!")
# print(e)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
parser.add_argument("--load", type=str, default="wip.job_469747_ckpt_4069")
parser.add_argument("--save", type=str, default=None)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
load_model_ckpt(args)
import argparse
from vllm import LLM, SamplingParams
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default="")
args = parser.parse_args()
# Sample prompts.
prompts = [
"北京烤鸭真好吃,正好有一家北京烤鸭店,我想去吃北京烤鸭",
"def reverse_list(list):",
"Beijing is the capital of",
"1 + 1 = ",
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
'''0123456789'''*810 + " what is next number?",
"您好,三加二等于多少?",
"我是",
""
]
params_dict = {
"n": 1,
"best_of": None,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"temperature": 1,
"top_p": 0.9,
"top_k": -1,
"use_beam_search": False,
"length_penalty": 1.0,
"early_stopping": False,
"stop": None,
"stop_token_ids": None,
"ignore_eos": False,
"max_tokens": 100,
"logprobs": None,
"prompt_logprobs": None,
"skip_special_tokens": True,
}
# Create a sampling params object.
sampling_params = SamplingParams(**params_dict)
# Create an LLM.
llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
#!/bin/bash
CHECKPOINT=$1
SAVE_TO=$2
HF_MODEL_NAME=wip.$SAVE_TO # huggingface上的模型名
python vllm_convert_checkpoint_to_hf.py --load $CHECKPOINT --save $SAVE_TO
python inference.py --model_path=$HF_MODEL_NAME
import argparse
import json
import os
import shutil
import sys
from collections import OrderedDict
import torch
from transformers import AutoTokenizer
from vllm.transformers_utils.configs import CPMMistralConfig
def find_ckpt(directory):
files = os.listdir(directory)
ckpt = [file for file in files if file.endswith(".pt")]
assert len(ckpt) == 1
return '/'.join((directory, ckpt[0]))
def convert_model(ckpt, layernum):
print("Total number in orgckpt", len(ckpt))
model_hf = OrderedDict()
model_hf['model.embed_tokens.weight'] = ckpt["input_embedding.weight"].contiguous()
model_hf['model.norm.weight'] = ckpt["encoder.output_layernorm.weight"].contiguous()
try:
model_hf['lm_head.weight'] = ckpt['lm_head.weight'].contiguous()
print("lm_head found")
except:
model_hf['lm_head.weight'] = ckpt["input_embedding.weight"].contiguous()
print("lm_head not found")
for lnum in range(layernum):
hf_pfx = f"model.layers.{lnum}"
bmt_pfx = f"encoder.layers.{lnum}"
model_hf[f"{hf_pfx}.input_layernorm.weight"] = ckpt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"].contiguous()
model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = ckpt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"].contiguous()
model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = ckpt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous()
print("Total number in converted", len(model_hf))
return model_hf
def load_model_ckpt(args):
ckpt = torch.load(find_ckpt(args.load))
# with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
# config = json.load(fin)
config = CPMMistralConfig.from_pretrained(args.load)
print(config)
# from IPython import embed; embed(header="222")
config_name = args.save #"cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
hf_ckpt = convert_model(ckpt, config.num_layers)
os.makedirs(f"wip.{config_name}", exist_ok=True)
torch.save(hf_ckpt, f"wip.{config_name}/pytorch_model.bin")
config.save_pretrained(f"wip.{config_name}")
try:
tokenizer = AutoTokenizer.from_pretrained(
args.load,
trust_remote_code=True)
tokenizer.save_pretrained(f"wip.{config_name}")
# for fast tokenizer bug
if 'tokenizer.json' in os.listdir(f"wip.{config_name}"):
os.remove(f"wip.{config_name}/tokenizer.json")
print("Tokenizer loaded")
except Exception as e:
print(e)
try:
shutil.copyfile("autoeval/tokenizer.model", f"wip.{config_name}/tokenizer.model")
shutil.copyfile("autoeval/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
shutil.copyfile("autoeval/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
print("Tokenizer copied")
except Exception as e:
print("Tokenizer not exists!")
print(e)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
parser.add_argument("--load", type=str, default="wip.job_469747_ckpt_4069D")
parser.add_argument("--save", type=str, default=None)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
load_model_ckpt(args)
import argparse
import json
import os
import shutil
import sys
from collections import OrderedDict
import torch
from transformers import AutoTokenizer, LlamaTokenizerFast
from vllm.transformers_utils.configs import CPMMistralConfig
def find_ckpt(directory):
files = os.listdir(directory)
ckpt = [file for file in files if file.endswith(".pt")]
assert len(ckpt) == 1
return "/".join((directory, ckpt[0]))
def convert_model(ckpt, layernum):
print("Total number in orgckpt", len(ckpt))
model_hf = OrderedDict()
model_hf["model.embed_tokens.weight"] = ckpt["input_embedding.weight"].contiguous()
model_hf["model.norm.weight"] = ckpt["encoder.output_layernorm.weight"].contiguous()
try:
model_hf["lm_head.weight"] = ckpt["lm_head.weight"].contiguous()
print("lm_head found")
except:
model_hf["lm_head.weight"] = ckpt["input_embedding.weight"].contiguous()
print("lm_head not found")
for lnum in range(layernum):
hf_pfx = f"model.layers.{lnum}"
bmt_pfx = f"encoder.layers.{lnum}"
model_hf[f"{hf_pfx}.input_layernorm.weight"] = ckpt[
f"{bmt_pfx}.self_att.layernorm_before_attention.weight"
].contiguous()
model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = ckpt[
f"{bmt_pfx}.self_att.self_attention.project_q.weight"
].contiguous()
model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = ckpt[
f"{bmt_pfx}.self_att.self_attention.project_k.weight"
].contiguous()
model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = ckpt[
f"{bmt_pfx}.self_att.self_attention.project_v.weight"
].contiguous()
model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = ckpt[
f"{bmt_pfx}.self_att.self_attention.attention_out.weight"
].contiguous()
model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = ckpt[
f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"
].contiguous()
model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous()
model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = ckpt[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous()
print("Total number in converted", len(model_hf))
return model_hf
def load_model_ckpt(args):
ckpt = torch.load(find_ckpt(args.load))
# with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
# config = json.load(fin)
config = CPMMistralConfig.from_pretrained(args.load)
print(config)
# from IPython import embed; embed(header="222")
config_name = args.save # "cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
hf_ckpt = convert_model(ckpt, config.num_layers)
os.makedirs(f"wip.{config_name}", exist_ok=True)
torch.save(hf_ckpt, f"wip.{config_name}/pytorch_model.bin")
config.save_pretrained(f"wip.{config_name}")
try:
try:
tokenizer = LlamaTokenizerFast.from_pretrained(args.load, trust_remote_code=True)
except Exception as e:
print(e)
tokenizer = AutoTokenizer.from_pretrained(args.load, trust_remote_code=True)
tokenizer.save_pretrained(f"wip.{config_name}")
# for fast tokenizer bug
if "tokenizer.json" in os.listdir(f"wip.{config_name}"):
os.remove(f"wip.{config_name}/tokenizer.json")
print("Tokenizer loaded")
except Exception as e:
print(e)
try:
shutil.copyfile(f"{args.load}/tokenizer.model", f"wip.{config_name}/tokenizer.model")
shutil.copyfile(f"{args.load}/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
shutil.copyfile(f"{args.load}/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
print("Tokenizer copied")
except Exception as e:
print("Tokenizer not exists!")
print(e)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
parser.add_argument("--load", type=str, default="wip.job_469747_ckpt_4069D")
parser.add_argument("--save", type=str, default=None)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
load_model_ckpt(args)
import argparse
from typing import List, Tuple
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
"""Create a list of test prompts with their sampling parameters."""
return [
("A robot may not injure a human being",
SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
("To be or not to be,",
SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
("What is the meaning of life?",
SamplingParams(n=2,
best_of=5,
temperature=0.8,
top_p=0.95,
frequency_penalty=0.1)),
("It is only with the heart that one can see rightly",
SamplingParams(n=3, best_of=3, use_beam_search=True,
temperature=0.0)),
]
def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params = test_prompts.pop(0)
engine.add_request(str(request_id), prompt, sampling_params)
request_id += 1
request_outputs: List[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print(request_output)
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
"""Initialize the LLMEngine from the command line arguments."""
engine_args = EngineArgs.from_cli_args(args)
return LLMEngine.from_engine_args(engine_args)
def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args)
test_prompts = create_test_prompts()
process_requests(engine, test_prompts)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="facebook/opt-125m")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
import openai
# Modify OpenAI's API key and API base to use vLLM's API server.
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"
# List models API
models = openai.Model.list()
print("Models:", models)
model = models["data"][0]["id"]
# Chat completion API
chat_completion = openai.ChatCompletion.create(
model=model,
messages=[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Who won the world series in 2020?"
}, {
"role":
"assistant",
"content":
"The Los Angeles Dodgers won the World Series in 2020."
}, {
"role": "user",
"content": "Where was it played?"
}])
print("Chat completion results:")
print(chat_completion)
import openai
# Modify OpenAI's API key and API base to use vLLM's API server.
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"
# List models API
models = openai.Model.list()
print("Models:", models)
model = models["data"][0]["id"]
# Completion API
stream = False
completion = openai.Completion.create(
model=model,
prompt="A robot may not injure a human being",
echo=False,
n=2,
stream=stream,
logprobs=3)
print("Completion results:")
if stream:
for c in completion:
print(c)
else:
print(completion)
#!/usr/bin/env bash
# YAPF formatter, adapted from ray and skypilot.
#
# Usage:
# # Do work and commit your work.
# # Format files that differ from origin/main.
# bash format.sh
# # Commit changed files with message 'Run yapf and pylint'
#
#
# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
# You are encouraged to run this locally before pushing changes for review.
# Cause the script to exit if a single command fails
set -eo pipefail
# this stops git rev-parse from failing if we run this from the .git directory
builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
ROOT="$(git rev-parse --show-toplevel)"
builtin cd "$ROOT" || exit 1
YAPF_VERSION=$(yapf --version | awk '{print $2}')
PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
MYPY_VERSION=$(mypy --version | awk '{print $2}')
# # params: tool name, tool version, required version
tool_version_check() {
if [[ $2 != $3 ]]; then
echo "Wrong $1 version installed: $3 is required, not $2."
exit 1
fi
}
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
YAPF_FLAGS=(
'--recursive'
'--parallel'
)
YAPF_EXCLUDES=(
'--exclude' 'build/**'
)
# Format specified files
format() {
yapf --in-place "${YAPF_FLAGS[@]}" "$@"
}
# Format files that differ from main branch. Ignores dirs that are not slated
# for autoformat yet.
format_changed() {
# The `if` guard ensures that the list of filenames is not empty, which
# could cause yapf to receive 0 positional arguments, making it hang
# waiting for STDIN.
#
# `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
# exist on both branches.
MERGEBASE="$(git merge-base origin/main HEAD)"
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
fi
}
# Format all files
format_all() {
yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests
}
## This flag formats individual files. --files *must* be the first command line
## arg to use this option.
if [[ "$1" == '--files' ]]; then
format "${@:2}"
# If `--all` is passed, then any further arguments are ignored and the
# entire python directory is formatted.
elif [[ "$1" == '--all' ]]; then
format_all
else
# Format only the files that changed in last commit.
format_changed
fi
echo 'vLLM yapf: Done'
# Run mypy
# TODO(zhuohan): Enable mypy
# echo 'vLLM mypy:'
# mypy
# Lint specified files
lint() {
pylint "$@"
}
# Lint files that differ from main branch. Ignores dirs that are not slated
# for autolint yet.
lint_changed() {
# The `if` guard ensures that the list of filenames is not empty, which
# could cause pylint to receive 0 positional arguments, making it hang
# waiting for STDIN.
#
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
# exist on both branches.
MERGEBASE="$(git merge-base origin/main HEAD)"
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
pylint
fi
}
# Run Pylint
echo 'vLLM Pylint:'
## This flag lints individual files. --files *must* be the first command line
## arg to use this option.
if [[ "$1" == '--files' ]]; then
lint "${@:2}"
# If `--all` is passed, then any further arguments are ignored and the
# entire python directory is linted.
elif [[ "$1" == '--all' ]]; then
lint vllm tests
else
# Format only the files that changed in last commit.
lint_changed
fi
if ! git diff --quiet &>/dev/null; then
echo 'Reformatted files. Please review and stage the changes.'
echo 'Changes not staged for commit:'
echo
git --no-pager diff --name-only
exit 1
fi
[mypy]
python_version = 3.8
ignore_missing_imports = True
files = vllm
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
[build-system]
requires = [
"ninja",
"packaging",
"setuptools",
"torch >= 2.1.0",
"wheel",
]
build-backend = "setuptools.build_meta"
# formatting
yapf==0.32.0
pylint==2.8.2
# type checking
mypy==0.991
types-PyYAML
types-requests
types-setuptools
# testing
pytest
pytest-forked
pytest-asyncio
ninja # For faster builds.
psutil
ray >= 2.5.1
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer.
numpy
einops # Required for phi-1_5
torch >= 2.1.0
transformers >= 4.34.0 # Required for Mistral.
xformers >= 0.0.22.post7 # Required for CUDA 12.1.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
import io
import os
import re
import subprocess
from typing import List, Set
import warnings
from packaging.version import parse, Version
import setuptools
import torch
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
ROOT_DIR = os.path.dirname(__file__)
MAIN_CUDA_VERSION = "12.1"
# Supported NVIDIA GPU architectures.
SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
# Compiler flags.
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
# TODO(woosuk): Should we use -O3?
NVCC_FLAGS = ["-O2", "-std=c++17"]
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
if CUDA_HOME is None:
raise RuntimeError(
"Cannot find CUDA_HOME. CUDA must be available to build the package.")
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
"""Get the CUDA version from nvcc.
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
"""
nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
universal_newlines=True)
output = nvcc_output.split()
release_idx = output.index("release") + 1
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
return nvcc_cuda_version
def get_torch_arch_list() -> Set[str]:
# TORCH_CUDA_ARCH_LIST can have one or more architectures,
# e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
# compiler to additionally include PTX code that can be runtime-compiled
# and executed on the 8.6 or newer architectures. While the PTX code will
# not give the best performance on the newer architectures, it provides
# forward compatibility.
env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
if env_arch_list is None:
return set()
# List are separated by ; or space.
torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
if not torch_arch_list:
return set()
# Filter out the invalid architectures and print a warning.
valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in SUPPORTED_ARCHS})
arch_list = torch_arch_list.intersection(valid_archs)
# If none of the specified architectures are valid, raise an error.
if not arch_list:
raise RuntimeError(
"None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
f"variable ({env_arch_list}) is supported. "
f"Supported CUDA architectures are: {valid_archs}.")
invalid_arch_list = torch_arch_list - valid_archs
if invalid_arch_list:
warnings.warn(
f"Unsupported CUDA architectures ({invalid_arch_list}) are "
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
f"({env_arch_list}). Supported CUDA architectures are: "
f"{valid_archs}.")
return arch_list
# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
if not compute_capabilities:
# If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
# GPUs on the current machine.
device_count = torch.cuda.device_count()
for i in range(device_count):
major, minor = torch.cuda.get_device_capability(i)
if major < 7:
raise RuntimeError(
"GPUs with compute capability below 7.0 are not supported.")
compute_capabilities.add(f"{major}.{minor}")
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if not compute_capabilities:
# If no GPU is specified nor available, add all supported architectures
# based on the NVCC CUDA version.
compute_capabilities = SUPPORTED_ARCHS.copy()
if nvcc_cuda_version < Version("11.1"):
compute_capabilities.remove("8.6")
if nvcc_cuda_version < Version("11.8"):
compute_capabilities.remove("8.9")
compute_capabilities.remove("9.0")
# Validate the NVCC CUDA version.
if nvcc_cuda_version < Version("11.0"):
raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
if nvcc_cuda_version < Version("11.1"):
if any(cc.startswith("8.6") for cc in compute_capabilities):
raise RuntimeError(
"CUDA 11.1 or higher is required for compute capability 8.6.")
if nvcc_cuda_version < Version("11.8"):
if any(cc.startswith("8.9") for cc in compute_capabilities):
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
# However, GPUs with compute capability 8.9 can also run the code generated by
# the previous versions of CUDA 11 and targeting compute capability 8.0.
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
# instead of 8.9.
warnings.warn(
"CUDA 11.8 or higher is required for compute capability 8.9. "
"Targeting compute capability 8.0 instead.")
compute_capabilities = set(cc for cc in compute_capabilities
if not cc.startswith("8.9"))
compute_capabilities.add("8.0+PTX")
if any(cc.startswith("9.0") for cc in compute_capabilities):
raise RuntimeError(
"CUDA 11.8 or higher is required for compute capability 9.0.")
# Add target compute capabilities to NVCC flags.
for capability in compute_capabilities:
num = capability[0] + capability[2]
NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
if capability.endswith("+PTX"):
NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
# Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"):
num_threads = min(os.cpu_count(), 8)
NVCC_FLAGS += ["--threads", str(num_threads)]
ext_modules = []
# Cache operations.
cache_extension = CUDAExtension(
name="vllm.cache_ops",
sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(cache_extension)
# Attention kernels.
attention_extension = CUDAExtension(
name="vllm.attention_ops",
sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(attention_extension)
# Positional encoding kernels.
positional_encoding_extension = CUDAExtension(
name="vllm.pos_encoding_ops",
sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(positional_encoding_extension)
# Layer normalization kernels.
layernorm_extension = CUDAExtension(
name="vllm.layernorm_ops",
sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(layernorm_extension)
# Activation kernels.
activation_extension = CUDAExtension(
name="vllm.activation_ops",
sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(activation_extension)
# Quantization kernels.
quantization_extension = CUDAExtension(
name="vllm.quantization_ops",
sources=[
"csrc/quantization.cpp",
"csrc/quantization/awq/gemm_kernels.cu",
"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(quantization_extension)
# Misc. CUDA utils.
cuda_utils_extension = CUDAExtension(
name="vllm.cuda_utils",
sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(cuda_utils_extension)
def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)
def find_version(filepath: str) -> str:
"""Extract version information from the given filepath.
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
"""
with open(filepath) as fp:
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
fp.read(), re.M)
if version_match:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")
def get_vllm_version() -> str:
version = find_version(get_path("vllm", "__init__.py"))
cuda_version = str(nvcc_cuda_version)
if cuda_version != MAIN_CUDA_VERSION:
cuda_version_str = cuda_version.replace(".", "")[:3]
version += f"+cu{cuda_version_str}"
return version
def read_readme() -> str:
"""Read the README file if present."""
p = get_path("README.md")
if os.path.isfile(p):
return io.open(get_path("README.md"), "r", encoding="utf-8").read()
else:
return ""
def get_requirements() -> List[str]:
"""Get Python package dependencies from requirements.txt."""
with open(get_path("requirements.txt")) as f:
requirements = f.read().strip().split("\n")
return requirements
setuptools.setup(
name="vllm",
version=get_vllm_version(),
author="vLLM Team",
license="Apache 2.0",
description=("A high-throughput and memory-efficient inference and "
"serving engine for LLMs"),
long_description=read_readme(),
long_description_content_type="text/markdown",
url="https://github.com/vllm-project/vllm",
project_urls={
"Homepage": "https://github.com/vllm-project/vllm",
"Documentation": "https://vllm.readthedocs.io/en/latest/",
},
classifiers=[
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
"examples", "tests")),
python_requires=">=3.8",
install_requires=get_requirements(),
ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension},
package_data={"vllm": ["py.typed"]},
)
"""vllm.entrypoints.api_server with some extra logging for testing."""
import argparse
from typing import Any, Dict
import uvicorn
from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
app = vllm.entrypoints.api_server.app
class AsyncLLMEngineWithStats(AsyncLLMEngine):
# pylint: disable=redefined-outer-name
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._num_aborts = 0
async def abort(self, request_id: str) -> None:
await super().abort(request_id)
self._num_aborts += 1
def testing_stats(self) -> Dict[str, Any]:
return {"num_aborted_requests": self._num_aborts}
@app.get("/stats")
def stats() -> Response:
"""Get the statistics of the engine."""
return JSONResponse(engine.testing_stats())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
vllm.entrypoints.api_server.engine = engine
uvicorn.run(
app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
import subprocess
import sys
import time
from multiprocessing import Pool
from pathlib import Path
import pytest
import requests
def _query_server(prompt: str) -> dict:
response = requests.post("http://localhost:8000/generate",
json={
"prompt": prompt,
"max_tokens": 100,
"temperature": 0,
"ignore_eos": True
})
response.raise_for_status()
return response.json()
@pytest.fixture
def api_server():
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
# pylint: disable=consider-using-with
uvicorn_process = subprocess.Popen([
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m"
])
yield
uvicorn_process.terminate()
# pylint: disable=redefined-outer-name, unused-argument
def test_api_server(api_server):
"""
Run the API server and test it.
We run both the server and requests in separate processes.
We test that the server can handle incoming requests, including
multiple requests at the same time, and that it can handle requests
being cancelled without crashing.
"""
with Pool(32) as pool:
# Wait until the server is ready
prompts = ["Hello world"] * 1
result = None
while not result:
# pylint: disable=bare-except
try:
for result in pool.map(_query_server, prompts):
break
except:
time.sleep(1)
# Actual tests start here
# Try with 1 prompt
for result in pool.map(_query_server, prompts):
assert result
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests == 0
# Try with 100 prompts
prompts = ["Hello world"] * 100
for result in pool.map(_query_server, prompts):
assert result
# Cancel requests
pool.map_async(_query_server, prompts)
time.sleep(0.01)
pool.terminate()
pool.join()
# check cancellation stats
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0
# check that server still runs after cancellations
with Pool(32) as pool:
# Try with 100 prompts
prompts = ["Hello world"] * 100
for result in pool.map(_query_server, prompts):
assert result
import asyncio
from dataclasses import dataclass
import pytest
from vllm.engine.async_llm_engine import AsyncLLMEngine
@dataclass
class RequestOutput:
request_id: int
finished: bool = False
class MockEngine:
def __init__(self):
self.step_calls = 0
self.add_request_calls = 0
self.abort_request_calls = 0
self.request_id = None
async def step_async(self):
self.step_calls += 1
return [RequestOutput(
request_id=self.request_id)] if self.request_id else []
def generate(self, request_id):
self.request_id = request_id
def stop_generating(self):
self.request_id = None
def add_request(self, **kwargs):
del kwargs # Unused
self.add_request_calls += 1
def abort_request(self, request_id):
del request_id # Unused
self.abort_request_calls += 1
class MockAsyncLLMEngine(AsyncLLMEngine):
def _init_engine(self, *args, **kwargs):
return MockEngine()
@pytest.mark.asyncio
async def test_new_requests_event():
engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
engine.start_background_loop()
await asyncio.sleep(0.01)
assert engine.engine.step_calls == 0
await engine.add_request("1", "", None)
await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 1
assert engine.engine.step_calls == 1
await engine.add_request("2", "", None)
engine.engine.generate("2")
await asyncio.sleep(0)
assert engine.engine.add_request_calls == 2
assert engine.engine.step_calls == 2
await asyncio.sleep(0)
assert engine.engine.step_calls == 3
engine.engine.stop_generating()
await asyncio.sleep(0)
assert engine.engine.step_calls == 4
await asyncio.sleep(0)
assert engine.engine.step_calls == 4
await engine.add_request("3", "", None)
await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == 5
await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == 5
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment