Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
FGORGEOUS
minicpm_classify_pytorch
Commits
24eacbc0
Commit
24eacbc0
authored
May 09, 2024
by
chenzk
Browse files
v1.0
parents
Changes
356
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1290 additions
and
0 deletions
+1290
-0
inference/vllm/examples/infer_cpm/transform.py
inference/vllm/examples/infer_cpm/transform.py
+9
-0
inference/vllm/examples/infer_cpm/vllm_convert_checkpoint_to_hf.py
.../vllm/examples/infer_cpm/vllm_convert_checkpoint_to_hf.py
+132
-0
inference/vllm/examples/infer_cpm_mistral/inference.py
inference/vllm/examples/infer_cpm_mistral/inference.py
+59
-0
inference/vllm/examples/infer_cpm_mistral/run_infer.sh
inference/vllm/examples/infer_cpm_mistral/run_infer.sh
+8
-0
inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.bk.py
...les/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.bk.py
+116
-0
inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.py
...amples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.py
+117
-0
inference/vllm/examples/llm_engine_example.py
inference/vllm/examples/llm_engine_example.py
+62
-0
inference/vllm/examples/offline_inference.py
inference/vllm/examples/offline_inference.py
+22
-0
inference/vllm/examples/openai_chatcompletion_client.py
inference/vllm/examples/openai_chatcompletion_client.py
+33
-0
inference/vllm/examples/openai_completion_client.py
inference/vllm/examples/openai_completion_client.py
+28
-0
inference/vllm/format.sh
inference/vllm/format.sh
+141
-0
inference/vllm/mypy.ini
inference/vllm/mypy.ini
+8
-0
inference/vllm/pyproject.toml
inference/vllm/pyproject.toml
+9
-0
inference/vllm/requirements-dev.txt
inference/vllm/requirements-dev.txt
+15
-0
inference/vllm/requirements.txt
inference/vllm/requirements.txt
+14
-0
inference/vllm/setup.py
inference/vllm/setup.py
+297
-0
inference/vllm/tests/__init__.py
inference/vllm/tests/__init__.py
+0
-0
inference/vllm/tests/async_engine/api_server_async_engine.py
inference/vllm/tests/async_engine/api_server_async_engine.py
+51
-0
inference/vllm/tests/async_engine/test_api_server.py
inference/vllm/tests/async_engine/test_api_server.py
+89
-0
inference/vllm/tests/async_engine/test_async_llm_engine.py
inference/vllm/tests/async_engine/test_async_llm_engine.py
+80
-0
No files found.
inference/vllm/examples/infer_cpm/transform.py
0 → 100644
View file @
24eacbc0
import
json
import
pandas
as
pd
with
open
(
"prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.jsonl"
,
'r'
)
as
fin
:
data
=
json
.
load
(
fin
)
# with open("prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.xlxs", 'w') as fout:
df
=
pd
.
DataFrame
(
data
)
df
.
to_excel
(
"prompts/cpm-8b-1210_zh_cpm_0129.jsondpo.v2-dpo.xlsx"
)
inference/vllm/examples/infer_cpm/vllm_convert_checkpoint_to_hf.py
0 → 100644
View file @
24eacbc0
import
argparse
import
json
import
os
import
shutil
import
sys
from
collections
import
OrderedDict
import
torch
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.configs
import
CPMMistralConfig
def
find_ckpt
(
directory
):
files
=
os
.
listdir
(
directory
)
ckpt
=
[
file
for
file
in
files
if
file
.
endswith
(
".pt"
)]
assert
len
(
ckpt
)
==
1
return
'/'
.
join
((
directory
,
ckpt
[
0
]))
def
convert_model
(
ckpt
,
layernum
):
print
(
"Total number in orgckpt"
,
len
(
ckpt
))
model_hf
=
OrderedDict
()
model_hf
[
'model.embed_tokens.weight'
]
=
ckpt
[
"input_embedding.weight"
].
contiguous
()
model_hf
[
'model.norm.weight'
]
=
ckpt
[
"encoder.output_layernorm.weight"
].
contiguous
()
try
:
model_hf
[
'lm_head.weight'
]
=
ckpt
[
'lm_head.weight'
].
contiguous
()
print
(
"lm_head found"
)
except
:
model_hf
[
'lm_head.weight'
]
=
ckpt
[
"input_embedding.weight"
].
contiguous
()
print
(
"lm_head not found"
)
# print(ckpt.keys())
for
lnum
in
range
(
layernum
):
hf_pfx
=
f
"model.layers.
{
lnum
}
"
bmt_pfx
=
f
"encoder.layers.
{
lnum
}
"
model_hf
[
f
"
{
hf_pfx
}
.input_layernorm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.layernorm_before_attention.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.q_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_q.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.k_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_k.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.v_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_v.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.o_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.attention_out.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.post_attention_layernorm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.layernorm_before_ffn.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.gate_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_in.w_0.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.up_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_in.w_1.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.down_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_out.weight"
].
contiguous
()
try
:
model_hf
[
f
"
{
hf_pfx
}
.self_attn.q_norm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.q_norm.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.k_norm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.k_norm.weight"
].
contiguous
()
except
:
print
(
"Error!"
)
pass
print
(
"Total number in converted"
,
len
(
model_hf
))
return
model_hf
def
load_model_ckpt
(
args
):
ckpt
=
torch
.
load
(
find_ckpt
(
args
.
load
))
# with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
# config = json.load(fin)
config
=
CPMMistralConfig
.
from_pretrained
(
args
.
load
)
print
(
config
)
# from IPython import embed; embed(header="222")
if
args
.
save
is
not
None
:
config_name
=
args
.
save
#"cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
else
:
config_name
=
args
.
load
hf_ckpt
=
convert_model
(
ckpt
,
config
.
num_layers
)
os
.
makedirs
(
f
"
{
config_name
}
"
,
exist_ok
=
True
)
torch
.
save
(
hf_ckpt
,
f
"
{
config_name
}
/pytorch_model.bin"
)
config
.
save_pretrained
(
f
"
{
config_name
}
"
)
# try:
# tokenizer = AutoTokenizer.from_pretrained(
# args.load,
# trust_remote_code=True)
# tokenizer.save_pretrained(f"{config_name}")
# # for fast tokenizer bug
# if 'tokenizer.json' in os.listdir(f"{config_name}"):
# os.remove(f"{config_name}/tokenizer.json")
# print("Tokenizer loaded")
# except Exception as e:
# print(e)
# try:
# shutil.copyfile("autoeval/120k_v2.model", f"wip.{config_name}/tokenizer.model")
# shutil.copyfile("autoeval/special_tokens_map.json", f"wip.{config_name}/special_tokens_map.json")
# shutil.copyfile("autoeval/tokenizer_config.json", f"wip.{config_name}/tokenizer_config.json")
# print("Tokenizer copied")
# except Exception as e:
# print("Tokenizer not exists!")
# print(e)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
# parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
parser
.
add_argument
(
"--load"
,
type
=
str
,
default
=
"wip.job_469747_ckpt_4069"
)
parser
.
add_argument
(
"--save"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
args
=
parser
.
parse_args
()
load_model_ckpt
(
args
)
inference/vllm/examples/infer_cpm_mistral/inference.py
0 → 100644
View file @
24eacbc0
import
argparse
from
vllm
import
LLM
,
SamplingParams
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model_path"
,
type
=
str
,
default
=
""
)
args
=
parser
.
parse_args
()
# Sample prompts.
prompts
=
[
"北京烤鸭真好吃,正好有一家北京烤鸭店,我想去吃北京烤鸭"
,
"def reverse_list(list):"
,
"Beijing is the capital of"
,
"1 + 1 = "
,
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
'''0123456789'''
*
810
+
" what is next number?"
,
"您好,三加二等于多少?"
,
"我是"
,
""
]
params_dict
=
{
"n"
:
1
,
"best_of"
:
None
,
"presence_penalty"
:
0.0
,
"frequency_penalty"
:
0.0
,
"temperature"
:
1
,
"top_p"
:
0.9
,
"top_k"
:
-
1
,
"use_beam_search"
:
False
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"stop"
:
None
,
"stop_token_ids"
:
None
,
"ignore_eos"
:
False
,
"max_tokens"
:
100
,
"logprobs"
:
None
,
"prompt_logprobs"
:
None
,
"skip_special_tokens"
:
True
,
}
# Create a sampling params object.
sampling_params
=
SamplingParams
(
**
params_dict
)
# Create an LLM.
llm
=
LLM
(
model
=
args
.
model_path
,
tensor_parallel_size
=
1
,
dtype
=
'bfloat16'
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
inference/vllm/examples/infer_cpm_mistral/run_infer.sh
0 → 100644
View file @
24eacbc0
#!/bin/bash
CHECKPOINT
=
$1
SAVE_TO
=
$2
HF_MODEL_NAME
=
wip.
$SAVE_TO
# huggingface上的模型名
python vllm_convert_checkpoint_to_hf.py
--load
$CHECKPOINT
--save
$SAVE_TO
python inference.py
--model_path
=
$HF_MODEL_NAME
inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.bk.py
0 → 100644
View file @
24eacbc0
import
argparse
import
json
import
os
import
shutil
import
sys
from
collections
import
OrderedDict
import
torch
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.configs
import
CPMMistralConfig
def
find_ckpt
(
directory
):
files
=
os
.
listdir
(
directory
)
ckpt
=
[
file
for
file
in
files
if
file
.
endswith
(
".pt"
)]
assert
len
(
ckpt
)
==
1
return
'/'
.
join
((
directory
,
ckpt
[
0
]))
def
convert_model
(
ckpt
,
layernum
):
print
(
"Total number in orgckpt"
,
len
(
ckpt
))
model_hf
=
OrderedDict
()
model_hf
[
'model.embed_tokens.weight'
]
=
ckpt
[
"input_embedding.weight"
].
contiguous
()
model_hf
[
'model.norm.weight'
]
=
ckpt
[
"encoder.output_layernorm.weight"
].
contiguous
()
try
:
model_hf
[
'lm_head.weight'
]
=
ckpt
[
'lm_head.weight'
].
contiguous
()
print
(
"lm_head found"
)
except
:
model_hf
[
'lm_head.weight'
]
=
ckpt
[
"input_embedding.weight"
].
contiguous
()
print
(
"lm_head not found"
)
for
lnum
in
range
(
layernum
):
hf_pfx
=
f
"model.layers.
{
lnum
}
"
bmt_pfx
=
f
"encoder.layers.
{
lnum
}
"
model_hf
[
f
"
{
hf_pfx
}
.input_layernorm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.layernorm_before_attention.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.q_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_q.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.k_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_k.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.v_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_v.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.o_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.attention_out.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.post_attention_layernorm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.layernorm_before_ffn.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.gate_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_in.w_0.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.up_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_in.w_1.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.down_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_out.weight"
].
contiguous
()
print
(
"Total number in converted"
,
len
(
model_hf
))
return
model_hf
def
load_model_ckpt
(
args
):
ckpt
=
torch
.
load
(
find_ckpt
(
args
.
load
))
# with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
# config = json.load(fin)
config
=
CPMMistralConfig
.
from_pretrained
(
args
.
load
)
print
(
config
)
# from IPython import embed; embed(header="222")
config_name
=
args
.
save
#"cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
hf_ckpt
=
convert_model
(
ckpt
,
config
.
num_layers
)
os
.
makedirs
(
f
"wip.
{
config_name
}
"
,
exist_ok
=
True
)
torch
.
save
(
hf_ckpt
,
f
"wip.
{
config_name
}
/pytorch_model.bin"
)
config
.
save_pretrained
(
f
"wip.
{
config_name
}
"
)
try
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
load
,
trust_remote_code
=
True
)
tokenizer
.
save_pretrained
(
f
"wip.
{
config_name
}
"
)
# for fast tokenizer bug
if
'tokenizer.json'
in
os
.
listdir
(
f
"wip.
{
config_name
}
"
):
os
.
remove
(
f
"wip.
{
config_name
}
/tokenizer.json"
)
print
(
"Tokenizer loaded"
)
except
Exception
as
e
:
print
(
e
)
try
:
shutil
.
copyfile
(
"autoeval/tokenizer.model"
,
f
"wip.
{
config_name
}
/tokenizer.model"
)
shutil
.
copyfile
(
"autoeval/special_tokens_map.json"
,
f
"wip.
{
config_name
}
/special_tokens_map.json"
)
shutil
.
copyfile
(
"autoeval/tokenizer_config.json"
,
f
"wip.
{
config_name
}
/tokenizer_config.json"
)
print
(
"Tokenizer copied"
)
except
Exception
as
e
:
print
(
"Tokenizer not exists!"
)
print
(
e
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
# parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
parser
.
add_argument
(
"--load"
,
type
=
str
,
default
=
"wip.job_469747_ckpt_4069D"
)
parser
.
add_argument
(
"--save"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
args
=
parser
.
parse_args
()
load_model_ckpt
(
args
)
inference/vllm/examples/infer_cpm_mistral/vllm_convert_checkpoint_to_hf.py
0 → 100644
View file @
24eacbc0
import
argparse
import
json
import
os
import
shutil
import
sys
from
collections
import
OrderedDict
import
torch
from
transformers
import
AutoTokenizer
,
LlamaTokenizerFast
from
vllm.transformers_utils.configs
import
CPMMistralConfig
def
find_ckpt
(
directory
):
files
=
os
.
listdir
(
directory
)
ckpt
=
[
file
for
file
in
files
if
file
.
endswith
(
".pt"
)]
assert
len
(
ckpt
)
==
1
return
"/"
.
join
((
directory
,
ckpt
[
0
]))
def
convert_model
(
ckpt
,
layernum
):
print
(
"Total number in orgckpt"
,
len
(
ckpt
))
model_hf
=
OrderedDict
()
model_hf
[
"model.embed_tokens.weight"
]
=
ckpt
[
"input_embedding.weight"
].
contiguous
()
model_hf
[
"model.norm.weight"
]
=
ckpt
[
"encoder.output_layernorm.weight"
].
contiguous
()
try
:
model_hf
[
"lm_head.weight"
]
=
ckpt
[
"lm_head.weight"
].
contiguous
()
print
(
"lm_head found"
)
except
:
model_hf
[
"lm_head.weight"
]
=
ckpt
[
"input_embedding.weight"
].
contiguous
()
print
(
"lm_head not found"
)
for
lnum
in
range
(
layernum
):
hf_pfx
=
f
"model.layers.
{
lnum
}
"
bmt_pfx
=
f
"encoder.layers.
{
lnum
}
"
model_hf
[
f
"
{
hf_pfx
}
.input_layernorm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.layernorm_before_attention.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.q_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_q.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.k_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_k.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.v_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.project_v.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.self_attn.o_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.self_att.self_attention.attention_out.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.post_attention_layernorm.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.layernorm_before_ffn.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.gate_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_in.w_0.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.up_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_in.w_1.weight"
].
contiguous
()
model_hf
[
f
"
{
hf_pfx
}
.mlp.down_proj.weight"
]
=
ckpt
[
f
"
{
bmt_pfx
}
.ffn.ffn.w_out.weight"
].
contiguous
()
print
(
"Total number in converted"
,
len
(
model_hf
))
return
model_hf
def
load_model_ckpt
(
args
):
ckpt
=
torch
.
load
(
find_ckpt
(
args
.
load
))
# with open(os.path.join(os.path.dirname(args.load), "config.json"), 'r') as fin:
# config = json.load(fin)
config
=
CPMMistralConfig
.
from_pretrained
(
args
.
load
)
print
(
config
)
# from IPython import embed; embed(header="222")
config_name
=
args
.
save
# "cpmlive_llama7b" #os.path.dirname(args.load).split("/")[-1]
hf_ckpt
=
convert_model
(
ckpt
,
config
.
num_layers
)
os
.
makedirs
(
f
"wip.
{
config_name
}
"
,
exist_ok
=
True
)
torch
.
save
(
hf_ckpt
,
f
"wip.
{
config_name
}
/pytorch_model.bin"
)
config
.
save_pretrained
(
f
"wip.
{
config_name
}
"
)
try
:
try
:
tokenizer
=
LlamaTokenizerFast
.
from_pretrained
(
args
.
load
,
trust_remote_code
=
True
)
except
Exception
as
e
:
print
(
e
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
load
,
trust_remote_code
=
True
)
tokenizer
.
save_pretrained
(
f
"wip.
{
config_name
}
"
)
# for fast tokenizer bug
if
"tokenizer.json"
in
os
.
listdir
(
f
"wip.
{
config_name
}
"
):
os
.
remove
(
f
"wip.
{
config_name
}
/tokenizer.json"
)
print
(
"Tokenizer loaded"
)
except
Exception
as
e
:
print
(
e
)
try
:
shutil
.
copyfile
(
f
"
{
args
.
load
}
/tokenizer.model"
,
f
"wip.
{
config_name
}
/tokenizer.model"
)
shutil
.
copyfile
(
f
"
{
args
.
load
}
/special_tokens_map.json"
,
f
"wip.
{
config_name
}
/special_tokens_map.json"
)
shutil
.
copyfile
(
f
"
{
args
.
load
}
/tokenizer_config.json"
,
f
"wip.
{
config_name
}
/tokenizer_config.json"
)
print
(
"Tokenizer copied"
)
except
Exception
as
e
:
print
(
"Tokenizer not exists!"
)
print
(
e
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
# parser.add_argument("--load", type=str, default="/mnt/data/user/tc_agi/user/zhaoweilin/cpmlive-llama-2-7b/pytorch_model.pt")
parser
.
add_argument
(
"--load"
,
type
=
str
,
default
=
"wip.job_469747_ckpt_4069D"
)
parser
.
add_argument
(
"--save"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
)
args
=
parser
.
parse_args
()
load_model_ckpt
(
args
)
inference/vllm/examples/llm_engine_example.py
0 → 100644
View file @
24eacbc0
import
argparse
from
typing
import
List
,
Tuple
from
vllm
import
EngineArgs
,
LLMEngine
,
SamplingParams
,
RequestOutput
def
create_test_prompts
()
->
List
[
Tuple
[
str
,
SamplingParams
]]:
"""Create a list of test prompts with their sampling parameters."""
return
[
(
"A robot may not injure a human being"
,
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
)),
(
"To be or not to be,"
,
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
)),
(
"What is the meaning of life?"
,
SamplingParams
(
n
=
2
,
best_of
=
5
,
temperature
=
0.8
,
top_p
=
0.95
,
frequency_penalty
=
0.1
)),
(
"It is only with the heart that one can see rightly"
,
SamplingParams
(
n
=
3
,
best_of
=
3
,
use_beam_search
=
True
,
temperature
=
0.0
)),
]
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
List
[
Tuple
[
str
,
SamplingParams
]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
while
test_prompts
or
engine
.
has_unfinished_requests
():
if
test_prompts
:
prompt
,
sampling_params
=
test_prompts
.
pop
(
0
)
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
request_id
+=
1
request_outputs
:
List
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
print
(
request_output
)
def
initialize_engine
(
args
:
argparse
.
Namespace
)
->
LLMEngine
:
"""Initialize the LLMEngine from the command line arguments."""
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
return
LLMEngine
.
from_engine_args
(
engine_args
)
def
main
(
args
:
argparse
.
Namespace
):
"""Main function that sets up and runs the prompt processing."""
engine
=
initialize_engine
(
args
)
test_prompts
=
create_test_prompts
()
process_requests
(
engine
,
test_prompts
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Demo on using the LLMEngine class directly'
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
main
(
args
)
inference/vllm/examples/offline_inference.py
0 → 100644
View file @
24eacbc0
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
inference/vllm/examples/openai_chatcompletion_client.py
0 → 100644
View file @
24eacbc0
import
openai
# Modify OpenAI's API key and API base to use vLLM's API server.
openai
.
api_key
=
"EMPTY"
openai
.
api_base
=
"http://localhost:8000/v1"
# List models API
models
=
openai
.
Model
.
list
()
print
(
"Models:"
,
models
)
model
=
models
[
"data"
][
0
][
"id"
]
# Chat completion API
chat_completion
=
openai
.
ChatCompletion
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}])
print
(
"Chat completion results:"
)
print
(
chat_completion
)
inference/vllm/examples/openai_completion_client.py
0 → 100644
View file @
24eacbc0
import
openai
# Modify OpenAI's API key and API base to use vLLM's API server.
openai
.
api_key
=
"EMPTY"
openai
.
api_base
=
"http://localhost:8000/v1"
# List models API
models
=
openai
.
Model
.
list
()
print
(
"Models:"
,
models
)
model
=
models
[
"data"
][
0
][
"id"
]
# Completion API
stream
=
False
completion
=
openai
.
Completion
.
create
(
model
=
model
,
prompt
=
"A robot may not injure a human being"
,
echo
=
False
,
n
=
2
,
stream
=
stream
,
logprobs
=
3
)
print
(
"Completion results:"
)
if
stream
:
for
c
in
completion
:
print
(
c
)
else
:
print
(
completion
)
inference/vllm/format.sh
0 → 100644
View file @
24eacbc0
#!/usr/bin/env bash
# YAPF formatter, adapted from ray and skypilot.
#
# Usage:
# # Do work and commit your work.
# # Format files that differ from origin/main.
# bash format.sh
# # Commit changed files with message 'Run yapf and pylint'
#
#
# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
# You are encouraged to run this locally before pushing changes for review.
# Cause the script to exit if a single command fails
set
-eo
pipefail
# this stops git rev-parse from failing if we run this from the .git directory
builtin cd
"
$(
dirname
"
${
BASH_SOURCE
:-
$0
}
"
)
"
ROOT
=
"
$(
git rev-parse
--show-toplevel
)
"
builtin cd
"
$ROOT
"
||
exit
1
YAPF_VERSION
=
$(
yapf
--version
|
awk
'{print $2}'
)
PYLINT_VERSION
=
$(
pylint
--version
|
head
-n
1 |
awk
'{print $2}'
)
MYPY_VERSION
=
$(
mypy
--version
|
awk
'{print $2}'
)
# # params: tool name, tool version, required version
tool_version_check
()
{
if
[[
$2
!=
$3
]]
;
then
echo
"Wrong
$1
version installed:
$3
is required, not
$2
."
exit
1
fi
}
tool_version_check
"yapf"
$YAPF_VERSION
"
$(
grep
yapf requirements-dev.txt |
cut
-d
'='
-f3
)
"
tool_version_check
"pylint"
$PYLINT_VERSION
"
$(
grep
"pylint=="
requirements-dev.txt |
cut
-d
'='
-f3
)
"
tool_version_check
"mypy"
"
$MYPY_VERSION
"
"
$(
grep
mypy requirements-dev.txt |
cut
-d
'='
-f3
)
"
YAPF_FLAGS
=(
'--recursive'
'--parallel'
)
YAPF_EXCLUDES
=(
'--exclude'
'build/**'
)
# Format specified files
format
()
{
yapf
--in-place
"
${
YAPF_FLAGS
[@]
}
"
"
$@
"
}
# Format files that differ from main branch. Ignores dirs that are not slated
# for autoformat yet.
format_changed
()
{
# The `if` guard ensures that the list of filenames is not empty, which
# could cause yapf to receive 0 positional arguments, making it hang
# waiting for STDIN.
#
# `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
# exist on both branches.
MERGEBASE
=
"
$(
git merge-base origin/main HEAD
)
"
if
!
git diff
--diff-filter
=
ACM
--quiet
--exit-code
"
$MERGEBASE
"
--
'*.py'
'*.pyi'
&>/dev/null
;
then
git diff
--name-only
--diff-filter
=
ACM
"
$MERGEBASE
"
--
'*.py'
'*.pyi'
| xargs
-P
5
\
yapf
--in-place
"
${
YAPF_EXCLUDES
[@]
}
"
"
${
YAPF_FLAGS
[@]
}
"
fi
}
# Format all files
format_all
()
{
yapf
--in-place
"
${
YAPF_FLAGS
[@]
}
"
"
${
YAPF_EXCLUDES
[@]
}
"
vllm tests
}
## This flag formats individual files. --files *must* be the first command line
## arg to use this option.
if
[[
"
$1
"
==
'--files'
]]
;
then
format
"
${
@
:2
}
"
# If `--all` is passed, then any further arguments are ignored and the
# entire python directory is formatted.
elif
[[
"
$1
"
==
'--all'
]]
;
then
format_all
else
# Format only the files that changed in last commit.
format_changed
fi
echo
'vLLM yapf: Done'
# Run mypy
# TODO(zhuohan): Enable mypy
# echo 'vLLM mypy:'
# mypy
# Lint specified files
lint
()
{
pylint
"
$@
"
}
# Lint files that differ from main branch. Ignores dirs that are not slated
# for autolint yet.
lint_changed
()
{
# The `if` guard ensures that the list of filenames is not empty, which
# could cause pylint to receive 0 positional arguments, making it hang
# waiting for STDIN.
#
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
# exist on both branches.
MERGEBASE
=
"
$(
git merge-base origin/main HEAD
)
"
if
!
git diff
--diff-filter
=
ACM
--quiet
--exit-code
"
$MERGEBASE
"
--
'*.py'
'*.pyi'
&>/dev/null
;
then
git diff
--name-only
--diff-filter
=
ACM
"
$MERGEBASE
"
--
'*.py'
'*.pyi'
| xargs
\
pylint
fi
}
# Run Pylint
echo
'vLLM Pylint:'
## This flag lints individual files. --files *must* be the first command line
## arg to use this option.
if
[[
"
$1
"
==
'--files'
]]
;
then
lint
"
${
@
:2
}
"
# If `--all` is passed, then any further arguments are ignored and the
# entire python directory is linted.
elif
[[
"
$1
"
==
'--all'
]]
;
then
lint vllm tests
else
# Format only the files that changed in last commit.
lint_changed
fi
if
!
git diff
--quiet
&>/dev/null
;
then
echo
'Reformatted files. Please review and stage the changes.'
echo
'Changes not staged for commit:'
echo
git
--no-pager
diff
--name-only
exit
1
fi
inference/vllm/mypy.ini
0 → 100644
View file @
24eacbc0
[mypy]
python_version
=
3.8
ignore_missing_imports
=
True
files
=
vllm
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude
=
vllm/model_executor/parallel_utils/|vllm/model_executor/models/
inference/vllm/pyproject.toml
0 → 100644
View file @
24eacbc0
[build-system]
requires
=
[
"ninja"
,
"packaging"
,
"setuptools"
,
"torch >= 2.1.0"
,
"wheel"
,
]
build-backend
=
"setuptools.build_meta"
inference/vllm/requirements-dev.txt
0 → 100644
View file @
24eacbc0
# formatting
yapf==0.32.0
pylint==2.8.2
# type checking
mypy==0.991
types-PyYAML
types-requests
types-setuptools
# testing
pytest
pytest-forked
pytest-asyncio
inference/vllm/requirements.txt
0 → 100644
View file @
24eacbc0
ninja # For faster builds.
psutil
ray >= 2.5.1
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer.
numpy
einops # Required for phi-1_5
torch >= 2.1.0
transformers >= 4.34.0 # Required for Mistral.
xformers >= 0.0.22.post7 # Required for CUDA 12.1.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
inference/vllm/setup.py
0 → 100644
View file @
24eacbc0
import
io
import
os
import
re
import
subprocess
from
typing
import
List
,
Set
import
warnings
from
packaging.version
import
parse
,
Version
import
setuptools
import
torch
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
ROOT_DIR
=
os
.
path
.
dirname
(
__file__
)
MAIN_CUDA_VERSION
=
"12.1"
# Supported NVIDIA GPU architectures.
SUPPORTED_ARCHS
=
{
"7.0"
,
"7.5"
,
"8.0"
,
"8.6"
,
"8.9"
,
"9.0"
}
# Compiler flags.
CXX_FLAGS
=
[
"-g"
,
"-O2"
,
"-std=c++17"
]
# TODO(woosuk): Should we use -O3?
NVCC_FLAGS
=
[
"-O2"
,
"-std=c++17"
]
ABI
=
1
if
torch
.
_C
.
_GLIBCXX_USE_CXX11_ABI
else
0
CXX_FLAGS
+=
[
f
"-D_GLIBCXX_USE_CXX11_ABI=
{
ABI
}
"
]
NVCC_FLAGS
+=
[
f
"-D_GLIBCXX_USE_CXX11_ABI=
{
ABI
}
"
]
if
CUDA_HOME
is
None
:
raise
RuntimeError
(
"Cannot find CUDA_HOME. CUDA must be available to build the package."
)
def
get_nvcc_cuda_version
(
cuda_dir
:
str
)
->
Version
:
"""Get the CUDA version from nvcc.
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
"""
nvcc_output
=
subprocess
.
check_output
([
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
output
=
nvcc_output
.
split
()
release_idx
=
output
.
index
(
"release"
)
+
1
nvcc_cuda_version
=
parse
(
output
[
release_idx
].
split
(
","
)[
0
])
return
nvcc_cuda_version
def
get_torch_arch_list
()
->
Set
[
str
]:
# TORCH_CUDA_ARCH_LIST can have one or more architectures,
# e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
# compiler to additionally include PTX code that can be runtime-compiled
# and executed on the 8.6 or newer architectures. While the PTX code will
# not give the best performance on the newer architectures, it provides
# forward compatibility.
env_arch_list
=
os
.
environ
.
get
(
"TORCH_CUDA_ARCH_LIST"
,
None
)
if
env_arch_list
is
None
:
return
set
()
# List are separated by ; or space.
torch_arch_list
=
set
(
env_arch_list
.
replace
(
" "
,
";"
).
split
(
";"
))
if
not
torch_arch_list
:
return
set
()
# Filter out the invalid architectures and print a warning.
valid_archs
=
SUPPORTED_ARCHS
.
union
({
s
+
"+PTX"
for
s
in
SUPPORTED_ARCHS
})
arch_list
=
torch_arch_list
.
intersection
(
valid_archs
)
# If none of the specified architectures are valid, raise an error.
if
not
arch_list
:
raise
RuntimeError
(
"None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
f
"variable (
{
env_arch_list
}
) is supported. "
f
"Supported CUDA architectures are:
{
valid_archs
}
."
)
invalid_arch_list
=
torch_arch_list
-
valid_archs
if
invalid_arch_list
:
warnings
.
warn
(
f
"Unsupported CUDA architectures (
{
invalid_arch_list
}
) are "
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
f
"(
{
env_arch_list
}
). Supported CUDA architectures are: "
f
"
{
valid_archs
}
."
)
return
arch_list
# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities
=
get_torch_arch_list
()
if
not
compute_capabilities
:
# If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
# GPUs on the current machine.
device_count
=
torch
.
cuda
.
device_count
()
for
i
in
range
(
device_count
):
major
,
minor
=
torch
.
cuda
.
get_device_capability
(
i
)
if
major
<
7
:
raise
RuntimeError
(
"GPUs with compute capability below 7.0 are not supported."
)
compute_capabilities
.
add
(
f
"
{
major
}
.
{
minor
}
"
)
nvcc_cuda_version
=
get_nvcc_cuda_version
(
CUDA_HOME
)
if
not
compute_capabilities
:
# If no GPU is specified nor available, add all supported architectures
# based on the NVCC CUDA version.
compute_capabilities
=
SUPPORTED_ARCHS
.
copy
()
if
nvcc_cuda_version
<
Version
(
"11.1"
):
compute_capabilities
.
remove
(
"8.6"
)
if
nvcc_cuda_version
<
Version
(
"11.8"
):
compute_capabilities
.
remove
(
"8.9"
)
compute_capabilities
.
remove
(
"9.0"
)
# Validate the NVCC CUDA version.
if
nvcc_cuda_version
<
Version
(
"11.0"
):
raise
RuntimeError
(
"CUDA 11.0 or higher is required to build the package."
)
if
nvcc_cuda_version
<
Version
(
"11.1"
):
if
any
(
cc
.
startswith
(
"8.6"
)
for
cc
in
compute_capabilities
):
raise
RuntimeError
(
"CUDA 11.1 or higher is required for compute capability 8.6."
)
if
nvcc_cuda_version
<
Version
(
"11.8"
):
if
any
(
cc
.
startswith
(
"8.9"
)
for
cc
in
compute_capabilities
):
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
# However, GPUs with compute capability 8.9 can also run the code generated by
# the previous versions of CUDA 11 and targeting compute capability 8.0.
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
# instead of 8.9.
warnings
.
warn
(
"CUDA 11.8 or higher is required for compute capability 8.9. "
"Targeting compute capability 8.0 instead."
)
compute_capabilities
=
set
(
cc
for
cc
in
compute_capabilities
if
not
cc
.
startswith
(
"8.9"
))
compute_capabilities
.
add
(
"8.0+PTX"
)
if
any
(
cc
.
startswith
(
"9.0"
)
for
cc
in
compute_capabilities
):
raise
RuntimeError
(
"CUDA 11.8 or higher is required for compute capability 9.0."
)
# Add target compute capabilities to NVCC flags.
for
capability
in
compute_capabilities
:
num
=
capability
[
0
]
+
capability
[
2
]
NVCC_FLAGS
+=
[
"-gencode"
,
f
"arch=compute_
{
num
}
,code=sm_
{
num
}
"
]
if
capability
.
endswith
(
"+PTX"
):
NVCC_FLAGS
+=
[
"-gencode"
,
f
"arch=compute_
{
num
}
,code=compute_
{
num
}
"
]
# Use NVCC threads to parallelize the build.
if
nvcc_cuda_version
>=
Version
(
"11.2"
):
num_threads
=
min
(
os
.
cpu_count
(),
8
)
NVCC_FLAGS
+=
[
"--threads"
,
str
(
num_threads
)]
ext_modules
=
[]
# Cache operations.
cache_extension
=
CUDAExtension
(
name
=
"vllm.cache_ops"
,
sources
=
[
"csrc/cache.cpp"
,
"csrc/cache_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
cache_extension
)
# Attention kernels.
attention_extension
=
CUDAExtension
(
name
=
"vllm.attention_ops"
,
sources
=
[
"csrc/attention.cpp"
,
"csrc/attention/attention_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
attention_extension
)
# Positional encoding kernels.
positional_encoding_extension
=
CUDAExtension
(
name
=
"vllm.pos_encoding_ops"
,
sources
=
[
"csrc/pos_encoding.cpp"
,
"csrc/pos_encoding_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
positional_encoding_extension
)
# Layer normalization kernels.
layernorm_extension
=
CUDAExtension
(
name
=
"vllm.layernorm_ops"
,
sources
=
[
"csrc/layernorm.cpp"
,
"csrc/layernorm_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
layernorm_extension
)
# Activation kernels.
activation_extension
=
CUDAExtension
(
name
=
"vllm.activation_ops"
,
sources
=
[
"csrc/activation.cpp"
,
"csrc/activation_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
activation_extension
)
# Quantization kernels.
quantization_extension
=
CUDAExtension
(
name
=
"vllm.quantization_ops"
,
sources
=
[
"csrc/quantization.cpp"
,
"csrc/quantization/awq/gemm_kernels.cu"
,
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
,
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
quantization_extension
)
# Misc. CUDA utils.
cuda_utils_extension
=
CUDAExtension
(
name
=
"vllm.cuda_utils"
,
sources
=
[
"csrc/cuda_utils.cpp"
,
"csrc/cuda_utils_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
cuda_utils_extension
)
def
get_path
(
*
filepath
)
->
str
:
return
os
.
path
.
join
(
ROOT_DIR
,
*
filepath
)
def
find_version
(
filepath
:
str
)
->
str
:
"""Extract version information from the given filepath.
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
"""
with
open
(
filepath
)
as
fp
:
version_match
=
re
.
search
(
r
"^__version__ = ['\"]([^'\"]*)['\"]"
,
fp
.
read
(),
re
.
M
)
if
version_match
:
return
version_match
.
group
(
1
)
raise
RuntimeError
(
"Unable to find version string."
)
def
get_vllm_version
()
->
str
:
version
=
find_version
(
get_path
(
"vllm"
,
"__init__.py"
))
cuda_version
=
str
(
nvcc_cuda_version
)
if
cuda_version
!=
MAIN_CUDA_VERSION
:
cuda_version_str
=
cuda_version
.
replace
(
"."
,
""
)[:
3
]
version
+=
f
"+cu
{
cuda_version_str
}
"
return
version
def
read_readme
()
->
str
:
"""Read the README file if present."""
p
=
get_path
(
"README.md"
)
if
os
.
path
.
isfile
(
p
):
return
io
.
open
(
get_path
(
"README.md"
),
"r"
,
encoding
=
"utf-8"
).
read
()
else
:
return
""
def
get_requirements
()
->
List
[
str
]:
"""Get Python package dependencies from requirements.txt."""
with
open
(
get_path
(
"requirements.txt"
))
as
f
:
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
return
requirements
setuptools
.
setup
(
name
=
"vllm"
,
version
=
get_vllm_version
(),
author
=
"vLLM Team"
,
license
=
"Apache 2.0"
,
description
=
(
"A high-throughput and memory-efficient inference and "
"serving engine for LLMs"
),
long_description
=
read_readme
(),
long_description_content_type
=
"text/markdown"
,
url
=
"https://github.com/vllm-project/vllm"
,
project_urls
=
{
"Homepage"
:
"https://github.com/vllm-project/vllm"
,
"Documentation"
:
"https://vllm.readthedocs.io/en/latest/"
,
},
classifiers
=
[
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"License :: OSI Approved :: Apache Software License"
,
"Topic :: Scientific/Engineering :: Artificial Intelligence"
,
],
packages
=
setuptools
.
find_packages
(
exclude
=
(
"benchmarks"
,
"csrc"
,
"docs"
,
"examples"
,
"tests"
)),
python_requires
=
">=3.8"
,
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
cmdclass
=
{
"build_ext"
:
BuildExtension
},
package_data
=
{
"vllm"
:
[
"py.typed"
]},
)
inference/vllm/tests/__init__.py
0 → 100644
View file @
24eacbc0
inference/vllm/tests/async_engine/api_server_async_engine.py
0 → 100644
View file @
24eacbc0
"""vllm.entrypoints.api_server with some extra logging for testing."""
import
argparse
from
typing
import
Any
,
Dict
import
uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
import
vllm.entrypoints.api_server
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
app
=
vllm
.
entrypoints
.
api_server
.
app
class
AsyncLLMEngineWithStats
(
AsyncLLMEngine
):
# pylint: disable=redefined-outer-name
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_num_aborts
=
0
async
def
abort
(
self
,
request_id
:
str
)
->
None
:
await
super
().
abort
(
request_id
)
self
.
_num_aborts
+=
1
def
testing_stats
(
self
)
->
Dict
[
str
,
Any
]:
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
@
app
.
get
(
"/stats"
)
def
stats
()
->
Response
:
"""Get the statistics of the engine."""
return
JSONResponse
(
engine
.
testing_stats
())
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
)
vllm
.
entrypoints
.
api_server
.
engine
=
engine
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"debug"
,
timeout_keep_alive
=
vllm
.
entrypoints
.
api_server
.
TIMEOUT_KEEP_ALIVE
)
inference/vllm/tests/async_engine/test_api_server.py
0 → 100644
View file @
24eacbc0
import
subprocess
import
sys
import
time
from
multiprocessing
import
Pool
from
pathlib
import
Path
import
pytest
import
requests
def
_query_server
(
prompt
:
str
)
->
dict
:
response
=
requests
.
post
(
"http://localhost:8000/generate"
,
json
=
{
"prompt"
:
prompt
,
"max_tokens"
:
100
,
"temperature"
:
0
,
"ignore_eos"
:
True
})
response
.
raise_for_status
()
return
response
.
json
()
@
pytest
.
fixture
def
api_server
():
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
"api_server_async_engine.py"
).
absolute
()
# pylint: disable=consider-using-with
uvicorn_process
=
subprocess
.
Popen
([
sys
.
executable
,
"-u"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
])
yield
uvicorn_process
.
terminate
()
# pylint: disable=redefined-outer-name, unused-argument
def
test_api_server
(
api_server
):
"""
Run the API server and test it.
We run both the server and requests in separate processes.
We test that the server can handle incoming requests, including
multiple requests at the same time, and that it can handle requests
being cancelled without crashing.
"""
with
Pool
(
32
)
as
pool
:
# Wait until the server is ready
prompts
=
[
"Hello world"
]
*
1
result
=
None
while
not
result
:
# pylint: disable=bare-except
try
:
for
result
in
pool
.
map
(
_query_server
,
prompts
):
break
except
:
time
.
sleep
(
1
)
# Actual tests start here
# Try with 1 prompt
for
result
in
pool
.
map
(
_query_server
,
prompts
):
assert
result
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
==
0
# Try with 100 prompts
prompts
=
[
"Hello world"
]
*
100
for
result
in
pool
.
map
(
_query_server
,
prompts
):
assert
result
# Cancel requests
pool
.
map_async
(
_query_server
,
prompts
)
time
.
sleep
(
0.01
)
pool
.
terminate
()
pool
.
join
()
# check cancellation stats
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
>
0
# check that server still runs after cancellations
with
Pool
(
32
)
as
pool
:
# Try with 100 prompts
prompts
=
[
"Hello world"
]
*
100
for
result
in
pool
.
map
(
_query_server
,
prompts
):
assert
result
inference/vllm/tests/async_engine/test_async_llm_engine.py
0 → 100644
View file @
24eacbc0
import
asyncio
from
dataclasses
import
dataclass
import
pytest
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
@
dataclass
class
RequestOutput
:
request_id
:
int
finished
:
bool
=
False
class
MockEngine
:
def
__init__
(
self
):
self
.
step_calls
=
0
self
.
add_request_calls
=
0
self
.
abort_request_calls
=
0
self
.
request_id
=
None
async
def
step_async
(
self
):
self
.
step_calls
+=
1
return
[
RequestOutput
(
request_id
=
self
.
request_id
)]
if
self
.
request_id
else
[]
def
generate
(
self
,
request_id
):
self
.
request_id
=
request_id
def
stop_generating
(
self
):
self
.
request_id
=
None
def
add_request
(
self
,
**
kwargs
):
del
kwargs
# Unused
self
.
add_request_calls
+=
1
def
abort_request
(
self
,
request_id
):
del
request_id
# Unused
self
.
abort_request_calls
+=
1
class
MockAsyncLLMEngine
(
AsyncLLMEngine
):
def
_init_engine
(
self
,
*
args
,
**
kwargs
):
return
MockEngine
()
@
pytest
.
mark
.
asyncio
async
def
test_new_requests_event
():
engine
=
MockAsyncLLMEngine
(
worker_use_ray
=
False
,
engine_use_ray
=
False
)
engine
.
start_background_loop
()
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
step_calls
==
0
await
engine
.
add_request
(
"1"
,
""
,
None
)
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
add_request_calls
==
1
assert
engine
.
engine
.
step_calls
==
1
await
engine
.
add_request
(
"2"
,
""
,
None
)
engine
.
engine
.
generate
(
"2"
)
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
add_request_calls
==
2
assert
engine
.
engine
.
step_calls
==
2
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
step_calls
==
3
engine
.
engine
.
stop_generating
()
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
step_calls
==
4
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
step_calls
==
4
await
engine
.
add_request
(
"3"
,
""
,
None
)
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
step_calls
==
5
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
step_calls
==
5
Prev
1
…
4
5
6
7
8
9
10
11
12
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment