Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
eedb46bf
Unverified
Commit
eedb46bf
authored
Jun 17, 2023
by
Zhuohan Li
Committed by
GitHub
Jun 17, 2023
Browse files
Rename servers and change port numbers to reduce confusion (#149)
parent
311490a7
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
41 additions
and
37 deletions
+41
-37
benchmarks/benchmark_async_llm_server.py
benchmarks/benchmark_async_llm_server.py
+1
-1
cacheflow/__init__.py
cacheflow/__init__.py
+2
-2
cacheflow/entrypoints/api_server.py
cacheflow/entrypoints/api_server.py
+4
-4
cacheflow/entrypoints/llm.py
cacheflow/entrypoints/llm.py
+3
-3
cacheflow/entrypoints/openai/api_server.py
cacheflow/entrypoints/openai/api_server.py
+2
-2
cacheflow/server/async_llm_server.py
cacheflow/server/async_llm_server.py
+16
-16
cacheflow/server/llm_server.py
cacheflow/server/llm_server.py
+3
-3
examples/api_client.py
examples/api_client.py
+3
-1
examples/gradio_webserver.py
examples/gradio_webserver.py
+3
-2
examples/llmserver_example.py
examples/llmserver_example.py
+4
-3
No files found.
benchmarks/benchmark_async_llm_server.py
View file @
eedb46bf
...
...
@@ -52,7 +52,7 @@ def main(args: argparse.Namespace):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
1
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
0
)
parser
.
add_argument
(
"--max-tokens"
,
type
=
int
,
default
=
128
)
parser
.
add_argument
(
"--n-threads"
,
type
=
int
,
default
=
128
)
args
=
parser
.
parse_args
()
...
...
cacheflow/__init__.py
View file @
eedb46bf
...
...
@@ -2,7 +2,7 @@ from cacheflow.entrypoints.llm import LLM
from
cacheflow.outputs
import
RequestOutput
,
CompletionOutput
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.server.arg_utils
import
ServerArgs
from
cacheflow.server.llm_server
import
LLM
Server
from
cacheflow.server.llm_server
import
LLM
Engine
from
cacheflow.server.ray_utils
import
initialize_cluster
__version__
=
"0.1.0"
...
...
@@ -12,7 +12,7 @@ __all__ = [
"SamplingParams"
,
"RequestOutput"
,
"CompletionOutput"
,
"LLM
Server
"
,
"LLM
Engine
"
,
"ServerArgs"
,
"initialize_cluster"
,
]
cacheflow/entrypoints/
simple_fastapi_frontend
.py
→
cacheflow/entrypoints/
api_server
.py
View file @
eedb46bf
...
...
@@ -8,7 +8,7 @@ import uvicorn
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.server.arg_utils
import
AsyncServerArgs
from
cacheflow.server.async_llm_server
import
AsyncLLM
Server
from
cacheflow.server.async_llm_server
import
AsyncLLM
Engine
from
cacheflow.utils
import
random_uuid
TIMEOUT_KEEP_ALIVE
=
5
# seconds.
...
...
@@ -18,7 +18,7 @@ app = FastAPI()
@
app
.
post
(
"/generate"
)
async
def
generate
(
request
:
Request
)
->
Response
:
"""
Stream the results of the generation
request.
"""
Generate completion for the
request.
The request should be a JSON object with the following fields:
- prompt: the prompt to use for the generation.
...
...
@@ -74,12 +74,12 @@ async def generate(request: Request) -> Response:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
1
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
0
)
parser
=
AsyncServerArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
server_args
=
AsyncServerArgs
.
from_cli_args
(
args
)
server
=
AsyncLLM
Server
.
from_server_args
(
server_args
)
server
=
AsyncLLM
Engine
.
from_server_args
(
server_args
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"debug"
,
timeout_keep_alive
=
TIMEOUT_KEEP_ALIVE
)
cacheflow/entrypoints/llm.py
View file @
eedb46bf
...
...
@@ -6,7 +6,7 @@ from tqdm import tqdm
from
cacheflow.outputs
import
RequestOutput
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.server.arg_utils
import
ServerArgs
from
cacheflow.server.llm_server
import
LLM
Server
from
cacheflow.server.llm_server
import
LLM
Engine
from
cacheflow.utils
import
Counter
...
...
@@ -20,7 +20,7 @@ class LLM:
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLM
Server
` class instead.
serving, use the `AsyncLLM
Engine
` class instead.
NOTE: For the comprehensive list of arguments, see `ServerArgs`.
Args:
...
...
@@ -52,7 +52,7 @@ class LLM:
seed
=
seed
,
**
kwargs
,
)
self
.
llm_server
=
LLM
Server
.
from_server_args
(
server_args
)
self
.
llm_server
=
LLM
Engine
.
from_server_args
(
server_args
)
self
.
request_counter
=
Counter
()
def
get_tokenizer
(
...
...
cacheflow/entrypoints/openai/
openai_frontend
.py
→
cacheflow/entrypoints/openai/
api_server
.py
View file @
eedb46bf
...
...
@@ -15,7 +15,7 @@ import uvicorn
from
cacheflow.outputs
import
RequestOutput
from
cacheflow.server.arg_utils
import
AsyncServerArgs
from
cacheflow.server.async_llm_server
import
AsyncLLM
Server
from
cacheflow.server.async_llm_server
import
AsyncLLM
Engine
from
cacheflow.server.tokenizer_utils
import
get_tokenizer
from
cacheflow.logger
import
init_logger
from
cacheflow.sampling_params
import
SamplingParams
...
...
@@ -319,7 +319,7 @@ if __name__ == "__main__":
served_model
=
args
.
served_model_name
or
args
.
model
server_args
=
AsyncServerArgs
.
from_cli_args
(
args
)
server
=
AsyncLLM
Server
.
from_server_args
(
server_args
)
server
=
AsyncLLM
Engine
.
from_server_args
(
server_args
)
# A separate tokenizer to map token IDs to strings.
tokenizer
=
get_tokenizer
(
args
.
model
)
...
...
cacheflow/server/async_llm_server.py
View file @
eedb46bf
...
...
@@ -6,7 +6,7 @@ from cacheflow.logger import init_logger
from
cacheflow.outputs
import
RequestOutput
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.server.arg_utils
import
AsyncServerArgs
from
cacheflow.server.llm_server
import
LLM
Server
from
cacheflow.server.llm_server
import
LLM
Engine
from
cacheflow.server.ray_utils
import
ray
,
initialize_cluster
logger
=
init_logger
(
__name__
)
...
...
@@ -14,26 +14,26 @@ logger = init_logger(__name__)
TIMEOUT_TO_PREVENT_DEADLOCK
=
1
# seconds
class
AsyncLLM
Server
:
"""An asynchronous wrapper for LLM
Server
.
class
AsyncLLM
Engine
:
"""An asynchronous wrapper for LLM
Engine
.
This class is used to wrap the LLM
Server
class to make it asynchronous. It
This class is used to wrap the LLM
Engine
class to make it asynchronous. It
uses asyncio to create a background loop that keeps processing incoming
requests. The LLM
Server
is kicked by the generate method when there
requests. The LLM
Engine
is kicked by the generate method when there
are requests in the waiting queue. The generate method yields the outputs
from the LLM
Server
to the caller.
from the LLM
Engine
to the caller.
NOTE: For the comprehensive list of arguments, see `LLM
Server
`.
NOTE: For the comprehensive list of arguments, see `LLM
Engine
`.
Args:
worker_use_ray: Whether to use Ray for model workers. Required for
distributed execution. Should be the same as
`parallel_config.worker_use_ray`.
server_use_ray: Whether to make LLM
Server
a Ray actor. If so, the
server_use_ray: Whether to make LLM
Engine
a Ray actor. If so, the
async frontend will be executed in a separate process as the
model workers.
log_requests: Whether to log the requests.
*args, *kwargs: Arguments for LLM
Server
.
*args, *kwargs: Arguments for LLM
Engine
.
"""
def
__init__
(
self
,
worker_use_ray
:
bool
,
server_use_ray
:
bool
,
log_requests
:
bool
=
True
,
*
args
,
**
kwargs
)
->
None
:
...
...
@@ -41,11 +41,11 @@ class AsyncLLMServer:
self
.
server_use_ray
=
server_use_ray
self
.
log_requests
=
log_requests
if
not
self
.
server_use_ray
:
server_class
=
LLM
Server
server_class
=
LLM
Engine
elif
self
.
worker_use_ray
:
server_class
=
ray
.
remote
(
num_cpus
=
0
)(
LLM
Server
).
remote
server_class
=
ray
.
remote
(
num_cpus
=
0
)(
LLM
Engine
).
remote
else
:
server_class
=
ray
.
remote
(
num_gpus
=
1
)(
LLM
Server
).
remote
server_class
=
ray
.
remote
(
num_gpus
=
1
)(
LLM
Engine
).
remote
self
.
server
=
server_class
(
*
args
,
**
kwargs
)
# Request id -> request output.
self
.
request_outputs
:
Dict
[
str
,
RequestOutput
]
=
{}
...
...
@@ -85,8 +85,8 @@ class AsyncLLMServer:
"""Generate outputs for a request.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLM
Server
and streams the outputs
from the LLM
Server
to the caller.
request into the waiting queue of the LLM
Engine
and streams the outputs
from the LLM
Engine
to the caller.
Args:
prompt: The prompt string. Can be None if prompt_token_ids is
...
...
@@ -97,7 +97,7 @@ class AsyncLLMServer:
use the tokenizer to convert the prompts to token IDs.
Yields:
The output `RequestOutput` objects from the LLM
Server
for the
The output `RequestOutput` objects from the LLM
Engine
for the
request.
"""
# Preprocess the request.
...
...
@@ -200,7 +200,7 @@ class AsyncLLMServer:
self
.
kicking_request_id
=
None
@
classmethod
def
from_server_args
(
cls
,
server_args
:
AsyncServerArgs
)
->
"AsyncLLM
Server
"
:
def
from_server_args
(
cls
,
server_args
:
AsyncServerArgs
)
->
"AsyncLLM
Engine
"
:
"""Creates an async LLM server from the server arguments."""
# Create the server configs.
server_configs
=
server_args
.
create_server_configs
()
...
...
cacheflow/server/llm_server.py
View file @
eedb46bf
...
...
@@ -18,7 +18,7 @@ from cacheflow.worker.worker import Worker
logger
=
init_logger
(
__name__
)
class
LLM
Server
:
class
LLM
Engine
:
"""An LLM server that receives requests and generates texts.
This is the main class for the CacheFlow LLM server. It receives requests
...
...
@@ -29,7 +29,7 @@ class LLMServer:
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLM
Server
` class wraps this class for online serving.
`AsyncLLM
Engine
` class wraps this class for online serving.
NOTE: The config arguments are derived from the `ServerArgs` class. For the
comprehensive list of arguments, see `ServerArgs`.
...
...
@@ -135,7 +135,7 @@ class LLMServer:
self
.
_run_workers
(
"init_cache_engine"
,
cache_config
=
self
.
cache_config
)
@
classmethod
def
from_server_args
(
cls
,
server_args
:
ServerArgs
)
->
"LLM
Server
"
:
def
from_server_args
(
cls
,
server_args
:
ServerArgs
)
->
"LLM
Engine
"
:
"""Creates an LLM server from the server arguments."""
# Create the server configs.
server_configs
=
server_args
.
create_server_configs
()
...
...
examples/
simple_fast
api_client.py
→
examples/api_client.py
View file @
eedb46bf
"""Example Python client for cacheflow.entrypoints.api_server"""
import
argparse
import
json
from
typing
import
Iterable
,
List
...
...
@@ -45,7 +47,7 @@ def get_response(response: requests.Response) -> List[str]:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
1
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
0
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
4
)
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"San Francisco is a"
)
parser
.
add_argument
(
"--stream"
,
action
=
"store_true"
)
...
...
examples/gradio_webserver.py
View file @
eedb46bf
...
...
@@ -9,6 +9,7 @@ def http_bot(prompt):
headers
=
{
"User-Agent"
:
"Cacheflow Client"
}
pload
=
{
"prompt"
:
prompt
,
"stream"
:
True
,
"max_tokens"
:
128
,
}
response
=
requests
.
post
(
args
.
model_url
,
headers
=
headers
,
json
=
pload
,
stream
=
True
)
...
...
@@ -34,8 +35,8 @@ def build_demo():
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
2
)
parser
.
add_argument
(
"--model-url"
,
type
=
str
,
default
=
"http://localhost:800
1
/generate"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
800
1
)
parser
.
add_argument
(
"--model-url"
,
type
=
str
,
default
=
"http://localhost:800
0
/generate"
)
args
=
parser
.
parse_args
()
demo
=
build_demo
()
...
...
examples/
simple_server
.py
→
examples/
llmserver_example
.py
View file @
eedb46bf
import
argparse
from
cacheflow
import
ServerArgs
,
LLM
Server
,
SamplingParams
from
cacheflow
import
ServerArgs
,
LLM
Engine
,
SamplingParams
def
main
(
args
:
argparse
.
Namespace
):
# Parse the CLI argument and initialize the server.
server_args
=
ServerArgs
.
from_cli_args
(
args
)
server
=
LLM
Server
.
from_server_args
(
server_args
)
server
=
LLM
Engine
.
from_server_args
(
server_args
)
# Test the following prompts.
test_prompts
=
[
...
...
@@ -38,7 +38,8 @@ def main(args: argparse.Namespace):
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Simple CacheFlow server.'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Demo on using the LLMEngine class synchronously'
)
parser
=
ServerArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment