Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
0b98ba15
Unverified
Commit
0b98ba15
authored
Jun 17, 2023
by
Woosuk Kwon
Committed by
GitHub
Jun 17, 2023
Browse files
Change the name to vLLM (#150)
parent
e5464ee4
Changes
88
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
81 additions
and
72 deletions
+81
-72
vllm/config.py
vllm/config.py
+3
-3
vllm/core/__init__.py
vllm/core/__init__.py
+0
-0
vllm/core/block_manager.py
vllm/core/block_manager.py
+3
-3
vllm/core/policy.py
vllm/core/policy.py
+1
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+7
-7
vllm/engine/__init__.py
vllm/engine/__init__.py
+0
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-5
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+9
-9
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+13
-14
vllm/engine/ray_utils.py
vllm/engine/ray_utils.py
+1
-1
vllm/engine/tokenizer_utils.py
vllm/engine/tokenizer_utils.py
+1
-1
vllm/entrypoints/__init__.py
vllm/entrypoints/__init__.py
+0
-0
vllm/entrypoints/api_server.py
vllm/entrypoints/api_server.py
+4
-4
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+5
-5
vllm/entrypoints/openai/__init__.py
vllm/entrypoints/openai/__init__.py
+0
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+13
-13
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+3
-3
vllm/logger.py
vllm/logger.py
+1
-1
vllm/model_executor/__init__.py
vllm/model_executor/__init__.py
+10
-0
vllm/model_executor/input_metadata.py
vllm/model_executor/input_metadata.py
+2
-2
No files found.
cacheflow
/config.py
→
vllm
/config.py
View file @
0b98ba15
...
...
@@ -3,8 +3,8 @@ from typing import Optional
import
torch
from
transformers
import
AutoConfig
,
PretrainedConfig
from
cacheflow
.logger
import
init_logger
from
cacheflow
.utils
import
get_cpu_memory
from
vllm
.logger
import
init_logger
from
vllm
.utils
import
get_cpu_memory
logger
=
init_logger
(
__name__
)
...
...
@@ -87,7 +87,7 @@ class CacheConfig:
Args:
block_size: Size of a cache block in number of tokens.
gpu_memory_utilization: Fraction of GPU memory to use for the
CacheFlow
execution.
vLLM
execution.
swap_space: Size of the CPU swap space per GPU (in GiB).
"""
def
__init__
(
...
...
cacheflow
/core/__init__.py
→
vllm
/core/__init__.py
View file @
0b98ba15
File moved
cacheflow
/core/block_manager.py
→
vllm
/core/block_manager.py
View file @
0b98ba15
"""A block manager that manages token blocks."""
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
cacheflow
.block
import
PhysicalTokenBlock
from
cacheflow
.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
cacheflow
.utils
import
Device
from
vllm
.block
import
PhysicalTokenBlock
from
vllm
.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm
.utils
import
Device
class
BlockAllocator
:
...
...
cacheflow
/core/policy.py
→
vllm
/core/policy.py
View file @
0b98ba15
from
typing
import
List
from
cacheflow
.sequence
import
SequenceGroup
from
vllm
.sequence
import
SequenceGroup
class
Policy
:
...
...
cacheflow
/core/scheduler.py
→
vllm
/core/scheduler.py
View file @
0b98ba15
...
...
@@ -2,13 +2,13 @@ import enum
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
cacheflow
.config
import
CacheConfig
,
SchedulerConfig
from
cacheflow
.core.block_manager
import
BlockSpaceManager
from
cacheflow
.core.policy
import
PolicyFactory
from
cacheflow
.logger
import
init_logger
from
cacheflow
.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceOutputs
,
SequenceStatus
)
from
vllm
.config
import
CacheConfig
,
SchedulerConfig
from
vllm
.core.block_manager
import
BlockSpaceManager
from
vllm
.core.policy
import
PolicyFactory
from
vllm
.logger
import
init_logger
from
vllm
.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceOutputs
,
SequenceStatus
)
logger
=
init_logger
(
__name__
)
...
...
cacheflow
/engine/__init__.py
→
vllm
/engine/__init__.py
View file @
0b98ba15
File moved
cacheflow
/engine/arg_utils.py
→
vllm
/engine/arg_utils.py
View file @
0b98ba15
...
...
@@ -3,13 +3,13 @@ import dataclasses
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Tuple
from
cacheflow
.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm
.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
@
dataclass
class
EngineArgs
:
"""Arguments for
CacheFlow
engine."""
"""Arguments for
vLLM
engine."""
model
:
str
download_dir
:
Optional
[
str
]
=
None
use_np_weights
:
bool
=
False
...
...
@@ -33,7 +33,7 @@ class EngineArgs:
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
,
)
->
argparse
.
ArgumentParser
:
"""Shared CLI arguments for
CacheFlow
engine."""
"""Shared CLI arguments for
vLLM
engine."""
# Model arguments
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
help
=
'name or path of the huggingface model to use'
)
...
...
@@ -118,7 +118,7 @@ class EngineArgs:
@
dataclass
class
AsyncEngineArgs
(
EngineArgs
):
"""Arguments for asynchronous
CacheFlow
engine."""
"""Arguments for asynchronous
vLLM
engine."""
engine_use_ray
:
bool
=
False
disable_log_requests
:
bool
=
False
...
...
cacheflow
/engine/async_llm_engine.py
→
vllm
/engine/async_llm_engine.py
View file @
0b98ba15
...
...
@@ -2,12 +2,12 @@ import asyncio
import
time
from
typing
import
Dict
,
List
,
Optional
from
cacheflow
.engine.arg_utils
import
AsyncEngineArgs
from
cacheflow
.engine.llm_engine
import
LLMEngine
from
cacheflow
.engine.ray_utils
import
initialize_cluster
,
ray
from
cacheflow
.logger
import
init_logger
from
cacheflow
.outputs
import
RequestOutput
from
cacheflow
.sampling_params
import
SamplingParams
from
vllm
.engine.arg_utils
import
AsyncEngineArgs
from
vllm
.engine.llm_engine
import
LLMEngine
from
vllm
.engine.ray_utils
import
initialize_cluster
,
ray
from
vllm
.logger
import
init_logger
from
vllm
.outputs
import
RequestOutput
from
vllm
.sampling_params
import
SamplingParams
logger
=
init_logger
(
__name__
)
...
...
@@ -104,7 +104,7 @@ class AsyncLLMEngine:
arrival_time
=
time
.
time
()
# Create an event to notify us that there is new output from the
#
cacheflow
engine.
#
vLLM
engine.
request_event
=
asyncio
.
Event
()
self
.
request_events
[
request_id
]
=
request_event
...
...
@@ -114,7 +114,7 @@ class AsyncLLMEngine:
f
"sampling params:
{
sampling_params
}
, "
f
"prompt token ids:
{
prompt_token_ids
}
."
)
# Add the request into the
cacheflow
engine's waiting queue.
# Add the request into the
vLLM
engine's waiting queue.
if
self
.
engine_use_ray
:
await
self
.
engine
.
add_request
.
remote
(
request_id
,
prompt
,
sampling_params
,
...
...
@@ -126,7 +126,7 @@ class AsyncLLMEngine:
prompt_token_ids
=
prompt_token_ids
,
arrival_time
=
arrival_time
)
# The
cacheflow
engine does not have a background loop that keeps
# The
vLLM
engine does not have a background loop that keeps
# processing incoming requests. Therefore, we need to keep kicking
# the engine to process the requests.
while
True
:
...
...
cacheflow
/engine/llm_engine.py
→
vllm
/engine/llm_engine.py
View file @
0b98ba15
import
time
from
typing
import
Any
,
List
,
Optional
from
cacheflow.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
cacheflow.core.scheduler
import
Scheduler
from
cacheflow.engine.arg_utils
import
EngineArgs
from
cacheflow.engine.ray_utils
import
DeviceID
,
initialize_cluster
,
ray
from
cacheflow.engine.tokenizer_utils
import
(
detokenize_incrementally
,
get_tokenizer
)
from
cacheflow.logger
import
init_logger
from
cacheflow.outputs
import
RequestOutput
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
cacheflow.utils
import
Counter
from
cacheflow.worker.worker
import
Worker
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.ray_utils
import
DeviceID
,
initialize_cluster
,
ray
from
vllm.engine.tokenizer_utils
import
detokenize_incrementally
,
get_tokenizer
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Counter
from
vllm.worker.worker
import
Worker
logger
=
init_logger
(
__name__
)
...
...
@@ -21,7 +20,7 @@ logger = init_logger(__name__)
class
LLMEngine
:
"""An LLM engine that receives requests and generates texts.
This is the main class for the
CacheFlow
LLM engine. It receives requests
This is the main class for the
v
LLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
...
...
cacheflow
/engine/ray_utils.py
→
vllm
/engine/ray_utils.py
View file @
0b98ba15
...
...
@@ -6,7 +6,7 @@ try:
except
ImportError
:
ray
=
None
from
cacheflow
.config
import
ParallelConfig
from
vllm
.config
import
ParallelConfig
DeviceID
=
Tuple
[
int
,
Optional
[
str
],
int
]
# rank, node resource (node IP), device id
...
...
cacheflow
/engine/tokenizer_utils.py
→
vllm
/engine/tokenizer_utils.py
View file @
0b98ba15
...
...
@@ -3,7 +3,7 @@ from typing import List, Tuple, Union
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
from
cacheflow
.logger
import
init_logger
from
vllm
.logger
import
init_logger
logger
=
init_logger
(
__name__
)
...
...
cacheflow
/entrypoints/__init__.py
→
vllm
/entrypoints/__init__.py
View file @
0b98ba15
File moved
cacheflow
/entrypoints/api_server.py
→
vllm
/entrypoints/api_server.py
View file @
0b98ba15
...
...
@@ -6,10 +6,10 @@ from fastapi import BackgroundTasks, FastAPI, Request
from
fastapi.responses
import
Response
,
StreamingResponse
import
uvicorn
from
cacheflow
.engine.arg_utils
import
AsyncEngineArgs
from
cacheflow
.engine.async_llm_engine
import
AsyncLLMEngine
from
cacheflow
.sampling_params
import
SamplingParams
from
cacheflow
.utils
import
random_uuid
from
vllm
.engine.arg_utils
import
AsyncEngineArgs
from
vllm
.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm
.sampling_params
import
SamplingParams
from
vllm
.utils
import
random_uuid
TIMEOUT_KEEP_ALIVE
=
5
# seconds.
TIMEOUT_TO_PREVENT_DEADLOCK
=
1
# seconds
...
...
cacheflow
/entrypoints/llm.py
→
vllm
/entrypoints/llm.py
View file @
0b98ba15
...
...
@@ -3,11 +3,11 @@ from typing import List, Optional, Union
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
cacheflow
.engine.arg_utils
import
EngineArgs
from
cacheflow
.engine.llm_engine
import
LLMEngine
from
cacheflow
.outputs
import
RequestOutput
from
cacheflow
.sampling_params
import
SamplingParams
from
cacheflow
.utils
import
Counter
from
vllm
.engine.arg_utils
import
EngineArgs
from
vllm
.engine.llm_engine
import
LLMEngine
from
vllm
.outputs
import
RequestOutput
from
vllm
.sampling_params
import
SamplingParams
from
vllm
.utils
import
Counter
class
LLM
:
...
...
cacheflow
/entrypoints/openai/__init__.py
→
vllm
/entrypoints/openai/__init__.py
View file @
0b98ba15
File moved
cacheflow
/entrypoints/openai/api_server.py
→
vllm
/entrypoints/openai/api_server.py
View file @
0b98ba15
...
...
@@ -13,17 +13,17 @@ from fastapi.middleware.cors import CORSMiddleware
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
import
uvicorn
from
cacheflow
.engine.arg_utils
import
AsyncEngineArgs
from
cacheflow
.engine.async_llm_engine
import
AsyncLLMEngine
from
cacheflow
.engine.tokenizer_utils
import
get_tokenizer
from
cacheflow
.entrypoints.openai.protocol
import
(
from
vllm
.engine.arg_utils
import
AsyncEngineArgs
from
vllm
.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm
.engine.tokenizer_utils
import
get_tokenizer
from
vllm
.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
ErrorResponse
,
LogProbs
,
ModelCard
,
ModelList
,
ModelPermission
,
UsageInfo
)
from
cacheflow
.logger
import
init_logger
from
cacheflow
.outputs
import
RequestOutput
from
cacheflow
.sampling_params
import
SamplingParams
from
cacheflow
.utils
import
random_uuid
from
vllm
.logger
import
init_logger
from
vllm
.outputs
import
RequestOutput
from
vllm
.sampling_params
import
SamplingParams
from
vllm
.utils
import
random_uuid
TIMEOUT_KEEP_ALIVE
=
5
# seconds
...
...
@@ -93,11 +93,11 @@ async def create_completion(raw_request: Request):
for the API specification. This API mimics the OpenAI Completion API.
NOTE: Currently we do not support the following features:
- echo (since the
cacheflow
engine does not currently support
- echo (since the
vLLM
engine does not currently support
getting the logprobs of prompt tokens)
- suffix (the language models we currently support do not support
suffix)
- logit_bias (to be supported
in cacheflow
engine)
- logit_bias (to be supported
by vLLM
engine)
"""
request
=
CompletionRequest
(
**
await
raw_request
.
json
())
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
...
...
@@ -107,7 +107,7 @@ async def create_completion(raw_request: Request):
return
error_check_ret
if
request
.
echo
:
# We do not support echo since the
cacheflow
engine does not
# We do not support echo since the
vLLM
engine does not
# currently support getting the logprobs of prompt tokens.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"echo is not currently supported"
)
...
...
@@ -118,7 +118,7 @@ async def create_completion(raw_request: Request):
"suffix is not currently supported"
)
if
request
.
logit_bias
is
not
None
:
# TODO: support logit_bias in
cacheflow
engine.
# TODO: support logit_bias in
vLLM
engine.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"logit_bias is not currently supported"
)
...
...
@@ -274,7 +274,7 @@ async def create_completion(raw_request: Request):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"
CacheFlow
OpenAI-Compatible RESTful API server."
description
=
"
vLLM
OpenAI-Compatible RESTful API server."
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
,
help
=
"host name"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
,
help
=
"port number"
)
...
...
cacheflow
/entrypoints/openai/protocol.py
→
vllm
/entrypoints/openai/protocol.py
View file @
0b98ba15
...
...
@@ -4,7 +4,7 @@ from typing import Dict, List, Literal, Optional, Union
from
pydantic
import
BaseModel
,
Field
from
cacheflow
.utils
import
random_uuid
from
vllm
.utils
import
random_uuid
class
ErrorResponse
(
BaseModel
):
...
...
@@ -34,7 +34,7 @@ class ModelCard(BaseModel):
id
:
str
object
:
str
=
"model"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
owned_by
:
str
=
"
cacheflow
"
owned_by
:
str
=
"
vllm
"
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
permission
:
List
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
...
...
@@ -82,7 +82,7 @@ class CompletionRequest(BaseModel):
best_of
:
Optional
[
int
]
=
None
logit_bias
:
Optional
[
Dict
[
str
,
float
]]
=
None
user
:
Optional
[
str
]
=
None
# Additional parameters supported by
cacheflow
# Additional parameters supported by
vLLM
top_k
:
Optional
[
int
]
=
-
1
ignore_eos
:
Optional
[
bool
]
=
False
use_beam_search
:
Optional
[
bool
]
=
False
...
...
cacheflow
/logger.py
→
vllm
/logger.py
View file @
0b98ba15
...
...
@@ -22,7 +22,7 @@ class NewLineFormatter(logging.Formatter):
return
msg
_root_logger
=
logging
.
getLogger
(
"
cacheflow
"
)
_root_logger
=
logging
.
getLogger
(
"
vllm
"
)
_default_handler
=
None
...
...
vllm/model_executor/__init__.py
0 → 100644
View file @
0b98ba15
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.utils
import
set_random_seed
__all__
=
[
"InputMetadata"
,
"get_model"
,
"set_random_seed"
,
]
cacheflow
/model_executor/input_metadata.py
→
vllm
/model_executor/input_metadata.py
View file @
0b98ba15
...
...
@@ -3,8 +3,8 @@ from typing import Dict, List, Tuple
import
torch
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
cacheflow
.sampling_params
import
SamplingParams
from
cacheflow
.sequence
import
SequenceData
from
vllm
.sampling_params
import
SamplingParams
from
vllm
.sequence
import
SequenceData
class
InputMetadata
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment