Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
0b98ba15
Unverified
Commit
0b98ba15
authored
Jun 17, 2023
by
Woosuk Kwon
Committed by
GitHub
Jun 17, 2023
Browse files
Change the name to vLLM (#150)
parent
e5464ee4
Changes
88
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
81 additions
and
72 deletions
+81
-72
vllm/config.py
vllm/config.py
+3
-3
vllm/core/__init__.py
vllm/core/__init__.py
+0
-0
vllm/core/block_manager.py
vllm/core/block_manager.py
+3
-3
vllm/core/policy.py
vllm/core/policy.py
+1
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+7
-7
vllm/engine/__init__.py
vllm/engine/__init__.py
+0
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-5
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+9
-9
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+13
-14
vllm/engine/ray_utils.py
vllm/engine/ray_utils.py
+1
-1
vllm/engine/tokenizer_utils.py
vllm/engine/tokenizer_utils.py
+1
-1
vllm/entrypoints/__init__.py
vllm/entrypoints/__init__.py
+0
-0
vllm/entrypoints/api_server.py
vllm/entrypoints/api_server.py
+4
-4
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+5
-5
vllm/entrypoints/openai/__init__.py
vllm/entrypoints/openai/__init__.py
+0
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+13
-13
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+3
-3
vllm/logger.py
vllm/logger.py
+1
-1
vllm/model_executor/__init__.py
vllm/model_executor/__init__.py
+10
-0
vllm/model_executor/input_metadata.py
vllm/model_executor/input_metadata.py
+2
-2
No files found.
cacheflow
/config.py
→
vllm
/config.py
View file @
0b98ba15
...
@@ -3,8 +3,8 @@ from typing import Optional
...
@@ -3,8 +3,8 @@ from typing import Optional
import
torch
import
torch
from
transformers
import
AutoConfig
,
PretrainedConfig
from
transformers
import
AutoConfig
,
PretrainedConfig
from
cacheflow
.logger
import
init_logger
from
vllm
.logger
import
init_logger
from
cacheflow
.utils
import
get_cpu_memory
from
vllm
.utils
import
get_cpu_memory
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -87,7 +87,7 @@ class CacheConfig:
...
@@ -87,7 +87,7 @@ class CacheConfig:
Args:
Args:
block_size: Size of a cache block in number of tokens.
block_size: Size of a cache block in number of tokens.
gpu_memory_utilization: Fraction of GPU memory to use for the
gpu_memory_utilization: Fraction of GPU memory to use for the
CacheFlow
execution.
vLLM
execution.
swap_space: Size of the CPU swap space per GPU (in GiB).
swap_space: Size of the CPU swap space per GPU (in GiB).
"""
"""
def
__init__
(
def
__init__
(
...
...
cacheflow
/core/__init__.py
→
vllm
/core/__init__.py
View file @
0b98ba15
File moved
cacheflow
/core/block_manager.py
→
vllm
/core/block_manager.py
View file @
0b98ba15
"""A block manager that manages token blocks."""
"""A block manager that manages token blocks."""
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
cacheflow
.block
import
PhysicalTokenBlock
from
vllm
.block
import
PhysicalTokenBlock
from
cacheflow
.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm
.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
cacheflow
.utils
import
Device
from
vllm
.utils
import
Device
class
BlockAllocator
:
class
BlockAllocator
:
...
...
cacheflow
/core/policy.py
→
vllm
/core/policy.py
View file @
0b98ba15
from
typing
import
List
from
typing
import
List
from
cacheflow
.sequence
import
SequenceGroup
from
vllm
.sequence
import
SequenceGroup
class
Policy
:
class
Policy
:
...
...
cacheflow
/core/scheduler.py
→
vllm
/core/scheduler.py
View file @
0b98ba15
...
@@ -2,13 +2,13 @@ import enum
...
@@ -2,13 +2,13 @@ import enum
import
time
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
cacheflow
.config
import
CacheConfig
,
SchedulerConfig
from
vllm
.config
import
CacheConfig
,
SchedulerConfig
from
cacheflow
.core.block_manager
import
BlockSpaceManager
from
vllm
.core.block_manager
import
BlockSpaceManager
from
cacheflow
.core.policy
import
PolicyFactory
from
vllm
.core.policy
import
PolicyFactory
from
cacheflow
.logger
import
init_logger
from
vllm
.logger
import
init_logger
from
cacheflow
.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
from
vllm
.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceOutputs
,
SequenceGroupMetadata
,
SequenceOutputs
,
SequenceStatus
)
SequenceStatus
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
...
cacheflow
/engine/__init__.py
→
vllm
/engine/__init__.py
View file @
0b98ba15
File moved
cacheflow
/engine/arg_utils.py
→
vllm
/engine/arg_utils.py
View file @
0b98ba15
...
@@ -3,13 +3,13 @@ import dataclasses
...
@@ -3,13 +3,13 @@ import dataclasses
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
from
cacheflow
.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
from
vllm
.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
SchedulerConfig
)
@
dataclass
@
dataclass
class
EngineArgs
:
class
EngineArgs
:
"""Arguments for
CacheFlow
engine."""
"""Arguments for
vLLM
engine."""
model
:
str
model
:
str
download_dir
:
Optional
[
str
]
=
None
download_dir
:
Optional
[
str
]
=
None
use_np_weights
:
bool
=
False
use_np_weights
:
bool
=
False
...
@@ -33,7 +33,7 @@ class EngineArgs:
...
@@ -33,7 +33,7 @@ class EngineArgs:
def
add_cli_args
(
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
,
parser
:
argparse
.
ArgumentParser
,
)
->
argparse
.
ArgumentParser
:
)
->
argparse
.
ArgumentParser
:
"""Shared CLI arguments for
CacheFlow
engine."""
"""Shared CLI arguments for
vLLM
engine."""
# Model arguments
# Model arguments
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
help
=
'name or path of the huggingface model to use'
)
help
=
'name or path of the huggingface model to use'
)
...
@@ -118,7 +118,7 @@ class EngineArgs:
...
@@ -118,7 +118,7 @@ class EngineArgs:
@
dataclass
@
dataclass
class
AsyncEngineArgs
(
EngineArgs
):
class
AsyncEngineArgs
(
EngineArgs
):
"""Arguments for asynchronous
CacheFlow
engine."""
"""Arguments for asynchronous
vLLM
engine."""
engine_use_ray
:
bool
=
False
engine_use_ray
:
bool
=
False
disable_log_requests
:
bool
=
False
disable_log_requests
:
bool
=
False
...
...
cacheflow
/engine/async_llm_engine.py
→
vllm
/engine/async_llm_engine.py
View file @
0b98ba15
...
@@ -2,12 +2,12 @@ import asyncio
...
@@ -2,12 +2,12 @@ import asyncio
import
time
import
time
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Dict
,
List
,
Optional
from
cacheflow
.engine.arg_utils
import
AsyncEngineArgs
from
vllm
.engine.arg_utils
import
AsyncEngineArgs
from
cacheflow
.engine.llm_engine
import
LLMEngine
from
vllm
.engine.llm_engine
import
LLMEngine
from
cacheflow
.engine.ray_utils
import
initialize_cluster
,
ray
from
vllm
.engine.ray_utils
import
initialize_cluster
,
ray
from
cacheflow
.logger
import
init_logger
from
vllm
.logger
import
init_logger
from
cacheflow
.outputs
import
RequestOutput
from
vllm
.outputs
import
RequestOutput
from
cacheflow
.sampling_params
import
SamplingParams
from
vllm
.sampling_params
import
SamplingParams
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -104,7 +104,7 @@ class AsyncLLMEngine:
...
@@ -104,7 +104,7 @@ class AsyncLLMEngine:
arrival_time
=
time
.
time
()
arrival_time
=
time
.
time
()
# Create an event to notify us that there is new output from the
# Create an event to notify us that there is new output from the
#
cacheflow
engine.
#
vLLM
engine.
request_event
=
asyncio
.
Event
()
request_event
=
asyncio
.
Event
()
self
.
request_events
[
request_id
]
=
request_event
self
.
request_events
[
request_id
]
=
request_event
...
@@ -114,7 +114,7 @@ class AsyncLLMEngine:
...
@@ -114,7 +114,7 @@ class AsyncLLMEngine:
f
"sampling params:
{
sampling_params
}
, "
f
"sampling params:
{
sampling_params
}
, "
f
"prompt token ids:
{
prompt_token_ids
}
."
)
f
"prompt token ids:
{
prompt_token_ids
}
."
)
# Add the request into the
cacheflow
engine's waiting queue.
# Add the request into the
vLLM
engine's waiting queue.
if
self
.
engine_use_ray
:
if
self
.
engine_use_ray
:
await
self
.
engine
.
add_request
.
remote
(
await
self
.
engine
.
add_request
.
remote
(
request_id
,
prompt
,
sampling_params
,
request_id
,
prompt
,
sampling_params
,
...
@@ -126,7 +126,7 @@ class AsyncLLMEngine:
...
@@ -126,7 +126,7 @@ class AsyncLLMEngine:
prompt_token_ids
=
prompt_token_ids
,
prompt_token_ids
=
prompt_token_ids
,
arrival_time
=
arrival_time
)
arrival_time
=
arrival_time
)
# The
cacheflow
engine does not have a background loop that keeps
# The
vLLM
engine does not have a background loop that keeps
# processing incoming requests. Therefore, we need to keep kicking
# processing incoming requests. Therefore, we need to keep kicking
# the engine to process the requests.
# the engine to process the requests.
while
True
:
while
True
:
...
...
cacheflow
/engine/llm_engine.py
→
vllm
/engine/llm_engine.py
View file @
0b98ba15
import
time
import
time
from
typing
import
Any
,
List
,
Optional
from
typing
import
Any
,
List
,
Optional
from
cacheflow.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
SchedulerConfig
)
from
cacheflow.core.scheduler
import
Scheduler
from
vllm.core.scheduler
import
Scheduler
from
cacheflow.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
cacheflow.engine.ray_utils
import
DeviceID
,
initialize_cluster
,
ray
from
vllm.engine.ray_utils
import
DeviceID
,
initialize_cluster
,
ray
from
cacheflow.engine.tokenizer_utils
import
(
detokenize_incrementally
,
from
vllm.engine.tokenizer_utils
import
detokenize_incrementally
,
get_tokenizer
get_tokenizer
)
from
vllm.logger
import
init_logger
from
cacheflow.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
cacheflow.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
vllm.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
cacheflow.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Counter
from
cacheflow.utils
import
Counter
from
vllm.worker.worker
import
Worker
from
cacheflow.worker.worker
import
Worker
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -21,7 +20,7 @@ logger = init_logger(__name__)
...
@@ -21,7 +20,7 @@ logger = init_logger(__name__)
class
LLMEngine
:
class
LLMEngine
:
"""An LLM engine that receives requests and generates texts.
"""An LLM engine that receives requests and generates texts.
This is the main class for the
CacheFlow
LLM engine. It receives requests
This is the main class for the
v
LLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
space allocated for intermediate states (aka KV cache). This class utilizes
...
...
cacheflow
/engine/ray_utils.py
→
vllm
/engine/ray_utils.py
View file @
0b98ba15
...
@@ -6,7 +6,7 @@ try:
...
@@ -6,7 +6,7 @@ try:
except
ImportError
:
except
ImportError
:
ray
=
None
ray
=
None
from
cacheflow
.config
import
ParallelConfig
from
vllm
.config
import
ParallelConfig
DeviceID
=
Tuple
[
int
,
Optional
[
str
],
int
]
# rank, node resource (node IP), device id
DeviceID
=
Tuple
[
int
,
Optional
[
str
],
int
]
# rank, node resource (node IP), device id
...
...
cacheflow
/engine/tokenizer_utils.py
→
vllm
/engine/tokenizer_utils.py
View file @
0b98ba15
...
@@ -3,7 +3,7 @@ from typing import List, Tuple, Union
...
@@ -3,7 +3,7 @@ from typing import List, Tuple, Union
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
PreTrainedTokenizer
,
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
PreTrainedTokenizerFast
)
from
cacheflow
.logger
import
init_logger
from
vllm
.logger
import
init_logger
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
...
cacheflow
/entrypoints/__init__.py
→
vllm
/entrypoints/__init__.py
View file @
0b98ba15
File moved
cacheflow
/entrypoints/api_server.py
→
vllm
/entrypoints/api_server.py
View file @
0b98ba15
...
@@ -6,10 +6,10 @@ from fastapi import BackgroundTasks, FastAPI, Request
...
@@ -6,10 +6,10 @@ from fastapi import BackgroundTasks, FastAPI, Request
from
fastapi.responses
import
Response
,
StreamingResponse
from
fastapi.responses
import
Response
,
StreamingResponse
import
uvicorn
import
uvicorn
from
cacheflow
.engine.arg_utils
import
AsyncEngineArgs
from
vllm
.engine.arg_utils
import
AsyncEngineArgs
from
cacheflow
.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm
.engine.async_llm_engine
import
AsyncLLMEngine
from
cacheflow
.sampling_params
import
SamplingParams
from
vllm
.sampling_params
import
SamplingParams
from
cacheflow
.utils
import
random_uuid
from
vllm
.utils
import
random_uuid
TIMEOUT_KEEP_ALIVE
=
5
# seconds.
TIMEOUT_KEEP_ALIVE
=
5
# seconds.
TIMEOUT_TO_PREVENT_DEADLOCK
=
1
# seconds
TIMEOUT_TO_PREVENT_DEADLOCK
=
1
# seconds
...
...
cacheflow
/entrypoints/llm.py
→
vllm
/entrypoints/llm.py
View file @
0b98ba15
...
@@ -3,11 +3,11 @@ from typing import List, Optional, Union
...
@@ -3,11 +3,11 @@ from typing import List, Optional, Union
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
cacheflow
.engine.arg_utils
import
EngineArgs
from
vllm
.engine.arg_utils
import
EngineArgs
from
cacheflow
.engine.llm_engine
import
LLMEngine
from
vllm
.engine.llm_engine
import
LLMEngine
from
cacheflow
.outputs
import
RequestOutput
from
vllm
.outputs
import
RequestOutput
from
cacheflow
.sampling_params
import
SamplingParams
from
vllm
.sampling_params
import
SamplingParams
from
cacheflow
.utils
import
Counter
from
vllm
.utils
import
Counter
class
LLM
:
class
LLM
:
...
...
cacheflow
/entrypoints/openai/__init__.py
→
vllm
/entrypoints/openai/__init__.py
View file @
0b98ba15
File moved
cacheflow
/entrypoints/openai/api_server.py
→
vllm
/entrypoints/openai/api_server.py
View file @
0b98ba15
...
@@ -13,17 +13,17 @@ from fastapi.middleware.cors import CORSMiddleware
...
@@ -13,17 +13,17 @@ from fastapi.middleware.cors import CORSMiddleware
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
import
uvicorn
import
uvicorn
from
cacheflow
.engine.arg_utils
import
AsyncEngineArgs
from
vllm
.engine.arg_utils
import
AsyncEngineArgs
from
cacheflow
.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm
.engine.async_llm_engine
import
AsyncLLMEngine
from
cacheflow
.engine.tokenizer_utils
import
get_tokenizer
from
vllm
.engine.tokenizer_utils
import
get_tokenizer
from
cacheflow
.entrypoints.openai.protocol
import
(
from
vllm
.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
ErrorResponse
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
ErrorResponse
,
LogProbs
,
ModelCard
,
ModelList
,
ModelPermission
,
UsageInfo
)
LogProbs
,
ModelCard
,
ModelList
,
ModelPermission
,
UsageInfo
)
from
cacheflow
.logger
import
init_logger
from
vllm
.logger
import
init_logger
from
cacheflow
.outputs
import
RequestOutput
from
vllm
.outputs
import
RequestOutput
from
cacheflow
.sampling_params
import
SamplingParams
from
vllm
.sampling_params
import
SamplingParams
from
cacheflow
.utils
import
random_uuid
from
vllm
.utils
import
random_uuid
TIMEOUT_KEEP_ALIVE
=
5
# seconds
TIMEOUT_KEEP_ALIVE
=
5
# seconds
...
@@ -93,11 +93,11 @@ async def create_completion(raw_request: Request):
...
@@ -93,11 +93,11 @@ async def create_completion(raw_request: Request):
for the API specification. This API mimics the OpenAI Completion API.
for the API specification. This API mimics the OpenAI Completion API.
NOTE: Currently we do not support the following features:
NOTE: Currently we do not support the following features:
- echo (since the
cacheflow
engine does not currently support
- echo (since the
vLLM
engine does not currently support
getting the logprobs of prompt tokens)
getting the logprobs of prompt tokens)
- suffix (the language models we currently support do not support
- suffix (the language models we currently support do not support
suffix)
suffix)
- logit_bias (to be supported
in cacheflow
engine)
- logit_bias (to be supported
by vLLM
engine)
"""
"""
request
=
CompletionRequest
(
**
await
raw_request
.
json
())
request
=
CompletionRequest
(
**
await
raw_request
.
json
())
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
...
@@ -107,7 +107,7 @@ async def create_completion(raw_request: Request):
...
@@ -107,7 +107,7 @@ async def create_completion(raw_request: Request):
return
error_check_ret
return
error_check_ret
if
request
.
echo
:
if
request
.
echo
:
# We do not support echo since the
cacheflow
engine does not
# We do not support echo since the
vLLM
engine does not
# currently support getting the logprobs of prompt tokens.
# currently support getting the logprobs of prompt tokens.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"echo is not currently supported"
)
"echo is not currently supported"
)
...
@@ -118,7 +118,7 @@ async def create_completion(raw_request: Request):
...
@@ -118,7 +118,7 @@ async def create_completion(raw_request: Request):
"suffix is not currently supported"
)
"suffix is not currently supported"
)
if
request
.
logit_bias
is
not
None
:
if
request
.
logit_bias
is
not
None
:
# TODO: support logit_bias in
cacheflow
engine.
# TODO: support logit_bias in
vLLM
engine.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"logit_bias is not currently supported"
)
"logit_bias is not currently supported"
)
...
@@ -274,7 +274,7 @@ async def create_completion(raw_request: Request):
...
@@ -274,7 +274,7 @@ async def create_completion(raw_request: Request):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"
CacheFlow
OpenAI-Compatible RESTful API server."
description
=
"
vLLM
OpenAI-Compatible RESTful API server."
)
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
,
help
=
"host name"
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
,
help
=
"host name"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
,
help
=
"port number"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
,
help
=
"port number"
)
...
...
cacheflow
/entrypoints/openai/protocol.py
→
vllm
/entrypoints/openai/protocol.py
View file @
0b98ba15
...
@@ -4,7 +4,7 @@ from typing import Dict, List, Literal, Optional, Union
...
@@ -4,7 +4,7 @@ from typing import Dict, List, Literal, Optional, Union
from
pydantic
import
BaseModel
,
Field
from
pydantic
import
BaseModel
,
Field
from
cacheflow
.utils
import
random_uuid
from
vllm
.utils
import
random_uuid
class
ErrorResponse
(
BaseModel
):
class
ErrorResponse
(
BaseModel
):
...
@@ -34,7 +34,7 @@ class ModelCard(BaseModel):
...
@@ -34,7 +34,7 @@ class ModelCard(BaseModel):
id
:
str
id
:
str
object
:
str
=
"model"
object
:
str
=
"model"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
owned_by
:
str
=
"
cacheflow
"
owned_by
:
str
=
"
vllm
"
root
:
Optional
[
str
]
=
None
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
permission
:
List
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
permission
:
List
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
...
@@ -82,7 +82,7 @@ class CompletionRequest(BaseModel):
...
@@ -82,7 +82,7 @@ class CompletionRequest(BaseModel):
best_of
:
Optional
[
int
]
=
None
best_of
:
Optional
[
int
]
=
None
logit_bias
:
Optional
[
Dict
[
str
,
float
]]
=
None
logit_bias
:
Optional
[
Dict
[
str
,
float
]]
=
None
user
:
Optional
[
str
]
=
None
user
:
Optional
[
str
]
=
None
# Additional parameters supported by
cacheflow
# Additional parameters supported by
vLLM
top_k
:
Optional
[
int
]
=
-
1
top_k
:
Optional
[
int
]
=
-
1
ignore_eos
:
Optional
[
bool
]
=
False
ignore_eos
:
Optional
[
bool
]
=
False
use_beam_search
:
Optional
[
bool
]
=
False
use_beam_search
:
Optional
[
bool
]
=
False
...
...
cacheflow
/logger.py
→
vllm
/logger.py
View file @
0b98ba15
...
@@ -22,7 +22,7 @@ class NewLineFormatter(logging.Formatter):
...
@@ -22,7 +22,7 @@ class NewLineFormatter(logging.Formatter):
return
msg
return
msg
_root_logger
=
logging
.
getLogger
(
"
cacheflow
"
)
_root_logger
=
logging
.
getLogger
(
"
vllm
"
)
_default_handler
=
None
_default_handler
=
None
...
...
vllm/model_executor/__init__.py
0 → 100644
View file @
0b98ba15
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.utils
import
set_random_seed
__all__
=
[
"InputMetadata"
,
"get_model"
,
"set_random_seed"
,
]
cacheflow
/model_executor/input_metadata.py
→
vllm
/model_executor/input_metadata.py
View file @
0b98ba15
...
@@ -3,8 +3,8 @@ from typing import Dict, List, Tuple
...
@@ -3,8 +3,8 @@ from typing import Dict, List, Tuple
import
torch
import
torch
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
cacheflow
.sampling_params
import
SamplingParams
from
vllm
.sampling_params
import
SamplingParams
from
cacheflow
.sequence
import
SequenceData
from
vllm
.sequence
import
SequenceData
class
InputMetadata
:
class
InputMetadata
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment