Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
01bfb22b
Unverified
Commit
01bfb22b
authored
Mar 25, 2024
by
SangBin Cho
Committed by
GitHub
Mar 25, 2024
Browse files
[CI] Try introducing isort. (#3495)
parent
e67c295b
Changes
144
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
79 additions
and
76 deletions
+79
-76
vllm/attention/__init__.py
vllm/attention/__init__.py
+2
-1
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+3
-2
vllm/attention/backends/xformers.py
vllm/attention/backends/xformers.py
+2
-1
vllm/attention/ops/paged_attn.py
vllm/attention/ops/paged_attn.py
+1
-2
vllm/attention/selector.py
vllm/attention/selector.py
+4
-2
vllm/config.py
vllm/config.py
+7
-6
vllm/core/block_manager.py
vllm/core/block_manager.py
+3
-3
vllm/core/evictor.py
vllm/core/evictor.py
+1
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+3
-3
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+2
-3
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+3
-3
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+5
-5
vllm/engine/metrics.py
vllm/engine/metrics.py
+7
-6
vllm/engine/ray_utils.py
vllm/engine/ray_utils.py
+2
-3
vllm/entrypoints/api_server.py
vllm/entrypoints/api_server.py
+1
-1
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+1
-1
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+8
-9
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+2
-3
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+8
-6
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+14
-15
No files found.
vllm/attention/__init__.py
View file @
01bfb22b
from
vllm.attention.backends.abstract
import
AttentionBackend
,
AttentionMetadata
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionMetadata
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.selector
import
get_attn_backend
...
...
vllm/attention/backends/flash_attn.py
View file @
01bfb22b
...
...
@@ -7,12 +7,13 @@ flashinfer for all the attention operations.
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Type
from
flash_attn
import
flash_attn_varlen_func
import
torch
from
flash_attn
import
flash_attn_varlen_func
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionImpl
,
AttentionMetadata
)
from
vllm.attention.ops.paged_attn
import
PagedAttention
,
PagedAttentionMetadata
from
vllm.attention.ops.paged_attn
import
(
PagedAttention
,
PagedAttentionMetadata
)
class
FlashAttentionBackend
(
AttentionBackend
):
...
...
vllm/attention/backends/xformers.py
View file @
01bfb22b
...
...
@@ -11,7 +11,8 @@ from xformers.ops.fmha.attn_bias import (AttentionBias,
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionImpl
,
AttentionMetadata
)
from
vllm.attention.ops.paged_attn
import
PagedAttention
,
PagedAttentionMetadata
from
vllm.attention.ops.paged_attn
import
(
PagedAttention
,
PagedAttentionMetadata
)
from
vllm.logger
import
init_logger
from
vllm.utils
import
is_hip
...
...
vllm/attention/ops/paged_attn.py
View file @
01bfb22b
...
...
@@ -3,8 +3,7 @@ from typing import Dict, List, Optional, Tuple
import
torch
from
vllm._C
import
cache_ops
from
vllm._C
import
ops
from
vllm._C
import
cache_ops
,
ops
from
vllm.attention.ops.prefix_prefill
import
context_attention_fwd
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
...
...
vllm/attention/selector.py
View file @
01bfb22b
...
...
@@ -13,11 +13,13 @@ logger = init_logger(__name__)
def
get_attn_backend
(
dtype
:
torch
.
dtype
)
->
AttentionBackend
:
if
_can_use_flash_attn
(
dtype
):
logger
.
info
(
"Using FlashAttention backend."
)
from
vllm.attention.backends.flash_attn
import
FlashAttentionBackend
# noqa: F401
from
vllm.attention.backends.flash_attn
import
(
# noqa: F401
FlashAttentionBackend
)
return
FlashAttentionBackend
else
:
logger
.
info
(
"Using XFormers backend."
)
from
vllm.attention.backends.xformers
import
XFormersBackend
# noqa: F401
from
vllm.attention.backends.xformers
import
(
# noqa: F401
XFormersBackend
)
return
XFormersBackend
...
...
vllm/config.py
View file @
01bfb22b
from
typing
import
TYPE_CHECKING
,
Optional
,
Union
,
ClassVar
from
dataclasses
import
dataclass
import
json
import
os
from
packaging.version
import
Version
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
ClassVar
,
Optional
,
Union
import
json
import
torch
from
packaging.version
import
Version
from
transformers
import
PretrainedConfig
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.config
import
get_config
from
vllm.utils
import
get_cpu_memory
,
is_hip
,
is_neuron
,
get_nvcc_cuda_version
from
vllm.utils
import
get_cpu_memory
,
get_nvcc_cuda_version
,
is_hip
,
is_neuron
if
TYPE_CHECKING
:
from
ray.util.placement_group
import
PlacementGroup
...
...
@@ -103,7 +103,8 @@ class ModelConfig:
if
os
.
environ
.
get
(
"VLLM_USE_MODELSCOPE"
,
"False"
).
lower
()
==
"true"
:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from
modelscope.hub.snapshot_download
import
snapshot_download
# pylint: disable=C
# pylint: disable=C.
from
modelscope.hub.snapshot_download
import
snapshot_download
if
not
os
.
path
.
exists
(
model
):
model_path
=
snapshot_download
(
model_id
=
model
,
...
...
vllm/core/block_manager.py
View file @
01bfb22b
"""A block manager that manages token blocks."""
import
enum
from
abc
import
ABC
,
abstractmethod
from
itertools
import
count
,
takewhile
from
os.path
import
commonprefix
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
abc
import
ABC
,
abstractmethod
from
vllm.block
import
BlockTable
,
PhysicalTokenBlock
from
vllm.core.evictor
import
EvictionPolicy
,
Evictor
,
make_evictor
from
vllm.logger
import
init_logger
from
vllm.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Device
from
vllm.core.evictor
import
Evictor
,
EvictionPolicy
,
make_evictor
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
...
...
vllm/core/evictor.py
View file @
01bfb22b
import
enum
from
typing
import
OrderedDict
from
abc
import
ABC
,
abstractmethod
,
abstractproperty
from
typing
import
OrderedDict
from
vllm.block
import
PhysicalTokenBlock
...
...
vllm/core/scheduler.py
View file @
01bfb22b
from
collections
import
deque
import
enum
import
time
from
typing
import
Deque
,
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
,
Set
from
collections
import
deque
from
typing
import
Deque
,
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Union
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.core.block_manager
import
AllocStatus
,
BlockSpaceManager
from
vllm.core.policy
import
PolicyFactory
from
vllm.lora.request
import
LoRARequest
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceStatus
)
...
...
vllm/engine/arg_utils.py
View file @
01bfb22b
...
...
@@ -3,9 +3,8 @@ import dataclasses
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Tuple
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
,
TokenizerPoolConfig
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
TokenizerPoolConfig
)
@
dataclass
...
...
vllm/engine/async_llm_engine.py
View file @
01bfb22b
...
...
@@ -2,17 +2,17 @@ import asyncio
import
os
import
time
from
functools
import
partial
from
typing
import
(
Callable
,
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Type
,
Union
,
AsyncIterator
)
from
typing
import
(
AsyncIterator
,
Callable
,
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Type
,
Union
)
from
transformers
import
PreTrainedTokenizer
from
vllm.lora.request
import
LoRARequest
from
vllm.config
import
ModelConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.ray_utils
import
initialize_ray_cluster
,
ray
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
...
...
vllm/engine/llm_engine.py
View file @
01bfb22b
...
...
@@ -4,22 +4,22 @@ from typing import Iterable, List, Optional, Tuple, Type, Union
from
transformers
import
PreTrainedTokenizer
import
vllm
from
vllm.lora.request
import
LoRARequest
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.engine.metrics
import
StatLogger
,
Stats
from
vllm.engine.ray_utils
import
initialize_ray_cluster
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
SamplerOutput
,
Sequence
,
SequenceGroup
,
SequenceGroupOutput
,
SequenceOutput
,
SequenceStatus
)
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.transformers_utils.tokenizer_group
import
(
BaseTokenizerGroup
,
get_tokenizer_group
)
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.utils
import
Counter
logger
=
init_logger
(
__name__
)
...
...
vllm/engine/metrics.py
View file @
01bfb22b
from
vllm.logger
import
init_logger
from
prometheus_client
import
(
Counter
,
Gauge
,
Histogram
,
Info
,
REGISTRY
,
disable_created_metrics
)
import
time
import
numpy
as
np
from
typing
import
Dict
,
List
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
import
numpy
as
np
from
prometheus_client
import
(
REGISTRY
,
Counter
,
Gauge
,
Histogram
,
Info
,
disable_created_metrics
)
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
...
...
vllm/engine/ray_utils.py
View file @
01bfb22b
import
pickle
from
typing
import
Optional
,
List
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
from
vllm.config
import
ParallelConfig
from
vllm.logger
import
init_logger
from
vllm.utils
import
is_hip
,
set_cuda_visible_devices
,
get_ip
from
vllm.utils
import
get_ip
,
is_hip
,
set_cuda_visible_devices
logger
=
init_logger
(
__name__
)
...
...
vllm/entrypoints/api_server.py
View file @
01bfb22b
...
...
@@ -11,9 +11,9 @@ import json
import
ssl
from
typing
import
AsyncGenerator
import
uvicorn
from
fastapi
import
FastAPI
,
Request
from
fastapi.responses
import
JSONResponse
,
Response
,
StreamingResponse
import
uvicorn
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
...
...
vllm/entrypoints/llm.py
View file @
01bfb22b
...
...
@@ -3,9 +3,9 @@ from typing import List, Optional, Union
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
vllm.lora.request
import
LoRARequest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
Counter
...
...
vllm/entrypoints/openai/api_server.py
View file @
01bfb22b
import
asyncio
from
contextlib
import
asynccontextmanager
import
os
import
importlib
import
inspect
import
os
from
contextlib
import
asynccontextmanager
from
http
import
HTTPStatus
from
prometheus_client
import
make_asgi_app
import
fastapi
import
uvicorn
from
http
import
HTTPStatus
from
fastapi
import
Request
from
fastapi.exceptions
import
RequestValidationError
from
fastapi.middleware.cors
import
CORSMiddleware
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
,
Response
from
fastapi.responses
import
JSONResponse
,
Response
,
StreamingResponse
from
prometheus_client
import
make_asgi_app
import
vllm
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
ChatCompletionRequest
,
ErrorResponse
)
from
vllm.logger
import
init_logger
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
ErrorResponse
)
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_completion
import
OpenAIServingCompletion
from
vllm.logger
import
init_logger
TIMEOUT_KEEP_ALIVE
=
5
# seconds
...
...
vllm/entrypoints/openai/protocol.py
View file @
01bfb22b
...
...
@@ -3,12 +3,11 @@
import
time
from
typing
import
Dict
,
List
,
Literal
,
Optional
,
Union
import
torch
from
pydantic
import
BaseModel
,
Field
,
model_validator
from
vllm.utils
import
random_uuid
from
vllm.sampling_params
import
SamplingParams
import
torch
from
vllm.utils
import
random_uuid
class
ErrorResponse
(
BaseModel
):
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
01bfb22b
import
time
import
codecs
import
time
from
typing
import
AsyncGenerator
,
AsyncIterator
,
List
,
Optional
,
Union
from
fastapi
import
Request
from
typing
import
AsyncGenerator
,
AsyncIterator
,
Optional
,
List
,
Union
from
vllm.logger
import
init_logger
from
vllm.utils
import
random_uuid
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionResponseChoice
,
ChatCompletionResponseStreamChoice
,
ChatCompletionStreamResponse
,
ChatMessage
,
DeltaMessage
,
ErrorResponse
,
UsageInfo
)
from
vllm.
outputs
import
RequestOutput
from
vllm.
entrypoints.openai.serving_engine
import
OpenAIServing
,
LoRA
from
vllm.
entrypoints.openai.serving_engine
import
LoRA
,
OpenAIServing
from
vllm.
logger
import
init_logger
from
vllm.model_executor.guided_decoding
import
(
get_guided_decoding_logits_processor
)
from
vllm.outputs
import
RequestOutput
from
vllm.utils
import
random_uuid
logger
=
init_logger
(
__name__
)
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
01bfb22b
import
asyncio
import
time
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
)
from
fastapi
import
Request
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Callable
,
List
,
Optional
,
Dict
,
Tuple
)
from
vllm.logger
import
init_logger
from
vllm.utils
import
random_uuid
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
LogProbs
,
UsageInfo
,
)
from
vllm.outputs
import
RequestOutput
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
,
LoRA
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
LogProbs
,
UsageInfo
)
from
vllm.entrypoints.openai.serving_engine
import
LoRA
,
OpenAIServing
from
vllm.logger
import
init_logger
from
vllm.model_executor.guided_decoding
import
(
get_guided_decoding_logits_processor
)
from
vllm.outputs
import
RequestOutput
from
vllm.utils
import
random_uuid
logger
=
init_logger
(
__name__
)
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment