Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
500b93c8
Commit
500b93c8
authored
Jul 25, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1
parents
99426767
38c4b7e8
Changes
282
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
183 additions
and
210 deletions
+183
-210
examples/api_client.py
examples/api_client.py
+2
-3
examples/cpu_offload.py
examples/cpu_offload.py
+22
-0
examples/llava_example.py
examples/llava_example.py
+3
-30
examples/logging_configuration.md
examples/logging_configuration.md
+3
-9
examples/offline_inference_tpu.py
examples/offline_inference_tpu.py
+28
-0
examples/openai_vision_api_client.py
examples/openai_vision_api_client.py
+1
-3
examples/paligemma_example.py
examples/paligemma_example.py
+3
-30
examples/phi3v_example.py
examples/phi3v_example.py
+2
-24
examples/production_monitoring/Otel.md
examples/production_monitoring/Otel.md
+3
-3
examples/production_monitoring/README.md
examples/production_monitoring/README.md
+1
-2
examples/run_cluster.sh
examples/run_cluster.sh
+49
-0
requirements-rocm.txt
requirements-rocm.txt
+4
-0
rocm_patch/rocm_bf16.patch
rocm_patch/rocm_bf16.patch
+0
-15
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+7
-32
tests/async_engine/test_openapi_server_ray.py
tests/async_engine/test_openapi_server_ray.py
+11
-11
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+13
-0
tests/conftest.py
tests/conftest.py
+18
-38
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+7
-4
tests/core/block/test_cpu_gpu_block_allocator.py
tests/core/block/test_cpu_gpu_block_allocator.py
+4
-4
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+2
-2
No files found.
examples/api_client.py
View file @
500b93c8
"""Example Python client for vllm.entrypoints.api_server
"""Example Python client for
`
vllm.entrypoints.api_server
`
NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use.
For production use, we recommend vllm.entrypoints.openai.api_server
and the OpenAI client API
For production use, we recommend `vllm serve` and the OpenAI client API.
"""
import
argparse
...
...
examples/cpu_offload.py
0 → 100644
View file @
500b93c8
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
cpu_offload_gb
=
10
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
examples/llava_example.py
View file @
500b93c8
import
os
import
subprocess
from
PIL
import
Image
from
vllm
import
LLM
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
from
vllm.assets.image
import
ImageAsset
def
run_llava
():
...
...
@@ -14,7 +7,7 @@ def run_llava():
prompt
=
"USER: <image>
\n
What is the content of this image?
\n
ASSISTANT:"
image
=
Image
.
open
(
"images/stop_sign.jpg"
)
image
=
Image
Asset
(
"stop_sign"
).
pil_image
outputs
=
llm
.
generate
({
"prompt"
:
prompt
,
...
...
@@ -28,25 +21,5 @@ def run_llava():
print
(
generated_text
)
def
main
():
run_llava
()
if
__name__
==
"__main__"
:
# Download from s3
s3_bucket_path
=
"s3://air-example-data-2/vllm_opensource_llava/"
local_directory
=
"images"
# Make sure the local directory exists or create it
os
.
makedirs
(
local_directory
,
exist_ok
=
True
)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess
.
check_call
([
"aws"
,
"s3"
,
"sync"
,
s3_bucket_path
,
local_directory
,
"--no-sign-request"
,
])
main
()
run_llava
()
examples/logging_configuration.md
View file @
500b93c8
...
...
@@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:
```
bash
VLLM_LOGGING_CONFIG_PATH
=
/path/to/logging_config.json
\
python3
-m
vllm.entrypoints.openai.api_server
\
--max-model-len
2048
\
--model
mistralai/Mistral-7B-v0.1
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
...
...
@@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:
```
bash
VLLM_LOGGING_CONFIG_PATH
=
/path/to/logging_config.json
\
python3
-m
vllm.entrypoints.openai.api_server
\
--max-model-len
2048
\
--model
mistralai/Mistral-7B-v0.1
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
...
...
@@ -167,9 +163,7 @@ loggers.
```
bash
VLLM_CONFIGURE_LOGGING
=
0
\
python3
-m
vllm.entrypoints.openai.api_server
\
--max-model-len
2048
\
--model
mistralai/Mistral-7B-v0.1
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
...
...
examples/offline_inference_tpu.py
0 → 100644
View file @
500b93c8
from
vllm
import
LLM
,
SamplingParams
prompts
=
[
"A robot may not injure a human being"
,
"It is only with the heart that one can see rightly;"
,
"The greatest glory in living lies not in never falling,"
,
]
answers
=
[
" or, through inaction, allow a human being to come to harm."
,
" what is essential is invisible to the eye."
,
" but in rising every time we fall."
,
]
N
=
1
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
top_p
=
1.0
,
n
=
N
,
max_tokens
=
16
)
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"google/gemma-2b"
,
enforce_eager
=
True
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
generated_text
.
startswith
(
answer
)
examples/openai_vision_api_client.py
View file @
500b93c8
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
python -m vllm.entrypoints.openai.api_server
\
--model llava-hf/llava-1.5-7b-hf
\
--chat-template template_llava.jinja
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
"""
import
base64
...
...
examples/paligemma_example.py
View file @
500b93c8
import
os
import
subprocess
from
PIL
import
Image
from
vllm
import
LLM
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
from
vllm.assets.image
import
ImageAsset
def
run_paligemma
():
...
...
@@ -14,7 +7,7 @@ def run_paligemma():
prompt
=
"caption es"
image
=
Image
.
open
(
"images/stop_sign.jpg"
)
image
=
Image
Asset
(
"stop_sign"
).
pil_image
outputs
=
llm
.
generate
({
"prompt"
:
prompt
,
...
...
@@ -28,25 +21,5 @@ def run_paligemma():
print
(
generated_text
)
def
main
():
run_paligemma
()
if
__name__
==
"__main__"
:
# Download from s3
s3_bucket_path
=
"s3://air-example-data-2/vllm_opensource_llava/"
local_directory
=
"images"
# Make sure the local directory exists or create it
os
.
makedirs
(
local_directory
,
exist_ok
=
True
)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess
.
check_call
([
"aws"
,
"s3"
,
"sync"
,
s3_bucket_path
,
local_directory
,
"--no-sign-request"
,
])
main
()
run_paligemma
()
examples/phi3v_example.py
View file @
500b93c8
import
os
import
subprocess
from
PIL
import
Image
from
vllm
import
LLM
,
SamplingParams
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
from
vllm.assets.image
import
ImageAsset
def
run_phi3v
():
...
...
@@ -24,7 +17,7 @@ def run_phi3v():
max_num_seqs
=
5
,
)
image
=
Image
.
open
(
"images/
cherry_blossom
.jpg"
)
image
=
Image
Asset
(
"
cherry_blossom
"
).
pil_image
# single-image prompt
prompt
=
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
...
...
@@ -44,19 +37,4 @@ def run_phi3v():
if
__name__
==
"__main__"
:
s3_bucket_path
=
"s3://air-example-data-2/vllm_opensource_llava/"
local_directory
=
"images"
# Make sure the local directory exists or create it
os
.
makedirs
(
local_directory
,
exist_ok
=
True
)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess
.
check_call
([
"aws"
,
"s3"
,
"sync"
,
s3_bucket_path
,
local_directory
,
"--no-sign-request"
,
])
run_phi3v
()
examples/production_monitoring/Otel.md
View file @
500b93c8
...
...
@@ -36,7 +36,7 @@
```
export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
python -m vllm.entrypoints.openai.api_server --model="
facebook/opt-125m
"
--otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
vllm serve
facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
1.
In a new shell, send requests with trace context from a dummy client
...
...
@@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th
```
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
python -m vllm.entrypoints.openai.api_server --model="
facebook/opt-125m
"
--otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
vllm serve
facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```
## Instrumentation of FastAPI
...
...
@@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
1.
Run vLLM with
`opentelemetry-instrument`
```
opentelemetry-instrument
python -m vllm.entrypoints.openai.api_server --model="
facebook/opt-125m
"
opentelemetry-instrument
vllm serve
facebook/opt-125m
```
1.
Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
...
...
examples/production_monitoring/README.md
View file @
500b93c8
...
...
@@ -10,8 +10,7 @@ Install:
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
```
bash
python3
-m
vllm.entrypoints.openai.api_server
\
--model
mistralai/Mistral-7B-v0.1
\
vllm serve mistralai/Mistral-7B-v0.1
\
--max-model-len
2048
\
--disable-log-requests
```
...
...
examples/run_cluster.sh
0 → 100644
View file @
500b93c8
#!/bin/bash
# Check for minimum number of required arguments
if
[
$#
-lt
4
]
;
then
echo
"Usage:
$0
docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
exit
1
fi
# Assign the first three arguments and shift them away
DOCKER_IMAGE
=
"
$1
"
HEAD_NODE_ADDRESS
=
"
$2
"
NODE_TYPE
=
"
$3
"
# Should be --head or --worker
PATH_TO_HF_HOME
=
"
$4
"
shift
4
# Additional arguments are passed directly to the Docker command
ADDITIONAL_ARGS
=
"
$@
"
# Validate node type
if
[
"
${
NODE_TYPE
}
"
!=
"--head"
]
&&
[
"
${
NODE_TYPE
}
"
!=
"--worker"
]
;
then
echo
"Error: Node type must be --head or --worker"
exit
1
fi
# Define a function to cleanup on EXIT signal
cleanup
()
{
docker stop node
docker
rm
node
}
trap
cleanup EXIT
# Command setup for head or worker node
RAY_START_CMD
=
"ray start --block"
if
[
"
${
NODE_TYPE
}
"
==
"--head"
]
;
then
RAY_START_CMD+
=
" --head --port=6379"
else
RAY_START_CMD+
=
" --address=
${
HEAD_NODE_ADDRESS
}
:6379"
fi
# Run the docker command with the user specified parameters and additional arguments
docker run
\
--entrypoint
/bin/bash
\
--network
host
\
--name
node
\
--shm-size
10.24g
\
--gpus
all
\
-v
"
${
PATH_TO_HF_HOME
}
:/root/.cache/huggingface"
\
${
ADDITIONAL_ARGS
}
\
"
${
DOCKER_IMAGE
}
"
-c
"
${
RAY_START_CMD
}
"
requirements-rocm.txt
View file @
500b93c8
...
...
@@ -2,5 +2,9 @@
-r requirements-common.txt
# Dependencies for AMD GPUs
awscli
boto3
botocore
ray >= 2.10.0
peft
pytest-asyncio
rocm_patch/rocm_bf16.patch
deleted
100644 → 0
View file @
99426767
--- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000
+++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000
@@ -90,10 +90,10 @@
#include "math_fwd.h" // ocml device functions
#if defined(__HIPCC_RTC__)
-#define __HOST_DEVICE__ __device__
+#define __HOST_DEVICE__ __device__ static
#else
#include <climits>
-#define __HOST_DEVICE__ __host__ __device__
+#define __HOST_DEVICE__ __host__ __device__ static inline
#endif
// Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
tests/async_engine/test_chat_template.py
View file @
500b93c8
import
os
import
pathlib
from
dataclasses
import
dataclass
import
pytest
from
vllm.entrypoints.chat_utils
import
load_chat_template
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
...
...
@@ -50,24 +49,9 @@ TEST_MESSAGES = [
]
@
dataclass
class
MockTokenizer
:
chat_template
=
None
@
dataclass
class
MockServingChat
:
tokenizer
:
MockTokenizer
def
test_load_chat_template
():
# Testing chatml template
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
template_content
=
tokenizer
.
chat_template
template_content
=
load_chat_template
(
chat_template
=
chatml_jinja_path
)
# Test assertions
assert
template_content
is
not
None
...
...
@@ -79,24 +63,16 @@ def test_load_chat_template():
def
test_no_load_chat_template_filelike
():
# Testing chatml template
template
=
"../../examples/does_not_exist"
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
with
pytest
.
raises
(
ValueError
,
match
=
"looks like a file path"
):
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
load_chat_template
(
chat_template
=
template
)
def
test_no_load_chat_template_literallike
():
# Testing chatml template
template
=
"{{ messages }}"
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
template_content
=
load_chat_template
(
chat_template
=
template
)
assert
template_content
==
template
...
...
@@ -108,9 +84,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output
):
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
template_content
=
load_chat_template
(
chat_template
=
template
)
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
...
...
@@ -122,7 +96,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
result
=
tokenizer
.
apply_chat_template
(
conversation
=
mock_request
.
messages
,
tokenize
=
False
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
)
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
chat_template
=
mock_request
.
chat_template
or
template_content
)
# Test assertion
assert
result
==
expected_output
,
(
...
...
tests/async_engine/test_openapi_server_ray.py
View file @
500b93c8
...
...
@@ -9,17 +9,17 @@ MODEL_NAME = "facebook/opt-125m"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
with
RemoteOpenAIServer
(
[
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype
"
,
"float16
"
,
"--max-model-len
"
,
"2048"
,
"--enforce-eager"
,
"--engine-use-ray"
]
)
as
remote_server
:
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--max-model-len
"
,
"2048
"
,
"--enforce-eager
"
,
"--engine-use-ray"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/basic_correctness/test_cpu_offload.py
0 → 100644
View file @
500b93c8
from
vllm.utils
import
is_hip
from
..utils
import
compare_two_settings
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
[
"--cpu-offload-gb"
,
"4"
])
if
not
is_hip
():
# compressed-tensors quantization is currently not supported in ROCm.
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
tests/conftest.py
View file @
500b93c8
...
...
@@ -3,11 +3,7 @@ import gc
import
os
import
sys
from
collections
import
UserList
from
dataclasses
import
dataclass
from
functools
import
cached_property
from
pathlib
import
Path
from
typing
import
(
Any
,
Dict
,
List
,
Literal
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
)
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
import
pytest
import
torch
...
...
@@ -18,14 +14,16 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoTokenizer
,
BatchEncoding
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
)
from
vllm.inputs
import
TextPrompt
from
vllm.logger
import
init_logger
from
vllm.multimodal.utils
import
fetch_image
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
cuda_device_count_stateless
,
is_cpu
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
is_cpu
)
logger
=
init_logger
(
__name__
)
...
...
@@ -33,9 +31,6 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
_IMAGE_DIR
=
Path
(
_TEST_DIR
)
/
"images"
"""You can use `.buildkite/download-images.sh` to download the assets."""
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
with
open
(
filename
,
"r"
)
as
f
:
...
...
@@ -43,24 +38,9 @@ def _read_prompts(filename: str) -> List[str]:
return
prompts
@
dataclass
(
frozen
=
True
)
class
ImageAsset
:
name
:
Literal
[
"stop_sign"
,
"cherry_blossom"
,
"boardwalk"
]
@
cached_property
def
pil_image
(
self
)
->
Image
.
Image
:
if
self
.
name
==
"boardwalk"
:
return
fetch_image
(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return
Image
.
open
(
_IMAGE_DIR
/
f
"
{
self
.
name
}
.jpg"
)
class
_ImageAssetPrompts
(
TypedDict
):
stop_sign
:
str
cherry_blossom
:
str
boardwalk
:
str
if
sys
.
version_info
<
(
3
,
9
):
...
...
@@ -79,7 +59,6 @@ class _ImageAssets(_ImageAssetsBase):
super
().
__init__
([
ImageAsset
(
"stop_sign"
),
ImageAsset
(
"cherry_blossom"
),
ImageAsset
(
"boardwalk"
)
])
def
prompts
(
self
,
prompts
:
_ImageAssetPrompts
)
->
List
[
str
]:
...
...
@@ -89,16 +68,20 @@ class _ImageAssets(_ImageAssetsBase):
The order of the returned prompts matches the order of the
assets when iterating through this object.
"""
return
[
prompts
[
"stop_sign"
],
prompts
[
"cherry_blossom"
],
prompts
[
"boardwalk"
]
]
return
[
prompts
[
"stop_sign"
],
prompts
[
"cherry_blossom"
]]
IMAGE_ASSETS
=
_ImageAssets
()
"""Singleton instance of :class:`_ImageAssets`."""
@
pytest
.
fixture
(
autouse
=
True
)
def
init_test_http_connection
():
# pytest_asyncio may use a different event loop per test
# so we need to make sure the async client is created anew
global_http_connection
.
reuse_client
=
False
def
cleanup
():
destroy_model_parallel
()
destroy_distributed_environment
()
...
...
@@ -150,12 +133,6 @@ def image_assets() -> _ImageAssets:
return
IMAGE_ASSETS
_STR_DTYPE_TO_TORCH_DTYPE
=
{
"half"
:
torch
.
half
,
"bfloat16"
:
torch
.
bfloat16
,
"float"
:
torch
.
float
,
}
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
)
...
...
@@ -177,8 +154,7 @@ class HfRunner:
is_vision_model
:
bool
=
False
,
is_sparseml_model
:
bool
=
False
,
)
->
None
:
assert
dtype
in
_STR_DTYPE_TO_TORCH_DTYPE
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
self
.
model_name
=
model_name
...
...
@@ -590,6 +566,10 @@ def get_tokenizer_pool_config(tokenizer_group_type):
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
"ray"
,
extra_config
=
{})
if
isinstance
(
tokenizer_group_type
,
type
):
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
tokenizer_group_type
,
extra_config
=
{})
raise
ValueError
(
f
"Unknown tokenizer_group_type:
{
tokenizer_group_type
}
"
)
...
...
tests/core/block/test_block_manager_v2.py
View file @
500b93c8
...
...
@@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
# Expect consumed blocks to be new blocks required to support the new slots.
expected_consumed_blocks
=
len
(
chunk_list
(
list
(
range
(
prompt_len
+
num_slots_to_append
+
num_lookahead_slots
)),
block_size
))
-
len
(
chunk_list
(
list
(
range
(
prompt_len
)),
block_size
))
list
(
chunk_list
(
list
(
range
(
prompt_len
+
num_slots_to_append
+
num_lookahead_slots
)),
block_size
)))
-
len
(
list
(
chunk_list
(
list
(
range
(
prompt_len
)),
block_size
)))
assert
num_consumed_blocks
==
expected_consumed_blocks
...
...
tests/core/block/test_cpu_gpu_block_allocator.py
View file @
500b93c8
...
...
@@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
unique_token_ids
=
list
(
range
((
num_cpu_blocks
+
num_gpu_blocks
)
*
block_size
))
gpu_token_ids
=
chunk_list
(
unique_token_ids
[:
num_gpu_blocks
*
block_size
],
block_size
)
cpu_token_ids
=
chunk_list
(
unique_token_ids
[
num_gpu_blocks
*
block_size
:],
block_size
)
gpu_token_ids
=
list
(
chunk_list
(
unique_token_ids
[:
num_gpu_blocks
*
block_size
],
block_size
)
)
cpu_token_ids
=
list
(
chunk_list
(
unique_token_ids
[
num_gpu_blocks
*
block_size
:],
block_size
)
)
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
...
...
tests/core/test_scheduler.py
View file @
500b93c8
...
...
@@ -462,7 +462,7 @@ def test_prefill_schedule_max_lora():
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_
local_
path
=
"abc"
))
lora_path
=
"abc"
))
waiting
.
append
(
seq_group
)
# Add two more requests to verify lora is prioritized.
# 0: Lora, 1: Lora, 2: regular, 3: regular
...
...
@@ -760,7 +760,7 @@ def test_schedule_swapped_max_loras():
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_
local_
path
=
"abc"
))
lora_path
=
"abc"
))
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
...
Prev
1
2
3
4
5
6
7
8
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment