Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
...@@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
) )
# Keye-VL-1.5
def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-1.5-8B"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
trust_remote_code=True,
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Kimi-VL # Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -1648,6 +1679,7 @@ model_example_map = { ...@@ -1648,6 +1679,7 @@ model_example_map = {
"interns1": run_interns1, "interns1": run_interns1,
"internvl_chat": run_internvl, "internvl_chat": run_internvl,
"keye_vl": run_keye_vl, "keye_vl": run_keye_vl,
"keye_vl1_5": run_keye_vl1_5,
"kimi_vl": run_kimi_vl, "kimi_vl": run_kimi_vl,
"llama4": run_llama4, "llama4": run_llama4,
"llava": run_llava, "llava": run_llava,
......
...@@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-1_5-8B"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
},
]
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct" model_name = "moonshotai/Kimi-VL-A3B-Instruct"
...@@ -1209,6 +1246,7 @@ model_example_map = { ...@@ -1209,6 +1246,7 @@ model_example_map = {
"interns1": load_interns1, "interns1": load_interns1,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"keye_vl": load_keye_vl, "keye_vl": load_keye_vl,
"keye_vl1_5": load_keye_vl1_5,
"kimi_vl": load_kimi_vl, "kimi_vl": load_kimi_vl,
"llama4": load_llama4, "llama4": load_llama4,
"llava": load_llava, "llava": load_llava,
......
...@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ ...@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \ --trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
# decoding instance, which is the KV consumer # decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
...@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ ...@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \ --trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
# wait until prefill and decode instances are ready # wait until prefill and decode instances are ready
wait_for_server 8100 wait_for_server 8100
......
...@@ -6,6 +6,8 @@ import msgspec ...@@ -6,6 +6,8 @@ import msgspec
import zmq import zmq
from msgspec.msgpack import Decoder from msgspec.msgpack import Decoder
from vllm.v1.core.kv_cache_utils import BlockHash
# #
# Types copied from vllm.distributed.kv_events # Types copied from vllm.distributed.kv_events
...@@ -22,8 +24,8 @@ class KVCacheEvent( ...@@ -22,8 +24,8 @@ class KVCacheEvent(
class BlockStored(KVCacheEvent): class BlockStored(KVCacheEvent):
block_hashes: list[int] block_hashes: list[BlockHash]
parent_block_hash: Optional[int] parent_block_hash: Optional[BlockHash]
token_ids: list[int] token_ids: list[int]
block_size: int block_size: int
lora_id: Optional[int] lora_id: Optional[int]
...@@ -31,7 +33,7 @@ class BlockStored(KVCacheEvent): ...@@ -31,7 +33,7 @@ class BlockStored(KVCacheEvent):
class BlockRemoved(KVCacheEvent): class BlockRemoved(KVCacheEvent):
block_hashes: list[int] block_hashes: list[BlockHash]
medium: Optional[str] medium: Optional[str]
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# Example usage: # Example usage:
# On the head node machine, start the Ray head node process and run a vLLM server. # On the head node machine, start the Ray head node process and run a vLLM server.
# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \ # ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2 # vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
# #
# On each worker node, start the Ray worker node process. # On each worker node, start the Ray worker node process.
# ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>] # ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
......
...@@ -266,10 +266,52 @@ def run_audio(model: str) -> None: ...@@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
print("Chat completion output from base64 encoded audio:", result) print("Chat completion output from base64 encoded audio:", result)
def run_multi_audio(model: str) -> None:
from vllm.assets.audio import AudioAsset
# Two different audios to showcase batched inference.
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
audio_url2 = AudioAsset("azacinto_foscolo").url
audio_base64_2 = encode_base64_content_from_url(audio_url2)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Are these two audios the same?"},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav",
},
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64_2,
"format": "wav",
},
},
],
}
],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result)
example_function_map = { example_function_map = {
"text-only": run_text_only, "text-only": run_text_only,
"single-image": run_single_image, "single-image": run_single_image,
"multi-image": run_multi_image, "multi-image": run_multi_image,
"multi-audio": run_multi_audio,
"video": run_video, "video": run_video,
"audio": run_audio, "audio": run_audio,
} }
......
...@@ -10,18 +10,19 @@ import requests ...@@ -10,18 +10,19 @@ import requests
# multimodal data. In this specific case this example will take a geotiff # multimodal data. In this specific case this example will take a geotiff
# image as input, process it using the multimodal data processor, and # image as input, process it using the multimodal data processor, and
# perform inference. # perform inference.
# Reuirements : # Requirements :
# - install plugin at: # - install plugin at:
# https://github.com/christian-pinto/prithvi_io_processor_plugin # https://github.com/christian-pinto/prithvi_io_processor_plugin
# - start vllm in serving mode with the below args # - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch
# --task embed --trust-remote-code # --task embed --trust-remote-code
# --skip-tokenizer-init --enforce-eager # --skip-tokenizer-init --enforce-eager
# --io-processor-plugin prithvi_to_tiff_india # --io-processor-plugin prithvi_to_tiff
def main(): def main():
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501 image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501
server_endpoint = "http://localhost:8000/pooling" server_endpoint = "http://localhost:8000/pooling"
request_payload_url = { request_payload_url = {
...@@ -33,6 +34,7 @@ def main(): ...@@ -33,6 +34,7 @@ def main():
}, },
"priority": 0, "priority": 0,
"model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM", "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
"softmax": False,
} }
ret = requests.post(server_endpoint, json=request_payload_url) ret = requests.post(server_endpoint, json=request_payload_url)
......
...@@ -402,7 +402,7 @@ ...@@ -402,7 +402,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"instant": false, "instant": false,
...@@ -418,7 +418,7 @@ ...@@ -418,7 +418,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
...@@ -435,7 +435,7 @@ ...@@ -435,7 +435,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
...@@ -452,7 +452,7 @@ ...@@ -452,7 +452,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
...@@ -468,7 +468,7 @@ ...@@ -468,7 +468,7 @@
"uid": "${DS_PROMETHEUS}" "uid": "${DS_PROMETHEUS}"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
"hide": false, "hide": false,
"instant": false, "instant": false,
"legendFormat": "Mean", "legendFormat": "Mean",
...@@ -476,7 +476,7 @@ ...@@ -476,7 +476,7 @@
"refId": "E" "refId": "E"
} }
], ],
"title": "Time Per Output Token Latency", "title": "Inter Token Latency",
"type": "timeseries" "type": "timeseries"
}, },
{ {
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
<|system|> <|system|>
{{ system_message }} {{ system_message }}
{%- if tools %} {%- if tools %}
In addition to plain text responses, you can chose to call one or more of the provided functions. In addition to plain text responses, you can choose to call one or more of the provided functions.
Use the following rule to decide when to call a function: Use the following rule to decide when to call a function:
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
...@@ -19,7 +19,7 @@ If you decide to call functions: ...@@ -19,7 +19,7 @@ If you decide to call functions:
* prefix function calls with functools marker (no closing marker required) * prefix function calls with functools marker (no closing marker required)
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...] * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0 * respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
* make sure you pick the right functions that match the user intent * make sure you pick the right functions that match the user intent
......
...@@ -228,6 +228,7 @@ fo = "fo" ...@@ -228,6 +228,7 @@ fo = "fo"
ba = "ba" ba = "ba"
[tool.typos.type.py.extend-words] [tool.typos.type.py.extend-words]
ba = "ba"
[tool.typos.type.cpp] [tool.typos.type.cpp]
extend-glob = ["*.cu"] extend-glob = ["*.cu"]
...@@ -344,3 +345,6 @@ extend-ignore-re = [] ...@@ -344,3 +345,6 @@ extend-ignore-re = []
windo = "windo" windo = "windo"
[tool.typos.type.vimscript.extend-words] [tool.typos.type.vimscript.extend-words]
[tool.uv]
no-build-isolation-package = ["torch"]
...@@ -20,12 +20,11 @@ prometheus-fastapi-instrumentator >= 7.0.0 ...@@ -20,12 +20,11 @@ prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.11.3 lm-format-enforcer == 0.11.3
llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines_core == 0.2.10 ; platform_machine != "s390x" outlines_core == 0.2.11
outlines == 0.1.11 ; platform_machine == "s390x"
# required for outlines backend disk cache # required for outlines backend disk cache
diskcache == 5.6.3 diskcache == 5.6.3
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
......
# Common dependencies
-r common.txt
# Dependencies for Neuron devices
packaging>=24.2
setuptools>=77.0.3,<80.0.0
torch-neuronx >= 2.5.0
neuronx-cc>=2.0.0a0
torchvision # Required for Llama3.2 multimodal image preprocessing
...@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' ...@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
boto3 boto3
botocore botocore
datasets datasets
ray>=2.10.0,<2.45.0 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
peft peft
pytest-asyncio pytest-asyncio
tensorizer==2.10.1 tensorizer==2.10.1
......
...@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli ...@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
sentence-transformers # required for embedding tests sentence-transformers # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.8.0 torch==2.8.0
torchaudio==2.8.0 torchaudio==2.8.0
...@@ -53,5 +54,5 @@ runai-model-streamer==0.11.0 ...@@ -53,5 +54,5 @@ runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0 runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10 fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10 pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test
decord==0.6.0 decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
...@@ -137,7 +137,7 @@ contourpy==1.3.0 ...@@ -137,7 +137,7 @@ contourpy==1.3.0
# via matplotlib # via matplotlib
cramjam==2.9.0 cramjam==2.9.0
# via fastparquet # via fastparquet
cupy-cuda12x==13.3.0 cupy-cuda12x==13.6.0
# via ray # via ray
cycler==0.12.1 cycler==0.12.1
# via matplotlib # via matplotlib
...@@ -1032,6 +1032,8 @@ tabledata==1.3.3 ...@@ -1032,6 +1032,8 @@ tabledata==1.3.3
# via pytablewriter # via pytablewriter
tabulate==0.9.0 tabulate==0.9.0
# via sacrebleu # via sacrebleu
tblib==3.1.0
# via -r requirements/test.in
tcolorpy==0.1.6 tcolorpy==0.1.6
# via pytablewriter # via pytablewriter
tenacity==9.0.0 tenacity==9.0.0
...@@ -1042,7 +1044,7 @@ tensorboardx==2.6.4 ...@@ -1042,7 +1044,7 @@ tensorboardx==2.6.4
# via lightning # via lightning
tensorizer==2.10.1 tensorizer==2.10.1
# via -r requirements/test.in # via -r requirements/test.in
terratorch==1.1rc2 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
# via -r requirements/test.in # via -r requirements/test.in
threadpoolctl==3.5.0 threadpoolctl==3.5.0
# via scikit-learn # via scikit-learn
......
...@@ -10,10 +10,10 @@ wheel ...@@ -10,10 +10,10 @@ wheel
jinja2>=3.1.6 jinja2>=3.1.6
datasets # for benchmark scripts datasets # for benchmark scripts
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
--extra-index-url=https://download.pytorch.org/whl/xpu nixl==0.3.0 # for PD disaggregation
torch==2.8.0+xpu torch==2.8.0+xpu
torchaudio torchaudio
torchvision torchvision
pytorch-triton-xpu --extra-index-url=https://download.pytorch.org/whl/xpu
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.8.10+xpu intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
...@@ -413,8 +413,7 @@ def _no_device() -> bool: ...@@ -413,8 +413,7 @@ def _no_device() -> bool:
def _is_cuda() -> bool: def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
and not (_is_neuron() or _is_tpu()))
def _is_hip() -> bool: def _is_hip() -> bool:
...@@ -422,10 +421,6 @@ def _is_hip() -> bool: ...@@ -422,10 +421,6 @@ def _is_hip() -> bool:
or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
def _is_neuron() -> bool:
return VLLM_TARGET_DEVICE == "neuron"
def _is_tpu() -> bool: def _is_tpu() -> bool:
return VLLM_TARGET_DEVICE == "tpu" return VLLM_TARGET_DEVICE == "tpu"
...@@ -470,25 +465,6 @@ def get_rocm_version(): ...@@ -470,25 +465,6 @@ def get_rocm_version():
return None return None
def get_neuronxcc_version():
import sysconfig
site_dir = sysconfig.get_paths()["purelib"]
version_file = os.path.join(site_dir, "neuronxcc", "version",
"__init__.py")
# Check if the command was executed successfully
with open(version_file) as fp:
content = fp.read()
# Extract the version using a regular expression
match = re.search(r"__version__ = '(\S+)'", content)
if match:
# Return the version string
return match.group(1)
else:
raise RuntimeError("Could not find Neuron version in the output")
def get_nvcc_cuda_version() -> Version: def get_nvcc_cuda_version() -> Version:
"""Get the CUDA version from nvcc. """Get the CUDA version from nvcc.
...@@ -541,12 +517,6 @@ def get_vllm_version() -> str: ...@@ -541,12 +517,6 @@ def get_vllm_version() -> str:
rocm_version = get_rocm_version() or torch.version.hip rocm_version = get_rocm_version() or torch.version.hip
if rocm_version and rocm_version != MAIN_CUDA_VERSION: if rocm_version and rocm_version != MAIN_CUDA_VERSION:
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
elif _is_neuron():
# Get the Neuron version
neuron_version = str(get_neuronxcc_version())
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"{sep}neuron{neuron_version_str}"
elif _is_tpu(): elif _is_tpu():
version += f"{sep}tpu" version += f"{sep}tpu"
elif _is_cpu(): elif _is_cpu():
...@@ -591,8 +561,6 @@ def get_requirements() -> list[str]: ...@@ -591,8 +561,6 @@ def get_requirements() -> list[str]:
requirements = modified_requirements requirements = modified_requirements
elif _is_hip(): elif _is_hip():
requirements = _read_requirements("rocm.txt") requirements = _read_requirements("rocm.txt")
elif _is_neuron():
requirements = _read_requirements("neuron.txt")
elif _is_tpu(): elif _is_tpu():
requirements = _read_requirements("tpu.txt") requirements = _read_requirements("tpu.txt")
elif _is_cpu(): elif _is_cpu():
...@@ -601,7 +569,7 @@ def get_requirements() -> list[str]: ...@@ -601,7 +569,7 @@ def get_requirements() -> list[str]:
requirements = _read_requirements("xpu.txt") requirements = _read_requirements("xpu.txt")
else: else:
raise ValueError( raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") "Unsupported platform, please use CUDA, ROCm, or CPU.")
return requirements return requirements
...@@ -688,13 +656,15 @@ setup( ...@@ -688,13 +656,15 @@ setup(
"bench": ["pandas", "datasets"], "bench": ["pandas", "datasets"],
"tensorizer": ["tensorizer==2.10.1"], "tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"], "fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": "runai": [
["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"], "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
"google-cloud-storage", "runai-model-streamer-s3", "boto3"
],
"audio": ["librosa", "soundfile", "audio": ["librosa", "soundfile",
"mistral_common[audio]"], # Required for audio processing "mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility "video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile # FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.14.post1"], "flashinfer": ["flashinfer-python==0.3.0"],
# Optional deps for AMD FP4 quantization support # Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"], "petit-kernel": ["petit-kernel"],
}, },
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copyreg
import os import os
import subprocess import subprocess
import sys import sys
...@@ -10,6 +11,30 @@ from pathlib import Path ...@@ -10,6 +11,30 @@ from pathlib import Path
import pytest import pytest
import requests import requests
import urllib3.exceptions
def _pickle_new_connection_error(obj):
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
# Extract the original message by removing the "conn: " prefix
full_message = obj.args[0] if obj.args else ""
if ': ' in full_message:
# Split off the connection part and keep the actual message
_, actual_message = full_message.split(': ', 1)
else:
actual_message = full_message
return _unpickle_new_connection_error, (actual_message, )
def _unpickle_new_connection_error(message):
"""Custom unpickler for NewConnectionError."""
# Create with None as conn and the actual message
return urllib3.exceptions.NewConnectionError(None, message)
# Register the custom pickle/unpickle functions for tblib compatibility
copyreg.pickle(urllib3.exceptions.NewConnectionError,
_pickle_new_connection_error)
def _query_server(prompt: str, max_tokens: int = 5) -> dict: def _query_server(prompt: str, max_tokens: int = 5) -> dict:
...@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str): ...@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str):
uvicorn_process.terminate() uvicorn_process.terminate()
@pytest.mark.timeout(300)
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, distributed_executor_backend: str): def test_api_server(api_server, distributed_executor_backend: str):
""" """
...@@ -98,7 +124,7 @@ def test_api_server(api_server, distributed_executor_backend: str): ...@@ -98,7 +124,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
pool.join() pool.join()
# check cancellation stats # check cancellation stats
# give it some times to update the stats # give it some time to update the stats
time.sleep(1) time.sleep(1)
num_aborted_requests = requests.get( num_aborted_requests = requests.get(
......
...@@ -45,3 +45,34 @@ def test_bench_serve(server): ...@@ -45,3 +45,34 @@ def test_bench_serve(server):
print(result.stderr) print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}" assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
@pytest.mark.benchmark
def test_bench_serve_chat(server):
command = [
"vllm",
"bench",
"serve",
"--model",
MODEL_NAME,
"--host",
server.host,
"--port",
str(server.port),
"--dataset-name",
"random",
"--random-input-len",
"32",
"--random-output-len",
"4",
"--num-prompts",
"5",
"--endpoint",
"/v1/chat/completions",
"--endpoint-type",
"openai-chat",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
...@@ -61,6 +61,16 @@ backend_configs = { ...@@ -61,6 +61,16 @@ backend_configs = {
"cudagraph_mode": "FULL_AND_PIECEWISE", "cudagraph_mode": "FULL_AND_PIECEWISE",
}, },
specific_gpu_arch=(9, 0)), specific_gpu_arch=(9, 0)),
# FlashAttention MLA on Hopper
"FlashAttentionMLA":
BackendConfig(name="FlashAttentionMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
},
comp_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
},
specific_gpu_arch=(9, 0)),
# Cutlass MLA on Blackwell # Cutlass MLA on Blackwell
"CutlassMLA": "CutlassMLA":
BackendConfig( BackendConfig(
...@@ -102,7 +112,7 @@ backend_configs = { ...@@ -102,7 +112,7 @@ backend_configs = {
test_params_full_cudagraph = [] test_params_full_cudagraph = []
# deepseek-ai/DeepSeek-V2-Lite with MLA # deepseek-ai/DeepSeek-V2-Lite with MLA
MLA_backends = ["FlashMLA", "CutlassMLA"] MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
for mla_backend in MLA_backends: for mla_backend in MLA_backends:
test_params_full_cudagraph.append( test_params_full_cudagraph.append(
pytest.param( pytest.param(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment