Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
...@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image" ...@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu . docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \ docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
...@@ -10,7 +10,20 @@ steps: ...@@ -10,7 +10,20 @@ steps:
- tests/kernels/test_top_k_per_row.py - tests/kernels/test_top_k_per_row.py
- tests/kernels/test_concat_mla_q.py - tests/kernels/test_concat_mla_q.py
commands: commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
- label: Kernels MiniMax Reduce RMS Test (2 GPUs)
timeout_in_minutes: 15
num_devices: 2
device: h100
source_file_dependencies:
- csrc/minimax_reduce_rms_kernel.cu
- csrc/minimax_reduce_rms_kernel.h
- vllm/model_executor/layers/mamba/linear_attn.py
- vllm/model_executor/layers/mamba/lamport_workspace.py
- tests/kernels/core/test_minimax_reduce_rms.py
commands:
- pytest -v -s kernels/core/test_minimax_reduce_rms.py
- label: Kernels Attention Test %N - label: Kernels Attention Test %N
timeout_in_minutes: 35 timeout_in_minutes: 35
......
...@@ -69,3 +69,18 @@ steps: ...@@ -69,3 +69,18 @@ steps:
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Transformers Backward Compatibility Models Test
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install transformers==4.57.5
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
...@@ -306,6 +306,8 @@ set(VLLM_EXT_SRC ...@@ -306,6 +306,8 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC "csrc/minimax_reduce_rms_kernel.cu")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
......
This diff is collapsed.
/*
* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <torch/types.h>
namespace vllm {
namespace tensorrt_llm {
template <typename DType>
struct ElemsPerAccess;
template <>
struct ElemsPerAccess<half> {
static constexpr int value = 8;
using vec_type = float4;
};
template <>
struct ElemsPerAccess<nv_bfloat16> {
static constexpr int value = 8;
using vec_type = float4;
};
template <>
struct ElemsPerAccess<float> {
static constexpr int value = 4;
using vec_type = float4;
};
template <typename DType>
static constexpr int kElemsPerAccess = ElemsPerAccess<DType>::value;
struct MiniMaxReduceRMSParams {
int nranks{};
int rank{};
at::ScalarType dtype{at::ScalarType::Undefined};
int size_q{};
int hidden_dim{};
int size_k{};
int hidden_dim_k{};
int stride_q{}; // row stride for q input (elements); when > hidden_dim,
// q is part of a wider qkv tensor
int stride_k{}; // row stride for k input (elements); when > hidden_dim_k,
// k is part of a wider qkv tensor
int stride_q_out{}; // row stride for q output (elements); 0 = contiguous
int stride_k_out{}; // row stride for k output (elements); 0 = contiguous
void** workspace{};
void* allreduce_in{};
void* rms_norm_out{};
void* rms_gamma{};
void* allreduce_in_k{};
void* rms_norm_out_k{};
void* rms_gamma_k{};
float rms_eps{};
cudaStream_t stream{};
};
void minimax_reduce_rms_op(MiniMaxReduceRMSParams const& params);
} // namespace tensorrt_llm
} // namespace vllm
...@@ -392,3 +392,15 @@ int64_t qr_max_size(); ...@@ -392,3 +392,15 @@ int64_t qr_max_size();
void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a, void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a,
torch::Tensor const& mat_b); torch::Tensor const& mat_b);
#endif #endif
#ifndef USE_ROCM
torch::Tensor minimax_allreduce_rms(torch::Tensor const& input,
torch::Tensor const& norm_weight,
torch::Tensor workspace, int64_t const rank,
int64_t const nranks, double const eps);
std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
torch::Tensor qkv, torch::Tensor const& norm_weight_q,
torch::Tensor const& norm_weight_k, torch::Tensor workspace,
int64_t const q_size, int64_t const kv_size, int64_t const rank,
int64_t const nranks, double const eps);
#endif
\ No newline at end of file
...@@ -668,6 +668,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -668,6 +668,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor? b_qzeros, " "Tensor? b_qzeros, "
"SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt " "SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
"CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor"); "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
ops.def(
"minimax_allreduce_rms("
"Tensor input,"
"Tensor norm_weight,"
"Tensor workspace,"
"int rank,"
"int nranks,"
"float eps) -> Tensor");
ops.impl("minimax_allreduce_rms", torch::kCUDA, &minimax_allreduce_rms);
ops.def(
"minimax_allreduce_rms_qk("
"Tensor qkv,"
"Tensor norm_weight_q,"
"Tensor norm_weight_k,"
"Tensor workspace,"
"int q_size,"
"int kv_size,"
"int rank,"
"int nranks,"
"float eps) -> (Tensor, Tensor)");
ops.impl("minimax_allreduce_rms_qk", torch::kCUDA, &minimax_allreduce_rms_qk);
// conditionally compiled so impl in source file // conditionally compiled so impl in source file
#endif #endif
} }
......
...@@ -649,7 +649,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -649,7 +649,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \ else \
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \ BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
fi; \ fi; \
uv pip install --system accelerate hf_transfer modelscope \ uv pip install --system accelerate modelscope \
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}" "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
# ============================================================ # ============================================================
...@@ -772,9 +772,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -772,9 +772,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ ENV HF_XET_HIGH_PERFORMANCE 1
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1 # increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# Copy in the v1 package for testing (it isn't distributed yet) # Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
......
...@@ -140,9 +140,11 @@ RUN \ ...@@ -140,9 +140,11 @@ RUN \
esac; \ esac; \
}; \ }; \
remove_packages_not_supported_on_aarch64 && \ remove_packages_not_supported_on_aarch64 && \
sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \ sed -i 's/^torch==.*/torch==2.11.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
# Related issue: https://github.com/vllm-project/vllm/pull/38800#issuecomment-4228314305
sed -i 's/^sentence-transformers.*/sentence-transformers==5.3.0/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
...@@ -195,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/ ...@@ -195,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
######################### RELEASE IMAGE ######################### ######################### RELEASE IMAGE #########################
FROM base AS vllm-openai FROM base AS vllm-openai
......
...@@ -269,9 +269,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -269,9 +269,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ ENV HF_XET_HIGH_PERFORMANCE 1
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1 # increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt uv pip install --system -r requirements/nightly_torch_test.txt
......
...@@ -364,9 +364,10 @@ RUN cd /vllm-workspace \ ...@@ -364,9 +364,10 @@ RUN cd /vllm-workspace \
&& python3 -m pip install pytest-shard && python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ ENV HF_XET_HIGH_PERFORMANCE=1
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1 # increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# install audio decode package `torchcodec` from source (required due to # install audio decode package `torchcodec` from source (required due to
# ROCm and torch version mismatch) for tests with datasets package # ROCm and torch version mismatch) for tests with datasets package
......
...@@ -147,7 +147,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700 ...@@ -147,7 +147,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
# Install dependencies # Install dependencies
pip install --upgrade numba \ pip install --upgrade numba \
scipy \ scipy \
huggingface-hub[cli,hf_transfer] \ huggingface-hub[cli] \
setuptools_scm setuptools_scm
pip install -r requirements/rocm.txt pip install -r requirements/rocm.txt
......
{%- macro format_parameters(properties, required) -%}
{%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
{%- set ns = namespace(found_first=false) -%}
{%- for key, value in properties | dictsort -%}
{%- set add_comma = false -%}
{%- if key not in standard_keys -%}
{%- if ns.found_first %},{% endif -%}
{%- set ns.found_first = true -%}
{{ key }}:{
{%- if value['description'] -%}
description:<|"|>{{ value['description'] }}<|"|>
{%- set add_comma = true -%}
{%- endif -%}
{%- if value['nullable'] %}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
nullable:true
{%- endif -%}
{%- if value['type'] | upper == 'STRING' -%}
{%- if value['enum'] -%}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
enum:{{ format_argument(value['enum']) }}
{%- endif -%}
{%- elif value['type'] | upper == 'OBJECT' -%}
,properties:{
{%- if value['properties'] is defined and value['properties'] is mapping -%}
{{- format_parameters(value['properties'], value['required'] | default([])) -}}
{%- elif value is mapping -%}
{{- format_parameters(value, value['required'] | default([])) -}}
{%- endif -%}
}
{%- if value['required'] -%}
,required:[
{%- for item in value['required'] | default([]) -%}
<|"|>{{- item -}}<|"|>
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
]
{%- endif -%}
{%- elif value['type'] | upper == 'ARRAY' -%}
{%- if value['items'] is mapping and value['items'] -%}
,items:{
{%- set ns_items = namespace(found_first=false) -%}
{%- for item_key, item_value in value['items'] | dictsort -%}
{%- if item_value is not none -%}
{%- if ns_items.found_first %},{% endif -%}
{%- set ns_items.found_first = true -%}
{%- if item_key == 'properties' -%}
properties:{
{%- if item_value is mapping -%}
{{- format_parameters(item_value, value['items']['required'] | default([])) -}}
{%- endif -%}
}
{%- elif item_key == 'required' -%}
required:[
{%- for req_item in item_value -%}
<|"|>{{- req_item -}}<|"|>
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
]
{%- elif item_key == 'type' -%}
{%- if item_value is string -%}
type:{{ format_argument(item_value | upper) }}
{%- else -%}
type:{{ format_argument(item_value | map('upper') | list) }}
{%- endif -%}
{%- else -%}
{{ item_key }}:{{ format_argument(item_value) }}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
}
{%- endif -%}
{%- endif -%}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
type:<|"|>{{ value['type'] | upper }}<|"|>}
{%- endif -%}
{%- endfor -%}
{%- endmacro -%}
{%- macro format_function_declaration(tool_data) -%}
declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
{%- set params = tool_data['function']['parameters'] -%}
{%- if params -%}
,parameters:{
{%- if params['properties'] -%}
properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
{%- endif -%}
{%- if params['required'] -%}
required:[
{%- for item in params['required'] -%}
<|"|>{{- item -}}<|"|>
{{- ',' if not loop.last -}}
{%- endfor -%}
],
{%- endif -%}
{%- if params['type'] -%}
type:<|"|>{{- params['type'] | upper -}}<|"|>}
{%- endif -%}
{%- endif -%}
{%- if 'response' in tool_data['function'] -%}
{%- set response_declaration = tool_data['function']['response'] -%}
,response:{
{%- if response_declaration['description'] -%}
description:<|"|>{{- response_declaration['description'] -}}<|"|>,
{%- endif -%}
{%- if response_declaration['type'] | upper == 'OBJECT' -%}
type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
{%- endif -%}
{%- endif -%}
}
{%- endmacro -%}
{%- macro format_argument(argument, escape_keys=True) -%}
{%- if argument is string -%}
{{- '<|"|>' + argument + '<|"|>' -}}
{%- elif argument is boolean -%}
{{- 'true' if argument else 'false' -}}
{%- elif argument is mapping -%}
{{- '{' -}}
{%- set ns = namespace(found_first=false) -%}
{%- for key, value in argument | dictsort -%}
{%- if ns.found_first %},{% endif -%}
{%- set ns.found_first = true -%}
{%- if escape_keys -%}
{{- '<|"|>' + key + '<|"|>' -}}
{%- else -%}
{{- key -}}
{%- endif -%}
:{{- format_argument(value, escape_keys=escape_keys) -}}
{%- endfor -%}
{{- '}' -}}
{%- elif argument is sequence -%}
{{- '[' -}}
{%- for item in argument -%}
{{- format_argument(item, escape_keys=escape_keys) -}}
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
{{- ']' -}}
{%- else -%}
{{- argument -}}
{%- endif -%}
{%- endmacro -%}
{%- macro strip_thinking(text) -%}
{%- set ns = namespace(result='') -%}
{%- for part in text.split('<channel|>') -%}
{%- if '<|channel>' in part -%}
{%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
{%- else -%}
{%- set ns.result = ns.result + part -%}
{%- endif -%}
{%- endfor -%}
{{- ns.result | trim -}}
{%- endmacro -%}
{%- macro format_tool_response_block(tool_name, response) -%}
{{- '<|tool_response>' -}}
{%- if response is mapping -%}
{{- 'response:' + tool_name + '{' -}}
{%- for key, value in response | dictsort -%}
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
{{- '}' -}}
{%- else -%}
{{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
{%- endif -%}
{{- '<tool_response|>' -}}
{%- endmacro -%}
{%- set ns = namespace(prev_message_type=None) -%}
{%- set loop_messages = messages -%}
{{ bos_token }}
{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
{{- '<|turn>system\n' -}}
{%- if enable_thinking is defined and enable_thinking -%}
{{- '<|think|>' -}}
{%- set ns.prev_message_type = 'think' -%}
{%- endif -%}
{%- if messages[0]['role'] in ['system', 'developer'] -%}
{{- messages[0]['content'] | trim -}}
{%- set loop_messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- for tool in tools %}
{{- '<|tool>' -}}
{{- format_function_declaration(tool) | trim -}}
{{- '<tool|>' -}}
{%- endfor %}
{%- set ns.prev_message_type = 'tool' -%}
{%- endif -%}
{{- '<turn|>\n' -}}
{%- endif %}
{%- set ns_turn = namespace(last_user_idx=-1) -%}
{%- for i in range(loop_messages | length) -%}
{%- if loop_messages[i]['role'] == 'user' -%}
{%- set ns_turn.last_user_idx = i -%}
{%- endif -%}
{%- endfor -%}
{%- for message in loop_messages -%}
{%- if message['role'] != 'tool' -%}
{%- set ns.prev_message_type = None -%}
{%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
{#- OpenAI may emit multiple assistant messages in one tool loop (user → asst → tool → asst → tool).
Only the first of those should open <|turn>model; later ones continue the same model turn. -#}
{%- set prev_nt = namespace(role=None, found=false) -%}
{%- if loop.index0 > 0 -%}
{%- for j in range(loop.index0 - 1, -1, -1) -%}
{%- if not prev_nt.found -%}
{%- if loop_messages[j]['role'] != 'tool' -%}
{%- set prev_nt.role = loop_messages[j]['role'] -%}
{%- set prev_nt.found = true -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
{%- if not continue_same_model_turn -%}
{{- '<|turn>' + role + '\n' }}
{%- endif -%}
{%- if message.get('reasoning') and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
{{- '<|channel>thought\n' + message['reasoning'] + '\n<channel|>'}}
{%- endif -%}
{%- if message['tool_calls'] -%}
{%- for tool_call in message['tool_calls'] -%}
{%- set function = tool_call['function'] -%}
{{- '<|tool_call>call:' + function['name'] + '{' -}}
{%- if function['arguments'] is mapping -%}
{%- set ns_args = namespace(found_first=false) -%}
{%- for key, value in function['arguments'] | dictsort -%}
{%- if ns_args.found_first %},{% endif -%}
{%- set ns_args.found_first = true -%}
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
{%- endfor -%}
{%- elif function['arguments'] is string -%}
{{- function['arguments'] -}}
{%- endif -%}
{{- '}<tool_call|>' -}}
{%- endfor -%}
{%- set ns.prev_message_type = 'tool_call' -%}
{%- endif -%}
{%- set ns_tr_out = namespace(flag=false) -%}
{%- if message.get('tool_responses') -%}
{#- Legacy: tool_responses embedded on the assistant message -#}
{%- for tool_response in message['tool_responses'] -%}
{{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
{%- set ns_tr_out.flag = true -%}
{%- set ns.prev_message_type = 'tool_response' -%}
{%- endfor -%}
{%- elif message.get('tool_calls') -%}
{#- OpenAI Chat Completions: consecutive following messages with role "tool" (no break/continue; range scan) -#}
{%- set ns_tool_scan = namespace(stopped=false) -%}
{%- for k in range(loop.index0 + 1, loop_messages | length) -%}
{%- if ns_tool_scan.stopped -%}
{%- elif loop_messages[k]['role'] != 'tool' -%}
{%- set ns_tool_scan.stopped = true -%}
{%- else -%}
{%- set follow = loop_messages[k] -%}
{%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
{%- for tc in message['tool_calls'] -%}
{%- if tc.get('id') == follow.get('tool_call_id') -%}
{%- set ns_tname.name = tc['function']['name'] -%}
{%- endif -%}
{%- endfor -%}
{%- set tool_body = follow.get('content') -%}
{%- if tool_body is string -%}
{{- format_tool_response_block(ns_tname.name, tool_body) -}}
{%- elif tool_body is sequence and tool_body is not string -%}
{%- set ns_txt = namespace(s='') -%}
{%- for part in tool_body -%}
{%- if part.get('type') == 'text' -%}
{%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
{%- endif -%}
{%- endfor -%}
{{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
{%- else -%}
{{- format_tool_response_block(ns_tname.name, tool_body) -}}
{%- endif -%}
{%- set ns_tr_out.flag = true -%}
{%- set ns.prev_message_type = 'tool_response' -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- if message['content'] is string -%}
{%- if role == 'model' -%}
{{- strip_thinking(message['content']) -}}
{%- else -%}
{{- message['content'] | trim -}}
{%- endif -%}
{%- elif message['content'] is sequence -%}
{%- for item in message['content'] -%}
{%- if item['type'] == 'text' -%}
{%- if role == 'model' -%}
{{- strip_thinking(item['text']) -}}
{%- else -%}
{{- item['text'] | trim -}}
{%- endif -%}
{%- elif item['type'] == 'image' -%}
{{- '\n\n<|image|>\n\n' -}}
{%- set ns.prev_message_type = 'image' -%}
{%- elif item['type'] == 'audio' -%}
{{- '<|audio|>' -}}
{%- set ns.prev_message_type = 'audio' -%}
{%- elif item['type'] == 'video' -%}
{{- '\n\n<|video|>\n\n' -}}
{%- set ns.prev_message_type = 'video' -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- if not (ns_tr_out.flag and not message.get('content')) -%}
{{- '<turn|>\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{%- if ns.prev_message_type != 'tool_response' -%}
{{- '<|turn>model\n' -}}
{%- endif -%}
{%- if not enable_thinking | default(false) -%}
{{- '<|channel>thought\n<channel|>' -}}
{%- endif -%}
{%- endif -%}
...@@ -7,7 +7,7 @@ requests >= 2.26.0 ...@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.56.0, < 5 transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization. tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
...@@ -37,7 +37,7 @@ pyyaml ...@@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.14.0.1 # required for compressed-tensors compressed-tensors == 0.15.0.1 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
......
lmcache >= 0.3.9 lmcache >= 0.3.9
nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
nixl-cu12 >= 0.7.1, < 0.10.0
nixl-cu13 >= 0.7.1, < 0.10.0
mooncake-transfer-engine >= 0.3.8 mooncake-transfer-engine >= 0.3.8
...@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test ...@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5 transformers==5.5.3
tokenizers==0.22.0 tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes>=0.49.2 bitsandbytes>=0.49.2
......
...@@ -36,8 +36,8 @@ opencv-python-headless>=4.13.0 # required for video test ...@@ -36,8 +36,8 @@ opencv-python-headless>=4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5 transformers==5.5.3
tokenizers==0.22.0 tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test schemathesis>=3.39.15 # Required for openai schema test
# quantization # quantization
bitsandbytes==0.49.2 bitsandbytes==0.49.2
...@@ -80,4 +80,3 @@ plotly # required for perf comparison html report ...@@ -80,4 +80,3 @@ plotly # required for perf comparison html report
rapidfuzz rapidfuzz
torchgeo==0.7.0 torchgeo==0.7.0
multiprocess==0.70.16 multiprocess==0.70.16
huggingface-hub==0.36.2
...@@ -232,7 +232,6 @@ filelock==3.25.2 ...@@ -232,7 +232,6 @@ filelock==3.25.2
# python-discovery # python-discovery
# ray # ray
# torch # torch
# transformers
# virtualenv # virtualenv
fiona==1.10.1 fiona==1.10.1
# via torchgeo # via torchgeo
...@@ -318,7 +317,7 @@ h5py==3.16.0 ...@@ -318,7 +317,7 @@ h5py==3.16.0
# via terratorch # via terratorch
harfile==0.4.0 harfile==0.4.0
# via schemathesis # via schemathesis
hf-xet==1.4.2 hf-xet==1.4.3
# via huggingface-hub # via huggingface-hub
hiredis==3.3.1 hiredis==3.3.1
# via tensorizer # via tensorizer
...@@ -332,11 +331,11 @@ httpx==0.27.2 ...@@ -332,11 +331,11 @@ httpx==0.27.2
# via # via
# -r requirements/rocm-test.in # -r requirements/rocm-test.in
# diffusers # diffusers
# huggingface-hub
# perceptron # perceptron
# schemathesis # schemathesis
huggingface-hub==0.36.2 huggingface-hub==1.10.2
# via # via
# -r requirements/rocm-test.in
# accelerate # accelerate
# datasets # datasets
# diffusers # diffusers
...@@ -970,7 +969,6 @@ requests==2.32.5 ...@@ -970,7 +969,6 @@ requests==2.32.5
# google-api-core # google-api-core
# google-cloud-storage # google-cloud-storage
# gpt-oss # gpt-oss
# huggingface-hub
# lightly # lightly
# lm-eval # lm-eval
# mistral-common # mistral-common
...@@ -983,7 +981,6 @@ requests==2.32.5 ...@@ -983,7 +981,6 @@ requests==2.32.5
# starlette-testclient # starlette-testclient
# tacoreader # tacoreader
# tiktoken # tiktoken
# transformers
# wandb # wandb
resampy==0.4.3 resampy==0.4.3
# via -r requirements/rocm-test.in # via -r requirements/rocm-test.in
...@@ -1191,7 +1188,7 @@ timm==1.0.17 ...@@ -1191,7 +1188,7 @@ timm==1.0.17
# segmentation-models-pytorch # segmentation-models-pytorch
# terratorch # terratorch
# torchgeo # torchgeo
tokenizers==0.22.0 tokenizers==0.22.2
# via # via
# -c requirements/common.txt # -c requirements/common.txt
# -r requirements/rocm-test.in # -r requirements/rocm-test.in
...@@ -1230,7 +1227,7 @@ tqdm==4.67.3 ...@@ -1230,7 +1227,7 @@ tqdm==4.67.3
# tacoreader # tacoreader
# terratorch # terratorch
# transformers # transformers
transformers==4.57.5 transformers==5.5.3
# via # via
# -c requirements/common.txt # -c requirements/common.txt
# -r requirements/rocm-test.in # -r requirements/rocm-test.in
...@@ -1252,7 +1249,9 @@ typepy==1.3.4 ...@@ -1252,7 +1249,9 @@ typepy==1.3.4
typer==0.24.1 typer==0.24.1
# via # via
# fastsafetensors # fastsafetensors
# huggingface-hub
# perceptron # perceptron
# transformers
typeshed-client==2.9.0 typeshed-client==2.9.0
# via jsonargparse # via jsonargparse
typing-extensions==4.15.0 typing-extensions==4.15.0
......
...@@ -18,7 +18,7 @@ httpx ...@@ -18,7 +18,7 @@ httpx
librosa # required for audio tests librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test peft>=0.18.1 # required for phi-4-mm test
pqdm pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests resampy # required for audio tests
...@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test ...@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5 transformers==5.5.3
tokenizers==0.22.0 tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes==0.49.2 bitsandbytes==0.49.2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment