Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
7e499b5c
Unverified
Commit
7e499b5c
authored
Dec 02, 2025
by
Yan Ru Pei
Committed by
GitHub
Dec 02, 2025
Browse files
test: bring back the framework 1 gpu pre-merge tests + clean up pytest markers (#4698)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
3cad926e
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
78 additions
and
179 deletions
+78
-179
.github/workflows/container-validation-backends.yml
.github/workflows/container-validation-backends.yml
+9
-36
.github/workflows/container-validation-dynamo.yml
.github/workflows/container-validation-dynamo.yml
+2
-2
components/src/dynamo/common/utils/prometheus.py
components/src/dynamo/common/utils/prometheus.py
+1
-1
components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
...s/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
+2
-1
components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
+1
-0
components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
...s/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
+1
-0
components/src/dynamo/trtllm/tests/test_trtllm_unit.py
components/src/dynamo/trtllm/tests/test_trtllm_unit.py
+1
-0
components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
...nents/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
+2
-1
tests/README.md
tests/README.md
+2
-1
tests/fault_tolerance/cancellation/test_trtllm.py
tests/fault_tolerance/cancellation/test_trtllm.py
+7
-16
tests/fault_tolerance/migration/test_vllm.py
tests/fault_tolerance/migration/test_vllm.py
+8
-20
tests/frontend/test_completion_mocker_engine.py
tests/frontend/test_completion_mocker_engine.py
+7
-16
tests/frontend/test_vllm.py
tests/frontend/test_vllm.py
+7
-16
tests/router/test_router_e2e_with_mockers.py
tests/router/test_router_e2e_with_mockers.py
+6
-49
tests/router/test_router_e2e_with_vllm.py
tests/router/test_router_e2e_with_vllm.py
+7
-9
tests/serve/test_sglang.py
tests/serve/test_sglang.py
+8
-4
tests/serve/test_trtllm.py
tests/serve/test_trtllm.py
+3
-3
tests/serve/test_vllm.py
tests/serve/test_vllm.py
+4
-4
No files found.
.github/workflows/container-validation-backends.yml
View file @
7e499b5c
...
@@ -179,23 +179,14 @@ jobs:
...
@@ -179,23 +179,14 @@ jobs:
azure_acr_user
:
${{ secrets.AZURE_ACR_USER }}
azure_acr_user
:
${{ secrets.AZURE_ACR_USER }}
azure_acr_password
:
${{ secrets.AZURE_ACR_PASSWORD }}
azure_acr_password
:
${{ secrets.AZURE_ACR_PASSWORD }}
-
name
:
Run
unit
tests
-
name
:
Run tests
if
:
${{ matrix.platform.arch != 'arm64' }}
if
:
${{ matrix.platform.arch != 'arm64' }}
uses
:
./.github/actions/pytest
uses
:
./.github/actions/pytest
with
:
with
:
image_tag
:
${{ steps.build-image.outputs.image_tag }}
image_tag
:
${{ steps.build-image.outputs.image_tag }}
pytest_marks
:
"
unit
and
vllm
and
gpu_1
"
pytest_marks
:
"
pre_merge
and
vllm
"
framework
:
"
vllm"
framework
:
"
vllm"
test_type
:
"
unit"
test_type
:
"
pre_merge"
platform_arch
:
${{ matrix.platform.arch }}
-
name
:
Run e2e tests
if
:
${{ matrix.platform.arch != 'arm64' }}
uses
:
./.github/actions/pytest
with
:
image_tag
:
${{ steps.build-image.outputs.image_tag }}
pytest_marks
:
"
e2e
and
vllm
and
gpu_1
and
not
slow"
framework
:
"
vllm"
test_type
:
"
e2e,
gpu_1"
platform_arch
:
${{ matrix.platform.arch }}
platform_arch
:
${{ matrix.platform.arch }}
sglang
:
sglang
:
...
@@ -246,23 +237,14 @@ jobs:
...
@@ -246,23 +237,14 @@ jobs:
azure_acr_user
:
${{ secrets.AZURE_ACR_USER }}
azure_acr_user
:
${{ secrets.AZURE_ACR_USER }}
azure_acr_password
:
${{ secrets.AZURE_ACR_PASSWORD }}
azure_acr_password
:
${{ secrets.AZURE_ACR_PASSWORD }}
-
name
:
Run unit tests
-
name
:
Run tests
if
:
${{ matrix.platform.arch != 'arm64' }}
uses
:
./.github/actions/pytest
with
:
image_tag
:
${{ steps.build-image.outputs.image_tag }}
pytest_marks
:
"
unit
and
sglang
and
gpu_1"
framework
:
"
sglang"
test_type
:
"
unit"
platform_arch
:
${{ matrix.platform.arch }}
-
name
:
Run e2e tests
if
:
${{ matrix.platform.arch != 'arm64' }}
if
:
${{ matrix.platform.arch != 'arm64' }}
uses
:
./.github/actions/pytest
uses
:
./.github/actions/pytest
with
:
with
:
image_tag
:
${{ steps.build-image.outputs.image_tag }}
image_tag
:
${{ steps.build-image.outputs.image_tag }}
pytest_marks
:
"
e2
e
and
sglang
and
gpu_1
"
pytest_marks
:
"
pre_merg
e
and
sglang"
framework
:
"
sglang"
framework
:
"
sglang"
test_type
:
"
e2e,
gpu_1
"
test_type
:
"
pre_merge
"
platform_arch
:
${{ matrix.platform.arch }}
platform_arch
:
${{ matrix.platform.arch }}
trtllm
:
trtllm
:
...
@@ -313,23 +295,14 @@ jobs:
...
@@ -313,23 +295,14 @@ jobs:
azure_acr_user
:
${{ secrets.AZURE_ACR_USER }}
azure_acr_user
:
${{ secrets.AZURE_ACR_USER }}
azure_acr_password
:
${{ secrets.AZURE_ACR_PASSWORD }}
azure_acr_password
:
${{ secrets.AZURE_ACR_PASSWORD }}
-
name
:
Run unit tests
-
name
:
Run tests
if
:
${{ matrix.platform.arch != 'arm64' }}
uses
:
./.github/actions/pytest
with
:
image_tag
:
${{ steps.build-image.outputs.image_tag }}
pytest_marks
:
"
unit
and
trtllm
and
gpu_1"
framework
:
"
trtllm"
test_type
:
"
unit"
platform_arch
:
${{ matrix.platform.arch }}
-
name
:
Run e2e tests
if
:
${{ matrix.platform.arch != 'arm64' }}
if
:
${{ matrix.platform.arch != 'arm64' }}
uses
:
./.github/actions/pytest
uses
:
./.github/actions/pytest
with
:
with
:
image_tag
:
${{ steps.build-image.outputs.image_tag }}
image_tag
:
${{ steps.build-image.outputs.image_tag }}
pytest_marks
:
"
e2
e
and
trtllm
and
gpu_1
and
not
slow
"
pytest_marks
:
"
pre_merg
e
and
trtllm"
framework
:
"
trtllm"
framework
:
"
trtllm"
test_type
:
"
e2e,
gpu_1
"
test_type
:
"
pre_merge
"
platform_arch
:
${{ matrix.platform.arch }}
platform_arch
:
${{ matrix.platform.arch }}
deploy-test-fault-tolerance
:
deploy-test-fault-tolerance
:
...
...
.github/workflows/container-validation-dynamo.yml
View file @
7e499b5c
...
@@ -65,7 +65,7 @@ jobs:
...
@@ -65,7 +65,7 @@ jobs:
docker compose down
docker compose down
-
name
:
Run pytest (parallel tests with xdist)
-
name
:
Run pytest (parallel tests with xdist)
env
:
env
:
PYTEST_MARKS
:
"
pre_merge
and
parallel"
PYTEST_MARKS
:
"
pre_merge
and
parallel
and
not
(vllm
or
sglang
or
trtllm)
"
run
:
|
run
:
|
docker run -w /workspace \
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest_parallel \
--name ${{ env.CONTAINER_ID }}_pytest_parallel \
...
@@ -77,7 +77,7 @@ jobs:
...
@@ -77,7 +77,7 @@ jobs:
docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
-
name
:
Run pytest (sequential tests)
-
name
:
Run pytest (sequential tests)
env
:
env
:
PYTEST_MARKS
:
"
(pre_merge
and
not
parallel)
or
mypy"
PYTEST_MARKS
:
"
(
(
pre_merge
and
not
parallel)
or
mypy
)
and
not
(vllm
or
sglang
or
trtllm)
"
run
:
|
run
:
|
docker run -w /workspace \
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest \
--name ${{ env.CONTAINER_ID }}_pytest \
...
...
components/src/dynamo/common/utils/prometheus.py
View file @
7e499b5c
...
@@ -55,7 +55,7 @@ def register_engine_metrics_callback(
...
@@ -55,7 +55,7 @@ def register_engine_metrics_callback(
# Include multiple metric prefixes
# Include multiple metric prefixes
register_engine_metrics_callback(
register_engine_metrics_callback(
generate_endpoint, REGISTRY, metric_prefix_filter=["vllm:", "lmcache:"]
generate_endpoint, REGISTRY, metric_prefix_filter
s
=["vllm:", "lmcache:"]
)
)
# With filtering and prefixing for TensorRT-LLM
# With filtering and prefixing for TensorRT-LLM
...
...
components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
View file @
7e499b5c
...
@@ -13,6 +13,7 @@ pytestmark = [
...
@@ -13,6 +13,7 @@ pytestmark = [
pytest
.
mark
.
unit
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
sglang
,
pytest
.
mark
.
sglang
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
post_merge
,
pytest
.
mark
.
post_merge
,
]
]
...
@@ -58,7 +59,7 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
...
@@ -58,7 +59,7 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
result
=
get_prometheus_expfmt
(
result
=
get_prometheus_expfmt
(
sglang_registry
,
sglang_registry
,
metric_prefix_filter
=
"sglang:"
,
metric_prefix_filter
s
=
[
"sglang:"
]
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
)
...
...
components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
View file @
7e499b5c
...
@@ -19,6 +19,7 @@ pytestmark = [
...
@@ -19,6 +19,7 @@ pytestmark = [
# `.github/workflows/container-validation-backends.yml` does not make use of
# `.github/workflows/container-validation-backends.yml` does not make use of
# the `gpu_0` marker.
# the `gpu_0` marker.
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
]
]
_PYTORCH_LLM_CLS_NAME
=
"dynamo.trtllm.engine.LLM"
_PYTORCH_LLM_CLS_NAME
=
"dynamo.trtllm.engine.LLM"
_AUTODEPLOY_LLM_CLS_NAME
=
"tensorrt_llm._torch.auto_deploy.LLM"
_AUTODEPLOY_LLM_CLS_NAME
=
"tensorrt_llm._torch.auto_deploy.LLM"
...
...
components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
View file @
7e499b5c
...
@@ -13,6 +13,7 @@ pytestmark = [
...
@@ -13,6 +13,7 @@ pytestmark = [
pytest
.
mark
.
unit
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
post_merge
,
pytest
.
mark
.
post_merge
,
]
]
...
...
components/src/dynamo/trtllm/tests/test_trtllm_unit.py
View file @
7e499b5c
...
@@ -23,6 +23,7 @@ pytestmark = [
...
@@ -23,6 +23,7 @@ pytestmark = [
pytest
.
mark
.
unit
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
]
]
...
...
components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
View file @
7e499b5c
...
@@ -13,6 +13,7 @@ pytestmark = [
...
@@ -13,6 +13,7 @@ pytestmark = [
pytest
.
mark
.
unit
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
post_merge
,
pytest
.
mark
.
post_merge
,
]
]
...
@@ -56,7 +57,7 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
...
@@ -56,7 +57,7 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
result
=
get_prometheus_expfmt
(
result
=
get_prometheus_expfmt
(
vllm_registry
,
vllm_registry
,
metric_prefix_filter
=
"vllm:"
,
metric_prefix_filter
s
=
[
"vllm:"
]
,
exclude_prefixes
=
[
"python_"
,
"process_"
],
exclude_prefixes
=
[
"python_"
,
"process_"
],
)
)
...
...
tests/README.md
View file @
7e499b5c
...
@@ -71,7 +71,8 @@ Markers are required for all tests. They are used for test selection in CI and l
...
@@ -71,7 +71,8 @@ Markers are required for all tests. They are used for test selection in CI and l
| Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test |
| Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test |
| Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required |
| Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required |
| Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity |
| Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity |
| Other | slow, skip, xfail | Special handling |
| Execution | parallel | Test can run in parallel with pytest-xdist |
| Other | slow, skip, xfail, mypy, custom_build | Special handling |
### Example
### Example
```
python
```
python
...
...
tests/fault_tolerance/cancellation/test_trtllm.py
View file @
7e499b5c
...
@@ -21,6 +21,13 @@ from tests.utils.payloads import check_health_generate, check_models_api
...
@@ -21,6 +21,13 @@ from tests.utils.payloads import check_health_generate, check_models_api
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
pytestmark
=
[
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
e2e
,
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
),
]
class
DynamoWorkerProcess
(
ManagedProcess
):
class
DynamoWorkerProcess
(
ManagedProcess
):
"""Process manager for Dynamo worker with TensorRT-LLM backend"""
"""Process manager for Dynamo worker with TensorRT-LLM backend"""
...
@@ -127,10 +134,6 @@ class DynamoWorkerProcess(ManagedProcess):
...
@@ -127,10 +134,6 @@ class DynamoWorkerProcess(ManagedProcess):
return
False
return
False
@
pytest
.
mark
.
trtllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
nightly
def
test_request_cancellation_trtllm_aggregated
(
def
test_request_cancellation_trtllm_aggregated
(
request
,
runtime_services
,
predownload_models
request
,
runtime_services
,
predownload_models
...
@@ -205,10 +208,6 @@ def test_request_cancellation_trtllm_aggregated(
...
@@ -205,10 +208,6 @@ def test_request_cancellation_trtllm_aggregated(
logger
.
info
(
f
"
{
description
}
detected successfully"
)
logger
.
info
(
f
"
{
description
}
detected successfully"
)
@
pytest
.
mark
.
trtllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
nightly
def
test_request_cancellation_trtllm_decode_cancel
(
def
test_request_cancellation_trtllm_decode_cancel
(
request
,
runtime_services
,
predownload_models
request
,
runtime_services
,
predownload_models
...
@@ -282,11 +281,7 @@ def test_request_cancellation_trtllm_decode_cancel(
...
@@ -282,11 +281,7 @@ def test_request_cancellation_trtllm_decode_cancel(
)
)
@
pytest
.
mark
.
trtllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
def
test_request_cancellation_trtllm_prefill_cancel
(
def
test_request_cancellation_trtllm_prefill_cancel
(
request
,
runtime_services
,
predownload_models
request
,
runtime_services
,
predownload_models
):
):
...
@@ -369,10 +364,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
...
@@ -369,10 +364,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
)
)
@
pytest
.
mark
.
trtllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
xfail
(
@
pytest
.
mark
.
xfail
(
reason
=
"May fail due to unknown reason with TRT-LLM or backend implementation"
,
reason
=
"May fail due to unknown reason with TRT-LLM or backend implementation"
,
strict
=
False
,
strict
=
False
,
...
...
tests/fault_tolerance/migration/test_vllm.py
View file @
7e499b5c
...
@@ -23,6 +23,14 @@ from .utils import (
...
@@ -23,6 +23,14 @@ from .utils import (
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
pytestmark
=
[
pytest
.
mark
.
vllm
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
e2e
,
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
),
pytest
.
mark
.
nightly
,
]
class
DynamoWorkerProcess
(
ManagedProcess
):
class
DynamoWorkerProcess
(
ManagedProcess
):
"""Process manager for Dynamo worker with vLLM backend"""
"""Process manager for Dynamo worker with vLLM backend"""
...
@@ -100,11 +108,6 @@ class DynamoWorkerProcess(ManagedProcess):
...
@@ -100,11 +108,6 @@ class DynamoWorkerProcess(ManagedProcess):
return
False
return
False
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
nightly
def
test_request_migration_vllm_worker_failure
(
def
test_request_migration_vllm_worker_failure
(
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
):
...
@@ -151,11 +154,6 @@ def test_request_migration_vllm_worker_failure(
...
@@ -151,11 +154,6 @@ def test_request_migration_vllm_worker_failure(
verify_migration_occurred
(
frontend
)
verify_migration_occurred
(
frontend
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
nightly
def
test_request_migration_vllm_graceful_shutdown
(
def
test_request_migration_vllm_graceful_shutdown
(
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
):
...
@@ -203,11 +201,6 @@ def test_request_migration_vllm_graceful_shutdown(
...
@@ -203,11 +201,6 @@ def test_request_migration_vllm_graceful_shutdown(
verify_migration_occurred
(
frontend
)
verify_migration_occurred
(
frontend
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
nightly
def
test_no_request_migration_vllm_worker_failure
(
def
test_no_request_migration_vllm_worker_failure
(
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
):
...
@@ -268,11 +261,6 @@ def test_no_request_migration_vllm_worker_failure(
...
@@ -268,11 +261,6 @@ def test_no_request_migration_vllm_worker_failure(
),
f
"Unexpected migration message:
{
e
}
"
),
f
"Unexpected migration message:
{
e
}
"
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
nightly
def
test_no_request_migration_vllm_graceful_shutdown
(
def
test_no_request_migration_vllm_graceful_shutdown
(
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
):
...
...
tests/frontend/test_completion_mocker_engine.py
View file @
7e499b5c
...
@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
...
@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
TEST_MODEL
=
QWEN
TEST_MODEL
=
QWEN
pytestmark
=
[
pytest
.
mark
.
e2e
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
post_merge
,
pytest
.
mark
.
model
(
TEST_MODEL
),
]
class
DynamoFrontendProcess
(
ManagedProcess
):
class
DynamoFrontendProcess
(
ManagedProcess
):
"""Process manager for Dynamo frontend"""
"""Process manager for Dynamo frontend"""
...
@@ -145,10 +152,6 @@ def start_services(request, runtime_services):
...
@@ -145,10 +152,6 @@ def start_services(request, runtime_services):
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_completion_string_prompt
()
->
None
:
def
test_completion_string_prompt
()
->
None
:
payload
:
Dict
[
str
,
Any
]
=
{
payload
:
Dict
[
str
,
Any
]
=
{
"model"
:
TEST_MODEL
,
"model"
:
TEST_MODEL
,
...
@@ -165,10 +168,6 @@ def test_completion_string_prompt() -> None:
...
@@ -165,10 +168,6 @@ def test_completion_string_prompt() -> None:
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_completion_empty_array_prompt
()
->
None
:
def
test_completion_empty_array_prompt
()
->
None
:
payload
:
Dict
[
str
,
Any
]
=
{
payload
:
Dict
[
str
,
Any
]
=
{
"model"
:
TEST_MODEL
,
"model"
:
TEST_MODEL
,
...
@@ -185,10 +184,6 @@ def test_completion_empty_array_prompt() -> None:
...
@@ -185,10 +184,6 @@ def test_completion_empty_array_prompt() -> None:
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_completion_single_element_array_prompt
()
->
None
:
def
test_completion_single_element_array_prompt
()
->
None
:
payload
:
Dict
[
str
,
Any
]
=
{
payload
:
Dict
[
str
,
Any
]
=
{
"model"
:
TEST_MODEL
,
"model"
:
TEST_MODEL
,
...
@@ -205,10 +200,6 @@ def test_completion_single_element_array_prompt() -> None:
...
@@ -205,10 +200,6 @@ def test_completion_single_element_array_prompt() -> None:
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_completion_multi_element_array_prompt
()
->
None
:
def
test_completion_multi_element_array_prompt
()
->
None
:
payload
:
Dict
[
str
,
Any
]
=
{
payload
:
Dict
[
str
,
Any
]
=
{
"model"
:
TEST_MODEL
,
"model"
:
TEST_MODEL
,
...
...
tests/frontend/test_vllm.py
View file @
7e499b5c
...
@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
...
@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
TEST_MODEL
=
GPT_OSS
TEST_MODEL
=
GPT_OSS
pytestmark
=
[
pytest
.
mark
.
vllm
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
e2e
,
pytest
.
mark
.
model
(
TEST_MODEL
),
]
WEATHER_TOOL
=
{
WEATHER_TOOL
=
{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -211,11 +218,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
...
@@ -211,11 +218,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_reasoning_effort
(
request
,
runtime_services
,
predownload_models
)
->
None
:
def
test_reasoning_effort
(
request
,
runtime_services
,
predownload_models
)
->
None
:
"""High reasoning effort should yield more detailed reasoning than low effort."""
"""High reasoning effort should yield more detailed reasoning than low effort."""
...
@@ -278,11 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
...
@@ -278,11 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_tool_calling
(
request
,
runtime_services
,
predownload_models
)
->
None
:
def
test_tool_calling
(
request
,
runtime_services
,
predownload_models
)
->
None
:
"""Test tool calling functionality with weather and system health tools."""
"""Test tool calling functionality with weather and system health tools."""
...
@@ -321,11 +320,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
...
@@ -321,11 +320,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_tool_calling_second_round
(
def
test_tool_calling_second_round
(
request
,
runtime_services
,
predownload_models
request
,
runtime_services
,
predownload_models
)
->
None
:
)
->
None
:
...
@@ -388,11 +383,7 @@ def test_tool_calling_second_round(
...
@@ -388,11 +383,7 @@ def test_tool_calling_second_round(
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
usefixtures
(
"start_services"
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
nightly
@
pytest
.
mark
.
model
(
TEST_MODEL
)
def
test_reasoning
(
request
,
runtime_services
,
predownload_models
)
->
None
:
def
test_reasoning
(
request
,
runtime_services
,
predownload_models
)
->
None
:
"""Test reasoning functionality with a mathematical problem."""
"""Test reasoning functionality with a mathematical problem."""
...
...
tests/router/test_router_e2e_with_mockers.py
View file @
7e499b5c
...
@@ -22,17 +22,17 @@ from tests.router.common import ( # utilities
...
@@ -22,17 +22,17 @@ from tests.router.common import ( # utilities
from
tests.utils.constants
import
ROUTER_MODEL_NAME
from
tests.utils.constants
import
ROUTER_MODEL_NAME
from
tests.utils.managed_process
import
ManagedProcess
from
tests.utils.managed_process
import
ManagedProcess
logger
=
logging
.
getLogger
(
__name__
)
MODEL_NAME
=
ROUTER_MODEL_NAME
pytestmark
=
[
pytestmark
=
[
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
integration
,
pytest
.
mark
.
integration
,
pytest
.
mark
.
parallel
,
pytest
.
mark
.
model
(
MODEL_NAME
),
]
]
logger
=
logging
.
getLogger
(
__name__
)
MODEL_NAME
=
ROUTER_MODEL_NAME
NUM_MOCKERS
=
2
NUM_MOCKERS
=
2
SPEEDUP_RATIO
=
10.0
SPEEDUP_RATIO
=
10.0
BASE_PORT
=
9100
# Base port for all tests (high port to avoid conflicts)
BASE_PORT
=
9100
# Base port for all tests (high port to avoid conflicts)
...
@@ -287,11 +287,6 @@ class DisaggMockerProcess:
...
@@ -287,11 +287,6 @@ class DisaggMockerProcess:
self
.
_process
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
self
.
_process
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_mocker_kv_router
(
request
,
runtime_services_session
,
predownload_tokenizers
):
def
test_mocker_kv_router
(
request
,
runtime_services_session
,
predownload_tokenizers
):
"""
"""
Test KV router with multiple mocker engine instances.
Test KV router with multiple mocker engine instances.
...
@@ -331,11 +326,6 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
...
@@ -331,11 +326,6 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
@
pytest
.
mark
.
parametrize
(
"store_backend"
,
[
"etcd"
,
"file"
])
@
pytest
.
mark
.
parametrize
(
"store_backend"
,
[
"etcd"
,
"file"
])
def
test_mocker_two_kv_router
(
def
test_mocker_two_kv_router
(
request
,
request
,
...
@@ -391,11 +381,6 @@ def test_mocker_two_kv_router(
...
@@ -391,11 +381,6 @@ def test_mocker_two_kv_router(
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
@
pytest
.
mark
.
skip
(
reason
=
"Flaky, temporarily disabled"
)
@
pytest
.
mark
.
skip
(
reason
=
"Flaky, temporarily disabled"
)
def
test_mocker_kv_router_overload_503
(
def
test_mocker_kv_router_overload_503
(
request
,
runtime_services_session
,
predownload_tokenizers
request
,
runtime_services_session
,
predownload_tokenizers
...
@@ -434,11 +419,6 @@ def test_mocker_kv_router_overload_503(
...
@@ -434,11 +419,6 @@ def test_mocker_kv_router_overload_503(
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_kv_push_router_bindings
(
def
test_kv_push_router_bindings
(
request
,
runtime_services_session
,
predownload_tokenizers
request
,
runtime_services_session
,
predownload_tokenizers
):
):
...
@@ -475,11 +455,6 @@ def test_kv_push_router_bindings(
...
@@ -475,11 +455,6 @@ def test_kv_push_router_bindings(
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
@
pytest
.
mark
.
parametrize
(
"store_backend"
,
[
"etcd"
,
"file"
])
@
pytest
.
mark
.
parametrize
(
"store_backend"
,
[
"etcd"
,
"file"
])
def
test_indexers_sync
(
def
test_indexers_sync
(
request
,
request
,
...
@@ -529,11 +504,6 @@ def test_indexers_sync(
...
@@ -529,11 +504,6 @@ def test_indexers_sync(
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_query_instance_id_returns_worker_and_tokens
(
def
test_query_instance_id_returns_worker_and_tokens
(
request
,
runtime_services_session
,
predownload_tokenizers
request
,
runtime_services_session
,
predownload_tokenizers
):
):
...
@@ -568,11 +538,6 @@ def test_query_instance_id_returns_worker_and_tokens(
...
@@ -568,11 +538,6 @@ def test_query_instance_id_returns_worker_and_tokens(
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_router_decisions
(
request
,
runtime_services_session
,
predownload_tokenizers
):
def
test_router_decisions
(
request
,
runtime_services_session
,
predownload_tokenizers
):
"""Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes."""
"""Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes."""
...
@@ -612,9 +577,6 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz
...
@@ -612,9 +577,6 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz
mockers
.
__exit__
(
None
,
None
,
None
)
mockers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_router_disagg_decisions
(
def
test_router_disagg_decisions
(
request
,
runtime_services_session
,
predownload_tokenizers
request
,
runtime_services_session
,
predownload_tokenizers
):
):
...
@@ -680,11 +642,6 @@ def test_router_disagg_decisions(
...
@@ -680,11 +642,6 @@ def test_router_disagg_decisions(
prefill_workers
.
__exit__
(
None
,
None
,
None
)
prefill_workers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_0
@
pytest
.
mark
.
integration
@
pytest
.
mark
.
parallel
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_busy_threshold_endpoint
(
def
test_busy_threshold_endpoint
(
request
,
runtime_services_session
,
predownload_tokenizers
request
,
runtime_services_session
,
predownload_tokenizers
):
):
...
...
tests/router/test_router_e2e_with_vllm.py
View file @
7e499b5c
...
@@ -18,6 +18,13 @@ from tests.utils.managed_process import ManagedProcess
...
@@ -18,6 +18,13 @@ from tests.utils.managed_process import ManagedProcess
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
MODEL_NAME
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODEL_NAME
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
pytestmark
=
[
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
e2e
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
model
(
MODEL_NAME
),
]
SPEEDUP_RATIO
=
10.0
SPEEDUP_RATIO
=
10.0
PORTS
=
[
PORTS
=
[
8011
,
8011
,
...
@@ -269,11 +276,8 @@ class VLLMProcess:
...
@@ -269,11 +276,8 @@ class VLLMProcess:
time
.
sleep
(
2
)
time
.
sleep
(
2
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
skip
(
reason
=
"All vLLM tests disabled for now"
)
@
pytest
.
mark
.
skip
(
reason
=
"All vLLM tests disabled for now"
)
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_vllm_kv_router_basic
(
request
,
runtime_services
,
predownload_tokenizers
):
def
test_vllm_kv_router_basic
(
request
,
runtime_services
,
predownload_tokenizers
):
"""
"""
Quick e2e sanity test for KV router with vLLM engine instances.
Quick e2e sanity test for KV router with vLLM engine instances.
...
@@ -319,11 +323,8 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
...
@@ -319,11 +323,8 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
vllm_workers
.
__exit__
(
None
,
None
,
None
)
vllm_workers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
skip
(
reason
=
"All vLLM tests disabled for now"
)
@
pytest
.
mark
.
skip
(
reason
=
"All vLLM tests disabled for now"
)
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_router_decisions_vllm_multiple_workers
(
def
test_router_decisions_vllm_multiple_workers
(
request
,
runtime_services
,
predownload_tokenizers
request
,
runtime_services
,
predownload_tokenizers
):
):
...
@@ -371,11 +372,8 @@ def test_router_decisions_vllm_multiple_workers(
...
@@ -371,11 +372,8 @@ def test_router_decisions_vllm_multiple_workers(
vllm_workers
.
__exit__
(
None
,
None
,
None
)
vllm_workers
.
__exit__
(
None
,
None
,
None
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
gpu_2
@
pytest
.
mark
.
gpu_2
@
pytest
.
mark
.
skip
(
reason
=
"All vLLM tests disabled for now"
)
@
pytest
.
mark
.
skip
(
reason
=
"All vLLM tests disabled for now"
)
@
pytest
.
mark
.
model
(
MODEL_NAME
)
def
test_router_decisions_vllm_dp
(
request
,
runtime_services
,
predownload_tokenizers
):
def
test_router_decisions_vllm_dp
(
request
,
runtime_services
,
predownload_tokenizers
):
"""Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
"""Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).
Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).
...
...
tests/serve/test_sglang.py
View file @
7e499b5c
...
@@ -44,7 +44,7 @@ sglang_configs = {
...
@@ -44,7 +44,7 @@ sglang_configs = {
name
=
"aggregated"
,
name
=
"aggregated"
,
directory
=
sglang_dir
,
directory
=
sglang_dir
,
script_name
=
"agg.sh"
,
script_name
=
"agg.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
env
=
{},
env
=
{},
models_port
=
8000
,
models_port
=
8000
,
...
@@ -73,7 +73,11 @@ sglang_configs = {
...
@@ -73,7 +73,11 @@ sglang_configs = {
name
=
"disaggregated_same_gpu"
,
name
=
"disaggregated_same_gpu"
,
directory
=
sglang_dir
,
directory
=
sglang_dir
,
script_name
=
"disagg_same_gpu.sh"
,
script_name
=
"disagg_same_gpu.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
skip
(
reason
=
"unstable"
)],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
skip
(
reason
=
"unstable"
),
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
env
=
{},
env
=
{},
models_port
=
8000
,
models_port
=
8000
,
...
@@ -116,7 +120,7 @@ sglang_configs = {
...
@@ -116,7 +120,7 @@ sglang_configs = {
name
=
"template_verification"
,
name
=
"template_verification"
,
directory
=
SERVE_TEST_DIR
,
# special directory for test-specific scripts
directory
=
SERVE_TEST_DIR
,
# special directory for test-specific scripts
script_name
=
"template_verifier.sh"
,
script_name
=
"template_verifier.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
nightly
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
nightly
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
env
=
{},
env
=
{},
models_port
=
8000
,
models_port
=
8000
,
...
@@ -159,7 +163,7 @@ sglang_configs = {
...
@@ -159,7 +163,7 @@ sglang_configs = {
name
=
"embedding_agg"
,
name
=
"embedding_agg"
,
directory
=
sglang_dir
,
directory
=
sglang_dir
,
script_name
=
"agg_embed.sh"
,
script_name
=
"agg_embed.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
nightly
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
nightly
],
model
=
"Qwen/Qwen3-Embedding-4B"
,
model
=
"Qwen/Qwen3-Embedding-4B"
,
delayed_start
=
0
,
delayed_start
=
0
,
timeout
=
180
,
timeout
=
180
,
...
...
tests/serve/test_trtllm.py
View file @
7e499b5c
...
@@ -40,7 +40,7 @@ trtllm_configs = {
...
@@ -40,7 +40,7 @@ trtllm_configs = {
name
=
"aggregated"
,
name
=
"aggregated"
,
directory
=
trtllm_dir
,
directory
=
trtllm_dir
,
script_name
=
"agg_metrics.sh"
,
script_name
=
"agg_metrics.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
trtllm
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
trtllm
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
models_port
=
8000
,
models_port
=
8000
,
request_payloads
=
[
request_payloads
=
[
...
@@ -65,7 +65,7 @@ trtllm_configs = {
...
@@ -65,7 +65,7 @@ trtllm_configs = {
name
=
"disaggregated_same_gpu"
,
name
=
"disaggregated_same_gpu"
,
directory
=
trtllm_dir
,
directory
=
trtllm_dir
,
script_name
=
"disagg_same_gpu.sh"
,
script_name
=
"disagg_same_gpu.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
trtllm
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
trtllm
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
models_port
=
8000
,
models_port
=
8000
,
request_payloads
=
[
request_payloads
=
[
...
@@ -79,7 +79,7 @@ trtllm_configs = {
...
@@ -79,7 +79,7 @@ trtllm_configs = {
name
=
"aggregated_router"
,
name
=
"aggregated_router"
,
directory
=
trtllm_dir
,
directory
=
trtllm_dir
,
script_name
=
"agg_router.sh"
,
script_name
=
"agg_router.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
post_merge
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
trtllm
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
models_port
=
8000
,
models_port
=
8000
,
request_payloads
=
[
request_payloads
=
[
...
...
tests/serve/test_vllm.py
View file @
7e499b5c
...
@@ -43,7 +43,7 @@ vllm_configs = {
...
@@ -43,7 +43,7 @@ vllm_configs = {
name
=
"aggregated"
,
name
=
"aggregated"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"agg.sh"
,
script_name
=
"agg.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
request_payloads
=
[
request_payloads
=
[
chat_payload_default
(),
chat_payload_default
(),
...
@@ -55,7 +55,7 @@ vllm_configs = {
...
@@ -55,7 +55,7 @@ vllm_configs = {
name
=
"aggregated_lmcache"
,
name
=
"aggregated_lmcache"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"agg_lmcache.sh"
,
script_name
=
"agg_lmcache.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
request_payloads
=
[
request_payloads
=
[
chat_payload_default
(),
chat_payload_default
(),
...
@@ -68,7 +68,7 @@ vllm_configs = {
...
@@ -68,7 +68,7 @@ vllm_configs = {
name
=
"agg-request-plane-tcp"
,
name
=
"agg-request-plane-tcp"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"agg_request_planes.sh"
,
script_name
=
"agg_request_planes.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
script_args
=
[
"--tcp"
],
script_args
=
[
"--tcp"
],
request_payloads
=
[
request_payloads
=
[
...
@@ -80,7 +80,7 @@ vllm_configs = {
...
@@ -80,7 +80,7 @@ vllm_configs = {
name
=
"agg-request-plane-http"
,
name
=
"agg-request-plane-http"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"agg_request_planes.sh"
,
script_name
=
"agg_request_planes.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen3-0.6B"
,
model
=
"Qwen/Qwen3-0.6B"
,
script_args
=
[
"--http"
],
script_args
=
[
"--http"
],
request_payloads
=
[
request_payloads
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment