Unverified Commit 7e499b5c authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

test: bring back the framework 1 gpu pre-merge tests + clean up pytest markers (#4698)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 3cad926e
......@@ -179,23 +179,14 @@ jobs:
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "unit and vllm and gpu_1"
pytest_marks: "pre_merge and vllm"
framework: "vllm"
test_type: "unit"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and vllm and gpu_1 and not slow"
framework: "vllm"
test_type: "e2e, gpu_1"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
sglang:
......@@ -246,23 +237,14 @@ jobs:
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "unit and sglang and gpu_1"
framework: "sglang"
test_type: "unit"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and sglang and gpu_1"
pytest_marks: "pre_merge and sglang"
framework: "sglang"
test_type: "e2e, gpu_1"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
trtllm:
......@@ -313,23 +295,14 @@ jobs:
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "unit and trtllm and gpu_1"
framework: "trtllm"
test_type: "unit"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and trtllm and gpu_1 and not slow"
pytest_marks: "pre_merge and trtllm"
framework: "trtllm"
test_type: "e2e, gpu_1"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
deploy-test-fault-tolerance:
......
......@@ -65,7 +65,7 @@ jobs:
docker compose down
- name: Run pytest (parallel tests with xdist)
env:
PYTEST_MARKS: "pre_merge and parallel"
PYTEST_MARKS: "pre_merge and parallel and not (vllm or sglang or trtllm)"
run: |
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest_parallel \
......@@ -77,7 +77,7 @@ jobs:
docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
- name: Run pytest (sequential tests)
env:
PYTEST_MARKS: "(pre_merge and not parallel) or mypy"
PYTEST_MARKS: "((pre_merge and not parallel) or mypy) and not (vllm or sglang or trtllm)"
run: |
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest \
......
......@@ -55,7 +55,7 @@ def register_engine_metrics_callback(
# Include multiple metric prefixes
register_engine_metrics_callback(
generate_endpoint, REGISTRY, metric_prefix_filter=["vllm:", "lmcache:"]
generate_endpoint, REGISTRY, metric_prefix_filters=["vllm:", "lmcache:"]
)
# With filtering and prefixing for TensorRT-LLM
......
......@@ -13,6 +13,7 @@ pytestmark = [
pytest.mark.unit,
pytest.mark.sglang,
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.post_merge,
]
......@@ -58,7 +59,7 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
result = get_prometheus_expfmt(
sglang_registry,
metric_prefix_filter="sglang:",
metric_prefix_filters=["sglang:"],
exclude_prefixes=["python_", "process_"],
)
......
......@@ -19,6 +19,7 @@ pytestmark = [
# `.github/workflows/container-validation-backends.yml` does not make use of
# the `gpu_0` marker.
pytest.mark.gpu_1,
pytest.mark.pre_merge,
]
_PYTORCH_LLM_CLS_NAME = "dynamo.trtllm.engine.LLM"
_AUTODEPLOY_LLM_CLS_NAME = "tensorrt_llm._torch.auto_deploy.LLM"
......
......@@ -13,6 +13,7 @@ pytestmark = [
pytest.mark.unit,
pytest.mark.trtllm,
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.post_merge,
]
......
......@@ -23,6 +23,7 @@ pytestmark = [
pytest.mark.unit,
pytest.mark.trtllm,
pytest.mark.gpu_1,
pytest.mark.pre_merge,
]
......
......@@ -13,6 +13,7 @@ pytestmark = [
pytest.mark.unit,
pytest.mark.vllm,
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.post_merge,
]
......@@ -56,7 +57,7 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
result = get_prometheus_expfmt(
vllm_registry,
metric_prefix_filter="vllm:",
metric_prefix_filters=["vllm:"],
exclude_prefixes=["python_", "process_"],
)
......
......@@ -71,7 +71,8 @@ Markers are required for all tests. They are used for test selection in CI and l
| Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test |
| Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required |
| Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity |
| Other | slow, skip, xfail | Special handling |
| Execution | parallel | Test can run in parallel with pytest-xdist |
| Other | slow, skip, xfail, mypy, custom_build | Special handling |
### Example
```python
......
......@@ -21,6 +21,13 @@ from tests.utils.payloads import check_health_generate, check_models_api
logger = logging.getLogger(__name__)
pytestmark = [
pytest.mark.trtllm,
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
]
class DynamoWorkerProcess(ManagedProcess):
"""Process manager for Dynamo worker with TensorRT-LLM backend"""
......@@ -127,10 +134,6 @@ class DynamoWorkerProcess(ManagedProcess):
return False
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_request_cancellation_trtllm_aggregated(
request, runtime_services, predownload_models
......@@ -205,10 +208,6 @@ def test_request_cancellation_trtllm_aggregated(
logger.info(f"{description} detected successfully")
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_request_cancellation_trtllm_decode_cancel(
request, runtime_services, predownload_models
......@@ -282,11 +281,7 @@ def test_request_cancellation_trtllm_decode_cancel(
)
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.nightly
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
def test_request_cancellation_trtllm_prefill_cancel(
request, runtime_services, predownload_models
):
......@@ -369,10 +364,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
)
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.xfail(
reason="May fail due to unknown reason with TRT-LLM or backend implementation",
strict=False,
......
......@@ -23,6 +23,14 @@ from .utils import (
logger = logging.getLogger(__name__)
pytestmark = [
pytest.mark.vllm,
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly,
]
class DynamoWorkerProcess(ManagedProcess):
"""Process manager for Dynamo worker with vLLM backend"""
......@@ -100,11 +108,6 @@ class DynamoWorkerProcess(ManagedProcess):
return False
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_request_migration_vllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......@@ -151,11 +154,6 @@ def test_request_migration_vllm_worker_failure(
verify_migration_occurred(frontend)
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_request_migration_vllm_graceful_shutdown(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......@@ -203,11 +201,6 @@ def test_request_migration_vllm_graceful_shutdown(
verify_migration_occurred(frontend)
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_no_request_migration_vllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......@@ -268,11 +261,6 @@ def test_no_request_migration_vllm_worker_failure(
), f"Unexpected migration message: {e}"
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_no_request_migration_vllm_graceful_shutdown(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......
......@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
TEST_MODEL = QWEN
pytestmark = [
pytest.mark.e2e,
pytest.mark.gpu_1,
pytest.mark.post_merge,
pytest.mark.model(TEST_MODEL),
]
class DynamoFrontendProcess(ManagedProcess):
"""Process manager for Dynamo frontend"""
......@@ -145,10 +152,6 @@ def start_services(request, runtime_services):
@pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_string_prompt() -> None:
payload: Dict[str, Any] = {
"model": TEST_MODEL,
......@@ -165,10 +168,6 @@ def test_completion_string_prompt() -> None:
@pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_empty_array_prompt() -> None:
payload: Dict[str, Any] = {
"model": TEST_MODEL,
......@@ -185,10 +184,6 @@ def test_completion_empty_array_prompt() -> None:
@pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_single_element_array_prompt() -> None:
payload: Dict[str, Any] = {
"model": TEST_MODEL,
......@@ -205,10 +200,6 @@ def test_completion_single_element_array_prompt() -> None:
@pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_multi_element_array_prompt() -> None:
payload: Dict[str, Any] = {
"model": TEST_MODEL,
......
......@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
TEST_MODEL = GPT_OSS
pytestmark = [
pytest.mark.vllm,
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(TEST_MODEL),
]
WEATHER_TOOL = {
"type": "function",
"function": {
......@@ -211,11 +218,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
"""High reasoning effort should yield more detailed reasoning than low effort."""
......@@ -278,11 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_tool_calling(request, runtime_services, predownload_models) -> None:
"""Test tool calling functionality with weather and system health tools."""
......@@ -321,11 +320,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.nightly
@pytest.mark.model(TEST_MODEL)
def test_tool_calling_second_round(
request, runtime_services, predownload_models
) -> None:
......@@ -388,11 +383,7 @@ def test_tool_calling_second_round(
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.nightly
@pytest.mark.model(TEST_MODEL)
def test_reasoning(request, runtime_services, predownload_models) -> None:
"""Test reasoning functionality with a mathematical problem."""
......
......@@ -22,17 +22,17 @@ from tests.router.common import ( # utilities
from tests.utils.constants import ROUTER_MODEL_NAME
from tests.utils.managed_process import ManagedProcess
logger = logging.getLogger(__name__)
MODEL_NAME = ROUTER_MODEL_NAME
pytestmark = [
pytest.mark.pre_merge,
pytest.mark.gpu_0,
pytest.mark.integration,
pytest.mark.parallel,
pytest.mark.model(MODEL_NAME),
]
logger = logging.getLogger(__name__)
MODEL_NAME = ROUTER_MODEL_NAME
NUM_MOCKERS = 2
SPEEDUP_RATIO = 10.0
BASE_PORT = 9100 # Base port for all tests (high port to avoid conflicts)
......@@ -287,11 +287,6 @@ class DisaggMockerProcess:
self._process.__exit__(exc_type, exc_val, exc_tb)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_mocker_kv_router(request, runtime_services_session, predownload_tokenizers):
"""
Test KV router with multiple mocker engine instances.
......@@ -331,11 +326,6 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
@pytest.mark.parametrize("store_backend", ["etcd", "file"])
def test_mocker_two_kv_router(
request,
......@@ -391,11 +381,6 @@ def test_mocker_two_kv_router(
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
@pytest.mark.skip(reason="Flaky, temporarily disabled")
def test_mocker_kv_router_overload_503(
request, runtime_services_session, predownload_tokenizers
......@@ -434,11 +419,6 @@ def test_mocker_kv_router_overload_503(
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_kv_push_router_bindings(
request, runtime_services_session, predownload_tokenizers
):
......@@ -475,11 +455,6 @@ def test_kv_push_router_bindings(
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
@pytest.mark.parametrize("store_backend", ["etcd", "file"])
def test_indexers_sync(
request,
......@@ -529,11 +504,6 @@ def test_indexers_sync(
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_query_instance_id_returns_worker_and_tokens(
request, runtime_services_session, predownload_tokenizers
):
......@@ -568,11 +538,6 @@ def test_query_instance_id_returns_worker_and_tokens(
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_router_decisions(request, runtime_services_session, predownload_tokenizers):
"""Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes."""
......@@ -612,9 +577,6 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz
mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_router_disagg_decisions(
request, runtime_services_session, predownload_tokenizers
):
......@@ -680,11 +642,6 @@ def test_router_disagg_decisions(
prefill_workers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_busy_threshold_endpoint(
request, runtime_services_session, predownload_tokenizers
):
......
......@@ -18,6 +18,13 @@ from tests.utils.managed_process import ManagedProcess
logger = logging.getLogger(__name__)
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
pytestmark = [
pytest.mark.pre_merge,
pytest.mark.e2e,
pytest.mark.vllm,
pytest.mark.model(MODEL_NAME),
]
SPEEDUP_RATIO = 10.0
PORTS = [
8011,
......@@ -269,11 +276,8 @@ class VLLMProcess:
time.sleep(2)
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.vllm
@pytest.mark.skip(reason="All vLLM tests disabled for now")
@pytest.mark.model(MODEL_NAME)
def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers):
"""
Quick e2e sanity test for KV router with vLLM engine instances.
......@@ -319,11 +323,8 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
vllm_workers.__exit__(None, None, None)
@pytest.mark.e2e
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.skip(reason="All vLLM tests disabled for now")
@pytest.mark.model(MODEL_NAME)
def test_router_decisions_vllm_multiple_workers(
request, runtime_services, predownload_tokenizers
):
......@@ -371,11 +372,8 @@ def test_router_decisions_vllm_multiple_workers(
vllm_workers.__exit__(None, None, None)
@pytest.mark.e2e
@pytest.mark.vllm
@pytest.mark.gpu_2
@pytest.mark.skip(reason="All vLLM tests disabled for now")
@pytest.mark.model(MODEL_NAME)
def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokenizers):
"""Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).
......
......@@ -44,7 +44,7 @@ sglang_configs = {
name="aggregated",
directory=sglang_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
......@@ -73,7 +73,11 @@ sglang_configs = {
name="disaggregated_same_gpu",
directory=sglang_dir,
script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.skip(reason="unstable")],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.skip(reason="unstable"),
],
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
......@@ -116,7 +120,7 @@ sglang_configs = {
name="template_verification",
directory=SERVE_TEST_DIR, # special directory for test-specific scripts
script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1, pytest.mark.nightly],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
......@@ -159,7 +163,7 @@ sglang_configs = {
name="embedding_agg",
directory=sglang_dir,
script_name="agg_embed.sh",
marks=[pytest.mark.gpu_1, pytest.mark.nightly],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
model="Qwen/Qwen3-Embedding-4B",
delayed_start=0,
timeout=180,
......
......@@ -40,7 +40,7 @@ trtllm_configs = {
name="aggregated",
directory=trtllm_dir,
script_name="agg_metrics.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
......@@ -65,7 +65,7 @@ trtllm_configs = {
name="disaggregated_same_gpu",
directory=trtllm_dir,
script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
......@@ -79,7 +79,7 @@ trtllm_configs = {
name="aggregated_router",
directory=trtllm_dir,
script_name="agg_router.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm, pytest.mark.post_merge],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
......
......@@ -43,7 +43,7 @@ vllm_configs = {
name="aggregated",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
......@@ -55,7 +55,7 @@ vllm_configs = {
name="aggregated_lmcache",
directory=vllm_dir,
script_name="agg_lmcache.sh",
marks=[pytest.mark.gpu_1],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
......@@ -68,7 +68,7 @@ vllm_configs = {
name="agg-request-plane-tcp",
directory=vllm_dir,
script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
script_args=["--tcp"],
request_payloads=[
......@@ -80,7 +80,7 @@ vllm_configs = {
name="agg-request-plane-http",
directory=vllm_dir,
script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1],
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B",
script_args=["--http"],
request_payloads=[
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment