Unverified Commit 7e499b5c authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

test: bring back the framework 1 gpu pre-merge tests + clean up pytest markers (#4698)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 3cad926e
...@@ -179,23 +179,14 @@ jobs: ...@@ -179,23 +179,14 @@ jobs:
azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests - name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }} if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.build-image.outputs.image_tag }} image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "unit and vllm and gpu_1" pytest_marks: "pre_merge and vllm"
framework: "vllm" framework: "vllm"
test_type: "unit" test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and vllm and gpu_1 and not slow"
framework: "vllm"
test_type: "e2e, gpu_1"
platform_arch: ${{ matrix.platform.arch }} platform_arch: ${{ matrix.platform.arch }}
sglang: sglang:
...@@ -246,23 +237,14 @@ jobs: ...@@ -246,23 +237,14 @@ jobs:
azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests - name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "unit and sglang and gpu_1"
framework: "sglang"
test_type: "unit"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
if: ${{ matrix.platform.arch != 'arm64' }} if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.build-image.outputs.image_tag }} image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and sglang and gpu_1" pytest_marks: "pre_merge and sglang"
framework: "sglang" framework: "sglang"
test_type: "e2e, gpu_1" test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }} platform_arch: ${{ matrix.platform.arch }}
trtllm: trtllm:
...@@ -313,23 +295,14 @@ jobs: ...@@ -313,23 +295,14 @@ jobs:
azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests - name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "unit and trtllm and gpu_1"
framework: "trtllm"
test_type: "unit"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
if: ${{ matrix.platform.arch != 'arm64' }} if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.build-image.outputs.image_tag }} image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and trtllm and gpu_1 and not slow" pytest_marks: "pre_merge and trtllm"
framework: "trtllm" framework: "trtllm"
test_type: "e2e, gpu_1" test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }} platform_arch: ${{ matrix.platform.arch }}
deploy-test-fault-tolerance: deploy-test-fault-tolerance:
......
...@@ -65,7 +65,7 @@ jobs: ...@@ -65,7 +65,7 @@ jobs:
docker compose down docker compose down
- name: Run pytest (parallel tests with xdist) - name: Run pytest (parallel tests with xdist)
env: env:
PYTEST_MARKS: "pre_merge and parallel" PYTEST_MARKS: "pre_merge and parallel and not (vllm or sglang or trtllm)"
run: | run: |
docker run -w /workspace \ docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest_parallel \ --name ${{ env.CONTAINER_ID }}_pytest_parallel \
...@@ -77,7 +77,7 @@ jobs: ...@@ -77,7 +77,7 @@ jobs:
docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found" docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
- name: Run pytest (sequential tests) - name: Run pytest (sequential tests)
env: env:
PYTEST_MARKS: "(pre_merge and not parallel) or mypy" PYTEST_MARKS: "((pre_merge and not parallel) or mypy) and not (vllm or sglang or trtllm)"
run: | run: |
docker run -w /workspace \ docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest \ --name ${{ env.CONTAINER_ID }}_pytest \
......
...@@ -55,7 +55,7 @@ def register_engine_metrics_callback( ...@@ -55,7 +55,7 @@ def register_engine_metrics_callback(
# Include multiple metric prefixes # Include multiple metric prefixes
register_engine_metrics_callback( register_engine_metrics_callback(
generate_endpoint, REGISTRY, metric_prefix_filter=["vllm:", "lmcache:"] generate_endpoint, REGISTRY, metric_prefix_filters=["vllm:", "lmcache:"]
) )
# With filtering and prefixing for TensorRT-LLM # With filtering and prefixing for TensorRT-LLM
......
...@@ -13,6 +13,7 @@ pytestmark = [ ...@@ -13,6 +13,7 @@ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
pytest.mark.sglang, pytest.mark.sglang,
pytest.mark.gpu_0, pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.post_merge, pytest.mark.post_merge,
] ]
...@@ -58,7 +59,7 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075 ...@@ -58,7 +59,7 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_.""" """Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
result = get_prometheus_expfmt( result = get_prometheus_expfmt(
sglang_registry, sglang_registry,
metric_prefix_filter="sglang:", metric_prefix_filters=["sglang:"],
exclude_prefixes=["python_", "process_"], exclude_prefixes=["python_", "process_"],
) )
......
...@@ -19,6 +19,7 @@ pytestmark = [ ...@@ -19,6 +19,7 @@ pytestmark = [
# `.github/workflows/container-validation-backends.yml` does not make use of # `.github/workflows/container-validation-backends.yml` does not make use of
# the `gpu_0` marker. # the `gpu_0` marker.
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.pre_merge,
] ]
_PYTORCH_LLM_CLS_NAME = "dynamo.trtllm.engine.LLM" _PYTORCH_LLM_CLS_NAME = "dynamo.trtllm.engine.LLM"
_AUTODEPLOY_LLM_CLS_NAME = "tensorrt_llm._torch.auto_deploy.LLM" _AUTODEPLOY_LLM_CLS_NAME = "tensorrt_llm._torch.auto_deploy.LLM"
......
...@@ -13,6 +13,7 @@ pytestmark = [ ...@@ -13,6 +13,7 @@ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
pytest.mark.trtllm, pytest.mark.trtllm,
pytest.mark.gpu_0, pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.post_merge, pytest.mark.post_merge,
] ]
......
...@@ -23,6 +23,7 @@ pytestmark = [ ...@@ -23,6 +23,7 @@ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
pytest.mark.trtllm, pytest.mark.trtllm,
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.pre_merge,
] ]
......
...@@ -13,6 +13,7 @@ pytestmark = [ ...@@ -13,6 +13,7 @@ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
pytest.mark.vllm, pytest.mark.vllm,
pytest.mark.gpu_0, pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.post_merge, pytest.mark.post_merge,
] ]
...@@ -56,7 +57,7 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165 ...@@ -56,7 +57,7 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_.""" """Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
result = get_prometheus_expfmt( result = get_prometheus_expfmt(
vllm_registry, vllm_registry,
metric_prefix_filter="vllm:", metric_prefix_filters=["vllm:"],
exclude_prefixes=["python_", "process_"], exclude_prefixes=["python_", "process_"],
) )
......
...@@ -71,7 +71,8 @@ Markers are required for all tests. They are used for test selection in CI and l ...@@ -71,7 +71,8 @@ Markers are required for all tests. They are used for test selection in CI and l
| Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test | | Test Type [required] | unit, integration, e2e, benchmark, stress, multimodal | Nature of the test |
| Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required | | Hardware [required] | gpu_0, gpu_1, gpu_2, gpu_4, gpu_8, h100 | Number/type of GPUs required |
| Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity | | Component/Framework | vllm, trtllm, sglang, kvbm, planner, router | Backend or component specificity |
| Other | slow, skip, xfail | Special handling | | Execution | parallel | Test can run in parallel with pytest-xdist |
| Other | slow, skip, xfail, mypy, custom_build | Special handling |
### Example ### Example
```python ```python
......
...@@ -21,6 +21,13 @@ from tests.utils.payloads import check_health_generate, check_models_api ...@@ -21,6 +21,13 @@ from tests.utils.payloads import check_health_generate, check_models_api
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pytestmark = [
pytest.mark.trtllm,
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
]
class DynamoWorkerProcess(ManagedProcess): class DynamoWorkerProcess(ManagedProcess):
"""Process manager for Dynamo worker with TensorRT-LLM backend""" """Process manager for Dynamo worker with TensorRT-LLM backend"""
...@@ -127,10 +134,6 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -127,10 +134,6 @@ class DynamoWorkerProcess(ManagedProcess):
return False return False
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly @pytest.mark.nightly
def test_request_cancellation_trtllm_aggregated( def test_request_cancellation_trtllm_aggregated(
request, runtime_services, predownload_models request, runtime_services, predownload_models
...@@ -205,10 +208,6 @@ def test_request_cancellation_trtllm_aggregated( ...@@ -205,10 +208,6 @@ def test_request_cancellation_trtllm_aggregated(
logger.info(f"{description} detected successfully") logger.info(f"{description} detected successfully")
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly @pytest.mark.nightly
def test_request_cancellation_trtllm_decode_cancel( def test_request_cancellation_trtllm_decode_cancel(
request, runtime_services, predownload_models request, runtime_services, predownload_models
...@@ -282,11 +281,7 @@ def test_request_cancellation_trtllm_decode_cancel( ...@@ -282,11 +281,7 @@ def test_request_cancellation_trtllm_decode_cancel(
) )
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.nightly @pytest.mark.nightly
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
def test_request_cancellation_trtllm_prefill_cancel( def test_request_cancellation_trtllm_prefill_cancel(
request, runtime_services, predownload_models request, runtime_services, predownload_models
): ):
...@@ -369,10 +364,6 @@ def test_request_cancellation_trtllm_prefill_cancel( ...@@ -369,10 +364,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
) )
@pytest.mark.trtllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.xfail( @pytest.mark.xfail(
reason="May fail due to unknown reason with TRT-LLM or backend implementation", reason="May fail due to unknown reason with TRT-LLM or backend implementation",
strict=False, strict=False,
......
...@@ -23,6 +23,14 @@ from .utils import ( ...@@ -23,6 +23,14 @@ from .utils import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pytestmark = [
pytest.mark.vllm,
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly,
]
class DynamoWorkerProcess(ManagedProcess): class DynamoWorkerProcess(ManagedProcess):
"""Process manager for Dynamo worker with vLLM backend""" """Process manager for Dynamo worker with vLLM backend"""
...@@ -100,11 +108,6 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -100,11 +108,6 @@ class DynamoWorkerProcess(ManagedProcess):
return False return False
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_request_migration_vllm_worker_failure( def test_request_migration_vllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
...@@ -151,11 +154,6 @@ def test_request_migration_vllm_worker_failure( ...@@ -151,11 +154,6 @@ def test_request_migration_vllm_worker_failure(
verify_migration_occurred(frontend) verify_migration_occurred(frontend)
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_request_migration_vllm_graceful_shutdown( def test_request_migration_vllm_graceful_shutdown(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
...@@ -203,11 +201,6 @@ def test_request_migration_vllm_graceful_shutdown( ...@@ -203,11 +201,6 @@ def test_request_migration_vllm_graceful_shutdown(
verify_migration_occurred(frontend) verify_migration_occurred(frontend)
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_no_request_migration_vllm_worker_failure( def test_no_request_migration_vllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
...@@ -268,11 +261,6 @@ def test_no_request_migration_vllm_worker_failure( ...@@ -268,11 +261,6 @@ def test_no_request_migration_vllm_worker_failure(
), f"Unexpected migration message: {e}" ), f"Unexpected migration message: {e}"
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.nightly
def test_no_request_migration_vllm_graceful_shutdown( def test_no_request_migration_vllm_graceful_shutdown(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
......
...@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__) ...@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
TEST_MODEL = QWEN TEST_MODEL = QWEN
pytestmark = [
pytest.mark.e2e,
pytest.mark.gpu_1,
pytest.mark.post_merge,
pytest.mark.model(TEST_MODEL),
]
class DynamoFrontendProcess(ManagedProcess): class DynamoFrontendProcess(ManagedProcess):
"""Process manager for Dynamo frontend""" """Process manager for Dynamo frontend"""
...@@ -145,10 +152,6 @@ def start_services(request, runtime_services): ...@@ -145,10 +152,6 @@ def start_services(request, runtime_services):
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_string_prompt() -> None: def test_completion_string_prompt() -> None:
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"model": TEST_MODEL, "model": TEST_MODEL,
...@@ -165,10 +168,6 @@ def test_completion_string_prompt() -> None: ...@@ -165,10 +168,6 @@ def test_completion_string_prompt() -> None:
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_empty_array_prompt() -> None: def test_completion_empty_array_prompt() -> None:
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"model": TEST_MODEL, "model": TEST_MODEL,
...@@ -185,10 +184,6 @@ def test_completion_empty_array_prompt() -> None: ...@@ -185,10 +184,6 @@ def test_completion_empty_array_prompt() -> None:
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_single_element_array_prompt() -> None: def test_completion_single_element_array_prompt() -> None:
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"model": TEST_MODEL, "model": TEST_MODEL,
...@@ -205,10 +200,6 @@ def test_completion_single_element_array_prompt() -> None: ...@@ -205,10 +200,6 @@ def test_completion_single_element_array_prompt() -> None:
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_completion_multi_element_array_prompt() -> None: def test_completion_multi_element_array_prompt() -> None:
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"model": TEST_MODEL, "model": TEST_MODEL,
......
...@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__) ...@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)
TEST_MODEL = GPT_OSS TEST_MODEL = GPT_OSS
pytestmark = [
pytest.mark.vllm,
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(TEST_MODEL),
]
WEATHER_TOOL = { WEATHER_TOOL = {
"type": "function", "type": "function",
"function": { "function": {
...@@ -211,11 +218,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]: ...@@ -211,11 +218,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.post_merge @pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_reasoning_effort(request, runtime_services, predownload_models) -> None: def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
"""High reasoning effort should yield more detailed reasoning than low effort.""" """High reasoning effort should yield more detailed reasoning than low effort."""
...@@ -278,11 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None ...@@ -278,11 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.post_merge @pytest.mark.post_merge
@pytest.mark.model(TEST_MODEL)
def test_tool_calling(request, runtime_services, predownload_models) -> None: def test_tool_calling(request, runtime_services, predownload_models) -> None:
"""Test tool calling functionality with weather and system health tools.""" """Test tool calling functionality with weather and system health tools."""
...@@ -321,11 +320,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None: ...@@ -321,11 +320,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.nightly @pytest.mark.nightly
@pytest.mark.model(TEST_MODEL)
def test_tool_calling_second_round( def test_tool_calling_second_round(
request, runtime_services, predownload_models request, runtime_services, predownload_models
) -> None: ) -> None:
...@@ -388,11 +383,7 @@ def test_tool_calling_second_round( ...@@ -388,11 +383,7 @@ def test_tool_calling_second_round(
@pytest.mark.usefixtures("start_services") @pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.nightly @pytest.mark.nightly
@pytest.mark.model(TEST_MODEL)
def test_reasoning(request, runtime_services, predownload_models) -> None: def test_reasoning(request, runtime_services, predownload_models) -> None:
"""Test reasoning functionality with a mathematical problem.""" """Test reasoning functionality with a mathematical problem."""
......
...@@ -22,17 +22,17 @@ from tests.router.common import ( # utilities ...@@ -22,17 +22,17 @@ from tests.router.common import ( # utilities
from tests.utils.constants import ROUTER_MODEL_NAME from tests.utils.constants import ROUTER_MODEL_NAME
from tests.utils.managed_process import ManagedProcess from tests.utils.managed_process import ManagedProcess
logger = logging.getLogger(__name__)
MODEL_NAME = ROUTER_MODEL_NAME
pytestmark = [ pytestmark = [
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.gpu_0, pytest.mark.gpu_0,
pytest.mark.integration, pytest.mark.integration,
pytest.mark.parallel,
pytest.mark.model(MODEL_NAME),
] ]
logger = logging.getLogger(__name__)
MODEL_NAME = ROUTER_MODEL_NAME
NUM_MOCKERS = 2 NUM_MOCKERS = 2
SPEEDUP_RATIO = 10.0 SPEEDUP_RATIO = 10.0
BASE_PORT = 9100 # Base port for all tests (high port to avoid conflicts) BASE_PORT = 9100 # Base port for all tests (high port to avoid conflicts)
...@@ -287,11 +287,6 @@ class DisaggMockerProcess: ...@@ -287,11 +287,6 @@ class DisaggMockerProcess:
self._process.__exit__(exc_type, exc_val, exc_tb) self._process.__exit__(exc_type, exc_val, exc_tb)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_mocker_kv_router(request, runtime_services_session, predownload_tokenizers): def test_mocker_kv_router(request, runtime_services_session, predownload_tokenizers):
""" """
Test KV router with multiple mocker engine instances. Test KV router with multiple mocker engine instances.
...@@ -331,11 +326,6 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz ...@@ -331,11 +326,6 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
@pytest.mark.parametrize("store_backend", ["etcd", "file"]) @pytest.mark.parametrize("store_backend", ["etcd", "file"])
def test_mocker_two_kv_router( def test_mocker_two_kv_router(
request, request,
...@@ -391,11 +381,6 @@ def test_mocker_two_kv_router( ...@@ -391,11 +381,6 @@ def test_mocker_two_kv_router(
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
@pytest.mark.skip(reason="Flaky, temporarily disabled") @pytest.mark.skip(reason="Flaky, temporarily disabled")
def test_mocker_kv_router_overload_503( def test_mocker_kv_router_overload_503(
request, runtime_services_session, predownload_tokenizers request, runtime_services_session, predownload_tokenizers
...@@ -434,11 +419,6 @@ def test_mocker_kv_router_overload_503( ...@@ -434,11 +419,6 @@ def test_mocker_kv_router_overload_503(
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_kv_push_router_bindings( def test_kv_push_router_bindings(
request, runtime_services_session, predownload_tokenizers request, runtime_services_session, predownload_tokenizers
): ):
...@@ -475,11 +455,6 @@ def test_kv_push_router_bindings( ...@@ -475,11 +455,6 @@ def test_kv_push_router_bindings(
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
@pytest.mark.parametrize("store_backend", ["etcd", "file"]) @pytest.mark.parametrize("store_backend", ["etcd", "file"])
def test_indexers_sync( def test_indexers_sync(
request, request,
...@@ -529,11 +504,6 @@ def test_indexers_sync( ...@@ -529,11 +504,6 @@ def test_indexers_sync(
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_query_instance_id_returns_worker_and_tokens( def test_query_instance_id_returns_worker_and_tokens(
request, runtime_services_session, predownload_tokenizers request, runtime_services_session, predownload_tokenizers
): ):
...@@ -568,11 +538,6 @@ def test_query_instance_id_returns_worker_and_tokens( ...@@ -568,11 +538,6 @@ def test_query_instance_id_returns_worker_and_tokens(
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_router_decisions(request, runtime_services_session, predownload_tokenizers): def test_router_decisions(request, runtime_services_session, predownload_tokenizers):
"""Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes.""" """Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes."""
...@@ -612,9 +577,6 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz ...@@ -612,9 +577,6 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz
mockers.__exit__(None, None, None) mockers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_router_disagg_decisions( def test_router_disagg_decisions(
request, runtime_services_session, predownload_tokenizers request, runtime_services_session, predownload_tokenizers
): ):
...@@ -680,11 +642,6 @@ def test_router_disagg_decisions( ...@@ -680,11 +642,6 @@ def test_router_disagg_decisions(
prefill_workers.__exit__(None, None, None) prefill_workers.__exit__(None, None, None)
@pytest.mark.pre_merge
@pytest.mark.gpu_0
@pytest.mark.integration
@pytest.mark.parallel
@pytest.mark.model(MODEL_NAME)
def test_busy_threshold_endpoint( def test_busy_threshold_endpoint(
request, runtime_services_session, predownload_tokenizers request, runtime_services_session, predownload_tokenizers
): ):
......
...@@ -18,6 +18,13 @@ from tests.utils.managed_process import ManagedProcess ...@@ -18,6 +18,13 @@ from tests.utils.managed_process import ManagedProcess
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
pytestmark = [
pytest.mark.pre_merge,
pytest.mark.e2e,
pytest.mark.vllm,
pytest.mark.model(MODEL_NAME),
]
SPEEDUP_RATIO = 10.0 SPEEDUP_RATIO = 10.0
PORTS = [ PORTS = [
8011, 8011,
...@@ -269,11 +276,8 @@ class VLLMProcess: ...@@ -269,11 +276,8 @@ class VLLMProcess:
time.sleep(2) time.sleep(2)
@pytest.mark.e2e
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.vllm
@pytest.mark.skip(reason="All vLLM tests disabled for now") @pytest.mark.skip(reason="All vLLM tests disabled for now")
@pytest.mark.model(MODEL_NAME)
def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers): def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers):
""" """
Quick e2e sanity test for KV router with vLLM engine instances. Quick e2e sanity test for KV router with vLLM engine instances.
...@@ -319,11 +323,8 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers) ...@@ -319,11 +323,8 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
vllm_workers.__exit__(None, None, None) vllm_workers.__exit__(None, None, None)
@pytest.mark.e2e
@pytest.mark.vllm
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.skip(reason="All vLLM tests disabled for now") @pytest.mark.skip(reason="All vLLM tests disabled for now")
@pytest.mark.model(MODEL_NAME)
def test_router_decisions_vllm_multiple_workers( def test_router_decisions_vllm_multiple_workers(
request, runtime_services, predownload_tokenizers request, runtime_services, predownload_tokenizers
): ):
...@@ -371,11 +372,8 @@ def test_router_decisions_vllm_multiple_workers( ...@@ -371,11 +372,8 @@ def test_router_decisions_vllm_multiple_workers(
vllm_workers.__exit__(None, None, None) vllm_workers.__exit__(None, None, None)
@pytest.mark.e2e
@pytest.mark.vllm
@pytest.mark.gpu_2 @pytest.mark.gpu_2
@pytest.mark.skip(reason="All vLLM tests disabled for now") @pytest.mark.skip(reason="All vLLM tests disabled for now")
@pytest.mark.model(MODEL_NAME)
def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokenizers): def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokenizers):
"""Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes. """Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1). Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).
......
...@@ -44,7 +44,7 @@ sglang_configs = { ...@@ -44,7 +44,7 @@ sglang_configs = {
name="aggregated", name="aggregated",
directory=sglang_dir, directory=sglang_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
...@@ -73,7 +73,11 @@ sglang_configs = { ...@@ -73,7 +73,11 @@ sglang_configs = {
name="disaggregated_same_gpu", name="disaggregated_same_gpu",
directory=sglang_dir, directory=sglang_dir,
script_name="disagg_same_gpu.sh", script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.skip(reason="unstable")], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.skip(reason="unstable"),
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
...@@ -116,7 +120,7 @@ sglang_configs = { ...@@ -116,7 +120,7 @@ sglang_configs = {
name="template_verification", name="template_verification",
directory=SERVE_TEST_DIR, # special directory for test-specific scripts directory=SERVE_TEST_DIR, # special directory for test-specific scripts
script_name="template_verifier.sh", script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1, pytest.mark.nightly], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
...@@ -159,7 +163,7 @@ sglang_configs = { ...@@ -159,7 +163,7 @@ sglang_configs = {
name="embedding_agg", name="embedding_agg",
directory=sglang_dir, directory=sglang_dir,
script_name="agg_embed.sh", script_name="agg_embed.sh",
marks=[pytest.mark.gpu_1, pytest.mark.nightly], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
model="Qwen/Qwen3-Embedding-4B", model="Qwen/Qwen3-Embedding-4B",
delayed_start=0, delayed_start=0,
timeout=180, timeout=180,
......
...@@ -40,7 +40,7 @@ trtllm_configs = { ...@@ -40,7 +40,7 @@ trtllm_configs = {
name="aggregated", name="aggregated",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg_metrics.sh", script_name="agg_metrics.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
...@@ -65,7 +65,7 @@ trtllm_configs = { ...@@ -65,7 +65,7 @@ trtllm_configs = {
name="disaggregated_same_gpu", name="disaggregated_same_gpu",
directory=trtllm_dir, directory=trtllm_dir,
script_name="disagg_same_gpu.sh", script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
...@@ -79,7 +79,7 @@ trtllm_configs = { ...@@ -79,7 +79,7 @@ trtllm_configs = {
name="aggregated_router", name="aggregated_router",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg_router.sh", script_name="agg_router.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm, pytest.mark.post_merge], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
......
...@@ -43,7 +43,7 @@ vllm_configs = { ...@@ -43,7 +43,7 @@ vllm_configs = {
name="aggregated", name="aggregated",
directory=vllm_dir, directory=vllm_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
...@@ -55,7 +55,7 @@ vllm_configs = { ...@@ -55,7 +55,7 @@ vllm_configs = {
name="aggregated_lmcache", name="aggregated_lmcache",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_lmcache.sh", script_name="agg_lmcache.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
...@@ -68,7 +68,7 @@ vllm_configs = { ...@@ -68,7 +68,7 @@ vllm_configs = {
name="agg-request-plane-tcp", name="agg-request-plane-tcp",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_request_planes.sh", script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
script_args=["--tcp"], script_args=["--tcp"],
request_payloads=[ request_payloads=[
...@@ -80,7 +80,7 @@ vllm_configs = { ...@@ -80,7 +80,7 @@ vllm_configs = {
name="agg-request-plane-http", name="agg-request-plane-http",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_request_planes.sh", script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
script_args=["--http"], script_args=["--http"],
request_payloads=[ request_payloads=[
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment