Unverified Commit d821a8b9 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: parallelize planner profile tests + bindings test cleanup (#4532)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 65f18884
...@@ -29,10 +29,7 @@ pytestmark = pytest.mark.pre_merge ...@@ -29,10 +29,7 @@ pytestmark = pytest.mark.pre_merge
@pytest.fixture @pytest.fixture
async def distributed_runtime(): async def distributed_runtime():
"""Function-scoped runtime fixture for use with @pytest.mark.forked tests. """Function-scoped runtime fixture for distributed runtime tests."""
Each test gets its own runtime in a forked process to avoid singleton conflicts.
"""
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, "etcd", "nats") runtime = DistributedRuntime(loop, "etcd", "nats")
yield runtime yield runtime
...@@ -40,7 +37,6 @@ async def distributed_runtime(): ...@@ -40,7 +37,6 @@ async def distributed_runtime():
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.forked
async def test_radix_tree_binding(distributed_runtime): async def test_radix_tree_binding(distributed_runtime):
"""Test RadixTree binding directly with store event and find matches""" """Test RadixTree binding directly with store event and find matches"""
import json import json
...@@ -107,7 +103,6 @@ async def test_radix_tree_binding(distributed_runtime): ...@@ -107,7 +103,6 @@ async def test_radix_tree_binding(distributed_runtime):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.forked
@pytest.mark.parametrize("num_threads", [2, 3, 5, 128]) @pytest.mark.parametrize("num_threads", [2, 3, 5, 128])
@pytest.mark.parametrize("prepopulate_worker_ids", [True, False]) @pytest.mark.parametrize("prepopulate_worker_ids", [True, False])
@pytest.mark.parametrize("expiration_duration_secs", [None]) @pytest.mark.parametrize("expiration_duration_secs", [None])
...@@ -209,15 +204,7 @@ async def test_radix_tree_thread_safety( ...@@ -209,15 +204,7 @@ async def test_radix_tree_thread_safety(
), f"Expected {expected_blocks_after_removal} block events after removal, got {len(blocks_after_removal)}" ), f"Expected {expected_blocks_after_removal} block events after removal, got {len(blocks_after_removal)}"
# TODO Figure out how to test with different kv_block_size
# Right now I get an error in EventPublisher init when I run this test
# back to back. It occurs when calling dynamo_llm_init and I think is related to the
# OnceCell initializations not being reset.
# The test works individually if I run it with 32, then 11, then 64.
# @pytest.mark.parametrize("kv_block_size", [11, 32, 64])
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.forked
@pytest.mark.skip(reason="Flakey in CI. Likely race condition going on.")
async def test_event_handler(distributed_runtime): async def test_event_handler(distributed_runtime):
kv_block_size = 32 kv_block_size = 32
namespace = "kv_test" namespace = "kv_test"
...@@ -225,7 +212,10 @@ async def test_event_handler(distributed_runtime): ...@@ -225,7 +212,10 @@ async def test_event_handler(distributed_runtime):
kv_listener = distributed_runtime.namespace(namespace).component(component) kv_listener = distributed_runtime.namespace(namespace).component(component)
# publisher # publisher
worker_id = 233 # Get actual worker_id from component (KvEventPublisher ignores the passed worker_id and uses component's connection_id)
# Create a dummy endpoint to access connection_id since Component doesn't expose it directly
dummy_endpoint = kv_listener.endpoint("dummy")
worker_id = dummy_endpoint.connection_id()
event_publisher = EventPublisher(kv_listener, worker_id, kv_block_size) event_publisher = EventPublisher(kv_listener, worker_id, kv_block_size)
# indexer # indexer
...@@ -237,44 +227,26 @@ async def test_event_handler(distributed_runtime): ...@@ -237,44 +227,26 @@ async def test_event_handler(distributed_runtime):
assert not scores.scores assert not scores.scores
event_publisher.store_event(test_token, lora_id) event_publisher.store_event(test_token, lora_id)
# wait for the event to be processed as it is sent asynchronously # Wait for the event to be processed (sent asynchronously)
# Retry loop for CI environments where processing may take longer await asyncio.sleep(0.2)
scores = await indexer.find_matches_for_request(test_token, lora_id)
worker_key = (worker_id, 0) # (worker_id, dp_rank) worker_key = (worker_id, 0) # (worker_id, dp_rank)
for retry in range(10): # Try up to 10 times assert scores.scores, "No scores found"
await asyncio.sleep(0.5) # Wait 500ms between retries assert worker_key in scores.scores, f"Worker {worker_key} not found in scores"
scores = await indexer.find_matches_for_request(test_token, lora_id) assert (
if ( scores.scores[worker_key] == 1
scores.scores ), f"Expected score 1, got {scores.scores[worker_key]}"
and worker_key in scores.scores
and scores.scores[worker_key] == 1 # Remove event and verify
):
break
if retry == 9: # Last iteration
# Provide detailed error message for debugging
assert scores.scores, f"No scores found after {(retry+1)*0.5}s"
assert (
worker_key in scores.scores
), f"Worker {worker_key} not in scores after {(retry+1)*0.5}s"
assert (
scores.scores[worker_key] == 1
), f"Expected score 1, got {scores.scores.get(worker_key)} after {(retry+1)*0.5}s"
# remove event
event_publisher.remove_event() event_publisher.remove_event()
# Retry loop for event removal verification await asyncio.sleep(0.2)
for retry in range(10): # Try up to 10 times
await asyncio.sleep(0.5) # Wait 500ms between retries scores = await indexer.find_matches_for_request(test_token, lora_id)
scores = await indexer.find_matches_for_request(test_token, lora_id) assert not scores.scores, f"Scores still present: {scores.scores}"
if not scores.scores:
break
if retry == 9: # Last iteration
assert (
not scores.scores
), f"Scores still present after {(retry+1)*0.5}s: {scores.scores}"
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.forked
async def test_approx_kv_indexer(distributed_runtime): async def test_approx_kv_indexer(distributed_runtime):
kv_block_size = 32 kv_block_size = 32
namespace = "kv_test" namespace = "kv_test"
......
...@@ -5,15 +5,16 @@ ...@@ -5,15 +5,16 @@
import os import os
import pytest
import uvloop import uvloop
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime
TEST_END_TO_END = os.environ.get("TEST_END_TO_END", 0) TEST_END_TO_END = os.environ.get("TEST_END_TO_END", 0)
@dynamo_worker() @pytest.mark.asyncio
async def test_register(runtime: DistributedRuntime): async def test_register(runtime: DistributedRuntime):
component = runtime.namespace("test").component("tensor") component = runtime.namespace("test").component("tensor")
......
...@@ -37,15 +37,16 @@ class TestProfileSlaAiconfigurator: ...@@ -37,15 +37,16 @@ class TestProfileSlaAiconfigurator:
"""Test class for profile_sla aiconfigurator functionality.""" """Test class for profile_sla aiconfigurator functionality."""
@pytest.fixture @pytest.fixture
def trtllm_args(self): def trtllm_args(self, request):
class Args: class Args:
def __init__(self): def __init__(self):
self.model = "" self.model = ""
self.dgd_image = "" self.dgd_image = ""
self.backend = "trtllm" self.backend = "trtllm"
self.config = "examples/backends/trtllm/deploy/disagg.yaml" self.config = "examples/backends/trtllm/deploy/disagg.yaml"
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.min_num_gpus_per_engine = 1 self.min_num_gpus_per_engine = 1
self.max_num_gpus_per_engine = 8 self.max_num_gpus_per_engine = 8
self.skip_existing_results = False self.skip_existing_results = False
...@@ -76,6 +77,7 @@ class TestProfileSlaAiconfigurator: ...@@ -76,6 +77,7 @@ class TestProfileSlaAiconfigurator:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"]) @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg): async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
...@@ -86,6 +88,7 @@ class TestProfileSlaAiconfigurator: ...@@ -86,6 +88,7 @@ class TestProfileSlaAiconfigurator:
await run_profile(trtllm_args) await run_profile(trtllm_args)
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"arg_name, bad_value", "arg_name, bad_value",
...@@ -103,11 +106,13 @@ class TestProfileSlaAiconfigurator: ...@@ -103,11 +106,13 @@ class TestProfileSlaAiconfigurator:
await run_profile(trtllm_args) await run_profile(trtllm_args)
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
# Test that profile_sla works with the model & backend in the trtllm_args fixture. # Test that profile_sla works with the model & backend in the trtllm_args fixture.
await run_profile(trtllm_args) await run_profile(trtllm_args)
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"backend, aic_backend_version", "backend, aic_backend_version",
......
...@@ -41,15 +41,16 @@ class TestProfileSLADryRun: ...@@ -41,15 +41,16 @@ class TestProfileSLADryRun:
"""Test class for profile_sla dry-run functionality.""" """Test class for profile_sla dry-run functionality."""
@pytest.fixture @pytest.fixture
def vllm_args(self): def vllm_args(self, request):
"""Create arguments for vllm backend dry-run test.""" """Create arguments for vllm backend dry-run test."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "vllm" self.backend = "vllm"
self.config = "examples/backends/vllm/deploy/disagg.yaml" self.config = "examples/backends/vllm/deploy/disagg.yaml"
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "" self.model = ""
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = 1 self.min_num_gpus_per_engine = 1
...@@ -83,15 +84,16 @@ class TestProfileSLADryRun: ...@@ -83,15 +84,16 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.fixture @pytest.fixture
def sglang_args(self): def sglang_args(self, request):
"""Create arguments for sglang backend dry-run test.""" """Create arguments for sglang backend dry-run test."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "sglang" self.backend = "sglang"
self.config = "examples/backends/sglang/deploy/disagg.yaml" self.config = "examples/backends/sglang/deploy/disagg.yaml"
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "" self.model = ""
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = 1 self.min_num_gpus_per_engine = 1
...@@ -124,6 +126,7 @@ class TestProfileSLADryRun: ...@@ -124,6 +126,7 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_vllm_dryrun(self, vllm_args): async def test_vllm_dryrun(self, vllm_args):
"""Test that profile_sla dry-run works for vllm backend with disagg.yaml config.""" """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
...@@ -131,6 +134,7 @@ class TestProfileSLADryRun: ...@@ -131,6 +134,7 @@ class TestProfileSLADryRun:
await run_profile(vllm_args) await run_profile(vllm_args)
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sglang_dryrun(self, sglang_args): async def test_sglang_dryrun(self, sglang_args):
"""Test that profile_sla dry-run works for sglang backend with disagg.yaml config.""" """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
...@@ -138,15 +142,16 @@ class TestProfileSLADryRun: ...@@ -138,15 +142,16 @@ class TestProfileSLADryRun:
await run_profile(sglang_args) await run_profile(sglang_args)
@pytest.fixture @pytest.fixture
def trtllm_args(self): def trtllm_args(self, request):
"""Create arguments for trtllm backend dry-run test.""" """Create arguments for trtllm backend dry-run test."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "trtllm" self.backend = "trtllm"
self.config = "examples/backends/trtllm/deploy/disagg.yaml" self.config = "examples/backends/trtllm/deploy/disagg.yaml"
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "" self.model = ""
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = 1 self.min_num_gpus_per_engine = 1
...@@ -179,6 +184,7 @@ class TestProfileSLADryRun: ...@@ -179,6 +184,7 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_trtllm_dryrun(self, trtllm_args): async def test_trtllm_dryrun(self, trtllm_args):
"""Test that profile_sla dry-run works for trtllm backend with disagg.yaml config.""" """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
...@@ -186,15 +192,16 @@ class TestProfileSLADryRun: ...@@ -186,15 +192,16 @@ class TestProfileSLADryRun:
await run_profile(trtllm_args) await run_profile(trtllm_args)
@pytest.fixture @pytest.fixture
def sglang_moe_args(self): def sglang_moe_args(self, request):
"""Create arguments for trtllm backend dry-run test.""" """Create arguments for trtllm backend dry-run test."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "sglang" self.backend = "sglang"
self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml" self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "" self.model = ""
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = 8 self.min_num_gpus_per_engine = 8
...@@ -228,6 +235,7 @@ class TestProfileSLADryRun: ...@@ -228,6 +235,7 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sglang_moe_dryrun(self, sglang_moe_args): async def test_sglang_moe_dryrun(self, sglang_moe_args):
"""Test that profile_sla dry-run works for sglang backend with MoE config.""" """Test that profile_sla dry-run works for sglang backend with MoE config."""
...@@ -255,15 +263,16 @@ class TestProfileSLADryRun: ...@@ -255,15 +263,16 @@ class TestProfileSLADryRun:
) )
@pytest.fixture @pytest.fixture
def vllm_args_with_model_autogen(self): def vllm_args_with_model_autogen(self, request):
"""Create arguments for vllm backend with model-based search space autogeneration.""" """Create arguments for vllm backend with model-based search space autogeneration."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "vllm" self.backend = "vllm"
self.config = "" self.config = ""
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = "" self.dgd_image = ""
# Set to 0 to trigger auto-generation path # Set to 0 to trigger auto-generation path
...@@ -293,6 +302,7 @@ class TestProfileSLADryRun: ...@@ -293,6 +302,7 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary") @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
@patch("benchmarks.profiler.utils.search_space_autogen.get_model_info") @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
...@@ -319,15 +329,16 @@ class TestProfileSLADryRun: ...@@ -319,15 +329,16 @@ class TestProfileSLADryRun:
await run_profile(vllm_args_with_model_autogen) await run_profile(vllm_args_with_model_autogen)
@pytest.fixture @pytest.fixture
def sglang_args_with_model_autogen(self): def sglang_args_with_model_autogen(self, request):
"""Create arguments for sglang backend with model-based search space autogeneration.""" """Create arguments for sglang backend with model-based search space autogeneration."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "sglang" self.backend = "sglang"
self.config = "" self.config = ""
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = 0 self.min_num_gpus_per_engine = 0
...@@ -355,6 +366,7 @@ class TestProfileSLADryRun: ...@@ -355,6 +366,7 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary") @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
@patch("benchmarks.profiler.utils.search_space_autogen.get_model_info") @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
...@@ -381,15 +393,16 @@ class TestProfileSLADryRun: ...@@ -381,15 +393,16 @@ class TestProfileSLADryRun:
await run_profile(sglang_args_with_model_autogen) await run_profile(sglang_args_with_model_autogen)
@pytest.fixture @pytest.fixture
def trtllm_args_with_model_autogen(self): def trtllm_args_with_model_autogen(self, request):
"""Create arguments for trtllm backend with model-based search space autogeneration.""" """Create arguments for trtllm backend with model-based search space autogeneration."""
class Args: class Args:
def __init__(self): def __init__(self):
self.backend = "trtllm" self.backend = "trtllm"
self.config = "" self.config = ""
self.output_dir = "/tmp/test_profiling_results" # Use unique output directory per test for parallel execution
self.namespace = "test-namespace" self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
self.namespace = f"test-namespace-{request.node.name}"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = 0 self.min_num_gpus_per_engine = 0
...@@ -417,6 +430,7 @@ class TestProfileSLADryRun: ...@@ -417,6 +430,7 @@ class TestProfileSLADryRun:
return Args() return Args()
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.parallel
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary") @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
@patch("benchmarks.profiler.utils.search_space_autogen.get_model_info") @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment