]# Frontend ports: use PORTS[0] for single router, PORTS for multi-router
NUM_REQUESTS=10
PAGE_SIZE=16# SGLang uses "page_size" instead of "block_size"
# Shared test payload for all tests
TEST_PAYLOAD:Dict[str,Any]={
"model":MODEL_NAME,
"messages":[
{
"role":"user",
"content":"In a quiet meadow tucked between rolling hills, a plump gray rabbit nibbled on clover beneath the shade of a gnarled oak tree. Its ears twitched at the faint rustle of leaves, but it remained calm, confident in the safety of its burrow just a few hops away. The late afternoon sun warmed its fur, and tiny dust motes danced in the golden light as bees hummed lazily nearby. Though the rabbit lived a simple life, every day was an adventure of scents, shadows, and snacks—an endless search for the tastiest patch of greens and the softest spot to nap.",
}
],
"stream":True,
"max_tokens":10,
}
# Shared SGLang configuration for all tests
# mem_fraction_static limits actual VRAM allocation (required for multi-worker on same GPU)
SGLANG_ARGS:Dict[str,Any]={
"page_size":PAGE_SIZE,
"model":MODEL_NAME,
"mem_fraction_static":0.4,# Limit VRAM allocation per worker (equivalent to vLLM's gpu_memory_utilization)
"context_length":1024,# Limit context length to reduce KV cache size (equivalent to vLLM's max_model_len)
"disable_cuda_graph":True,# Disable CUDA graphs for faster startup & lower memory (equivalent to vLLM's enforce_eager)
}
classSGLangProcess:
"""Manages SGLang workers using dynamo.sglang (HTTP API + KV events).
This is a drop-in replacement for MockerProcess that uses real SGLang workers.
The key difference: dynamo.sglang automatically handles:
- HTTP API serving
- KV cache event publishing (ZMQ → NATS bridge)
- Integration with dynamo.frontend router
"""
def__init__(
self,
request,
sglang_args:Optional[Dict[str,Any]]=None,
num_workers:int=2,
single_gpu:bool=False,
data_parallel_size:Optional[int]=None,
):
"""Initialize SGLang workers with dynamo integration.
Args:
request: pytest request fixture for log directory
sglang_args: Configuration dict with keys:
- page_size: KV cache page size (default: 16)
- model: Model name/path (default: TinyLlama-1.1B)
- mem_fraction_static: Fraction of GPU memory to allocate (optional)
- context_length: Maximum sequence length (optional)
- disable_cuda_graph: Disable CUDA graphs (default: False)
num_workers: Number of SGLang worker processes
single_gpu: If True, all workers share GPU 0
data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size)