Unverified Commit 36f03d40 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

test: fault tolerance tests (#1444)


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent fb213a2f
...@@ -86,6 +86,23 @@ def create_circus_watcher( ...@@ -86,6 +86,23 @@ def create_circus_watcher(
use_sockets: bool = True, use_sockets: bool = True,
**kwargs: Any, **kwargs: Any,
) -> Watcher: ) -> Watcher:
log_dir = os.environ.get("DYN_CIRCUS_LOG_DIR", None)
if log_dir is not None:
prefix = f"{log_dir}/{name}"
os.makedirs(prefix, exist_ok=True)
stdout_stream = {
"class": "FileStream",
"filename": f"{prefix}/output.log",
"backup_count": 10,
}
stderr_stream = {
"class": "FileStream",
"filename": f"{prefix}/error.log",
"backup_count": 10,
}
else:
stdout_stream = None
stderr_stream = None
return Watcher( return Watcher(
name=name, name=name,
cmd=shlex.quote(cmd) if psutil.POSIX else cmd, cmd=shlex.quote(cmd) if psutil.POSIX else cmd,
...@@ -94,7 +111,10 @@ def create_circus_watcher( ...@@ -94,7 +111,10 @@ def create_circus_watcher(
stop_children=True, stop_children=True,
use_sockets=use_sockets, use_sockets=use_sockets,
graceful_timeout=86400, graceful_timeout=86400,
respawn=False, # TODO respawn=os.environ.get("DYN_CIRCUS_RESPAWN", "false").lower()
in ("true", "1", "yes"),
stdout_stream=stdout_stream,
stderr_stream=stderr_stream,
**kwargs, **kwargs,
) )
......
...@@ -162,6 +162,8 @@ markers = [ ...@@ -162,6 +162,8 @@ markers = [
"weekly: marks tests to run weekly", "weekly: marks tests to run weekly",
"gpu_1: marks tests to run on GPU", "gpu_1: marks tests to run on GPU",
"gpu_2: marks tests to run on 2GPUs", "gpu_2: marks tests to run on 2GPUs",
"gpu_4: marks tests to run on 4GPUs",
"gpu_8: marks tests to run on 8GPUs",
"e2e: marks tests as end-to-end tests", "e2e: marks tests as end-to-end tests",
"integration: marks tests as integration tests", "integration: marks tests as integration tests",
"unit: marks tests as unit tests", "unit: marks tests as unit tests",
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import logging import logging
import os import os
import shutil
import tempfile import tempfile
import pytest import pytest
...@@ -23,15 +24,31 @@ from tests.utils.managed_process import ManagedProcess ...@@ -23,15 +24,31 @@ from tests.utils.managed_process import ManagedProcess
# Custom format inspired by your example # Custom format inspired by your example
LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s" LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format=LOG_FORMAT, format=LOG_FORMAT,
datefmt="%Y-%m-%dT%H:%M:%S", # ISO 8601 UTC format datefmt=DATE_FORMAT, # ISO 8601 UTC format
) )
@pytest.fixture(autouse=True)
def logger(request):
log_path = os.path.join(request.node.name, "test.log.txt")
logger = logging.getLogger()
shutil.rmtree(request.node.name, ignore_errors=True)
os.makedirs(request.node.name, exist_ok=True)
handler = logging.FileHandler(log_path, mode="w")
formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
yield
handler.close()
logger.removeHandler(handler)
def pytest_collection_modifyitems(config, items): def pytest_collection_modifyitems(config, items):
""" """
This function is called to modify the list of tests to run. This function is called to modify the list of tests to run.
...@@ -69,7 +86,7 @@ class EtcdServer(ManagedProcess): ...@@ -69,7 +86,7 @@ class EtcdServer(ManagedProcess):
timeout=timeout, timeout=timeout,
display_output=False, display_output=False,
health_check_ports=[port], health_check_ports=[port],
data_dir=tempfile.mkdtemp(prefix="etcd_"), data_dir=data_dir,
log_dir=request.node.name, log_dir=request.node.name,
) )
......
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import os
import random
import time
from datetime import datetime
import requests
def _get_random_prompt(length):
word_list = [f"{i}" for i in range(10)]
return " ".join(random.choices(word_list, k=length))
def _single_request(
url,
payload,
logger,
retry_attempts=1,
input_token_length=100,
output_token_length=100,
timeout=30,
retry_delay=1,
):
prompt = _get_random_prompt(input_token_length)
payload["messages"][0]["content"] = prompt
payload["max_tokens"] = output_token_length
response = None
end_time = None
start_time = time.time()
results = []
while retry_attempts:
start_request_time = time.time()
try:
response = requests.post(
url,
json=payload,
timeout=timeout,
)
end_time = time.time()
content = None
try:
content = response.json()
except json.JSONDecodeError:
pass
results.append(
{
"status": response.status_code,
"result": content,
"request_elapsed_time": end_time - start_request_time,
}
)
if response.status_code != 200:
time.sleep(retry_delay)
retry_attempts -= 1
continue
else:
break
except (requests.RequestException, requests.Timeout) as e:
results.append(
{
"status": str(e),
"result": None,
"request_elapsed_time": time.time() - start_request_time,
}
)
logger.warning("Retrying due to Request failed: %s", e)
time.sleep(retry_delay)
retry_attempts -= 1
continue
return {
"time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
"results": results,
"total_time": time.time() - start_time,
}
def client(
deployment_graph,
server_process,
payload,
log_dir,
index,
requests_per_client,
input_token_length,
output_token_length,
max_retries,
retry_delay=1,
):
logger = logging.getLogger(f"CLIENT: {index}")
try:
log_path = os.path.join(log_dir, f"client_{index}.log.txt")
with open(log_path, "w") as log:
url = f"http://localhost:{server_process.port}/{deployment_graph.endpoints[0]}"
for i in range(requests_per_client):
result = _single_request(
url,
payload.payload_chat,
logger,
max_retries,
input_token_length=input_token_length,
output_token_length=output_token_length,
retry_delay=retry_delay,
)
logger.info(
f"Request: {i} Status: {result['results'][-1]['status']} Latency: {result['results'][-1]['request_elapsed_time']}"
)
log.write(json.dumps(result) + "\n")
log.flush()
except Exception as e:
logger.error(str(e))
logger.info("Exiting")
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 4
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 8
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
ServiceArgs:
workers: 1
resources:
cpu: "10"
memory: "20Gi"
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 2
ServiceArgs:
workers: 2
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 2
ServiceArgs:
workers: 4
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 1
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 1
ServiceArgs:
workers: 2
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: '4'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 4
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: '4'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: '4'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 2
ServiceArgs:
workers: 2
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
def pytest_addoption(parser):
parser.addoption("--requests-per-client", type=int, default=100)
parser.addoption("--clients", type=int, default=10)
parser.addoption("--no-respawn", action="store_true", default=False)
parser.addoption("--input-token-length", type=int, default=100)
parser.addoption("--output-token-length", type=int, default=100)
parser.addoption("--max-num-seqs", type=int, default=None)
parser.addoption("--max-retries", type=int, default=1)
parser.addoption("--display-dynamo-output", action="store_true", default=False)
parser.addoption("--combine-process-logs", action="store_true", default=False)
parser.addoption("--hf-hub-offline", action="store_true", default=False)
@pytest.fixture
def display_dynamo_output(request):
return request.config.getoption("--display-dynamo-output")
@pytest.fixture
def max_retries(request):
return request.config.getoption("--max-retries")
@pytest.fixture
def max_num_seqs(request):
return request.config.getoption("--max-num-seqs")
@pytest.fixture
def num_clients(request):
return request.config.getoption("--clients")
@pytest.fixture
def input_token_length(request):
return request.config.getoption("--input-token-length")
@pytest.fixture
def output_token_length(request):
return request.config.getoption("--output-token-length")
@pytest.fixture
def requests_per_client(request):
return request.config.getoption("--requests-per-client")
@pytest.fixture
def respawn(request):
return not request.config.getoption("--no-respawn")
@pytest.fixture
def separate_process_logs(request):
return not request.config.getoption("--combine-process-logs")
@pytest.fixture
def hf_hub_offline(request):
return request.config.getoption("--hf-hub-offline")
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
import re
from datetime import datetime
from typing import Any
import pandas as pd
from tabulate import tabulate
def parse_test_log(file_path):
start_time = None
ready_time = None
fault_time = None
start_cmd = None
if not os.path.isfile(file_path):
return None, None, None
with open(file_path, "r") as f:
for line in f:
line = line.strip()
if "Running command: dynamo serve" in line:
start_time = datetime.fromisoformat(
line.split(" ")[1].replace("T", " ")
)
start_cmd = line.split("Running command:")[1]
elif "Deployment Ready" in line:
ready_time = datetime.fromisoformat(
line.split(" ")[1].replace("T", " ")
)
elif "Injecting failure for:" in line:
fault_time = datetime.fromisoformat(
line.split(" ")[1].replace("T", " ")
)
startup_time = (
(ready_time - start_time).total_seconds() if start_time and ready_time else None
)
return startup_time, fault_time, start_cmd
def parse_client_logs(test_dir, expected_length=100):
all_logs = []
for file in os.listdir(test_dir):
if file.startswith("client_") and file.endswith(".log.txt"):
with open(os.path.join(test_dir, file), "r") as f:
request_number = 0
for line in f:
request_number += 1
data = json.loads(line.strip())
for result in data["results"]:
log_entry = {
"time": datetime.fromisoformat(
data["time"].replace("T", " ")
),
"status": result["status"],
"request_elapsed_time": result["request_elapsed_time"],
"request_number": request_number - 1,
"client": file.split("_")[1].split(".")[0],
}
if (
"result" in result
and result["result"]
and "choices" in result["result"]
and result["result"]["choices"]
):
log_entry["success"] = True
content = result["result"]["choices"][0]["message"][
"content"
]
if not content or len(content) < expected_length:
log_entry["success"] = False
else:
log_entry["success"] = False
all_logs.append(log_entry)
if len(all_logs):
df = pd.DataFrame(all_logs)
df.sort_values("time", inplace=True)
return df
return None
def calculate_metrics(df, fault_time, sla=2.1):
success = df["success"].sum()
failure = len(df) - success
if fault_time:
before_fault = df[df["time"] <= fault_time]
after_fault = df[df["time"] > fault_time]
else:
before_fault = df
after_fault = None
# Existing latency metrics (only successful requests)
successful_before = before_fault[before_fault["success"]]
avg_before = successful_before["request_elapsed_time"].mean()
std_before = successful_before["request_elapsed_time"].std()
avg_after, std_after = None, None
if after_fault is not None and not after_fault.empty:
successful_after = after_fault[after_fault["success"]]
avg_after = successful_after["request_elapsed_time"].mean()
std_after = successful_after["request_elapsed_time"].std()
# SLA violations (only successful requests exceeding the SLA)
violations_before = (successful_before["request_elapsed_time"] > sla).sum()
violations_after = (
(successful_after["request_elapsed_time"] > sla).sum()
if after_fault is not None and not after_fault.empty
else None
)
return (
success,
failure,
avg_before,
std_before,
avg_after,
std_after,
violations_before,
violations_after,
)
def parse_process_log(log_dir, process_name):
process_ready_line = {
"dynamo_Frontend": "added model",
"dynamo_VllmWorker": "Starting VllmWorker instance with all registered endpoints",
"dynamo_Processor": "Starting Processor instance with all registered endpoints",
"dynamo_PrefillWorker": "Starting PrefillWorker instance with all registered endpoints",
}
process_shutdown_line = {
"dynamo_Frontend": "SIGTERM received, starting graceful shutdown",
"dynamo_VllmWorker": "Received shutdown signal, shutting down DistributedRuntime",
"dynamo_Processor": "Received signal 15, initiating graceful shutdown",
"dynamo_PrefillWorker": "Shutdown hooks completed successfully",
}
process_log_path = os.path.join(log_dir, "error.log")
if not os.path.isfile(process_log_path):
return None, None
process_ready = []
process_shutdown = []
process_start_time = None
with open(process_log_path, "r") as f:
for line in f:
clean_line = re.sub(r"\x1b\[.*?m", "", line.strip()) # Remove ANSI codes
if not clean_line:
continue
parts = clean_line.split()
if len(parts) < 2:
continue
try:
# Parse timestamp (remove 'Z' for naive datetime)
timestamp = datetime.fromisoformat(parts[0].replace("Z", ""))
except ValueError:
continue
if not process_start_time:
process_start_time = timestamp
log_message = " ".join(parts[1:])
relative_time = (timestamp - process_start_time).total_seconds()
# Check for process start lines
if process_name in process_ready_line:
if process_ready_line[process_name] in log_message:
process_ready.append((timestamp, log_message, relative_time))
# Check for process end lines
if process_name in process_shutdown_line:
if process_shutdown_line[process_name] in log_message:
process_shutdown.append((timestamp, log_message, relative_time))
return process_ready, process_shutdown
def parse_watcher_log(test_dir, fault_time):
before_requests = []
after_requests = []
watcher_log_path = os.path.join(test_dir, "watcher.log.txt")
if not os.path.isfile(watcher_log_path):
return None, None
with open(watcher_log_path, "r") as f:
for line in f:
try:
data = json.loads(line.strip())
except json.JSONDecodeError:
continue
if "metrics" not in data:
continue
entry_time = datetime.fromisoformat(data["time"].replace("T", " "))
for metric in data["metrics"]:
if len(metric) != 2:
continue
_, metric_data = metric
if (
"num_requests_waiting" in metric_data
and "request_active_slots" in metric_data
and metric_data["request_active_slots"] > 0
):
if fault_time is None or entry_time <= fault_time:
before_requests.append(metric_data["num_requests_waiting"])
else:
after_requests.append(metric_data["num_requests_waiting"])
avg_before = (
sum(before_requests) / len(before_requests) if before_requests else None
)
avg_after = sum(after_requests) / len(after_requests) if after_requests else None
return avg_before, avg_after
def calculate_recovery_time(test_dir, failure_type, fault_time):
processes = [
"dynamo_Frontend",
"dynamo_Processor",
"dynamo_VllmWorker",
"dynamo_PrefillWorker",
]
process_start_ends = {}
start_time = None
for process in processes:
starts, ends = parse_process_log(os.path.join(test_dir, process), process)
if starts:
process_start_ends[process] = (starts, ends)
if failure_type == "processor":
start_time = process_start_ends["dynamo_Processor"][0][-1][0]
elif failure_type == "frontend":
start_time = process_start_ends["dynamo_Frontend"][0][-1][0]
elif failure_type == "decode_worker":
start_times = [
x
for x in process_start_ends["dynamo_VllmWorker"][0]
if "VllmWorker:1" in x[1]
]
if not start_times:
return None
start_time = start_times[-1][0]
elif failure_type == "prefill_worker":
if "dynamo_PrefillWorker" not in process_start_ends:
return None
start_times = [
x
for x in process_start_ends["dynamo_PrefillWorker"][0]
if "PrefillWorker:1" in x[1]
]
start_time = start_times[-1][0]
if not start_time:
return None
if fault_time > start_time:
return None
return (start_time - fault_time).total_seconds()
def process_test_directory(test_dir):
test_name = test_dir.split("test_worker_failure[", 1)[1].rstrip("]")
failure_type = test_name.split("-")[-1]
test_prefix = "-".join(test_name.split("-")[:-1])
startup_time, fault_time, start_cmd = parse_test_log(
os.path.join(test_dir, "test.log.txt")
)
df = parse_client_logs(test_dir)
if df is None or df.empty:
return None
pending_requests_before, pending_requests_after = parse_watcher_log(
test_dir, fault_time
)
(
success,
failure,
avg_before,
std_before,
avg_after,
std_after,
violations_before,
violations_after,
) = calculate_metrics(df, fault_time)
recovery_time = calculate_recovery_time(test_dir, failure_type, fault_time)
return {
"test": test_prefix,
"cmd": start_cmd,
"failure": failure_type,
"start_time": startup_time,
"success_requests": success,
"failed_requests": failure,
"avg_latency_before": avg_before,
"std_latency_before": std_before,
"avg_latency_after": avg_after,
"std_latency_after": std_after,
"pending_requests_before": pending_requests_before,
"pending_requests_after": pending_requests_after,
"violations_before": violations_before,
"violations_after": violations_after,
"recovery_time": recovery_time,
}
def main(logs_dir, tablefmt, log_paths=[]):
results = []
if log_paths:
for log_path in log_paths:
result = process_test_directory(log_path)
if result:
results.append(result)
elif logs_dir:
for entry in os.listdir(logs_dir):
if entry.startswith("test_worker_failure[") and os.path.isdir(
os.path.join(logs_dir, entry)
):
result = process_test_directory(os.path.join(logs_dir, entry))
if result:
results.append(result)
# Group results by test prefix
grouped: dict[str, list[dict[str, Any]]] = {}
commands = {}
for res in results:
test_prefix = res["test"]
if test_prefix not in grouped:
grouped[test_prefix] = []
commands[test_prefix] = res["cmd"]
grouped[test_prefix].append(res)
order = [
"none",
"frontend",
"processor",
"decode_worker",
"prefill_worker",
"vllm_worker",
]
# Print grouped tables
for test_prefix, group in grouped.items():
new_group = []
for failure in order:
for res in group:
if failure == res["failure"]:
new_group.append(res)
group = new_group
headers = [
"Failure",
"Startup Time",
"Success",
"Failed",
"Latency Before",
"Latency After",
"Pending Before",
"Pending After",
"Violations Before",
"Violations After",
"Recovery Time",
]
rows = []
for res in group:
row = [
res["failure"],
res["start_time"], # if res["start_time"] is not None else "N/A",
res["success_requests"],
res["failed_requests"],
res["avg_latency_before"],
res["avg_latency_after"],
res["pending_requests_before"],
res["pending_requests_after"],
res["violations_before"],
res["violations_after"],
res["recovery_time"],
]
rows.append(row)
print(f"\nTest Group: {test_prefix}")
print(f"\nTest Command: {commands[test_prefix]}")
print(
tabulate(
rows,
headers,
tablefmt=tablefmt,
floatfmt=".2f",
missingval="N/A",
numalign="right",
stralign="center",
)
)
print("\n" + "=" * 80)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parse test results")
parser.add_argument("--log-dir", default=".", help="Path to the logs directory")
parser.add_argument(
"--format", choices=["fancy", "markdown"], default="fancy", help="Table format"
)
args = parser.parse_args()
# Map format choices to tabulate formats
tablefmt = (
"fancy_grid" if args.format == "fancy" else "pipe"
) # Using pipe for markdown compatibility
main(args.log_dir, tablefmt)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment