Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
095ea3e7
Unverified
Commit
095ea3e7
authored
Jul 28, 2025
by
Neelay Shah
Committed by
GitHub
Jul 28, 2025
Browse files
chore: updating and removing tests (#2130)
parent
fdcf611f
Changes
22
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2 additions
and
2362 deletions
+2
-2362
tests/README.md
tests/README.md
+2
-3
tests/fault_tolerance/README.md
tests/fault_tolerance/README.md
+0
-581
tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
+0
-42
tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
+0
-42
tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
+0
-42
tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
+0
-43
tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
+0
-54
tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
+0
-47
tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
...ult_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
+0
-51
tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
...ult_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
+0
-53
tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
...ult_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
+0
-58
tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
...ult_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
+0
-52
tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
...ult_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
+0
-53
tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
...ult_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
+0
-58
tests/fault_tolerance/conftest.py
tests/fault_tolerance/conftest.py
+0
-79
tests/fault_tolerance/parse_results.py
tests/fault_tolerance/parse_results.py
+0
-433
tests/fault_tolerance/scenarios.py
tests/fault_tolerance/scenarios.py
+0
-219
tests/fault_tolerance/test_runner.py
tests/fault_tolerance/test_runner.py
+0
-218
tests/fault_tolerance/utils/circus_controller.py
tests/fault_tolerance/utils/circus_controller.py
+0
-107
tests/fault_tolerance/utils/metrics.py
tests/fault_tolerance/utils/metrics.py
+0
-127
No files found.
tests/README.md
View file @
095ea3e7
...
...
@@ -8,11 +8,10 @@ This document outlines the testing framework for the Dynamo runtime system, incl
```
bash
tests/
├── serve/
# E2E tests using dynamo serve
│ ├── conftest.py
# test fixtures as needed for specific test area
├── run/
# E2E tests using dynamo run
├── serve/
# E2E tests
│ ├── conftest.py
# test fixtures as needed for specific test area
├── conftest.py
# Shared fixtures and configuration
├── utils
# Common utils accross tests
└── README.md
# This file
```
...
...
tests/fault_tolerance/README.md
deleted
100644 → 0
View file @
fdcf611f
This diff is collapsed.
Click to expand it.
tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
4
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
8
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
ServiceArgs
:
workers
:
1
resources
:
cpu
:
"
10"
memory
:
"
20Gi"
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
ServiceArgs
:
workers
:
2
resources
:
cpu
:
"
10"
memory
:
"
20Gi"
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
2
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
ServiceArgs
:
workers
:
2
resources
:
cpu
:
"
10"
memory
:
"
20Gi"
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
4
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
tensor-parallel-size
:
1
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
ServiceArgs
:
workers
:
2
resources
:
cpu
:
"
10"
memory
:
"
20Gi"
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
tensor-parallel-size
:
1
ServiceArgs
:
workers
:
2
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
4
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
4'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
4
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
4
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
4'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
ServiceArgs
:
workers
:
2
resources
:
cpu
:
"
10"
memory
:
"
20Gi"
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
4
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
4'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
tensor-parallel-size
:
2
ServiceArgs
:
workers
:
2
resources
:
gpu
:
'
2'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
tests/fault_tolerance/conftest.py
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pytest
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--requests-per-client"
,
type
=
int
,
default
=
100
)
parser
.
addoption
(
"--clients"
,
type
=
int
,
default
=
10
)
parser
.
addoption
(
"--no-respawn"
,
action
=
"store_true"
,
default
=
False
)
parser
.
addoption
(
"--input-token-length"
,
type
=
int
,
default
=
100
)
parser
.
addoption
(
"--output-token-length"
,
type
=
int
,
default
=
100
)
parser
.
addoption
(
"--max-num-seqs"
,
type
=
int
,
default
=
None
)
parser
.
addoption
(
"--max-retries"
,
type
=
int
,
default
=
1
)
parser
.
addoption
(
"--display-dynamo-output"
,
action
=
"store_true"
,
default
=
False
)
parser
.
addoption
(
"--combine-process-logs"
,
action
=
"store_true"
,
default
=
False
)
parser
.
addoption
(
"--hf-hub-offline"
,
action
=
"store_true"
,
default
=
False
)
@
pytest
.
fixture
def
display_dynamo_output
(
request
):
return
request
.
config
.
getoption
(
"--display-dynamo-output"
)
@
pytest
.
fixture
def
max_retries
(
request
):
return
request
.
config
.
getoption
(
"--max-retries"
)
@
pytest
.
fixture
def
max_num_seqs
(
request
):
return
request
.
config
.
getoption
(
"--max-num-seqs"
)
@
pytest
.
fixture
def
num_clients
(
request
):
return
request
.
config
.
getoption
(
"--clients"
)
@
pytest
.
fixture
def
input_token_length
(
request
):
return
request
.
config
.
getoption
(
"--input-token-length"
)
@
pytest
.
fixture
def
output_token_length
(
request
):
return
request
.
config
.
getoption
(
"--output-token-length"
)
@
pytest
.
fixture
def
requests_per_client
(
request
):
return
request
.
config
.
getoption
(
"--requests-per-client"
)
@
pytest
.
fixture
def
respawn
(
request
):
return
not
request
.
config
.
getoption
(
"--no-respawn"
)
@
pytest
.
fixture
def
separate_process_logs
(
request
):
return
not
request
.
config
.
getoption
(
"--combine-process-logs"
)
@
pytest
.
fixture
def
hf_hub_offline
(
request
):
return
request
.
config
.
getoption
(
"--hf-hub-offline"
)
tests/fault_tolerance/parse_results.py
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
json
import
os
import
re
from
datetime
import
datetime
from
typing
import
Any
import
pandas
as
pd
from
tabulate
import
tabulate
def
parse_test_log
(
file_path
):
start_time
=
None
ready_time
=
None
fault_time
=
None
start_cmd
=
None
if
not
os
.
path
.
isfile
(
file_path
):
return
None
,
None
,
None
with
open
(
file_path
,
"r"
)
as
f
:
for
line
in
f
:
line
=
line
.
strip
()
if
"Running command: dynamo serve"
in
line
:
start_time
=
datetime
.
fromisoformat
(
line
.
split
(
" "
)[
1
].
replace
(
"T"
,
" "
)
)
start_cmd
=
line
.
split
(
"Running command:"
)[
1
]
elif
"Deployment Ready"
in
line
:
ready_time
=
datetime
.
fromisoformat
(
line
.
split
(
" "
)[
1
].
replace
(
"T"
,
" "
)
)
elif
"Injecting failure for:"
in
line
:
fault_time
=
datetime
.
fromisoformat
(
line
.
split
(
" "
)[
1
].
replace
(
"T"
,
" "
)
)
startup_time
=
(
(
ready_time
-
start_time
).
total_seconds
()
if
start_time
and
ready_time
else
None
)
return
startup_time
,
fault_time
,
start_cmd
def
parse_client_logs
(
test_dir
,
expected_length
=
100
):
all_logs
=
[]
for
file
in
os
.
listdir
(
test_dir
):
if
file
.
startswith
(
"client_"
)
and
file
.
endswith
(
".log.txt"
):
with
open
(
os
.
path
.
join
(
test_dir
,
file
),
"r"
)
as
f
:
request_number
=
0
for
line
in
f
:
request_number
+=
1
data
=
json
.
loads
(
line
.
strip
())
for
result
in
data
[
"results"
]:
log_entry
=
{
"time"
:
datetime
.
fromisoformat
(
data
[
"time"
].
replace
(
"T"
,
" "
)
),
"status"
:
result
[
"status"
],
"request_elapsed_time"
:
result
[
"request_elapsed_time"
],
"request_number"
:
request_number
-
1
,
"client"
:
file
.
split
(
"_"
)[
1
].
split
(
"."
)[
0
],
}
if
(
"result"
in
result
and
result
[
"result"
]
and
"choices"
in
result
[
"result"
]
and
result
[
"result"
][
"choices"
]
):
log_entry
[
"success"
]
=
True
content
=
result
[
"result"
][
"choices"
][
0
][
"message"
][
"content"
]
if
not
content
or
len
(
content
)
<
expected_length
:
log_entry
[
"success"
]
=
False
else
:
log_entry
[
"success"
]
=
False
all_logs
.
append
(
log_entry
)
if
len
(
all_logs
):
df
=
pd
.
DataFrame
(
all_logs
)
df
.
sort_values
(
"time"
,
inplace
=
True
)
return
df
return
None
def
calculate_metrics
(
df
,
fault_time
,
sla
=
2.1
):
success
=
df
[
"success"
].
sum
()
failure
=
len
(
df
)
-
success
if
fault_time
:
before_fault
=
df
[
df
[
"time"
]
<=
fault_time
]
after_fault
=
df
[
df
[
"time"
]
>
fault_time
]
else
:
before_fault
=
df
after_fault
=
None
# Existing latency metrics (only successful requests)
successful_before
=
before_fault
[
before_fault
[
"success"
]]
avg_before
=
successful_before
[
"request_elapsed_time"
].
mean
()
std_before
=
successful_before
[
"request_elapsed_time"
].
std
()
avg_after
,
std_after
=
None
,
None
if
after_fault
is
not
None
and
not
after_fault
.
empty
:
successful_after
=
after_fault
[
after_fault
[
"success"
]]
avg_after
=
successful_after
[
"request_elapsed_time"
].
mean
()
std_after
=
successful_after
[
"request_elapsed_time"
].
std
()
# SLA violations (only successful requests exceeding the SLA)
violations_before
=
(
successful_before
[
"request_elapsed_time"
]
>
sla
).
sum
()
violations_after
=
(
(
successful_after
[
"request_elapsed_time"
]
>
sla
).
sum
()
if
after_fault
is
not
None
and
not
after_fault
.
empty
else
None
)
return
(
success
,
failure
,
avg_before
,
std_before
,
avg_after
,
std_after
,
violations_before
,
violations_after
,
)
def
parse_process_log
(
log_dir
,
process_name
):
process_ready_line
=
{
"dynamo_Frontend"
:
"added model"
,
"dynamo_VllmWorker"
:
"Starting VllmWorker instance with all registered endpoints"
,
"dynamo_Processor"
:
"Starting Processor instance with all registered endpoints"
,
"dynamo_PrefillWorker"
:
"Starting PrefillWorker instance with all registered endpoints"
,
}
process_shutdown_line
=
{
"dynamo_Frontend"
:
"SIGTERM received, starting graceful shutdown"
,
"dynamo_VllmWorker"
:
"Received shutdown signal, shutting down DistributedRuntime"
,
"dynamo_Processor"
:
"Received signal 15, initiating graceful shutdown"
,
"dynamo_PrefillWorker"
:
"Shutdown hooks completed successfully"
,
}
process_log_path
=
os
.
path
.
join
(
log_dir
,
"error.log"
)
if
not
os
.
path
.
isfile
(
process_log_path
):
return
None
,
None
process_ready
=
[]
process_shutdown
=
[]
process_start_time
=
None
with
open
(
process_log_path
,
"r"
)
as
f
:
for
line
in
f
:
clean_line
=
re
.
sub
(
r
"\x1b\[.*?m"
,
""
,
line
.
strip
())
# Remove ANSI codes
if
not
clean_line
:
continue
parts
=
clean_line
.
split
()
if
len
(
parts
)
<
2
:
continue
try
:
# Parse timestamp (remove 'Z' for naive datetime)
timestamp
=
datetime
.
fromisoformat
(
parts
[
0
].
replace
(
"Z"
,
""
))
except
ValueError
:
continue
if
not
process_start_time
:
process_start_time
=
timestamp
log_message
=
" "
.
join
(
parts
[
1
:])
relative_time
=
(
timestamp
-
process_start_time
).
total_seconds
()
# Check for process start lines
if
process_name
in
process_ready_line
:
if
process_ready_line
[
process_name
]
in
log_message
:
process_ready
.
append
((
timestamp
,
log_message
,
relative_time
))
# Check for process end lines
if
process_name
in
process_shutdown_line
:
if
process_shutdown_line
[
process_name
]
in
log_message
:
process_shutdown
.
append
((
timestamp
,
log_message
,
relative_time
))
return
process_ready
,
process_shutdown
def
parse_watcher_log
(
test_dir
,
fault_time
):
before_requests
=
[]
after_requests
=
[]
watcher_log_path
=
os
.
path
.
join
(
test_dir
,
"watcher.log.txt"
)
if
not
os
.
path
.
isfile
(
watcher_log_path
):
return
None
,
None
with
open
(
watcher_log_path
,
"r"
)
as
f
:
for
line
in
f
:
try
:
data
=
json
.
loads
(
line
.
strip
())
except
json
.
JSONDecodeError
:
continue
if
"metrics"
not
in
data
:
continue
entry_time
=
datetime
.
fromisoformat
(
data
[
"time"
].
replace
(
"T"
,
" "
))
for
metric
in
data
[
"metrics"
]:
if
len
(
metric
)
!=
2
:
continue
_
,
metric_data
=
metric
if
(
"num_requests_waiting"
in
metric_data
and
"request_active_slots"
in
metric_data
and
metric_data
[
"request_active_slots"
]
>
0
):
if
fault_time
is
None
or
entry_time
<=
fault_time
:
before_requests
.
append
(
metric_data
[
"num_requests_waiting"
])
else
:
after_requests
.
append
(
metric_data
[
"num_requests_waiting"
])
avg_before
=
(
sum
(
before_requests
)
/
len
(
before_requests
)
if
before_requests
else
None
)
avg_after
=
sum
(
after_requests
)
/
len
(
after_requests
)
if
after_requests
else
None
return
avg_before
,
avg_after
def
calculate_recovery_time
(
test_dir
,
failure_type
,
fault_time
):
processes
=
[
"dynamo_Frontend"
,
"dynamo_Processor"
,
"dynamo_VllmWorker"
,
"dynamo_PrefillWorker"
,
]
process_start_ends
=
{}
start_time
=
None
for
process
in
processes
:
starts
,
ends
=
parse_process_log
(
os
.
path
.
join
(
test_dir
,
process
),
process
)
if
starts
:
process_start_ends
[
process
]
=
(
starts
,
ends
)
if
failure_type
==
"processor"
:
start_time
=
process_start_ends
[
"dynamo_Processor"
][
0
][
-
1
][
0
]
elif
failure_type
==
"frontend"
:
start_time
=
process_start_ends
[
"dynamo_Frontend"
][
0
][
-
1
][
0
]
elif
failure_type
==
"decode_worker"
:
start_times
=
[
x
for
x
in
process_start_ends
[
"dynamo_VllmWorker"
][
0
]
if
"VllmWorker:1"
in
x
[
1
]
]
if
not
start_times
:
return
None
start_time
=
start_times
[
-
1
][
0
]
elif
failure_type
==
"prefill_worker"
:
if
"dynamo_PrefillWorker"
not
in
process_start_ends
:
return
None
start_times
=
[
x
for
x
in
process_start_ends
[
"dynamo_PrefillWorker"
][
0
]
if
"PrefillWorker:1"
in
x
[
1
]
]
start_time
=
start_times
[
-
1
][
0
]
if
not
start_time
:
return
None
if
fault_time
>
start_time
:
return
None
return
(
start_time
-
fault_time
).
total_seconds
()
def
process_test_directory
(
test_dir
):
test_name
=
test_dir
.
split
(
"test_worker_failure["
,
1
)[
1
].
rstrip
(
"]"
)
failure_type
=
test_name
.
split
(
"-"
)[
-
1
]
test_prefix
=
"-"
.
join
(
test_name
.
split
(
"-"
)[:
-
1
])
startup_time
,
fault_time
,
start_cmd
=
parse_test_log
(
os
.
path
.
join
(
test_dir
,
"test.log.txt"
)
)
df
=
parse_client_logs
(
test_dir
)
if
df
is
None
or
df
.
empty
:
return
None
pending_requests_before
,
pending_requests_after
=
parse_watcher_log
(
test_dir
,
fault_time
)
(
success
,
failure
,
avg_before
,
std_before
,
avg_after
,
std_after
,
violations_before
,
violations_after
,
)
=
calculate_metrics
(
df
,
fault_time
)
recovery_time
=
calculate_recovery_time
(
test_dir
,
failure_type
,
fault_time
)
return
{
"test"
:
test_prefix
,
"cmd"
:
start_cmd
,
"failure"
:
failure_type
,
"start_time"
:
startup_time
,
"success_requests"
:
success
,
"failed_requests"
:
failure
,
"avg_latency_before"
:
avg_before
,
"std_latency_before"
:
std_before
,
"avg_latency_after"
:
avg_after
,
"std_latency_after"
:
std_after
,
"pending_requests_before"
:
pending_requests_before
,
"pending_requests_after"
:
pending_requests_after
,
"violations_before"
:
violations_before
,
"violations_after"
:
violations_after
,
"recovery_time"
:
recovery_time
,
}
def
main
(
logs_dir
,
tablefmt
,
log_paths
=
[]):
results
=
[]
if
log_paths
:
for
log_path
in
log_paths
:
result
=
process_test_directory
(
log_path
)
if
result
:
results
.
append
(
result
)
elif
logs_dir
:
for
entry
in
os
.
listdir
(
logs_dir
):
if
entry
.
startswith
(
"test_worker_failure["
)
and
os
.
path
.
isdir
(
os
.
path
.
join
(
logs_dir
,
entry
)
):
result
=
process_test_directory
(
os
.
path
.
join
(
logs_dir
,
entry
))
if
result
:
results
.
append
(
result
)
# Group results by test prefix
grouped
:
dict
[
str
,
list
[
dict
[
str
,
Any
]]]
=
{}
commands
=
{}
for
res
in
results
:
test_prefix
=
res
[
"test"
]
if
test_prefix
not
in
grouped
:
grouped
[
test_prefix
]
=
[]
commands
[
test_prefix
]
=
res
[
"cmd"
]
grouped
[
test_prefix
].
append
(
res
)
order
=
[
"none"
,
"frontend"
,
"processor"
,
"decode_worker"
,
"prefill_worker"
,
"vllm_worker"
,
]
# Print grouped tables
for
test_prefix
,
group
in
grouped
.
items
():
new_group
=
[]
for
failure
in
order
:
for
res
in
group
:
if
failure
==
res
[
"failure"
]:
new_group
.
append
(
res
)
group
=
new_group
headers
=
[
"Failure"
,
"Startup Time"
,
"Success"
,
"Failed"
,
"Latency Before"
,
"Latency After"
,
"Pending Before"
,
"Pending After"
,
"Violations Before"
,
"Violations After"
,
"Recovery Time"
,
]
rows
=
[]
for
res
in
group
:
row
=
[
res
[
"failure"
],
res
[
"start_time"
],
# if res["start_time"] is not None else "N/A",
res
[
"success_requests"
],
res
[
"failed_requests"
],
res
[
"avg_latency_before"
],
res
[
"avg_latency_after"
],
res
[
"pending_requests_before"
],
res
[
"pending_requests_after"
],
res
[
"violations_before"
],
res
[
"violations_after"
],
res
[
"recovery_time"
],
]
rows
.
append
(
row
)
print
(
f
"
\n
Test Group:
{
test_prefix
}
"
)
print
(
f
"
\n
Test Command:
{
commands
[
test_prefix
]
}
"
)
print
(
tabulate
(
rows
,
headers
,
tablefmt
=
tablefmt
,
floatfmt
=
".2f"
,
missingval
=
"N/A"
,
numalign
=
"right"
,
stralign
=
"center"
,
)
)
print
(
"
\n
"
+
"="
*
80
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Parse test results"
)
parser
.
add_argument
(
"--log-dir"
,
default
=
"."
,
help
=
"Path to the logs directory"
)
parser
.
add_argument
(
"--format"
,
choices
=
[
"fancy"
,
"markdown"
],
default
=
"fancy"
,
help
=
"Table format"
)
args
=
parser
.
parse_args
()
# Map format choices to tabulate formats
tablefmt
=
(
"fancy_grid"
if
args
.
format
==
"fancy"
else
"pipe"
)
# Using pipe for markdown compatibility
main
(
args
.
log_dir
,
tablefmt
)
tests/fault_tolerance/scenarios.py
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pytest
from
tests.utils.deployment_graph
import
(
DeploymentGraph
,
Payload
,
chat_completions_response_handler
,
)
# Initial payload used for testing
# initial deployment readiness.
text_prompt
=
"Tell me a short joke about AI."
text_payload
=
Payload
(
payload_chat
=
{
"model"
:
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
text_prompt
,
# Shorter prompt
}
],
"max_tokens"
:
150
,
"temperature"
:
0.1
,
# "seed": 10,
"ignore_eos"
:
True
,
"min_tokens"
:
150
,
"stream"
:
False
,
},
expected_log
=
[],
expected_response
=
[
"AI"
],
)
# Each Deployment Graph contains
# the dynamo serve module and configuration as well
# as the endpoint for interaction
deployment_graphs
=
{
"agg-tp-1-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"agg-tp-1-dp-8"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_8
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"agg-tp-1-dp-4"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_4
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"agg-tp-2-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"agg-tp-2-dp-2"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_4
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"agg-tp-2-dp-4"
:
(
DeploymentGraph
(
module
=
"graphs.agg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_8
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"disagg-p-tp-1-dp-1-d-tp-1-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"disagg-p-tp-1-dp-4-d-tp-4-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_8
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"disagg-p-tp-2-dp-2-d-tp-4-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_8
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"disagg-p-tp-2-dp-1-d-tp-4-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_8
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"disagg-p-tp-1-dp-2-d-tp-2-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_4
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
"disagg-p-tp-1-dp-1-d-tp-2-dp-1"
:
(
DeploymentGraph
(
module
=
"graphs.disagg:Frontend"
,
config
=
"/workspace/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml"
,
directory
=
"/workspace/examples/llm"
,
endpoints
=
[
"v1/chat/completions"
],
response_handlers
=
[
chat_completions_response_handler
],
marks
=
[
pytest
.
mark
.
gpu_4
,
pytest
.
mark
.
vllm
],
),
text_payload
,
),
}
# Each failure scenaro contains a list of failure injections
# Each failure injection has a time in seconds after the pervious injection and
# a list of failures to inject including the number of failures for each type.
# Failures are currently process termination.
#
# Example:
#
# "prefill_worker": [[30, [("dynamo_prefillworker", 1)]]],
#
# terminates 1 prefill worker after 30 seconds
failure_scenarios
=
{
"decode_worker"
:
[[
30
,
[(
"dynamo_vllmworker"
,
1
)]]],
"prefill_worker"
:
[[
30
,
[(
"dynamo_prefillworker"
,
1
)]]],
"frontend"
:
[[
30
,
[(
"dynamo_frontend"
,
1
)]]],
"processor"
:
[[
30
,
[(
"dynamo_processor"
,
1
)]]],
"vllm_worker"
:
[[
30
,
[(
"vllm_worker"
,
1
)]]],
"none"
:
[],
}
@
pytest
.
fixture
(
params
=
list
(
failure_scenarios
.
keys
()))
def
failures
(
request
):
return
failure_scenarios
[
request
.
param
]
@
pytest
.
fixture
(
params
=
list
(
deployment_graphs
.
keys
()))
def
deployment_graph_test
(
request
):
"""
Fixture that provides different deployment graph test configurations.
"""
return
deployment_graphs
[
request
.
param
]
tests/fault_tolerance/test_runner.py
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
os
import
time
from
contextlib
import
contextmanager
from
multiprocessing
import
Process
import
psutil
import
pytest
from
tests.fault_tolerance.client
import
client
from
tests.fault_tolerance.parse_results
import
main
as
parse_results
from
tests.fault_tolerance.scenarios
import
(
# noqa: F401
deployment_graph_test
,
failures
,
)
from
tests.fault_tolerance.utils.circus_controller
import
CircusController
from
tests.fault_tolerance.utils.metrics
import
nvidia_smi
# noqa: F401
from
tests.fault_tolerance.utils.metrics
import
worker_metrics
# noqa: F401
from
tests.serve.test_dynamo_serve
import
DynamoServeProcess
from
tests.utils.managed_process
import
terminate_process_tree
def
_set_deployment_args
(
request
,
max_num_seqs
):
decode_worker_name
=
"VllmWorker"
args
=
{}
if
max_num_seqs
is
not
None
:
args
[
f
"--
{
decode_worker_name
}
.max_num_seqs"
]
=
max_num_seqs
return
args
def
_list_vllm_worker_processes
():
processes
=
[]
for
ps_process
in
psutil
.
process_iter
([
"name"
,
"cmdline"
]):
try
:
if
"from multiprocessing.spawn import spawn_main;"
in
" "
.
join
(
ps_process
.
cmdline
()
):
processes
.
append
(
ps_process
.
pid
)
except
Exception
:
pass
return
processes
@
contextmanager
def
_clients
(
logger
,
num_clients
,
request
,
deployment_graph
,
server_process
,
payload
,
requests_per_client
,
input_token_length
,
output_token_length
,
max_retries
,
):
procs
=
[]
for
i
in
range
(
num_clients
):
procs
.
append
(
Process
(
target
=
client
,
args
=
(
deployment_graph
,
server_process
,
payload
,
request
.
node
.
name
,
i
,
requests_per_client
,
input_token_length
,
output_token_length
,
max_retries
,
),
)
)
procs
[
-
1
].
start
()
yield
procs
for
proc
in
procs
:
logger
.
debug
(
f
"
{
proc
}
waiting for join"
)
proc
.
join
()
logger
.
debug
(
f
"
{
proc
}
joined"
)
def
_inject_failures
(
failures
,
logger
):
# noqa: F811
circus_controller
=
CircusController
.
from_state_file
(
"dynamo"
)
for
failure_time
,
component
in
failures
:
time
.
sleep
(
failure_time
)
for
component_name
,
number
in
component
:
logger
.
info
(
f
"Injecting failure for:
{
component_name
}
"
)
if
"dynamo"
in
component_name
:
result
=
circus_controller
.
client
.
call
(
{
"command"
:
"list"
,
"properties"
:
{
"name"
:
f
"
{
component_name
}
"
}}
)
if
result
[
"status"
]
==
"error"
:
logger
.
warning
(
f
"component
{
component_name
}
not found
{
result
}
"
)
continue
num_processes
=
len
(
result
[
"pids"
])
if
number
is
None
:
number
=
num_processes
for
x
in
range
(
number
):
pid
=
result
[
"pids"
][
x
%
num_processes
]
logger
.
info
(
f
"Terminating
{
component_name
}
Pid
{
pid
}
"
)
terminate_process_tree
(
pid
,
logger
,
immediate_kill
=
True
)
elif
"vllm"
in
component_name
:
vllm_processes
=
_list_vllm_worker_processes
()
num_processes
=
len
(
vllm_processes
)
if
number
is
None
:
number
=
len
(
vllm_processes
)
for
x
in
range
(
number
):
pid
=
vllm_processes
[
x
%
num_processes
]
terminate_process_tree
(
pid
,
logger
,
immediate_kill
=
True
)
circus_controller
.
close
()
global_result_list
=
[]
@
pytest
.
fixture
(
autouse
=
True
)
def
results_table
(
request
):
yield
parse_results
(
logs_dir
=
None
,
log_paths
=
[
request
.
node
.
name
],
tablefmt
=
"fancy"
)
global_result_list
.
append
(
request
.
node
.
name
)
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"session"
)
def
results_summary
():
yield
parse_results
(
logs_dir
=
None
,
log_paths
=
global_result_list
,
tablefmt
=
"fancy"
)
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
slow
def
test_worker_failure
(
deployment_graph_test
,
# noqa: F811
request
,
runtime_services
,
num_clients
,
requests_per_client
,
worker_metrics
,
# noqa: F811
respawn
,
failures
,
# noqa: F811
input_token_length
,
output_token_length
,
max_num_seqs
,
max_retries
,
display_dynamo_output
,
nvidia_smi
,
# noqa: F811
separate_process_logs
,
hf_hub_offline
,
):
"""
Test dynamo serve deployments with injected failures
"""
# runtime_services is used to start nats and etcd
logger
=
logging
.
getLogger
(
request
.
node
.
name
)
logger
.
info
(
"Starting test_deployment"
)
deployment_graph
,
payload
=
deployment_graph_test
if
hf_hub_offline
:
os
.
environ
[
"HF_HUB_OFFLINE"
]
=
"1"
else
:
if
"HF_HUB_OFFLINE"
in
os
.
environ
:
del
os
.
environ
[
"HF_HUB_OFFLINE"
]
if
respawn
:
os
.
environ
[
"DYN_CIRCUS_RESPAWN"
]
=
"1"
else
:
if
"DYN_CIRCUS_RESPAWN"
in
os
.
environ
:
del
os
.
environ
[
"DYN_CIRCUS_RESPAWN"
]
if
separate_process_logs
:
os
.
environ
[
"DYN_CIRCUS_LOG_DIR"
]
=
os
.
path
.
abspath
(
request
.
node
.
name
)
deployment_args
=
_set_deployment_args
(
request
,
max_num_seqs
)
with
DynamoServeProcess
(
deployment_graph
,
request
,
display_output
=
display_dynamo_output
,
args
=
deployment_args
,
)
as
server_process
:
server_process
.
wait_for_ready
(
payload
)
with
_clients
(
logger
,
num_clients
,
request
,
deployment_graph
,
server_process
,
payload
,
requests_per_client
,
input_token_length
,
output_token_length
,
max_retries
,
):
_inject_failures
(
failures
,
logger
)
tests/fault_tolerance/utils/circus_controller.py
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
logging
import
os
from
pathlib
import
Path
from
typing
import
List
,
Optional
from
circus.client
import
CircusClient
from
circus.exc
import
CallError
logger
=
logging
.
getLogger
(
__name__
)
class
CircusController
:
"""A circus client implementation for Dynamo"""
def
__init__
(
self
,
endpoint
:
str
):
"""Initialize connection to arbiter.
Args:
endpoint: The circus endpoint (e.g., tcp://127.0.0.1:54927)
"""
self
.
endpoint
=
endpoint
self
.
client
=
CircusClient
(
endpoint
=
endpoint
,
timeout
=
15.0
)
@
classmethod
def
from_state_file
(
cls
,
namespace
:
str
)
->
"CircusController"
:
"""
Create a CircusController from a Dynamo state file.
Args:
namespace: The Dynamo namespace
Returns:
CircusController instance
Raises:
FileNotFoundError: If state file doesn't exist
ValueError: If no endpoint found in state file
"""
state_file
=
(
Path
(
os
.
environ
.
get
(
"DYN_LOCAL_STATE_DIR"
,
Path
.
home
()
/
".dynamo"
/
"state"
)
)
/
f
"
{
namespace
}
.json"
)
if
not
state_file
.
exists
():
raise
FileNotFoundError
(
f
"State file not found:
{
state_file
}
"
)
with
open
(
state_file
,
"r"
)
as
f
:
state
=
json
.
load
(
f
)
endpoint
=
state
.
get
(
"circus_endpoint"
)
if
not
endpoint
:
raise
ValueError
(
f
"No endpoint found in state file:
{
state_file
}
"
)
return
cls
(
endpoint
)
async
def
_get_watcher_processes
(
self
,
name
:
str
)
->
Optional
[
int
]:
"""
Get number of processes for a watcher.
Args:
name: The name of the watcher
Returns:
Number of processes for the watcher. Returns None operation fails.
"""
try
:
response
=
self
.
client
.
send_message
(
"numprocesses"
,
name
=
name
)
return
int
(
response
.
get
(
"numprocesses"
,
0
))
except
(
CallError
,
Exception
)
as
e
:
logger
.
error
(
f
"Failed to get process count for
{
name
}
:
{
e
}
"
)
return
None
async
def
_list_watchers
(
self
)
->
List
[
str
]:
"""
List all watchers managed by circus.
Returns:
List of watcher names. Returns None if the list operation fails.
"""
try
:
response
=
self
.
client
.
send_message
(
"list"
)
return
response
.
get
(
"watchers"
,
[])
except
(
CallError
,
Exception
)
as
e
:
logger
.
error
(
f
"Failed to list watchers:
{
e
}
"
)
return
[]
def
close
(
self
)
->
None
:
"""Close the connection to the arbiter."""
if
hasattr
(
self
,
"client"
):
self
.
client
.
stop
()
tests/fault_tolerance/utils/metrics.py
deleted
100644 → 0
View file @
fdcf611f
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
json
import
os
from
datetime
import
datetime
from
multiprocessing
import
Process
import
psutil
import
pytest
from
dynamo.runtime
import
dynamo_worker
from
tests.fault_tolerance.utils.circus_controller
import
CircusController
from
tests.utils.managed_process
import
ManagedProcess
def
run_metrics_process
(
log_dir
):
asyncio
.
run
(
get_metrics
(
log_dir
))
@
dynamo_worker
()
async
def
get_metrics
(
runtime
,
log_dir
):
# Log # processes
# Log # metrics per vllm worker
circus_controller
=
None
pipeline
=
None
log_path
=
os
.
path
.
join
(
log_dir
,
"watcher.log.txt"
)
with
open
(
log_path
,
"w"
)
as
log
:
while
True
:
try
:
await
asyncio
.
sleep
(
0.5
)
if
not
circus_controller
:
circus_controller
=
CircusController
.
from_state_file
(
"dynamo"
)
if
not
pipeline
:
pipeline
=
(
await
runtime
.
namespace
(
"dynamo"
)
.
component
(
"VllmWorker"
)
.
endpoint
(
"load_metrics"
)
.
client
()
)
watchers
=
[]
for
x
in
await
circus_controller
.
_list_watchers
():
result
=
circus_controller
.
client
.
call
(
{
"command"
:
"list"
,
"properties"
:
{
"name"
:
f
"
{
x
}
"
}}
)
watchers
.
append
((
x
,
result
))
metrics
=
[]
for
x
in
pipeline
.
instance_ids
():
async
for
worker_metric
in
await
pipeline
.
direct
(
None
,
x
):
metrics
.
append
((
x
,
worker_metric
.
data
()))
vllm_processes
=
[]
for
ps_process
in
psutil
.
process_iter
([
"name"
,
"cmdline"
]):
try
:
if
"from multiprocessing.spawn import spawn_main;"
in
" "
.
join
(
ps_process
.
cmdline
()
):
vllm_processes
.
append
(
ps_process
.
pid
)
except
(
psutil
.
NoSuchProcess
,
psutil
.
AccessDenied
):
# Process may have terminated or become inaccessible during iteration
pass
record
=
{
"time"
:
datetime
.
now
().
strftime
(
"%Y-%m-%dT%H:%M:%S"
),
"watchers"
:
watchers
,
"metrics"
:
metrics
,
"vllm_processes"
:
vllm_processes
,
}
log
.
write
(
json
.
dumps
(
record
)
+
"
\n
"
)
log
.
flush
()
except
Exception
as
e
:
record
=
{
"time"
:
datetime
.
now
().
strftime
(
"%Y-%m-%dT%H:%M:%S"
),
"watchers"
:
[],
"metrics"
:
[],
"vllm_processes"
:
[],
"error"
:
str
(
e
),
}
log
.
write
(
json
.
dumps
(
record
)
+
"
\n
"
)
log
.
flush
()
@
pytest
.
fixture
def
worker_metrics
(
request
):
process
=
Process
(
target
=
run_metrics_process
,
args
=
(
request
.
node
.
name
,))
process
.
start
()
yield
process
.
kill
()
class
NvidiaSMI
(
ManagedProcess
):
def
__init__
(
self
,
request
):
super
().
__init__
(
command
=
[
"nvidia-smi"
,
"dmon"
,
"--select=puc"
,
],
health_check_ports
=
[],
terminate_existing
=
True
,
display_output
=
False
,
data_dir
=
None
,
log_dir
=
request
.
node
.
name
,
)
@
pytest
.
fixture
def
nvidia_smi
(
request
):
with
NvidiaSMI
(
request
)
as
nvidia_smi_process
:
yield
nvidia_smi_process
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment