Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d3d49562
Unverified
Commit
d3d49562
authored
Mar 13, 2025
by
Liangfu Chen
Committed by
GitHub
Mar 13, 2025
Browse files
[Neuron] flatten test parameterization for neuron attention kernels (#14712)
parent
4059adc3
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
26 additions
and
22 deletions
+26
-22
.buildkite/run-neuron-test.sh
.buildkite/run-neuron-test.sh
+1
-1
tests/neuron/1_core/test_activation.py
tests/neuron/1_core/test_activation.py
+0
-0
tests/neuron/1_core/test_block_table.py
tests/neuron/1_core/test_block_table.py
+0
-0
tests/neuron/1_core/test_cache.py
tests/neuron/1_core/test_cache.py
+0
-0
tests/neuron/1_core/test_layernorm.py
tests/neuron/1_core/test_layernorm.py
+0
-0
tests/neuron/1_core/test_logits_processor.py
tests/neuron/1_core/test_logits_processor.py
+0
-0
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+25
-21
tests/neuron/1_core/test_rotary_embedding.py
tests/neuron/1_core/test_rotary_embedding.py
+0
-0
tests/neuron/2_core/test_comm_ops.py
tests/neuron/2_core/test_comm_ops.py
+0
-0
No files found.
.buildkite/run-neuron-test.sh
View file @
d3d49562
...
...
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e
"NEURON_COMPILE_CACHE_URL=
${
NEURON_COMPILE_CACHE_MOUNT
}
"
\
--name
"
${
container_name
}
"
\
${
image_name
}
\
/bin/bash
-c
"python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
/bin/bash
-c
"python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/
1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/
-v --capture=tee-sys"
tests/neuron/test_activation.py
→
tests/neuron/
1_core/
test_activation.py
View file @
d3d49562
File moved
tests/neuron/test_block_table.py
→
tests/neuron/
1_core/
test_block_table.py
View file @
d3d49562
File moved
tests/neuron/test_cache.py
→
tests/neuron/
1_core/
test_cache.py
View file @
d3d49562
File moved
tests/neuron/test_layernorm.py
→
tests/neuron/
1_core/
test_layernorm.py
View file @
d3d49562
File moved
tests/neuron/test_logits_processor.py
→
tests/neuron/
1_core/
test_logits_processor.py
View file @
d3d49562
File moved
tests/neuron/test_prefix_prefill.py
→
tests/neuron/
1_core/
test_prefix_prefill.py
View file @
d3d49562
...
...
@@ -292,28 +292,32 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
@
pytest
.
mark
.
parametrize
(
"prefill_batch_size,decode_batch_size,block_size,large_tile_size"
,
"prefill_batch_size,decode_batch_size,block_size,large_tile_size
,num_heads,num_queries_per_kv,head_size,mixed_precision
"
,
[
(
1
,
199
,
1
,
512
),
# 512 blocks
(
4
,
12
,
256
,
2048
),
# 128 blocks
(
4
,
12
,
16
,
2048
),
# 128 blocks
(
4
,
12
,
4
,
1024
),
# 256 blocks
(
4
,
12
,
32
,
2048
),
# 64 blocks
(
4
,
12
,
32
,
4096
),
# 128 blocks
(
4
,
12
,
32
,
8192
),
# 256 blocks
(
4
,
12
,
64
,
8192
),
# 128 blocks
],
)
@
pytest
.
mark
.
parametrize
(
"num_heads,num_queries_per_kv,head_size"
,
[
(
4
,
2
,
8
),
(
32
,
8
,
64
),
(
4
,
4
,
128
),
(
8
,
1
,
32
),
],
)
@
pytest
.
mark
.
parametrize
(
"mixed_precision"
,
[
True
,
False
])
# Test minimal configurations (small block size)
(
1
,
199
,
1
,
512
,
4
,
2
,
8
,
False
),
# minimal block size, small dimensions
(
1
,
199
,
1
,
512
,
4
,
2
,
8
,
True
),
# same with mixed precision
# Test common/medium configurations
(
4
,
12
,
32
,
2048
,
32
,
8
,
64
,
False
),
# common case, larger heads
(
4
,
12
,
32
,
2048
,
16
,
4
,
32
,
True
),
# medium size, mixed precision, grouped-query attention (GQA)
# Test large configurations
(
4
,
12
,
256
,
8192
,
8
,
1
,
128
,
False
),
# large blocks, large head size
(
4
,
12
,
256
,
8192
,
64
,
8
,
64
,
True
),
# large blocks, many heads
# Test asymmetric configurations
(
2
,
24
,
64
,
4096
,
12
,
4
,
96
,
False
),
# varied batch sizes
(
8
,
8
,
128
,
2048
,
24
,
2
,
48
,
True
),
# balanced batches
# Test edge cases
(
1
,
128
,
16
,
1024
,
4
,
2
,
16
,
False
),
# large decode batch
(
16
,
4
,
8
,
8192
,
48
,
1
,
128
,
True
),
# large prefill batch
(
4
,
12
,
32
,
2048
,
16
,
1
,
32
,
True
),
# multi-head attention (MHA)
(
4
,
12
,
32
,
2048
,
16
,
16
,
32
,
True
),
# multi-query attention (MQA)
])
@
torch
.
inference_mode
()
def
test_contexted_kv_attention
(
prefill_batch_size
:
int
,
...
...
tests/neuron/test_rotary_embedding.py
→
tests/neuron/
1_core/
test_rotary_embedding.py
View file @
d3d49562
File moved
tests/neuron/test_comm_ops.py
→
tests/neuron/
2_core/
test_comm_ops.py
View file @
d3d49562
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment