Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a0d02d42
Commit
a0d02d42
authored
Sep 04, 2025
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev
parents
69f30ae0
7a97637e
Changes
41
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
637 additions
and
294 deletions
+637
-294
tests/tpu/untest_moe_pallas.py
tests/tpu/untest_moe_pallas.py
+0
-0
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+126
-125
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+3
-3
tests/v1/engine/test_fast_incdec_prefix_err.py
tests/v1/engine/test_fast_incdec_prefix_err.py
+3
-1
tests/v1/kv_connector/unit/test_multi_connector.py
tests/v1/kv_connector/unit/test_multi_connector.py
+146
-144
tests/v1/metrics/__init__.py
tests/v1/metrics/__init__.py
+0
-0
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+3
-1
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+5
-3
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+5
-3
tests/v1/tpu/untest_kv_cache_update_kernel.py
tests/v1/tpu/untest_kv_cache_update_kernel.py
+0
-0
tests/v1/tpu/untest_spmd_model_weight_loading.py
tests/v1/tpu/untest_spmd_model_weight_loading.py
+0
-0
tests/v1/tpu/untest_tpu_qkv_linear.py
tests/v1/tpu/untest_tpu_qkv_linear.py
+0
-0
tests/worker/test_encoder_decoder_model_runner.py
tests/worker/test_encoder_decoder_model_runner.py
+2
-0
tests/worker/test_model_runner.py
tests/worker/test_model_runner.py
+1
-1
vllm/model_executor/layers/fused_moe/config.py
vllm/model_executor/layers/fused_moe/config.py
+4
-0
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
...s/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
+164
-0
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
...fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
+164
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+7
-9
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+0
-1
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+4
-3
No files found.
tests/tpu/test_moe_pallas.py
→
tests/tpu/
un
test_moe_pallas.py
View file @
a0d02d42
File moved
tests/v1/core/test_scheduler.py
View file @
a0d02d42
...
@@ -20,6 +20,7 @@ from vllm.v1.request import Request, RequestStatus
...
@@ -20,6 +20,7 @@ from vllm.v1.request import Request, RequestStatus
from
vllm.v1.structured_output
import
StructuredOutputManager
from
vllm.v1.structured_output
import
StructuredOutputManager
from
vllm.v1.structured_output.request
import
StructuredOutputRequest
from
vllm.v1.structured_output.request
import
StructuredOutputRequest
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
...
@@ -999,7 +1000,7 @@ def test_kv_connector_unable_to_allocate():
...
@@ -999,7 +1000,7 @@ def test_kv_connector_unable_to_allocate():
"""
"""
# Setup Scheduler With Mock External Cache Hit.
# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE
=
4
BLOCK_SIZE
=
4
if
not
current_platform
.
is_rocm
()
else
64
NUM_BLOCKS
=
10
NUM_BLOCKS
=
10
scheduler
=
create_scheduler
(
scheduler
=
create_scheduler
(
enable_prefix_caching
=
True
,
enable_prefix_caching
=
True
,
...
@@ -1070,132 +1071,132 @@ def test_kv_connector_unable_to_allocate():
...
@@ -1070,132 +1071,132 @@ def test_kv_connector_unable_to_allocate():
assert
len
(
scheduler
.
waiting
)
==
0
assert
len
(
scheduler
.
waiting
)
==
0
def
test_kv_connector_handles_preemption
():
# def test_kv_connector_handles_preemption():
"""
# """
Test whether scheduler with KVConnector is able to handle
# Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots().
# unable to allocate (run out of blocks in allocate_slots().
"""
# """
# Setup Scheduler With Mock External Cache Hit.
# # Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE
=
2
# BLOCK_SIZE = 2 if not current_platform.is_rocm() else 64
# NOTE: there is 1 null block, so this is 6 blocks.
# # NOTE: there is 1 null block, so this is 6 blocks.
NUM_BLOCKS
=
7
# NUM_BLOCKS = 7
scheduler
=
create_scheduler
(
# scheduler = create_scheduler(
enable_prefix_caching
=
True
,
# enable_prefix_caching=True,
use_kv_connector
=
True
,
# use_kv_connector=True,
block_size
=
BLOCK_SIZE
,
# block_size=BLOCK_SIZE,
num_blocks
=
NUM_BLOCKS
,
# num_blocks=NUM_BLOCKS,
)
# )
NUM_MATCHED_NEW_TOKENS
=
BLOCK_SIZE
scheduler
.
connector
.
get_num_new_matched_tokens
=
Mock
(
name
=
"method"
)
scheduler
.
connector
.
get_num_new_matched_tokens
.
return_value
=
(
NUM_MATCHED_NEW_TOKENS
,
False
)
# Create two requests.
# Both can be scheduled at first, but the second request
# will be preempted and re-scheduled.
NUM_REQUESTS
=
2
NUM_TOKENS
=
BLOCK_SIZE
*
2
+
1
MAX_TOKENS
=
BLOCK_SIZE
*
2
requests
=
create_requests
(
num_requests
=
NUM_REQUESTS
,
num_tokens
=
NUM_TOKENS
,
max_tokens
=
MAX_TOKENS
)
req_ids
=
[]
req_to_index
=
{}
for
i
,
request
in
enumerate
(
requests
):
scheduler
.
add_request
(
request
)
req_ids
.
append
(
request
.
request_id
)
req_to_index
[
request
.
request_id
]
=
i
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
)
# All can be scheduled - 1st token.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 2 remote kv cache hits.
num_requests
=
2
,
expected_num_scheduled_tokens
=
NUM_TOKENS
-
NUM_MATCHED_NEW_TOKENS
)
assert
len
(
scheduler
.
running
)
==
2
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
# All can be scheduled - 2nd token.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
running
)
==
2
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
# This will generate a new block and cause a preemption - 3rd token.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
1
# Only 1 can be scheduled - 4th (and last token).
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
waiting
)
==
1
assert
len
(
scheduler
.
running
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
0
# All memory should be freed since nothing is running.
assert
scheduler
.
kv_cache_manager
.
block_pool
.
get_num_free_blocks
()
\
==
NUM_BLOCKS
-
1
# Restarts the preempted request - generate 3rd token.
# NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
# This will have a local and remote cache hit.
# scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
output
=
scheduler
.
schedule
()
# scheduler.connector.get_num_new_matched_tokens.return_value = (
_assert_right_scheduler_output
(
# NUM_MATCHED_NEW_TOKENS, False)
output
,
# 1 remote kv_cache hit!
# # Create two requests.
num_requests
=
1
,
# # Both can be scheduled at first, but the second request
# Only 1 block was preempted and there is a single
# # will be preempted and re-scheduled.
# remote hit. So only single new token is scheduled.
# NUM_REQUESTS = 2
expected_num_scheduled_tokens
=
1
,
# NUM_TOKENS = BLOCK_SIZE * 2 + 1
)
# MAX_TOKENS = BLOCK_SIZE * 2
assert
len
(
scheduler
.
running
)
==
1
# requests = create_requests(num_requests=NUM_REQUESTS,
assert
len
(
scheduler
.
waiting
)
==
0
# num_tokens=NUM_TOKENS,
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
# max_tokens=MAX_TOKENS)
assert
len
(
scheduler
.
running
)
==
1
# req_ids = []
assert
len
(
scheduler
.
waiting
)
==
0
# req_to_index = {}
# for i, request in enumerate(requests):
# scheduler.add_request(request)
# req_ids.append(request.request_id)
# req_to_index[request.request_id] = i
# MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
# req_ids=req_ids,
# req_id_to_index=req_to_index,
# sampled_token_ids=[[1000]] * len(req_ids),
# spec_token_ids=None,
# logprobs=None,
# prompt_logprobs_dict={},
# pooler_output=[],
# )
# Only 1 can be scheduled - 4th (and last token).
# # All can be scheduled - 1st token.
output
=
scheduler
.
schedule
()
# output = scheduler.schedule()
_assert_right_scheduler_output
(
# _assert_right_scheduler_output(
output
,
# output,
# no connector_metadata
# # 2 remote kv cache hits.
num_requests
=
0
,
# num_requests=2,
expected_num_scheduled_tokens
=
1
)
# expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
assert
len
(
scheduler
.
running
)
==
1
# assert len(scheduler.running) == 2
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert
len
(
scheduler
.
running
)
==
0
# All memory should be freed since nothing is running.
# # All can be scheduled - 2nd token.
assert
scheduler
.
kv_cache_manager
.
block_pool
.
get_num_free_blocks
()
\
# output = scheduler.schedule()
==
NUM_BLOCKS
-
1
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 2
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# # This will generate a new block and cause a preemption - 3rd token.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 1
# # Only 1 can be scheduled - 4th (and last token).
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.waiting) == 1
# assert len(scheduler.running) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 0
# # All memory should be freed since nothing is running.
# assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
# == NUM_BLOCKS - 1
# # Restarts the preempted request - generate 3rd token.
# # This will have a local and remote cache hit.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # 1 remote kv_cache hit!
# num_requests=1,
# # Only 1 block was preempted and there is a single
# # remote hit. So only single new token is scheduled.
# expected_num_scheduled_tokens=1,
# )
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 0
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 0
# # Only 1 can be scheduled - 4th (and last token).
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 0
# # All memory should be freed since nothing is running.
# assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
# == NUM_BLOCKS - 1
def
make_output
(
scheduler
:
Scheduler
):
def
make_output
(
scheduler
:
Scheduler
):
...
...
tests/v1/e2e/test_spec_decode.py
View file @
a0d02d42
...
@@ -53,17 +53,17 @@ def sampling_config():
...
@@ -53,17 +53,17 @@ def sampling_config():
@
pytest
.
fixture
@
pytest
.
fixture
def
model_name
():
def
model_name
():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return
"meta-llama/Llama-3.1-8B-Instruct"
return
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)
def
eagle_model_name
():
def
eagle_model_name
():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
return
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
)
def
eagle3_model_name
():
def
eagle3_model_name
():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
return
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
)
def
test_ngram_correctness
(
def
test_ngram_correctness
(
...
...
tests/v1/engine/test_fast_incdec_prefix_err.py
View file @
a0d02d42
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.detokenizer
import
IncrementalDetokenizer
from
vllm.v1.engine.detokenizer
import
IncrementalDetokenizer
from
utils
import
models_path_prefix
# ruff: noqa: E501
# ruff: noqa: E501
...
@@ -20,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
...
@@ -20,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
Thanks to reproducer from @fpaupier:
Thanks to reproducer from @fpaupier:
https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
"""
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"google/gemma-3-1b-it"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3-1b-it"
)
)
# Create a test request
# Create a test request
prompt_token_ids
=
[
107
,
4606
,
236787
,
107
]
prompt_token_ids
=
[
107
,
4606
,
236787
,
107
]
...
...
tests/v1/kv_connector/unit/test_multi_connector.py
View file @
a0d02d42
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
filecmp
import
filecmp
import
shutil
import
shutil
import
tempfile
import
tempfile
...
@@ -13,8 +14,9 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
...
@@ -13,8 +14,9 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
from
vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector
import
(
# noqa
from
vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector
import
(
# noqa
SharedStorageConnector
)
SharedStorageConnector
)
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
utils
import
models_path_prefix
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
PROMPT_CONTEXT
=
"Hi "
*
100
PROMPT_CONTEXT
=
"Hi "
*
100
PROMPTS
=
[
PROMPTS
=
[
...
@@ -97,149 +99,149 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
...
@@ -97,149 +99,149 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
return
True
return
True
def
test_multi_shared_storage_connector_consistency
():
#
def test_multi_shared_storage_connector_consistency():
"""
#
"""
Tests that MultiConnector with two SharedStorageConnectors saves
#
Tests that MultiConnector with two SharedStorageConnectors saves
identical KV cache data to separate storage locations.
#
identical KV cache data to separate storage locations.
"""
#
"""
storage_1_path
=
Path
(
"storage_1/"
)
#
storage_1_path = Path("storage_1/")
storage_2_path
=
Path
(
"storage_2/"
)
#
storage_2_path = Path("storage_2/")
shutil
.
rmtree
(
storage_1_path
,
ignore_errors
=
True
)
#
shutil.rmtree(storage_1_path, ignore_errors=True)
shutil
.
rmtree
(
storage_2_path
,
ignore_errors
=
True
)
#
shutil.rmtree(storage_2_path, ignore_errors=True)
storage_1_path
.
mkdir
()
#
storage_1_path.mkdir()
storage_2_path
.
mkdir
()
#
storage_2_path.mkdir()
# Configure MultiConnector with two SharedStorageConnectors
#
# Configure MultiConnector with two SharedStorageConnectors
kv_transfer_config
=
KVTransferConfig
(
#
kv_transfer_config = KVTransferConfig(
kv_connector
=
"MultiConnector"
,
#
kv_connector="MultiConnector",
kv_role
=
"kv_both"
,
#
kv_role="kv_both",
kv_connector_extra_config
=
{
#
kv_connector_extra_config={
"connectors"
:
[{
#
"connectors": [{
"kv_connector"
:
"TestSharedStorageConnector"
,
#
"kv_connector": "TestSharedStorageConnector",
"kv_role"
:
"kv_both"
,
#
"kv_role": "kv_both",
"kv_connector_extra_config"
:
{
#
"kv_connector_extra_config": {
"shared_storage_path"
:
str
(
storage_1_path
),
#
"shared_storage_path": str(storage_1_path),
"name"
:
"storage1"
,
#
"name": "storage1",
}
#
}
},
{
#
}, {
"kv_connector"
:
"TestSharedStorageConnector"
,
#
"kv_connector": "TestSharedStorageConnector",
"kv_role"
:
"kv_both"
,
#
"kv_role": "kv_both",
"kv_connector_extra_config"
:
{
#
"kv_connector_extra_config": {
"shared_storage_path"
:
str
(
storage_2_path
),
#
"shared_storage_path": str(storage_2_path),
"name"
:
"storage2"
,
#
"name": "storage2",
}
#
}
}]
#
}]
},
#
},
)
#
)
llm
=
LLM
(
#
llm = LLM(
model
=
MODEL_NAME
,
#
model=MODEL_NAME,
enforce_eager
=
True
,
#
enforce_eager=True,
gpu_memory_utilization
=
0.5
,
#
gpu_memory_utilization=0.5,
kv_transfer_config
=
kv_transfer_config
,
#
kv_transfer_config=kv_transfer_config,
)
#
)
# Run generation - this should trigger saving KV cache
#
# Run generation - this should trigger saving KV cache
_
=
llm
.
generate
(
PROMPTS
,
SAMPLING_PARAMS
)
#
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
# --- Verification ---
#
# --- Verification ---
# Check that both storage directories were populated
#
# Check that both storage directories were populated
local_subdirs
=
list
(
storage_1_path
.
iterdir
())
#
local_subdirs = list(storage_1_path.iterdir())
external_subdirs
=
list
(
storage_2_path
.
iterdir
())
#
external_subdirs = list(storage_2_path.iterdir())
assert
len
(
#
assert len(
local_subdirs
#
local_subdirs
)
>
0
,
f
"Local storage path
{
storage_1_path
}
is empty after generation."
#
) > 0, f"Local storage path {storage_1_path} is empty after generation."
assert
len
(
external_subdirs
)
>
0
,
(
#
assert len(external_subdirs) > 0, (
f
"External storage path
{
storage_2_path
}
is empty after generation."
)
#
f"External storage path {storage_2_path} is empty after generation.")
assert
len
(
local_subdirs
)
==
len
(
external_subdirs
),
(
#
assert len(local_subdirs) == len(external_subdirs), (
f
"Mismatch in number of cache entries: "
#
f"Mismatch in number of cache entries: "
f
"Local=
{
len
(
local_subdirs
)
}
, External=
{
len
(
external_subdirs
)
}
"
)
#
f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
# The subdirectories should correspond to the prompt hashes
#
# The subdirectories should correspond to the prompt hashes
# Since prompts are the same, the hash directories should be the same name
#
# Since prompts are the same, the hash directories should be the same name
local_subdir_names
=
sorted
([
d
.
name
for
d
in
local_subdirs
])
#
local_subdir_names = sorted([d.name for d in local_subdirs])
external_subdir_names
=
sorted
([
d
.
name
for
d
in
external_subdirs
])
#
external_subdir_names = sorted([d.name for d in external_subdirs])
assert
local_subdir_names
==
external_subdir_names
,
(
#
assert local_subdir_names == external_subdir_names, (
"Cache directory names do not match between local and external storage"
#
"Cache directory names do not match between local and external storage"
)
#
)
# Compare the contents of each corresponding cache directory
#
# Compare the contents of each corresponding cache directory
for
subdir_name
in
local_subdir_names
:
#
for subdir_name in local_subdir_names:
print
(
f
"Comparing contents of cache directory:
{
subdir_name
}
"
)
#
print(f"Comparing contents of cache directory: {subdir_name}")
assert
_compare_directories
(
storage_1_path
/
subdir_name
,
#
assert _compare_directories(storage_1_path / subdir_name,
storage_2_path
/
subdir_name
),
\
#
storage_2_path / subdir_name), \
(
f
"Contents differ for cache directory '
{
subdir_name
}
' between "
#
(f"Contents differ for cache directory '{subdir_name}' between "
f
"
{
storage_1_path
}
and
{
storage_2_path
}
"
)
#
f"{storage_1_path} and {storage_2_path}")
events
=
get_connector_events
()
#
events = get_connector_events()
# get_num_new_matched_tokens and update_state_after_alloc will be called
#
# get_num_new_matched_tokens and update_state_after_alloc will be called
# on each connector in turn.
#
# on each connector in turn.
assert
events
[
"storage1-SCHEDULER"
][:
3
]
==
[
#
assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0'
,
#
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
#
]
assert
events
[
"storage1-WORKER"
][:
5
]
==
[
#
assert events["storage1-WORKER"][:5] == [
'register_kv_caches'
,
'bind_connector_metadata'
,
'start_load_kv'
,
#
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
'wait_for_layer_load'
,
'save_kv_layer'
#
'wait_for_layer_load', 'save_kv_layer'
]
#
]
assert
events
[
"storage2-SCHEDULER"
][:
3
]
==
[
#
assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0'
,
#
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
#
]
assert
events
[
"storage2-WORKER"
][:
5
]
==
[
#
assert events["storage2-WORKER"][:5] == [
'register_kv_caches'
,
'bind_connector_metadata'
,
'start_load_kv'
,
#
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
'wait_for_layer_load'
,
'save_kv_layer'
#
'wait_for_layer_load', 'save_kv_layer'
]
#
]
# Reset prefix cache or else we'll just get the tokens back from there.
#
# Reset prefix cache or else we'll just get the tokens back from there.
llm
.
reset_prefix_cache
()
#
llm.reset_prefix_cache()
# Run generation again - this should trigger loading from the first
#
# Run generation again - this should trigger loading from the first
# connector.
#
# connector.
_
=
llm
.
generate
(
PROMPTS
,
SAMPLING_PARAMS
)
#
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
events
=
get_connector_events
()
#
events = get_connector_events()
# get_num_new_matched_tokens will return new tokens from the first
#
# get_num_new_matched_tokens will return new tokens from the first
# connector so update_state_after_alloc will be with allocated blocks
#
# connector so update_state_after_alloc will be with allocated blocks
# on that one but with zero blocks for others (first nonzero match is
#
# on that one but with zero blocks for others (first nonzero match is
# chosen).
#
# chosen).
assert
events
[
"storage1-SCHEDULER"
][:
3
]
==
[
#
assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0'
,
#
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[7] 96'
,
'build_connector_meta'
#
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
]
#
]
assert
events
[
"storage2-SCHEDULER"
][:
3
]
==
[
#
assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0'
,
#
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
#
]
# Delete storage1 connector state
#
# Delete storage1 connector state
shutil
.
rmtree
(
storage_1_path
)
#
shutil.rmtree(storage_1_path)
# Reset prefix cache or else we'll just get the tokens back from there.
#
# Reset prefix cache or else we'll just get the tokens back from there.
llm
.
reset_prefix_cache
()
#
llm.reset_prefix_cache()
# Run generation again - this should trigger loading from the first
#
# Run generation again - this should trigger loading from the first
# connector.
#
# connector.
_
=
llm
.
generate
(
PROMPTS
,
SAMPLING_PARAMS
)
#
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
events
=
get_connector_events
()
#
events = get_connector_events()
# get_num_new_matched_tokens will be called for both connectors but will
#
# get_num_new_matched_tokens will be called for both connectors but will
# return 0 from the first connector, but the second connector should have
#
# return 0 from the first connector, but the second connector should have
# a hit, so update_state_after_alloc will only be called with allocated
#
# a hit, so update_state_after_alloc will only be called with allocated
# blocks for the second connector.
#
# blocks for the second connector.
assert
events
[
"storage1-SCHEDULER"
][:
3
]
==
[
#
assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0'
,
#
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
#
]
assert
events
[
"storage2-SCHEDULER"
][:
3
]
==
[
#
assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0'
,
#
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[7] 96'
,
'build_connector_meta'
#
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
]
#
]
# Clean up
#
# Clean up
shutil
.
rmtree
(
storage_1_path
)
#
shutil.rmtree(storage_1_path)
shutil
.
rmtree
(
storage_2_path
)
#
shutil.rmtree(storage_2_path)
def
get_connector_events
()
->
dict
[
str
,
list
[
str
]]:
def
get_connector_events
()
->
dict
[
str
,
list
[
str
]]:
...
...
tests/v1/metrics/__init__.py
0 → 100644
View file @
a0d02d42
tests/v1/metrics/test_ray_metrics.py
View file @
a0d02d42
...
@@ -2,10 +2,12 @@
...
@@ -2,10 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
ray
import
ray
import
os
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusStatLogger
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusStatLogger
from
utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
...
@@ -17,7 +19,7 @@ def use_v1_only(monkeypatch):
...
@@ -17,7 +19,7 @@ def use_v1_only(monkeypatch):
MODELS
=
[
MODELS
=
[
"distilbert/distilgpt2"
,
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
]
]
...
...
tests/v1/spec_decode/test_eagle.py
View file @
a0d02d42
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
from
unittest
import
mock
from
unittest
import
mock
import
os
import
pytest
import
pytest
import
torch
import
torch
...
@@ -12,10 +13,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
...
@@ -12,10 +13,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.v1.spec_decode.eagle
import
EagleProposer
from
vllm.v1.spec_decode.eagle
import
EagleProposer
from
...utils
import
models_path_prefix
model_dir
=
"meta-llama/Llama-3.1-8B-Instruct"
model_dir
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)
eagle_dir
=
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
eagle_dir
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
)
eagle3_dir
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
eagle3_dir
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
)
def
_create_proposer
(
method
:
str
,
k
:
int
)
->
EagleProposer
:
def
_create_proposer
(
method
:
str
,
k
:
int
)
->
EagleProposer
:
...
...
tests/v1/test_oracle.py
View file @
a0d02d42
...
@@ -8,6 +8,7 @@ import vllm.envs as envs
...
@@ -8,6 +8,7 @@ import vllm.envs as envs
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
UNSUPPORTED_MODELS_V1
=
[
UNSUPPORTED_MODELS_V1
=
[
...
@@ -121,6 +122,7 @@ def test_v1_llm_by_default(monkeypatch):
...
@@ -121,6 +122,7 @@ def test_v1_llm_by_default(monkeypatch):
def
test_v1_attn_backend
(
monkeypatch
):
def
test_v1_attn_backend
(
monkeypatch
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
if
not
current_platform
.
is_rocm
():
if
os
.
getenv
(
"VLLM_USE_V1"
,
None
):
if
os
.
getenv
(
"VLLM_USE_V1"
,
None
):
m
.
delenv
(
"VLLM_USE_V1"
)
m
.
delenv
(
"VLLM_USE_V1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"XFORMERS"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"XFORMERS"
)
...
...
tests/v1/tpu/test_kv_cache_update_kernel.py
→
tests/v1/tpu/
un
test_kv_cache_update_kernel.py
View file @
a0d02d42
File moved
tests/v1/tpu/test_spmd_model_weight_loading.py
→
tests/v1/tpu/
un
test_spmd_model_weight_loading.py
View file @
a0d02d42
File moved
tests/v1/tpu/test_tpu_qkv_linear.py
→
tests/v1/tpu/
un
test_tpu_qkv_linear.py
View file @
a0d02d42
File moved
tests/worker/test_encoder_decoder_model_runner.py
View file @
a0d02d42
...
@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
...
@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
assert
torch
.
equal
(
actual
,
expected
)
assert
torch
.
equal
(
actual
,
expected
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"ROCM is not supported."
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
)))
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
)))
@
pytest
.
mark
.
parametrize
(
"multiple_seqs_per_seq_group"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"multiple_seqs_per_seq_group"
,
[
True
,
False
])
def
test_prepare_decode_cuda_graph
(
batch_size
,
multiple_seqs_per_seq_group
):
def
test_prepare_decode_cuda_graph
(
batch_size
,
multiple_seqs_per_seq_group
):
...
...
tests/worker/test_model_runner.py
View file @
a0d02d42
...
@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
...
@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
False
,
enable_chunked_prefill
=
False
,
)
)
assert
model_runner
.
attn_backend
.
__name__
==
"TritonMLABackend"
assert
model_runner
.
attn_backend
.
__name__
==
"FlashMLABackend"
#
"TritonMLABackend"
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
,
3
)))
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
,
3
)))
...
...
vllm/model_executor/layers/fused_moe/config.py
View file @
a0d02d42
...
@@ -50,6 +50,7 @@ def get_config_quant_dtype(
...
@@ -50,6 +50,7 @@ def get_config_quant_dtype(
use_int8_w8a8
:
bool
,
use_int8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
use_int4_w4a16
:
bool
,
use_int4_w4a8
:
bool
,
)
->
Optional
[
torch
.
dtype
]:
)
->
Optional
[
torch
.
dtype
]:
if
use_fp8_w8a8
:
if
use_fp8_w8a8
:
return
torch
.
float8_e4m3fn
return
torch
.
float8_e4m3fn
...
@@ -126,6 +127,7 @@ class FusedMoEQuantConfig:
...
@@ -126,6 +127,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
per_act_token_quant
:
bool
=
False
,
per_act_token_quant
:
bool
=
False
,
per_out_ch_quant
:
bool
=
False
,
per_out_ch_quant
:
bool
=
False
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
...
@@ -136,6 +138,7 @@ class FusedMoEQuantConfig:
...
@@ -136,6 +138,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8
,
use_int8_w8a8
,
use_int8_w8a16
,
use_int8_w8a16
,
use_int4_w4a16
,
use_int4_w4a16
,
use_int4_w4a8
,
]
]
])
<=
1
,
"Quantization flags are mutually exclusive."
])
<=
1
,
"Quantization flags are mutually exclusive."
...
@@ -144,6 +147,7 @@ class FusedMoEQuantConfig:
...
@@ -144,6 +147,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a8
=
use_int4_w4a8
,
)
)
return
FusedMoEQuantConfig
(
return
FusedMoEQuantConfig
(
quant_dtype
,
quant_dtype
,
...
...
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
0 → 100644
View file @
a0d02d42
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
0 → 100644
View file @
a0d02d42
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
a0d02d42
...
@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
...
@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
B_scale
:
Optional
[
torch
.
Tensor
],
B_scale
:
Optional
[
torch
.
Tensor
],
B_zp
:
Optional
[
torch
.
Tensor
],
B_zp
:
Optional
[
torch
.
Tensor
],
topk_weights
:
Optional
[
torch
.
Tensor
],
topk_weights
:
Optional
[
torch
.
Tensor
],
topk_ids
:
torch
.
Tensor
,
sorted_token_ids
:
torch
.
Tensor
,
sorted_token_ids
:
torch
.
Tensor
,
expert_ids
:
torch
.
Tensor
,
expert_ids
:
torch
.
Tensor
,
num_tokens_post_padded
:
torch
.
Tensor
,
num_tokens_post_padded
:
torch
.
Tensor
,
...
@@ -1603,7 +1602,8 @@ def fused_experts_impl(
...
@@ -1603,7 +1602,8 @@ def fused_experts_impl(
qtype
=
get_config_quant_dtype
(
use_fp8_w8a8
=
use_fp8_w8a8
,
qtype
=
get_config_quant_dtype
(
use_fp8_w8a8
=
use_fp8_w8a8
,
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
)
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a8
=
use_int4_w4a8
)
get_config_func
=
functools
.
partial
(
get_config_func
=
functools
.
partial
(
try_get_optimal_moe_config
,
try_get_optimal_moe_config
,
...
@@ -1689,7 +1689,6 @@ def fused_experts_impl(
...
@@ -1689,7 +1689,6 @@ def fused_experts_impl(
w1_scale
,
w1_scale
,
w1_zp
,
w1_zp
,
curr_topk_weights
,
curr_topk_weights
,
curr_topk_ids
,
sorted_token_ids
,
sorted_token_ids
,
expert_ids
,
expert_ids
,
num_tokens_post_padded
,
num_tokens_post_padded
,
...
@@ -1729,7 +1728,6 @@ def fused_experts_impl(
...
@@ -1729,7 +1728,6 @@ def fused_experts_impl(
w2_scale
,
w2_scale
,
w2_zp
,
w2_zp
,
curr_topk_weights
,
curr_topk_weights
,
curr_topk_ids
,
sorted_token_ids
,
sorted_token_ids
,
expert_ids
,
expert_ids
,
num_tokens_post_padded
,
num_tokens_post_padded
,
...
@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
per_act_token_quant
:
bool
=
False
,
per_act_token_quant
:
bool
=
False
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
):
):
...
@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
self
.
use_int4_w4a16
=
use_int4_w4a16
self
.
use_int4_w4a16
=
use_int4_w4a16
self
.
use_int8_w8a8
=
use_int8_w8a8
self
.
use_int8_w8a8
=
use_int8_w8a8
self
.
use_int8_w8a16
=
use_int8_w8a16
self
.
use_int8_w8a16
=
use_int8_w8a16
self
.
use_int4_w4a8
=
use_int4_w4a8
self
.
use_int4_w4a8
=
use_int4_w4a8
@
property
@
property
def
activation_formats
(
def
activation_formats
(
...
@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8
=
self
.
use_int8_w8a8
,
use_int8_w8a8
=
self
.
use_int8_w8a8
,
use_int8_w8a16
=
self
.
use_int8_w8a16
,
use_int8_w8a16
=
self
.
use_int8_w8a16
,
use_int4_w4a16
=
self
.
use_int4_w4a16
,
use_int4_w4a16
=
self
.
use_int4_w4a16
,
use_int4_w4a8
=
self
.
use_int4_w4a8
,
use_int4_w4a8
=
self
.
use_int4_w4a8
,
per_channel_quant
=
self
.
per_act_token_quant
,
per_channel_quant
=
self
.
per_act_token_quant
,
block_shape
=
self
.
block_shape
)
block_shape
=
self
.
block_shape
)
...
@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe(
...
@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe(
use_int8_w8a8
:
bool
,
use_int8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
use_int4_w4a16
:
bool
,
use_int4_w4a8
:
bool
,
use_int4_w4a8
:
bool
,
per_act_token_quant
:
bool
,
per_act_token_quant
:
bool
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
)
->
mk
.
FusedMoEModularKernel
:
)
->
mk
.
FusedMoEModularKernel
:
...
@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe(
...
@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe(
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a8
=
use_int4_w4a8
,
use_int4_w4a8
=
use_int4_w4a8
,
per_act_token_quant
=
per_act_token_quant
,
per_act_token_quant
=
per_act_token_quant
,
block_shape
=
block_shape
,
block_shape
=
block_shape
,
),
),
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
a0d02d42
...
@@ -791,7 +791,6 @@ class FusedMoE(torch.nn.Module):
...
@@ -791,7 +791,6 @@ class FusedMoE(torch.nn.Module):
if
quant_config
is
None
:
if
quant_config
is
None
:
# Not considering quant for now, temporarily
# Not considering quant for now, temporarily
self
.
use_nn_moe
=
int
(
os
.
environ
.
get
(
'MOE_NN'
,
1
))
==
1
self
.
use_nn_moe
=
int
(
os
.
environ
.
get
(
'MOE_NN'
,
1
))
==
1
# self.use_nn_moe = os.environ.get('MOE_NN') == '1'
else
:
else
:
self
.
use_nn_moe
=
False
self
.
use_nn_moe
=
False
...
...
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
View file @
a0d02d42
...
@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
...
@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
@
cache
@
cache
def
is_rocm_aiter_moe_enabled
()
->
bool
:
def
is_rocm_aiter_moe_enabled
()
->
bool
:
return
current_platform
.
is_rocm
()
\
return
False
and
envs
.
VLLM_ROCM_USE_AITER_MOE
\
# return current_platform.is_rocm() \
and
envs
.
VLLM_ROCM_USE_AITER
# and envs.VLLM_ROCM_USE_AITER_MOE \
# and envs.VLLM_ROCM_USE_AITER
def
rocm_aiter_asm_moe_tkw1_impl
(
def
rocm_aiter_asm_moe_tkw1_impl
(
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment