"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "bc4eb65b5492b4f84a1b714bfc14bcff73d401f1"
Unverified Commit 8213e8f8 authored by Monishver's avatar Monishver Committed by GitHub
Browse files
parent 3693f922
......@@ -196,6 +196,7 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s tests/v1/distributed/test_eagle_dp.py
- label: Distributed Tests (2 GPUs)(B200)
device: b200
......
......@@ -20,16 +20,14 @@ if current_platform.is_rocm():
else:
ATTN_BACKENDS = ["FLASH_ATTN"]
# On SM<90 (e.g., L4), batch invariance does not support CUDA graphs.
# See https://github.com/vllm-project/vllm/pull/30018 and
# tests/v1/determinism/utils.py for the documented limitation.
IS_DEVICE_CAPABILITY_BELOW_90 = not current_platform.has_device_capability(90)
@pytest.mark.asyncio
@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
@pytest.mark.xfail(
not current_platform.is_rocm(),
reason="EAGLE + DP > 1 produces wrong outputs when async spec decode "
"correction is active. Root cause under investigation. "
"See: https://github.com/vllm-project/vllm/issues/31913",
strict=False,
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Test may fail on ROCm until batch invariance is enabled. "
......@@ -57,7 +55,7 @@ async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch, attn_backend: str):
engine_args = AsyncEngineArgs(
model=target_model,
tokenizer_mode="auto",
enforce_eager=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
data_parallel_size=DP_SIZE,
data_parallel_backend="mp", # ray takes more time
......
......@@ -8,6 +8,7 @@ import torch
import torch.nn.functional as F
from torch.nn.parameter import Parameter, UninitializedParameter
import vllm.envs as envs
from vllm.distributed import (
divide,
get_tensor_model_parallel_rank,
......@@ -15,6 +16,9 @@ from vllm.distributed import (
tensor_model_parallel_all_reduce,
)
from vllm.model_executor.custom_op import PluggableLayer
from vllm.model_executor.layers.batch_invariant import (
linear_batch_invariant,
)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig,
QuantizeMethodBase,
......@@ -66,6 +70,8 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
if envs.VLLM_BATCH_INVARIANT and current_platform.is_cuda_alike():
return linear_batch_invariant(x, layer.weight, bias)
return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment