Unverified Commit 8213e8f8 authored by Monishver's avatar Monishver Committed by GitHub
Browse files
parent 3693f922
...@@ -196,6 +196,7 @@ steps: ...@@ -196,6 +196,7 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s tests/v1/distributed/test_eagle_dp.py
- label: Distributed Tests (2 GPUs)(B200) - label: Distributed Tests (2 GPUs)(B200)
device: b200 device: b200
......
...@@ -20,16 +20,14 @@ if current_platform.is_rocm(): ...@@ -20,16 +20,14 @@ if current_platform.is_rocm():
else: else:
ATTN_BACKENDS = ["FLASH_ATTN"] ATTN_BACKENDS = ["FLASH_ATTN"]
# On SM<90 (e.g., L4), batch invariance does not support CUDA graphs.
# See https://github.com/vllm-project/vllm/pull/30018 and
# tests/v1/determinism/utils.py for the documented limitation.
IS_DEVICE_CAPABILITY_BELOW_90 = not current_platform.has_device_capability(90)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS) @pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
@pytest.mark.xfail(
not current_platform.is_rocm(),
reason="EAGLE + DP > 1 produces wrong outputs when async spec decode "
"correction is active. Root cause under investigation. "
"See: https://github.com/vllm-project/vllm/issues/31913",
strict=False,
)
@pytest.mark.xfail( @pytest.mark.xfail(
current_platform.is_rocm(), current_platform.is_rocm(),
reason="Test may fail on ROCm until batch invariance is enabled. " reason="Test may fail on ROCm until batch invariance is enabled. "
...@@ -57,7 +55,7 @@ async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch, attn_backend: str): ...@@ -57,7 +55,7 @@ async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch, attn_backend: str):
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=target_model, model=target_model,
tokenizer_mode="auto", tokenizer_mode="auto",
enforce_eager=False, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
data_parallel_size=DP_SIZE, data_parallel_size=DP_SIZE,
data_parallel_backend="mp", # ray takes more time data_parallel_backend="mp", # ray takes more time
......
...@@ -8,6 +8,7 @@ import torch ...@@ -8,6 +8,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch.nn.parameter import Parameter, UninitializedParameter from torch.nn.parameter import Parameter, UninitializedParameter
import vllm.envs as envs
from vllm.distributed import ( from vllm.distributed import (
divide, divide,
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
...@@ -15,6 +16,9 @@ from vllm.distributed import ( ...@@ -15,6 +16,9 @@ from vllm.distributed import (
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
) )
from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.custom_op import PluggableLayer
from vllm.model_executor.layers.batch_invariant import (
linear_batch_invariant,
)
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizationConfig,
QuantizeMethodBase, QuantizeMethodBase,
...@@ -66,6 +70,8 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): ...@@ -66,6 +70,8 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
x: torch.Tensor, x: torch.Tensor,
bias: torch.Tensor | None = None, bias: torch.Tensor | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
if envs.VLLM_BATCH_INVARIANT and current_platform.is_cuda_alike():
return linear_batch_invariant(x, layer.weight, bias)
return dispatch_unquantized_gemm()(layer, x, layer.weight, bias) return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor: def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment