Unverified Commit 9570654c authored by Micah Williamson's avatar Micah Williamson Committed by GitHub
Browse files

[ROCm][CI] Run Kernels Core Operation Test On MI325 and mitigate flakiness (#38184)


Signed-off-by: default avatarMicah Williamson <micah.williamson@amd.com>
parent d56e9522
...@@ -751,6 +751,7 @@ steps: ...@@ -751,6 +751,7 @@ steps:
timeout_in_minutes: 180 timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
agent_pool: mi250_1 agent_pool: mi250_1
optional: true
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
...@@ -2035,7 +2036,6 @@ steps: ...@@ -2035,7 +2036,6 @@ steps:
timeout_in_minutes: 38 timeout_in_minutes: 38
mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
agent_pool: mi325_1 agent_pool: mi325_1
optional: true
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
......
...@@ -7,12 +7,20 @@ import torch ...@@ -7,12 +7,20 @@ import torch
from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.quant_utils import FP8_DTYPE
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
if current_platform.is_rocm():
from vllm.platforms.rocm import on_gfx90a
on_mi250 = on_gfx90a()
else:
on_mi250 = False
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing
ADD_RESIDUAL = [False, True] ADD_RESIDUAL = [False, True] if not on_mi250 else [True]
SEEDS = [0] SEEDS = [0]
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
......
...@@ -182,6 +182,7 @@ _ON_GFX1X = any(arch in _GCN_ARCH for arch in ["gfx11", "gfx12"]) ...@@ -182,6 +182,7 @@ _ON_GFX1X = any(arch in _GCN_ARCH for arch in ["gfx11", "gfx12"])
_ON_GFX12X = any(arch in _GCN_ARCH for arch in ["gfx12"]) _ON_GFX12X = any(arch in _GCN_ARCH for arch in ["gfx12"])
_ON_MI3XX = any(arch in _GCN_ARCH for arch in ["gfx942", "gfx950"]) _ON_MI3XX = any(arch in _GCN_ARCH for arch in ["gfx942", "gfx950"])
_ON_GFX9 = any(arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) _ON_GFX9 = any(arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
_ON_GFX90A = "gfx90a" in _GCN_ARCH
_ON_GFX942 = "gfx942" in _GCN_ARCH _ON_GFX942 = "gfx942" in _GCN_ARCH
_ON_GFX950 = "gfx950" in _GCN_ARCH _ON_GFX950 = "gfx950" in _GCN_ARCH
...@@ -273,6 +274,10 @@ def on_gfx9() -> bool: ...@@ -273,6 +274,10 @@ def on_gfx9() -> bool:
return _ON_GFX9 return _ON_GFX9
def on_gfx90a() -> bool:
return _ON_GFX90A
def on_gfx942() -> bool: def on_gfx942() -> bool:
return _ON_GFX942 return _ON_GFX942
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment