Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec68d53b
Unverified
Commit
ec68d53b
authored
Apr 10, 2026
by
Yan Ma
Committed by
GitHub
Apr 10, 2026
Browse files
Add platform manual_seed_all API (#38468)
Signed-off-by:
Yan Ma
<
yan.ma@intel.com
>
parent
13e6b1b9
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
57 additions
and
44 deletions
+57
-44
benchmarks/kernels/benchmark_moe_align_block_size.py
benchmarks/kernels/benchmark_moe_align_block_size.py
+2
-1
benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
...hmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+2
-3
tests/kernels/attention/test_lightning_attn.py
tests/kernels/attention/test_lightning_attn.py
+0
-6
tests/kernels/core/test_fused_quant_layernorm.py
tests/kernels/core/test_fused_quant_layernorm.py
+2
-3
tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
+3
-2
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+5
-5
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
+2
-3
tests/kernels/test_fused_quant_activation.py
tests/kernels/test_fused_quant_activation.py
+2
-3
tests/models/language/pooling/test_token_classification.py
tests/models/language/pooling/test_token_classification.py
+2
-7
tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
+2
-7
tests/v1/spec_decode/test_tree_attention.py
tests/v1/spec_decode/test_tree_attention.py
+2
-2
tools/pre_commit/check_torch_cuda.py
tools/pre_commit/check_torch_cuda.py
+9
-0
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+4
-0
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+4
-0
vllm/platforms/interface.py
vllm/platforms/interface.py
+5
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+4
-0
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+4
-0
vllm/utils/torch_utils.py
vllm/utils/torch_utils.py
+3
-2
No files found.
benchmarks/kernels/benchmark_moe_align_block_size.py
View file @
ec68d53b
...
...
@@ -9,6 +9,7 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size
,
)
from
vllm.triton_utils
import
triton
from
vllm.utils.torch_utils
import
set_random_seed
def
get_topk_ids
(
num_tokens
:
int
,
num_experts
:
int
,
topk
:
int
)
->
torch
.
Tensor
:
...
...
@@ -44,7 +45,7 @@ configs = list(
def
benchmark
(
num_tokens
,
num_experts
,
topk
,
ep_size
,
provider
):
"""Benchmark function for Triton."""
block_size
=
256
torch
.
cuda
.
manual
_seed
_all
(
0
)
set_random
_seed
(
0
)
topk_ids
=
get_topk_ids
(
num_tokens
,
num_experts
,
topk
)
e_map
=
None
...
...
benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
View file @
ec68d53b
...
...
@@ -16,6 +16,7 @@ from vllm.utils.deep_gemm import (
fp8_gemm_nt
,
per_block_cast_to_fp8
,
)
from
vllm.utils.torch_utils
import
set_random_seed
def
benchmark_shape
(
...
...
@@ -235,9 +236,7 @@ def run_benchmarks(verbose: bool = False):
torch
.
backends
.
cudnn
.
allow_tf32
=
True
# Set seeds for reproducibility
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
set_random_seed
(
42
)
# Define benchmark shapes (m, n, k)
shapes
=
[
(
8
,
4096
,
7168
),
...
...
tests/kernels/attention/test_lightning_attn.py
View file @
ec68d53b
...
...
@@ -122,8 +122,6 @@ def test_linear_decode_forward_triton(
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
set_random_seed
(
42
)
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
...
...
@@ -165,8 +163,6 @@ def test_linear_decode_forward_triton_with_padding(
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
set_random_seed
(
42
)
batch_size
=
4
...
...
@@ -229,8 +225,6 @@ def test_lightning_attention_reference(
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
set_random_seed
(
42
)
base
=
0.01
...
...
tests/kernels/core/test_fused_quant_layernorm.py
View file @
ec68d53b
...
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import (
per_token_group_quant_int8
,
)
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
QUANT_DTYPES
=
[
torch
.
int8
,
current_platform
.
fp8_dtype
()]
...
...
@@ -180,9 +181,7 @@ def test_rms_norm(
device
:
str
,
strided_input
:
bool
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
accelerator
.
set_device_index
(
device
)
...
...
tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
View file @
ec68d53b
...
...
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_mxint4_moe import
)
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
def
mxint4_quantize
(
...
...
@@ -134,7 +135,7 @@ def test_marlin_vs_trtllm_mxint4_moe_kimik2(monkeypatch, m, n, k, e, topk, group
pytest
.
importorskip
(
"flashinfer"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_INT4"
,
"1"
)
torch
.
cuda
.
manual
_seed
(
0
)
set_random
_seed
(
0
)
dtype
=
torch
.
bfloat16
...
...
@@ -289,7 +290,7 @@ def test_flashinfer_trtllm_mxint4_moe_wrapper(m, n, k, e, topk):
flashinfer_trtllm_mxint4_moe
,
)
torch
.
cuda
.
manual
_seed
(
0
)
set_random
_seed
(
0
)
dtype
=
torch
.
bfloat16
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
*
0.5
...
...
tests/kernels/moe/test_moe.py
View file @
ec68d53b
...
...
@@ -1031,7 +1031,7 @@ def test_fused_marlin_moe(
act_order
:
bool
,
is_k_full
:
bool
,
):
torch
.
cuda
.
manual
_seed
(
1
)
set_random
_seed
(
1
)
group_size
=
group_blocks
if
group_blocks
<=
0
else
group_blocks
*
16
if
c_type
==
scalar_types
.
float16
:
...
...
@@ -1131,7 +1131,7 @@ def test_fused_marlin_moe(
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Skip for rocm"
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
256
])
def
test_fused_marlin_moe_with_bias
(
m
):
torch
.
cuda
.
manual
_seed
(
0
)
set_random
_seed
(
0
)
e
,
topk
=
32
,
4
n
,
k
=
2048
,
2048
...
...
@@ -1213,7 +1213,7 @@ def test_fused_marlin_moe_non_gated(
Non-gated activations like relu2 don't have the gate-up projection pattern,
so w1 has shape (e, n, k) instead of (e, 2*n, k).
"""
torch
.
cuda
.
manual
_seed
(
42
)
set_random
_seed
(
42
)
group_size
=
16
# NVFP4 group size
is_k_full
=
True
...
...
@@ -1397,7 +1397,7 @@ def test_cpu_fused_moe_basic(
from
vllm.model_executor.layers.fused_moe.cpu_fused_moe
import
CPUFusedMOE
device
=
"cpu"
torch
.
manual
_seed
(
7
)
set_random
_seed
(
7
)
a
=
torch
.
randn
((
m
,
k
),
device
=
device
,
dtype
=
dtype
)
/
10
w13
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
device
,
dtype
=
dtype
)
/
10
...
...
@@ -1469,7 +1469,7 @@ def test_batched_fused_marlin_moe(
f
"topk=
{
topk
}
, "
f
"max_tokens_per_batch=
{
max_tokens_per_batch
}
"
)
torch
.
cuda
.
manual
_seed
(
0
)
set_random
_seed
(
0
)
dtype
=
torch
.
bfloat16
quant_dtype
=
scalar_types
.
float4_e2m1f
...
...
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
View file @
ec68d53b
...
...
@@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
from
vllm.forward_context
import
set_forward_context
from
vllm.model_executor.layers.fused_moe.shared_fused_moe
import
SharedFusedMoE
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
,
set_random_seed
class
SimpleLinear
(
nn
.
Module
):
...
...
@@ -144,8 +144,7 @@ def test_routed_input_transform_inside_vs_outside(
rocm_aiter_ops
.
refresh_env_variables
()
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
set_random_seed
(
42
)
num_experts
=
8
top_k
=
2
...
...
tests/kernels/test_fused_quant_activation.py
View file @
ec68d53b
...
...
@@ -7,6 +7,7 @@ import vllm._custom_ops as ops
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float16
]
QUANT_DTYPES
=
[
current_platform
.
fp8_dtype
()]
...
...
@@ -49,9 +50,7 @@ def test_silu_and_mul(
seed
:
int
,
device
:
str
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
layer
=
SiluAndMul
()
...
...
tests/models/language/pooling/test_token_classification.py
View file @
ec68d53b
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
numpy
as
np
import
pytest
import
torch
from
transformers
import
AutoModelForTokenClassification
from
tests.models.utils
import
softmax
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
@
pytest
.
fixture
(
autouse
=
True
)
def
seed_everything
():
"""Seed all random number generators for reproducibility."""
seed
=
0
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
set_random_seed
(
seed
)
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
yield
...
...
tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
View file @
ec68d53b
...
...
@@ -5,9 +5,6 @@ This script contains:
1. test lora with speculative decoding for batch inference
"""
import
random
import
numpy
as
np
import
pytest
import
torch
...
...
@@ -15,6 +12,7 @@ from vllm import LLM, SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
LORA_TEST_PROMPT_MAP
:
dict
[
str
,
str
]
=
{}
...
...
@@ -63,10 +61,7 @@ def test_batch_inference_correctness(
with
monkeypatch
.
context
()
as
m
:
# Disable randomness
m
.
setenv
(
"CUBLAS_WORKSPACE_CONFIG"
,
":4096:8"
)
torch
.
manual_seed
(
SEED
)
np
.
random
.
seed
(
SEED
)
random
.
seed
(
SEED
)
torch
.
cuda
.
manual_seed_all
(
SEED
)
set_random_seed
(
SEED
)
torch
.
backends
.
cudnn
.
benchmark
=
False
torch
.
backends
.
cudnn
.
deterministic
=
True
...
...
tests/v1/spec_decode/test_tree_attention.py
View file @
ec68d53b
...
...
@@ -14,6 +14,7 @@ from tests.v1.attention.utils import (
)
from
vllm.config
import
ParallelConfig
,
SpeculativeConfig
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.attention.backend
import
CommonAttentionMetadata
from
vllm.v1.attention.backends.fa_utils
import
is_flash_attn_varlen_func_available
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
...
...
@@ -323,8 +324,7 @@ def forward_attention(
def
test_tree_attn_correctness
(
reference_backend
:
AttentionBackendEnum
,
)
->
None
:
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
set_random_seed
(
42
)
device
=
"cuda"
tree_attn_masks
=
{
...
...
tools/pre_commit/check_torch_cuda.py
View file @
ec68d53b
...
...
@@ -9,6 +9,7 @@ import regex as re
# --------------------------------------------------------------------------- #
_TORCH_CUDA_PATTERNS
=
[
r
"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|reset_peak_memory_stats|memory_stats|set_device|device\()\b"
,
r
"\btorch\.cuda\.(manual_seed|manual_seed_all)\b"
,
r
"\bwith\storch\.cuda\.device\b"
,
# Calls torch.cuda.{_is_compiled/_device_count_amdsmi/_device_count_nvml} internally
r
"\bcuda_device_count_stateless\(\)\b"
,
...
...
@@ -24,6 +25,14 @@ def scan_file(path: str) -> int:
for
match
in
re
.
finditer
(
pattern
,
content
,
re
.
MULTILINE
):
# Calculate line number from match position
line_num
=
content
[:
match
.
start
()
+
1
].
count
(
"
\n
"
)
+
1
matched_text
=
match
.
group
(
0
)
if
"manual_seed"
in
matched_text
:
print
(
f
"
{
path
}
:
{
line_num
}
: "
"
\033
[91merror:
\033
[0m "
f
"Found
{
matched_text
}
API call. Use set_random_seed instead."
)
return
1
print
(
f
"
{
path
}
:
{
line_num
}
: "
"
\033
[91merror:
\033
[0m "
# red color
...
...
vllm/platforms/cpu.py
View file @
ec68d53b
...
...
@@ -154,6 +154,10 @@ class CpuPlatform(Platform):
"""
torch
.
cpu
.
set_device
(
device
)
@
classmethod
def
manual_seed_all
(
cls
,
seed
:
int
)
->
None
:
pass
@
classmethod
def
inference_mode
(
cls
):
return
torch
.
no_grad
()
...
...
vllm/platforms/cuda.py
View file @
ec68d53b
...
...
@@ -188,6 +188,10 @@ class CudaPlatformBase(Platform):
# for why and when it is needed
_
=
torch
.
zeros
(
1
,
device
=
device
)
@
classmethod
def
manual_seed_all
(
cls
,
seed
:
int
)
->
None
:
torch
.
cuda
.
manual_seed_all
(
seed
)
@
classmethod
def
get_device_capability
(
cls
,
device_id
:
int
=
0
)
->
DeviceCapability
|
None
:
raise
NotImplementedError
...
...
vllm/platforms/interface.py
View file @
ec68d53b
...
...
@@ -391,6 +391,11 @@ class Platform:
"""
raise
NotImplementedError
@
classmethod
def
manual_seed_all
(
cls
,
seed
:
int
)
->
None
:
"""Set RNG seed across all devices for the current platform."""
raise
NotImplementedError
@
classmethod
def
pre_register_and_update
(
cls
,
parser
:
FlexibleArgumentParser
|
None
=
None
...
...
vllm/platforms/rocm.py
View file @
ec68d53b
...
...
@@ -605,6 +605,10 @@ class RocmPlatform(Platform):
"""
torch
.
cuda
.
set_device
(
device
)
@
classmethod
def
manual_seed_all
(
cls
,
seed
:
int
)
->
None
:
torch
.
cuda
.
manual_seed_all
(
seed
)
@
classmethod
@
lru_cache
(
maxsize
=
8
)
def
get_device_capability
(
cls
,
device_id
:
int
=
0
)
->
DeviceCapability
|
None
:
...
...
vllm/platforms/xpu.py
View file @
ec68d53b
...
...
@@ -125,6 +125,10 @@ class XPUPlatform(Platform):
"""
torch
.
xpu
.
set_device
(
device
)
@
classmethod
def
manual_seed_all
(
cls
,
seed
:
int
)
->
None
:
torch
.
xpu
.
manual_seed_all
(
seed
)
@
classmethod
def
get_device_capability
(
cls
,
...
...
vllm/utils/torch_utils.py
View file @
ec68d53b
...
...
@@ -365,8 +365,9 @@ def set_random_seed(seed: int | None) -> None:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
from
vllm.platforms
import
current_platform
current_platform
.
manual_seed_all
(
seed
)
def
create_kv_caches_with_random_flash
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment