Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cfabf125
Commit
cfabf125
authored
Aug 27, 2025
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev
parents
dbd0bda6
645fcfd9
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
39 deletions
+42
-39
tests/kernels/attention/test_attention.py
tests/kernels/attention/test_attention.py
+1
-1
tests/kernels/attention/test_mha_attn.py
tests/kernels/attention/test_mha_attn.py
+1
-1
tests/kernels/attention/test_triton_unified_attention.py
tests/kernels/attention/test_triton_unified_attention.py
+1
-1
tests/kernels/moe/test_batched_moe.py
tests/kernels/moe/test_batched_moe.py
+2
-2
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+1
-1
tests/kernels/moe/untest_block_fp8.py
tests/kernels/moe/untest_block_fp8.py
+0
-0
tests/kernels/moe/untest_moe_permute_unpermute.py
tests/kernels/moe/untest_moe_permute_unpermute.py
+0
-0
tests/kernels/moe/untest_nvfp4_moe.py
tests/kernels/moe/untest_nvfp4_moe.py
+0
-0
tests/kernels/moe/untest_pplx_cutlass_moe.py
tests/kernels/moe/untest_pplx_cutlass_moe.py
+0
-0
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+0
-0
tests/kernels/moe/untest_triton_moe_ptpc_fp8.py
tests/kernels/moe/untest_triton_moe_ptpc_fp8.py
+0
-0
tests/kernels/quantization/__init__.py
tests/kernels/quantization/__init__.py
+0
-0
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gguf.py
+1
-1
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_int8_quant.py
+2
-2
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+3
-1
tests/kernels/quantization/untest_rocmskinny_gemms.py
tests/kernels/quantization/untest_rocmskinny_gemms.py
+0
-0
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+3
-1
tests/kernels/untest_fused_quant_activation.py
tests/kernels/untest_fused_quant_activation.py
+1
-1
tests/kernels/untest_triton_flash_attention.py
tests/kernels/untest_triton_flash_attention.py
+22
-23
tests/models/registry.py
tests/models/registry.py
+4
-4
No files found.
tests/kernels/attention/test_attention.py
View file @
cfabf125
...
@@ -18,7 +18,7 @@ if not current_platform.is_rocm():
...
@@ -18,7 +18,7 @@ if not current_platform.is_rocm():
from
xformers
import
ops
as
xops
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
vllm.attention.backends.xformers
import
_make_alibi_bias
from
vllm.attention.backends.xformers
import
_make_alibi_bias
FLOAT32_BYTES
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
FLOAT32_BYTES
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
# This will change depending on the compute capability.
# This will change depending on the compute capability.
...
...
tests/kernels/attention/test_mha_attn.py
View file @
cfabf125
...
@@ -25,7 +25,7 @@ def clear_cache():
...
@@ -25,7 +25,7 @@ def clear_cache():
_cached_get_attn_backend
.
cache_clear
()
_cached_get_attn_backend
.
cache_clear
()
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
]
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
]
if
not
current_platform
.
is_rocm
()
else
[
"cpu"
,
"hip"
])
def
test_mha_attn_platform
(
device
:
str
):
def
test_mha_attn_platform
(
device
:
str
):
"""
"""
Test the attention selector between different platform and device.
Test the attention selector between different platform and device.
...
...
tests/kernels/attention/test_triton_unified_attention.py
View file @
cfabf125
...
@@ -15,7 +15,7 @@ BLOCK_SIZES = [16, 32]
...
@@ -15,7 +15,7 @@ BLOCK_SIZES = [16, 32]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
QDTYPES
=
[
None
,
torch
.
float8_e4m3fn
]
if
not
current_platform
.
is_rocm
()
else
[
QDTYPES
=
[
None
,
torch
.
float8_e4m3fn
]
if
not
current_platform
.
is_rocm
()
else
[
None
,
torch
.
float8_e4m3fnuz
None
#
, torch.float8_e4m3fnuz
]
]
# one value large enough to test overflow in index calculation.
# one value large enough to test overflow in index calculation.
# one value small enough to test the schema op check
# one value small enough to test the schema op check
...
...
tests/kernels/moe/test_batched_moe.py
View file @
cfabf125
...
@@ -96,7 +96,7 @@ class BatchedMMTensors:
...
@@ -96,7 +96,7 @@ class BatchedMMTensors:
@
pytest
.
mark
.
parametrize
(
"N"
,
[
128
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"N"
,
[
128
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
"dtype"
,
[
torch
.
float8_e4m3fn
,
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
[
torch
.
float8_e4m3fn
,
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]
if
not
current_platform
.
is_rocm
()
else
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"block_shape"
,
[
None
,
[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"block_shape"
,
[
None
,
[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
def
test_batched_mm
(
num_experts
:
int
,
max_tokens_per_expert
:
int
,
K
:
int
,
def
test_batched_mm
(
num_experts
:
int
,
max_tokens_per_expert
:
int
,
K
:
int
,
...
@@ -208,7 +208,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
...
@@ -208,7 +208,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
@
pytest
.
mark
.
parametrize
((
"m"
,
"n"
,
"k"
),
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
((
"m"
,
"n"
,
"k"
),
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float8_e4m3fn
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float8_e4m3fn
,
torch
.
bfloat16
]
if
not
current_platform
.
is_rocm
()
else
[
torch
.
bfloat16
]
)
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"block_shape"
,
[
None
,
[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"block_shape"
,
[
None
,
[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"input_scales"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"input_scales"
,
[
False
])
...
...
tests/kernels/moe/test_moe.py
View file @
cfabf125
...
@@ -353,7 +353,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
...
@@ -353,7 +353,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"padding"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"padding"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
"use_rocm_aiter"
,
[
True
,
False
]
if
not
current_platform
.
is_rocm
()
else
[
False
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
monkeypatch
):
monkeypatch
):
...
...
tests/kernels/moe/test_block_fp8.py
→
tests/kernels/moe/
un
test_block_fp8.py
View file @
cfabf125
File moved
tests/kernels/moe/test_moe_permute_unpermute.py
→
tests/kernels/moe/
un
test_moe_permute_unpermute.py
View file @
cfabf125
File moved
tests/kernels/moe/test_nvfp4_moe.py
→
tests/kernels/moe/
un
test_nvfp4_moe.py
View file @
cfabf125
File moved
tests/kernels/moe/test_pplx_cutlass_moe.py
→
tests/kernels/moe/
un
test_pplx_cutlass_moe.py
View file @
cfabf125
File moved
tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
→
tests/kernels/moe/
un
test_silu_mul_fp8_quant_deep_gemm.py
View file @
cfabf125
File moved
tests/kernels/moe/test_triton_moe_ptpc_fp8.py
→
tests/kernels/moe/
un
test_triton_moe_ptpc_fp8.py
View file @
cfabf125
File moved
tests/kernels/quantization/__init__.py
0 → 100644
View file @
cfabf125
tests/kernels/quantization/test_gguf.py
View file @
cfabf125
...
@@ -13,7 +13,7 @@ import vllm._custom_ops as ops
...
@@ -13,7 +13,7 @@ import vllm._custom_ops as ops
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.quantization.gguf
import
_fused_moe_gguf
from
vllm.model_executor.layers.quantization.gguf
import
_fused_moe_gguf
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
from
..
.
utils
import
models_path_prefix
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
...
...
tests/kernels/quantization/test_int8_quant.py
View file @
cfabf125
...
@@ -42,7 +42,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
...
@@ -42,7 +42,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
(
output
,
input
,
scale
,
azp
))
(
output
,
input
,
scale
,
azp
))
@
pytest
.
mark
.
skipif
(
current_platform
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Currently, there is not supported on ROCm."
)
reason
=
"Currently, there is not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
...
@@ -67,7 +67,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
...
@@ -67,7 +67,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
opcheck_int8_quant_dynamic
(
ops_out
,
x
)
opcheck_int8_quant_dynamic
(
ops_out
,
x
)
@
pytest
.
mark
.
skipif
(
current_platform
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Currently, there is not supported on ROCm."
)
reason
=
"Currently, there is not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
cfabf125
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
"""
"""
import
os
import
importlib
import
importlib
from
typing
import
Optional
from
typing
import
Optional
...
@@ -11,6 +12,7 @@ import pytest
...
@@ -11,6 +12,7 @@ import pytest
import
torch
import
torch
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
device
=
"cuda"
device
=
"cuda"
...
@@ -45,7 +47,7 @@ def get_8bit_types():
...
@@ -45,7 +47,7 @@ def get_8bit_types():
# This test is to check regressions for int8 support on ROCm.
# This test is to check regressions for int8 support on ROCm.
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
)
,
])
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
...
...
tests/kernels/quantization/test_rocm
_
skinny_gemms.py
→
tests/kernels/quantization/
un
test_rocmskinny_gemms.py
View file @
cfabf125
File moved
tests/kernels/test_flex_attention.py
View file @
cfabf125
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for FlexAttention backend vs default backend"""
"""Integration tests for FlexAttention backend vs default backend"""
import
os
import
random
import
random
import
numpy
as
np
import
numpy
as
np
...
@@ -10,6 +11,7 @@ import torch
...
@@ -10,6 +11,7 @@ import torch
from
packaging
import
version
from
packaging
import
version
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
..utils
import
models_path_prefix
TORCH_VERSION
=
version
.
parse
(
torch
.
__version__
)
TORCH_VERSION
=
version
.
parse
(
torch
.
__version__
)
MINIMUM_TORCH_VERSION
=
version
.
parse
(
"2.7.0"
)
MINIMUM_TORCH_VERSION
=
version
.
parse
(
"2.7.0"
)
...
@@ -34,7 +36,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
...
@@ -34,7 +36,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
This test compares the outputs from the FlexAttention backend with
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are identical when using the same seed.
the default backend, ensuring they are identical when using the same seed.
"""
"""
model_name
=
"Qwen/Qwen2.5-1.5B-Instruct"
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
seed
=
42
seed
=
42
max_tokens
=
32
max_tokens
=
32
prompts
=
[
prompts
=
[
...
...
tests/kernels/test_fused_quant_activation.py
→
tests/kernels/
un
test_fused_quant_activation.py
View file @
cfabf125
...
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
...
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float16
]
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float16
]
QUANT_DTYPES
=
[
current_platform
.
fp8_dtype
()]
QUANT_DTYPES
=
[
current_platform
.
fp8_dtype
()]
if
not
current_platform
.
is_rocm
()
else
[
None
]
NUM_TOKENS
=
[
1
,
17
,
86
,
1234
,
3045
]
# Arbitrary values for testing
NUM_TOKENS
=
[
1
,
17
,
86
,
1234
,
3045
]
# Arbitrary values for testing
HIDDEN_SIZES
=
[
16
,
48
,
128
,
1562
,
4096
]
# Arbitrary values for testing
HIDDEN_SIZES
=
[
16
,
48
,
128
,
1562
,
4096
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
...
...
tests/kernels/test_triton_flash_attention.py
→
tests/kernels/
un
test_triton_flash_attention.py
View file @
cfabf125
...
@@ -7,8 +7,7 @@ Run `pytest tests/kernels/test_triton_flash_attention.py`.
...
@@ -7,8 +7,7 @@ Run `pytest tests/kernels/test_triton_flash_attention.py`.
import
pytest
import
pytest
import
torch
import
torch
from
vllm.attention.ops.triton_flash_attention
import
(
SUPPORTED_LAYOUTS
,
from
vllm.attention.ops.triton_flash_attention
import
(
MetaData
,
MetaData
,
compute_alibi_tensor
,
compute_alibi_tensor
,
scale_fp8
,
scale_fp8
,
triton_attention_rocm
)
triton_attention_rocm
)
...
@@ -60,26 +59,26 @@ class ReferenceAttention:
...
@@ -60,26 +59,26 @@ class ReferenceAttention:
ref_out
=
ref_out
.
transpose
(
1
,
2
).
clone
()
ref_out
=
ref_out
.
transpose
(
1
,
2
).
clone
()
return
ref_out
return
ref_out
def
fwd_fp8
(
self
,
q_quantized
,
k_quantized
,
v_quantized
):
#
def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
q
=
(
q_quantized
.
to
(
torch
.
float16
)
*
self
.
input_metadata
.
q_descale
).
to
(
#
q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
self
.
dtype
)
#
self.dtype)
k
=
(
k_quantized
.
to
(
torch
.
float16
)
*
self
.
input_metadata
.
k_descale
).
to
(
#
k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
self
.
dtype
)
#
self.dtype)
v
=
(
v_quantized
.
to
(
torch
.
float16
)
*
self
.
input_metadata
.
v_descale
).
to
(
#
v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
self
.
dtype
)
#
self.dtype)
result
=
self
.
fwd
(
q
,
k
,
v
)
#
result = self.fwd(q, k, v)
if
self
.
input_metadata
.
o_scale
is
not
None
:
#
if self.input_metadata.o_scale is not None:
result
,
_
=
scale_fp8
(
result
,
self
.
input_metadata
.
o_scale
)
#
result, _ = scale_fp8(result, self.input_metadata.o_scale)
return
result
#
return result
def
fwd_fp8_kv
(
self
,
q
,
k_quantized
,
v_quantized
):
#
def fwd_fp8_kv(self, q, k_quantized, v_quantized):
k_descale
,
v_descale
=
(
self
.
input_metadata
.
k_descale
,
#
k_descale, v_descale = (self.input_metadata.k_descale,
self
.
input_metadata
.
v_descale
)
#
self.input_metadata.v_descale)
k_dequantized
=
(
k_quantized
.
to
(
torch
.
float32
)
*
#
k_dequantized = (k_quantized.to(torch.float32) *
k_descale
.
to
(
torch
.
float32
)).
to
(
self
.
dtype
)
#
k_descale.to(torch.float32)).to(self.dtype)
v_dequantized
=
(
v_quantized
.
to
(
torch
.
float32
)
*
#
v_dequantized = (v_quantized.to(torch.float32) *
v_descale
.
to
(
torch
.
float32
)).
to
(
self
.
dtype
)
#
v_descale.to(torch.float32)).to(self.dtype)
return
self
.
fwd
(
q
,
k_dequantized
,
v_dequantized
)
#
return self.fwd(q, k_dequantized, v_dequantized)
def
varlen_fwd
(
self
,
q
,
k
,
v
,
is_mqa
=
False
):
def
varlen_fwd
(
self
,
q
,
k
,
v
,
is_mqa
=
False
):
ref_out
=
torch
.
empty_like
(
q
)
ref_out
=
torch
.
empty_like
(
q
)
...
@@ -145,7 +144,7 @@ def input_helper(
...
@@ -145,7 +144,7 @@ def input_helper(
use_o_scale
=
False
,
use_o_scale
=
False
,
use_bias
=
False
,
use_bias
=
False
,
):
):
assert
layout
in
SUPPORTED_LAYOUTS
,
"Got unsupported layout."
#
assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
current_platform
.
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
...
...
tests/models/registry.py
View file @
cfabf125
...
@@ -210,7 +210,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -210,7 +210,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JambaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/AI21-Jamba-1.5-Mini"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/AI21-Jamba-1.5-Mini"
),
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
)}),
# noqa: E501
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
)}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
"LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
extras
=
{
"guard"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
extras
=
{
"guard"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
)
,
# noqa: E501
"hermes"
:
os
.
path
.
join
(
models_path_prefix
,
"NousResearch/Hermes-3-Llama-3.1-8B"
),
# noqa: E501
"hermes"
:
os
.
path
.
join
(
models_path_prefix
,
"NousResearch/Hermes-3-Llama-3.1-8B"
),
# noqa: E501
"fp8"
:
os
.
path
.
join
(
models_path_prefix
,
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
)}),
# noqa: E501
"fp8"
:
os
.
path
.
join
(
models_path_prefix
,
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
)}),
# noqa: E501
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"decapoda-research/llama-7b-hf"
),
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"decapoda-research/llama-7b-hf"
),
...
@@ -367,12 +367,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -367,12 +367,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
),
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
),
# noqa: E501
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM-256M-Instruct"
)}),
# noqa: E501
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM-256M-Instruct"
)}),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
)
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Instruct"
),
# noqa: E501
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Instruct"
),
# noqa: E501
extras
=
{
"thinking"
:
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Thinking"
)},
# noqa: E501
extras
=
{
"thinking"
:
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Thinking"
)},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
# noqa: E501
max_model_len
=
10240
),
max_model_len
=
10240
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
),
extras
=
{
"mistral"
:
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
# noqa: E501
extras
=
{
"mistral"
:
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
# noqa: E501
...
@@ -407,7 +407,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -407,7 +407,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
extras
=
{
"phi3.5"
:
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
}
)),
# noqa: E501
extras
=
{
"phi3.5"
:
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
}
),
# noqa: E501
"Ovis"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis2-1B"
),
trust_remote_code
=
True
,
"Ovis"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis2-1B"
),
trust_remote_code
=
True
,
extras
=
{
"1.6-llama"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
),
extras
=
{
"1.6-llama"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
),
"1.6-gemma"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
)}),
# noqa: E501
"1.6-gemma"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
)}),
# noqa: E501
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment