Commit 081057de authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-ori

parents 7cf5d5c4 ba41cc90
# SPDX-License-Identifier: Apache-2.0
import pytest
import schemathesis
from schemathesis import GenerationConfig
from ...utils import RemoteOpenAIServer
schemathesis.experimental.OPEN_API_3_1.enable()
MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
MAXIMUM_IMAGES = 2
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"generate",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def get_schema(server):
# avoid generating null (\x00) bytes in strings during test case generation
return schemathesis.openapi.from_uri(
f"{server.url_root}/openapi.json",
generation_config=GenerationConfig(allow_x00=False),
)
schema = schemathesis.from_pytest_fixture("get_schema")
@schema.parametrize()
@schema.override(headers={"Content-Type": "application/json"})
async def test_openapi_stateless(case):
#No need to verify SSL certificate for localhost
await case.call_and_validate(verify=False)
...@@ -192,3 +192,36 @@ async def test_stream_options(winning_call): ...@@ -192,3 +192,36 @@ async def test_stream_options(winning_call):
else: else:
continuous = continuous and hasattr(chunk, 'usage') continuous = continuous and hasattr(chunk, 'usage')
assert final and continuous assert final and continuous
@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb):
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
temperature=0.8,
extra_body=dict(seed=42,
repetition_penalty=1.9,
top_k=12,
top_p=0.4,
min_p=0.5,
frequency_penalty=1.8,
presence_penalty=2.0))
greedy_transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
temperature=0.0,
extra_body=dict(seed=42))
assert greedy_transcription.text != transcription.text
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json
import openai import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
...@@ -31,7 +33,7 @@ def server(): ...@@ -31,7 +33,7 @@ def server():
"--enforce-eager", "--enforce-eager",
"--trust-remote-code", "--trust-remote-code",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
f"video={MAXIMUM_VIDEOS}", json.dumps({"video": MAXIMUM_VIDEOS}),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -106,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI, ...@@ -106,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
model_name: str,
video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": video_url
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# video_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json
import openai import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
...@@ -35,7 +37,7 @@ def server(): ...@@ -35,7 +37,7 @@ def server():
"--enforce-eager", "--enforce-eager",
"--trust-remote-code", "--trust-remote-code",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}", json.dumps({"image": MAXIMUM_IMAGES}),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -135,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, ...@@ -135,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
model_name: str,
image_url: str):
content_text = "What's in this image?"
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": image_url
},
{
"type": "text",
"text": content_text
},
],
}]
# image_url should be a dict {"url": "some url"}, not directly a string
with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json
import pytest import pytest
import requests import requests
from PIL import Image from PIL import Image
...@@ -37,7 +39,7 @@ def server(): ...@@ -37,7 +39,7 @@ def server():
"--enforce-eager", "--enforce-eager",
"--trust-remote-code", "--trust-remote-code",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}", json.dumps({"image": MAXIMUM_IMAGES}),
"--chat-template", "--chat-template",
str(vlm2vec_jinja_path), str(vlm2vec_jinja_path),
] ]
......
...@@ -6,13 +6,12 @@ from typing import Optional ...@@ -6,13 +6,12 @@ from typing import Optional
import pytest import pytest
import torch import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes from vllm.utils import get_max_shared_memory_bytes
from .allclose_default import get_default_atol, get_default_rtol
if not current_platform.is_rocm(): if not current_platform.is_rocm():
from xformers import ops as xops from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
......
...@@ -19,45 +19,161 @@ def clear_cache(): ...@@ -19,45 +19,161 @@ def clear_cache():
_cached_get_attn_backend.cache_clear() _cached_get_attn_backend.cache_clear()
@pytest.mark.parametrize( # Define MLA and non-MLA backends separately
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) DEVICE_MLA_BACKENDS = {
"cuda": ["TRITON_MLA", "FLASHMLA"],
"hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
"cpu": [],
}
DEVICE_REGULAR_ATTN_BACKENDS = {
"cuda": ["XFORMERS", "FLASHINFER"],
"hip": ["ROCM_FLASH"],
"cpu": ["TORCH_SDPA"],
}
DEVICE_MLA_BLOCK_SIZES = {
"cuda": [16, 64], # CUDA supports both standard and extended block sizes
"hip": [16, 1], # HIP requires special handling for block_size=1
"cpu": [16] # CPU uses fixed block size from test cases
}
def generate_params():
params = []
for use_mla in [True, False]:
for device in ["cuda", "hip", "cpu"]:
backends = DEVICE_MLA_BACKENDS[
device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
for name in backends:
block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
16
]
for block_size in block_sizes:
params.append(
pytest.param(
device,
name,
use_mla,
block_size,
id=
f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
))
return params
@pytest.mark.parametrize("device, name, use_mla, block_size",
generate_params())
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
def test_env( def test_env(
device: str,
name: str, name: str,
use_mla: bool,
block_size: int,
use_v1: bool, use_v1: bool,
device: str,
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
"""Test that the attention selector can be set via environment variable. """Test attention backend selection with valid device-backend pairs."""
Note that we do not test FlashAttn because it is the default backend.
"""
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, name) m.setenv(STR_BACKEND_ENV_VAR, name)
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
CpuPlatform()): CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, backend = get_attn_backend(16, torch.float16, torch.float16,
16, False) block_size, False)
assert backend.get_name() == "TORCH_SDPA" assert backend.get_name() == "TORCH_SDPA"
elif device == "hip": elif device == "hip":
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
RocmPlatform()): RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, if use_mla:
16, False) # Validate HIP MLA backend-block_size combinations
EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" valid_combination = (
assert backend.get_name() == EXPECTED (name == "TRITON_MLA" and block_size != 1)
else: or (name == "ROCM_AITER_MLA" and block_size == 1))
if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform", if valid_combination:
CudaPlatform()): backend = get_attn_backend(16,
backend = get_attn_backend(16, torch.float16, torch.float16,
torch.float16, 16, False) torch.float16,
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name block_size,
assert backend.get_name() == EXPECTED False,
use_mla=use_mla)
assert backend.get_name() == name
else:
with pytest.raises(ValueError) as exc_info:
get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
assert f"The selected backend, {name}" in str(
exc_info.value)
else:
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == expected
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
if use_mla:
if name == "FLASHMLA" and block_size == 64:
from vllm.attention.backends.flashmla import (
is_flashmla_supported)
# only on cuda platforms with specific capability.
is_supported, _ = is_flashmla_supported()
if not is_supported:
# if platform is not supported then skip this case.
pytest.skip()
else:
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = f"{name}_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = ("TRITON_MLA_VLLM_V1"
if use_v1 else "TRITON_MLA")
assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
def test_flash_attn(monkeypatch: pytest.MonkeyPatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
......
...@@ -6,14 +6,13 @@ from typing import Optional ...@@ -6,14 +6,13 @@ from typing import Optional
import pytest import pytest
import torch import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import ( from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn) LocalStridedBlockSparseAttn)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes from vllm.utils import get_max_shared_memory_bytes
from .allclose_default import get_default_atol, get_default_rtol
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
# - 512 as a buffer # - 512 as a buffer
......
...@@ -16,6 +16,7 @@ NUM_LAYERS = [1] # Arbitrary values for testing ...@@ -16,6 +16,7 @@ NUM_LAYERS = [1] # Arbitrary values for testing
NUM_HEADS = [8] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing
HEAD_SIZES = [64, 80, 120, 256] HEAD_SIZES = [64, 80, 120, 256]
BLOCK_SIZES = [8, 16, 32] BLOCK_SIZES = [8, 16, 32]
CACHE_LAYOUTS = ["NHD", "HND"]
# Parameters for MLA tests. # Parameters for MLA tests.
KV_LORA_RANKS = [512] KV_LORA_RANKS = [512]
...@@ -220,6 +221,7 @@ def test_reshape_and_cache( ...@@ -220,6 +221,7 @@ def test_reshape_and_cache(
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@pytest.mark.parametrize("kv_cache_layout", CACHE_LAYOUTS)
@torch.inference_mode() @torch.inference_mode()
def test_reshape_and_cache_flash( def test_reshape_and_cache_flash(
kv_cache_factory_flashinfer, kv_cache_factory_flashinfer,
...@@ -232,17 +234,21 @@ def test_reshape_and_cache_flash( ...@@ -232,17 +234,21 @@ def test_reshape_and_cache_flash(
seed: int, seed: int,
device: str, device: str,
kv_cache_dtype: str, kv_cache_dtype: str,
kv_cache_layout: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
torch.set_default_device(device) torch.set_default_device(device)
# fp8 conversion requires continugous memory buffer. Reduce the number of
# blocks and tokens to consume less memory.
num_tokens = num_tokens // 2
num_blocks = num_blocks // 2
# Create a random slot mapping. # Create a random slot mapping.
num_slots = block_size * num_blocks num_slots = block_size * num_blocks
slot_mapping_lst = random.sample(range(num_slots), num_tokens) slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst, slot_mapping = torch.tensor(slot_mapping_lst,
dtype=torch.long, dtype=torch.long,
device=device) device=device)
qkv = torch.randn(num_tokens, qkv = torch.randn(num_tokens,
3, 3,
num_heads, num_heads,
...@@ -261,27 +267,35 @@ def test_reshape_and_cache_flash( ...@@ -261,27 +267,35 @@ def test_reshape_and_cache_flash(
kv_cache_dtype, kv_cache_dtype,
dtype, dtype,
device=device, device=device,
cache_layout=kv_cache_layout,
) )
key_cache, value_cache = key_caches[0].contiguous( key_cache, value_cache = key_caches[0], value_caches[0]
), value_caches[0].contiguous()
del key_caches del key_caches
del value_caches del value_caches
k_scale = (key.amax() / 64.0).to(torch.float32) k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32) v_scale = (value.amax() / 64.0).to(torch.float32)
def permute_and_compact(x):
y = x if kv_cache_layout == "NHD" else x.permute(0, 2, 1, 3)
return y.contiguous()
key_cache_compact = permute_and_compact(key_cache)
value_cache_compact = permute_and_compact(value_cache)
# Clone the KV caches. # Clone the KV caches.
if kv_cache_dtype == "fp8": if kv_cache_dtype == "fp8":
cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) cloned_key_cache = torch.empty_like(key_cache_compact,
ops.convert_fp8(cloned_key_cache, key_cache, k_scale.item(), dtype=torch.float16)
kv_cache_dtype) ops.convert_fp8(cloned_key_cache, key_cache_compact, k_scale.item(),
cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
ops.convert_fp8(cloned_value_cache, value_cache, v_scale.item(),
kv_cache_dtype) kv_cache_dtype)
cloned_value_cache = torch.empty_like(value_cache_compact,
dtype=torch.float16)
ops.convert_fp8(cloned_value_cache, value_cache_compact,
v_scale.item(), kv_cache_dtype)
else: else:
cloned_key_cache = key_cache.clone() cloned_key_cache = key_cache_compact.clone()
cloned_value_cache = value_cache.clone() cloned_value_cache = value_cache_compact.clone()
# Call the reshape_and_cache kernel. # Call the reshape_and_cache kernel.
opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash, opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
...@@ -289,16 +303,20 @@ def test_reshape_and_cache_flash( ...@@ -289,16 +303,20 @@ def test_reshape_and_cache_flash(
cond=(head_size == HEAD_SIZES[0])) cond=(head_size == HEAD_SIZES[0]))
ops.reshape_and_cache_flash(key, value, key_cache, value_cache, ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
slot_mapping, kv_cache_dtype, k_scale, v_scale) slot_mapping, kv_cache_dtype, k_scale, v_scale)
key_cache_compact = permute_and_compact(key_cache)
value_cache_compact = permute_and_compact(value_cache)
if kv_cache_dtype == "fp8": if kv_cache_dtype == "fp8":
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) result_key_cache = torch.empty_like(key_cache_compact,
dtype=torch.float16)
ops.convert_fp8(result_key_cache, ops.convert_fp8(result_key_cache,
key_cache, key_cache_compact,
k_scale.item(), k_scale.item(),
kv_dtype=kv_cache_dtype) kv_dtype=kv_cache_dtype)
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) result_value_cache = torch.empty_like(value_cache_compact,
dtype=torch.float16)
ops.convert_fp8(result_value_cache, ops.convert_fp8(result_value_cache,
value_cache, value_cache_compact,
v_scale.item(), v_scale.item(),
kv_dtype=kv_cache_dtype) kv_dtype=kv_cache_dtype)
...@@ -310,8 +328,12 @@ def test_reshape_and_cache_flash( ...@@ -310,8 +328,12 @@ def test_reshape_and_cache_flash(
for i in range(num_tokens): for i in range(num_tokens):
block_idx = block_indicies_lst[i] block_idx = block_indicies_lst[i]
block_offset = block_offsets_lst[i] block_offset = block_offsets_lst[i]
cloned_key_cache[block_idx, block_offset, :, :] = key[i] if kv_cache_layout == "NHD":
cloned_value_cache[block_idx, block_offset, :, :] = value[i] cloned_key_cache[block_idx, block_offset, :, :] = key[i]
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
else:
cloned_key_cache[block_idx, :, block_offset, :] = key[i]
cloned_value_cache[block_idx, :, block_offset, :] = value[i]
if kv_cache_dtype == "fp8": if kv_cache_dtype == "fp8":
torch.testing.assert_close(result_key_cache, torch.testing.assert_close(result_key_cache,
...@@ -323,8 +345,8 @@ def test_reshape_and_cache_flash( ...@@ -323,8 +345,8 @@ def test_reshape_and_cache_flash(
atol=0.001, atol=0.001,
rtol=0.1) rtol=0.1)
else: else:
torch.testing.assert_close(key_cache, cloned_key_cache) torch.testing.assert_close(key_cache_compact, cloned_key_cache)
torch.testing.assert_close(value_cache, cloned_value_cache) torch.testing.assert_close(value_cache_compact, cloned_value_cache)
@pytest.mark.parametrize("direction", COPYING_DIRECTION) @pytest.mark.parametrize("direction", COPYING_DIRECTION)
......
...@@ -145,7 +145,7 @@ def test_flash_attn_with_paged_kv( ...@@ -145,7 +145,7 @@ def test_flash_attn_with_paged_kv(
v_descale = None v_descale = None
if q_dtype is not None: if q_dtype is not None:
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
maybe_quantized_query = query.to(q_dtype) maybe_quantized_query = q.to(q_dtype)
maybe_quantized_key_cache = key_cache.to(q_dtype) maybe_quantized_key_cache = key_cache.to(q_dtype)
maybe_quantized_value_cache = value_cache.to(q_dtype) maybe_quantized_value_cache = value_cache.to(q_dtype)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment