Unverified Commit 7721ef17 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[CI/Build][CPU] Fix CPU CI and remove all CPU V0 files (#20560)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
parent 8369b7c2
...@@ -48,10 +48,16 @@ function cpu_tests() { ...@@ -48,10 +48,16 @@ function cpu_tests() {
# Run basic model test # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model # Note: disable until supports V1
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
# Note: disable Bart until supports V1
pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -v -s tests/models/language/pooling -m cpu_model pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \ pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_mllama.py \
...@@ -62,21 +68,15 @@ function cpu_tests() { ...@@ -62,21 +68,15 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Note: disable it until supports V1
# Run AWQ test # Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c " # docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e # set -e
# VLLM_USE_V1=0 pytest -s -v \ # VLLM_USE_V1=0 pytest -s -v \
# tests/quantization/test_ipex_quant.py" # tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving # online serving
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
......
...@@ -294,61 +294,3 @@ def test_with_prefix_caching( ...@@ -294,61 +294,3 @@ def test_with_prefix_caching(
name_0="w/o prefix caching", name_0="w/o prefix caching",
name_1="with prefix caching", name_1="with prefix caching",
) )
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
test_models(
hf_runner,
vllm_runner,
example_prompts,
model,
dtype,
max_tokens,
chunked_prefill_token_size,
enforce_eager,
1,
attention_backend,
monkeypatch,
)
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
dtype: str,
) -> None:
test_with_prefix_caching(
vllm_runner,
max_tokens,
enforce_eager,
chunk_size,
1,
dtype,
)
...@@ -39,7 +39,7 @@ AITER_MODEL_LIST = [ ...@@ -39,7 +39,7 @@ AITER_MODEL_LIST = [
[ [
pytest.param( pytest.param(
"bigscience/bloom-560m", # bloom - testing alibi slopes "bigscience/bloom-560m", # bloom - testing alibi slopes
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model],
), ),
pytest.param( pytest.param(
"openai-community/gpt2", # gpt2 "openai-community/gpt2", # gpt2
...@@ -87,7 +87,11 @@ AITER_MODEL_LIST = [ ...@@ -87,7 +87,11 @@ AITER_MODEL_LIST = [
pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param("bigcode/starcoder2-3b"), # starcoder2
pytest.param( pytest.param(
"TitanML/tiny-mixtral", # mixtral "TitanML/tiny-mixtral", # mixtral
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model],
),
pytest.param(
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
marks=[pytest.mark.cpu_model],
) )
]) ])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -29,8 +28,10 @@ def v1(run_with_both_engines): ...@@ -29,8 +28,10 @@ def v1(run_with_both_engines):
# [Decoder-only] # [Decoder-only]
pytest.param("BAAI/bge-multilingual-gemma2", pytest.param("BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model]), marks=[pytest.mark.core_model]),
pytest.param("intfloat/e5-mistral-7b-instruct", pytest.param(
marks=[pytest.mark.core_model, pytest.mark.cpu_model]), "intfloat/e5-mistral-7b-instruct",
# CPU v1 doesn't support sliding window
marks=[pytest.mark.core_model]),
# the qwen models interfere with each other (see PR # the qwen models interfere with each other (see PR
# https://github.com/vllm-project/vllm/pull/18720). # https://github.com/vllm-project/vllm/pull/18720).
# To avoid this problem, for now we skip v0 since it will be # To avoid this problem, for now we skip v0 since it will be
...@@ -38,9 +39,11 @@ def v1(run_with_both_engines): ...@@ -38,9 +39,11 @@ def v1(run_with_both_engines):
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only] # [Encoder-only]
pytest.param("BAAI/bge-base-en-v1.5", pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[ marks=[
pytest.mark.core_model, pytest.mark.cpu_model, # CPU only supports V1
pytest.mark.core_model,
pytest.mark.skip_v1 pytest.mark.skip_v1
]), ]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2", pytest.param("sentence-transformers/all-MiniLM-L12-v2",
...@@ -61,10 +64,6 @@ def test_models( ...@@ -61,10 +64,6 @@ def test_models(
model, model,
monkeypatch, monkeypatch,
) -> None: ) -> None:
if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
) and os.environ.get("VLLM_USE_V1", "0") == "1":
pytest.skip("CPU V1 doesn't support sliding window")
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention # ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend # switch to use ROCm CK FA backend
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
...@@ -84,6 +86,9 @@ def test_prm_models( ...@@ -84,6 +86,9 @@ def test_prm_models(
dtype: str, dtype: str,
monkeypatch, monkeypatch,
) -> None: ) -> None:
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
pytest.skip("CPU only supports V1")
if current_platform.is_rocm(): if current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention # ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend # switch to use ROCm CK FA backend
......
...@@ -45,6 +45,7 @@ def use_v0_only(monkeypatch): ...@@ -45,6 +45,7 @@ def use_v0_only(monkeypatch):
""" """
This module relies on V0 internals, so set VLLM_USE_V1=0. This module relies on V0 internals, so set VLLM_USE_V1=0.
""" """
if not current_platform.is_cpu():
monkeypatch.setenv('VLLM_USE_V1', '0') monkeypatch.setenv('VLLM_USE_V1', '0')
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import List, Optional, Tuple
try:
import intel_extension_for_pytorch.llm.modules as ipex_modules
_use_ipex = True
# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813
except (ImportError, AttributeError):
_use_ipex = False
import torch
from vllm import _custom_ops as ops
class _PagedAttention:
@staticmethod
def get_supported_head_sizes() -> List[int]:
return [32, 64, 80, 96, 112, 128, 192, 256]
@staticmethod
def get_kv_cache_shape(
num_blocks: int,
block_size: int,
num_kv_heads: int,
head_size: int,
*args,
) -> Tuple[int, ...]:
return 2, num_blocks, block_size * num_kv_heads * head_size
@staticmethod
def split_kv_cache(
kv_cache: torch.Tensor,
num_kv_heads: int,
head_size: int,
*args,
) -> Tuple[torch.Tensor, torch.Tensor]:
x = 16 // kv_cache.element_size()
num_blocks = kv_cache.shape[1]
key_cache = kv_cache[0]
key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-1, x)
value_cache = kv_cache[1]
value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
return key_cache, value_cache
@staticmethod
def write_to_paged_cache(
key: torch.Tensor,
value: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
slot_mapping: torch.Tensor,
kv_cache_dtype: str,
k_scale: torch.Tensor,
v_scale: torch.Tensor,
*args,
) -> None:
ops.reshape_and_cache(
key,
value,
key_cache,
value_cache,
slot_mapping.flatten(),
kv_cache_dtype,
k_scale,
v_scale,
)
@staticmethod
def forward_decode(
output: torch.Tensor,
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
block_tables: torch.Tensor,
context_lens: torch.Tensor,
max_context_len: int,
kv_cache_dtype: str,
num_kv_heads: int,
scale: float,
alibi_slopes: Optional[torch.Tensor],
k_scale: torch.Tensor,
v_scale: torch.Tensor,
*args,
) -> None:
tp_rank: int = 0
blocksparse_local_blocks: int = 0
blocksparse_vert_stride: int = 0
blocksparse_block_size: int = 64
blocksparse_head_sliding_step: int = 0
block_size = value_cache.shape[3]
ops.paged_attention_v1(
output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
context_lens,
block_size,
max_context_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
tp_rank,
blocksparse_local_blocks,
blocksparse_vert_stride,
blocksparse_block_size,
blocksparse_head_sliding_step,
)
@staticmethod
def copy_blocks(
kv_caches: List[torch.Tensor],
src_to_dists: torch.Tensor,
*args,
) -> None:
key_caches = [kv_cache[0] for kv_cache in kv_caches]
value_caches = [kv_cache[1] for kv_cache in kv_caches]
ops.copy_blocks(key_caches, value_caches, src_to_dists)
class _IPEXPagedAttention(_PagedAttention):
@staticmethod
def split_kv_cache(
kv_cache: torch.Tensor,
num_kv_heads: int,
head_size: int,
*args,
) -> Tuple[torch.Tensor, torch.Tensor]:
num_blocks = kv_cache.shape[1]
key_cache = kv_cache[0]
key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
value_cache = kv_cache[1]
value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
return key_cache, value_cache
@staticmethod
def write_to_paged_cache(
key: torch.Tensor,
value: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
slot_mapping: torch.Tensor,
kv_cache_dtype: str,
k_scale: torch.Tensor,
v_scale: torch.Tensor,
*args,
) -> None:
ipex_modules.PagedAttention.reshape_and_cache(
key, value, key_cache, value_cache,
slot_mapping.flatten().int())
@staticmethod
def forward_decode(
output: torch.Tensor,
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
block_tables: torch.Tensor,
context_lens: torch.Tensor,
max_context_len: int,
kv_cache_dtype: str,
num_kv_heads: int,
scale: float,
alibi_slopes: Optional[torch.Tensor],
k_scale: torch.Tensor,
v_scale: torch.Tensor,
*args,
) -> None:
block_size = value_cache.shape[2]
head_mapping = torch.arange(
0,
num_kv_heads,
device="cpu",
dtype=torch.int32,
).view(num_kv_heads,
1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
ipex_modules.PagedAttention.single_query_cached_kv_attention(
output, query.contiguous(), key_cache, value_cache, head_mapping,
scale, block_tables, context_lens, block_size, max_context_len,
alibi_slopes)
PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment