[CI/BUILD] enable intel queue for longer CPU tests (#4113)

cafb8e06 · Yuan · GitHub · cbb2f59c · cafb8e06 · cafb8e06
Unverified Commit cafb8e06 authored Jun 04, 2024 by Yuan Committed by GitHub Jun 03, 2024
11 changed files
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
-# Run the image and launch offline inference
+# Run the image
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+# offline inference
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+# Run basic model test
+docker exec cpu-test bash -c "cd tests;
+  pip install pytest Pillow protobuf
+  bash ../.buildkite/download-images.sh
+  cd ../
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -40,6 +40,8 @@ steps:
  - label: "Intel Test"
    depends_on: ~
+    agents:
+      queue: intel
    command: bash .buildkite/run-cpu-test.sh
  {% for step in steps %}

--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
-FROM ubuntu:22.04
+FROM ubuntu:22.04 AS cpu-test-1
 RUN apt-get update  -y \
    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
@@ -9,6 +9,8 @@ RUN apt-get update  -y \
 RUN pip install --upgrade pip \
    && pip install wheel packaging ninja setuptools>=49.4.0 numpy
+FROM cpu-test-1 AS build
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
@@ -19,4 +21,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 WORKDIR /workspace/
+RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 CMD ["/bin/bash"]
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -21,18 +21,13 @@ void rotary_embedding_impl(
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  const int embed_dim = rot_dim / 2;
-  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
+  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
+  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
-#pragma omp parallel for
+  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+                          scalar_t* qk) {
-    int64_t pos = positions[token_idx];
+    int j = 0;
-    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+    for (; j < loop_upper; j += VEC_ELEM_NUM) {
-    for (int i = 0; i < num_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head =
-          token_idx * query_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
      const int rot_offset = j;
      const int x_index = rot_offset;
      const int y_index = embed_dim + rot_offset;
@@ -43,8 +38,8 @@ void rotary_embedding_impl(
      const scalar_vec_t cos(cache_ptr + x_index);
      const scalar_vec_t sin(cache_ptr + y_index);
-        const scalar_vec_t q_x(query + out_x);
+      const scalar_vec_t q_x(qk + out_x);
-        const scalar_vec_t q_y(query + out_y);
+      const scalar_vec_t q_y(qk + out_y);
      vec_op::FP32Vec8 fp32_cos(cos);
      vec_op::FP32Vec8 fp32_sin(sin);
@@ -53,41 +48,47 @@ void rotary_embedding_impl(
      vec_op::FP32Vec8 fp32_q_y(q_y);
      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-        scalar_vec_t(out1).save(query + out_x);
+      scalar_vec_t(out1).save(qk + out_x);
      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
-        scalar_vec_t(out2).save(query + out_y);
+      scalar_vec_t(out2).save(qk + out_y);
-      }
    }
+    if (!flag) {
-    for (int i = 0; i < num_kv_heads; ++i) {
+      for (; j < embed_dim; ++j) {
-      const int head_idx = i;
+        const int x_index = j;
-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+        const int y_index = embed_dim + j;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
        const int64_t out_x = token_head + x_index;
        const int64_t out_y = token_head + y_index;
-        const scalar_vec_t cos(cache_ptr + x_index);
+        const float fp32_cos = cache_ptr[x_index];
-        const scalar_vec_t sin(cache_ptr + y_index);
+        const float fp32_sin = cache_ptr[y_index];
-        const scalar_vec_t k_x(key + out_x);
+        const float fp32_q_x = qk[out_x];
-        const scalar_vec_t k_y(key + out_y);
+        const float fp32_q_y = qk[out_y];
-        vec_op::FP32Vec8 fp32_cos(cos);
+        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-        vec_op::FP32Vec8 fp32_sin(sin);
+        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      }
+    }
+  };
-        vec_op::FP32Vec8 fp32_k_x(k_x);
+#pragma omp parallel for
-        vec_op::FP32Vec8 fp32_k_y(k_y);
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
-        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
+    for (int i = 0; i < num_heads; ++i) {
-        scalar_vec_t(out1).save(key + out_x);
+      const int head_idx = i;
-        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
+      const int64_t token_head =
-        scalar_vec_t(out2).save(key + out_y);
+          token_idx * query_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, query);
    }
+    for (int i = 0; i < num_kv_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, key);
    }
  }
 }

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,7 @@ from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
 logger = init_logger(__name__)
@@ -58,6 +59,7 @@ def cleanup():
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
+    if not is_cpu():
        torch.cuda.empty_cache()
@@ -151,6 +153,12 @@ _EMBEDDING_MODELS = [
 class HfRunner:
+    def wrap_device(self, input: any):
+        if not is_cpu():
+            return input.to("cuda")
+        else:
+            return input.to("cpu")
    def __init__(
        self,
        model_name: str,
@@ -164,16 +172,18 @@ class HfRunner:
        if model_name in _EMBEDDING_MODELS:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-            self.model = SentenceTransformer(
+            self.model = self.wrap_device(
+                SentenceTransformer(
                    model_name,
                    device="cpu",
-            ).to(dtype=torch_dtype).cuda()
+                ).to(dtype=torch_dtype))
        else:
-            self.model = AutoModelForCausalLM.from_pretrained(
+            self.model = self.wrap_device(
+                AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch_dtype,
                    trust_remote_code=True,
-            ).cuda()
+                ))
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
@@ -214,7 +224,7 @@ class HfRunner:
            inputs = self.processor(**processor_kwargs)
            output_ids = self.model.generate(
-                **inputs.to("cuda"),
+                **self.wrap_device(inputs),
                use_cache=True,
                **kwargs,
            )
@@ -271,7 +281,7 @@ class HfRunner:
        for prompt in prompts:
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            output = self.model.generate(
-                input_ids.cuda(),
+                self.wrap_device(input_ids),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -306,7 +316,7 @@ class HfRunner:
        for prompt in prompts:
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            output = self.model.generate(
-                input_ids.cuda(),
+                self.wrap_device(input_ids),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,

--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -8,9 +8,12 @@ import torch
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-capability = torch.cuda.get_device_capability()
+aqlm_not_supported = True
-capability = capability[0] * 10 + capability[1]
-aqlm_not_supported = (capability <
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    aqlm_not_supported = (capability <
                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
 # In this test we hardcode prompts and generations for the model so we don't

--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -5,6 +5,7 @@ This tests bigger models and use half precision.
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
+import torch
 MODELS = [
    "meta-llama/Llama-2-7b-hf",
@@ -16,9 +17,14 @@ MODELS = [
    # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
+#TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [32])
 def test_models(
    hf_runner,
@@ -46,7 +52,7 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 def test_model_print(
    vllm_runner,
    model: str,

--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -67,9 +67,12 @@ EXPECTED_STRS_MAP = {
    },
 }
-capability = torch.cuda.get_device_capability()
+fp8_not_supported = True
-capability = capability[0] * 10 + capability[1]
-fp8_not_supported = (capability <
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    fp8_not_supported = (capability <
                         QUANTIZATION_METHODS["fp8"].get_min_capability())

--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -22,9 +22,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
-capability = torch.cuda.get_device_capability()
+gptq_marlin_not_supported = True
-capability = capability[0] * 10 + capability[1]
-gptq_marlin_not_supported = (
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    gptq_marlin_not_supported = (
        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
 MODELS = [

--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -14,10 +14,13 @@ import torch
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-capability = torch.cuda.get_device_capability()
+marlin_not_supported = True
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
+if torch.cuda.is_available():
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass

--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -19,10 +19,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from .utils import check_logprobs_close
-capability = torch.cuda.get_device_capability()
+marlin_not_supported = True
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
+if torch.cuda.is_available():
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass