merge v0.3.3

e00b0a19 · zhuwenwen · ead94d93 · 3f1166ab · e00b0a19 · e00b0a19
Commit e00b0a19 authored Mar 23, 2024 by zhuwenwen
20 changed files
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
--- a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
+++ b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
--- a/csrc/quantization/gptq/matrix_view.cuh
+++ b/csrc/quantization/gptq/matrix_view.cuh
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
--- a/csrc/quantization/gptq/qdq_2.cuh
+++ b/csrc/quantization/gptq/qdq_2.cuh
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+#ifndef _qdq_2_cuh
+#define _qdq_2_cuh
+#include "qdq_util.cuh"
+namespace vllm {
+namespace gptq {
+// Permutation:
+//
+// ffddbb99 77553311  eeccaa88 66442200
+__forceinline__ __device__ void shuffle_2bit_16
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0];
+    uint32_t qb = 0;
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+    {
+        uint32_t qa0 = qa & 0x03;
+        uint32_t qa1 = (qa & 0x0c) >> 2;
+        qa >>= 4;
+        qb |= (qa1 << (i * 2 + 16));
+        qb |= (qa0 << (i * 2));
+    }
+    q[0] = qb;
+}
+__forceinline__ __device__ void dequant_2bit_16
+(
+    const uint32_t q_0,
+    half2 (&dq)[8],
+    int stride,
+    const uint32_t zero
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y4_  = __float2half_rn(1.0f /  4.0f);
+    const half y16_ = __float2half_rn(1.0f / 16.0f);
+    const half y64_ = __float2half_rn(1.0f / 64.0f);
+    const half2 y4  = __halves2half2(y4_,  y4_);
+    const half2 y16 = __halves2half2(y16_, y16_);
+    const half2 y64 = __halves2half2(y64_, y64_);
+    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+    const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
+    const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+    const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+    const half2 z1 = __half2half2(z1_.as_half);
+    const half2 z4 = __half2half2(z4_);
+    const half2 z16 = __half2half2(z16_);
+    const half2 z64 = __half2half2(z64_);
+    uint32_t qa = q_0;
+    half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
+    half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
+    half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
+    qa >>= 8;
+    half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
+    half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
+    half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
+    half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
+    dq[0] = __hadd2(q0.as_half2, z1);
+    dq[1] = __hfma2(q1.as_half2, y4,  z4);
+    dq[2] = __hfma2(q2.as_half2, y16, z16);
+    dq[3] = __hfma2(q3.as_half2, y64, z64);
+    dq[4] = __hadd2(q4.as_half2, z1);
+    dq[5] = __hfma2(q5.as_half2, y4,  z4);
+    dq[6] = __hfma2(q6.as_half2, y16, z16);
+    dq[7] = __hfma2(q7.as_half2, y64, z64);
+}
+}  // namespace gptq
+}  // namespace vllm
+#endif
--- a/csrc/quantization/gptq/qdq_3.cuh
+++ b/csrc/quantization/gptq/qdq_3.cuh
--- a/csrc/quantization/gptq/qdq_4.cuh
+++ b/csrc/quantization/gptq/qdq_4.cuh
--- a/csrc/quantization/gptq/qdq_8.cuh
+++ b/csrc/quantization/gptq/qdq_8.cuh
--- a/csrc/quantization/marlin/LICENSE
+++ b/csrc/quantization/marlin/LICENSE
--- a/csrc/quantization/marlin/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/marlin_cuda_kernel.cu
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -72,7 +72,7 @@ html_theme_options = {
 # Mock out external dependencies here.
 autodoc_mock_imports = [
-    "torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
+    "torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
    "vllm.cuda_utils", "vllm._C"
 ]
@@ -94,3 +94,5 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter
+navigation_with_keys = False
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -67,3 +67,13 @@ You can also build and install vLLM from source:
        $ # Use `--ipc=host` to make sure the shared memory is large enough.
        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+.. note::
+    If you are developing the C++ backend of vLLM, consider building vLLM with
+    .. code-block:: console
+        $ python setup.py develop
+    since it will give you incremental builds. The downside is that this method
+    is `deprecated by setuptools <https://github.com/pypa/setuptools/issues/917>`_.
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
--- a/docs/source/quantization/fp8_e5m2_kv_cache.rst
+++ b/docs/source/quantization/fp8_e5m2_kv_cache.rst
--- a/docs/source/serving/deploying_with_kserve.rst
+++ b/docs/source/serving/deploying_with_kserve.rst
+.. _deploying_with_kserve:
+Deploying with KServe
+============================
+vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
+Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/vllm/>`_ for more details on using vLLM with KServe.
--- a/docs/source/serving/serving_with_langchain.rst
+++ b/docs/source/serving/serving_with_langchain.rst