[Feature] Stats Quantization Parameters for KV Cache (#45)

* add cal qparams * support offload inference * add collect funtions (mod,weight) * stats kv scales * update init * add user guide * fix hints * fix comments & support turbomind format * update user guide * fix slice kv cache error & support pileval dataset (used in llm-awq) * fix wrong num heads slice * update default dataset * fix conflict * fix hints * fix hints * add gitignore

[Feature] Stats Quantization Parameters for KV Cache (#45)
* add cal qparams * support offload inference * add collect funtions (mod,weight) * stats kv scales * update init * add user guide * fix hints * fix comments & support turbomind format * update user guide * fix slice kv cache error & support pileval dataset (used in llm-awq) * fix wrong num heads slice * update default dataset * fix conflict * fix hints * fix hints * add gitignore
3fff964d · pppppM · GitHub · edb6eb86 · 3fff964d · 3fff964d
Unverified Commit 3fff964d authored Jul 05, 2023 by pppppM Committed by GitHub Jul 05, 2023
5 changed files
--- a/src/turbomind/python/CMakeLists.txt
+++ b/src/turbomind/python/CMakeLists.txt
@@ -13,7 +13,7 @@ if(NOT pybind11_FOUND)
 endif()

 pybind11_add_module(${PROJECT_NAME} bind.cpp)
-target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend 
+target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
    LlamaTritonBackend custom_ar_comm memory_utils)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)


--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -368,4 +368,4 @@ PYBIND11_MODULE(_turbomind, m)
        .def("__repr__", &AbstractTransformerModel::toString)
        .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)
        .def("get_pipeline_para_size", &AbstractTransformerModel::getPipelineParaSize);
-}
\ No newline at end of file
+}
--- a/src/turbomind/python/dlpack.h
+++ b/src/turbomind/python/dlpack.h
@@ -196,7 +196,7 @@ typedef struct {
     * `byte_offset` field should be used to point to the beginning of the data.
     *
     * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-     * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+     * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
     * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
     * (after which this note will be updated); at the moment it is recommended
     * to not rely on the data pointer being correctly aligned.
@@ -316,4 +316,4 @@ struct DLManagedTensorVersioned {
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif
-#endif  // DLPACK_DLPACK_H_
\ No newline at end of file
+#endif  // DLPACK_DLPACK_H_
--- a/tests/python/test_tokenizer.py
+++ b/tests/python/test_tokenizer.py
-from lmdeploy.turbomind.tokenizer import Tokenizer, Preprocessor, Postprocessor
+from lmdeploy.turbomind.tokenizer import Postprocessor, Preprocessor, Tokenizer
+

 def main():
    tokenizer = Tokenizer('huggyllama/llama-7b')
@@ -12,5 +13,6 @@ def main():
    decode_prompts = postprocessor(*tokens)
    print(decode_prompts)

+
 if __name__ == '__main__':
    main()
--- a/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py
+++ b/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py
+import torch
+
+from lmdeploy.lite.utils import (cal_qparams_per_channel_absmax,
+                                 cal_qparams_per_channel_minmax,
+                                 cal_qparams_per_group_absmax,
+                                 cal_qparams_per_group_minmax,
+                                 cal_qparams_per_tensor_absmax,
+                                 cal_qparams_per_tensor_minmax)
+
+
+def test_cal_qparams():
+    """Test function for quantization parameter calculation."""
+
+    # Create a dummy tensor
+    w = torch.randn(64, 64)
+
+    # Test per-channel absmax method
+    qparams = cal_qparams_per_channel_absmax(w, 8)
+    assert qparams.scales.shape == (64, 1)
+    assert qparams.zero_points is None
+
+    # Test per-channel minmax method
+    qparams = cal_qparams_per_channel_minmax(w, 8)
+    assert qparams.scales.shape == (64, 1)
+    assert qparams.zero_points.shape == (64, 1)
+
+    # Test per-group absmax method
+    qparams = cal_qparams_per_group_absmax(w, 8, 16)
+    assert qparams.scales.shape == (64, 4, 1)
+    assert qparams.zero_points is None
+
+    # Test per-group minmax method
+    qparams = cal_qparams_per_group_minmax(w, 8, 16)
+    assert qparams.scales.shape == (64, 4, 1)
+    assert qparams.zero_points.shape == (64, 4, 1)
+
+    # Test per-tensor absmax method
+    qparams = cal_qparams_per_tensor_absmax(w, 8)
+    assert qparams.scales.shape == ()
+    assert qparams.zero_points is None
+
+    # Test per-tensor minmax method
+    qparams = cal_qparams_per_tensor_minmax(w, 8)
+    assert qparams.scales.shape == ()
+    assert qparams.zero_points.shape == ()