"examples/pytorch/TAHIN/data_loader.py" did not exist on "7e58236cc7ebbd227d523e22a672000c90238828"
Unverified Commit 3fff964d authored by pppppM's avatar pppppM Committed by GitHub
Browse files

[Feature] Stats Quantization Parameters for KV Cache (#45)

* add cal qparams

* support offload inference

* add collect funtions (mod,weight)

* stats kv scales

* update init

* add user guide

* fix hints

* fix comments & support turbomind format

* update user guide

* fix slice kv cache error & support pileval dataset (used in llm-awq)

* fix wrong num heads slice

* update default dataset

* fix conflict

* fix hints

* fix hints

* add gitignore
parent edb6eb86
......@@ -13,7 +13,7 @@ if(NOT pybind11_FOUND)
endif()
pybind11_add_module(${PROJECT_NAME} bind.cpp)
target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
LlamaTritonBackend custom_ar_comm memory_utils)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
......
......@@ -368,4 +368,4 @@ PYBIND11_MODULE(_turbomind, m)
.def("__repr__", &AbstractTransformerModel::toString)
.def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)
.def("get_pipeline_para_size", &AbstractTransformerModel::getPipelineParaSize);
}
\ No newline at end of file
}
......@@ -196,7 +196,7 @@ typedef struct {
* `byte_offset` field should be used to point to the beginning of the data.
*
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
* TVM, perhaps others) do not adhere to this 256 byte alignment requirement
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
* (after which this note will be updated); at the moment it is recommended
* to not rely on the data pointer being correctly aligned.
......@@ -316,4 +316,4 @@ struct DLManagedTensorVersioned {
#ifdef __cplusplus
} // DLPACK_EXTERN_C
#endif
#endif // DLPACK_DLPACK_H_
\ No newline at end of file
#endif // DLPACK_DLPACK_H_
from lmdeploy.turbomind.tokenizer import Tokenizer, Preprocessor, Postprocessor
from lmdeploy.turbomind.tokenizer import Postprocessor, Preprocessor, Tokenizer
def main():
tokenizer = Tokenizer('huggyllama/llama-7b')
......@@ -12,5 +13,6 @@ def main():
decode_prompts = postprocessor(*tokens)
print(decode_prompts)
if __name__ == '__main__':
main()
import torch
from lmdeploy.lite.utils import (cal_qparams_per_channel_absmax,
cal_qparams_per_channel_minmax,
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
cal_qparams_per_tensor_minmax)
def test_cal_qparams():
"""Test function for quantization parameter calculation."""
# Create a dummy tensor
w = torch.randn(64, 64)
# Test per-channel absmax method
qparams = cal_qparams_per_channel_absmax(w, 8)
assert qparams.scales.shape == (64, 1)
assert qparams.zero_points is None
# Test per-channel minmax method
qparams = cal_qparams_per_channel_minmax(w, 8)
assert qparams.scales.shape == (64, 1)
assert qparams.zero_points.shape == (64, 1)
# Test per-group absmax method
qparams = cal_qparams_per_group_absmax(w, 8, 16)
assert qparams.scales.shape == (64, 4, 1)
assert qparams.zero_points is None
# Test per-group minmax method
qparams = cal_qparams_per_group_minmax(w, 8, 16)
assert qparams.scales.shape == (64, 4, 1)
assert qparams.zero_points.shape == (64, 4, 1)
# Test per-tensor absmax method
qparams = cal_qparams_per_tensor_absmax(w, 8)
assert qparams.scales.shape == ()
assert qparams.zero_points is None
# Test per-tensor minmax method
qparams = cal_qparams_per_tensor_minmax(w, 8)
assert qparams.scales.shape == ()
assert qparams.zero_points.shape == ()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment