Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0de4f1dc
Commit
0de4f1dc
authored
May 31, 2024
by
zhuwenwen
Browse files
add int8
parent
b9e12416
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
20 additions
and
20 deletions
+20
-20
CMakeLists.txt
CMakeLists.txt
+1
-1
csrc/ops.h
csrc/ops.h
+2
-2
csrc/pybind.cpp
csrc/pybind.cpp
+2
-2
vllm/_custom_ops.py
vllm/_custom_ops.py
+15
-15
No files found.
CMakeLists.txt
View file @
0de4f1dc
...
@@ -168,7 +168,7 @@ set(VLLM_EXT_SRC
...
@@ -168,7 +168,7 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/gptq/q_gemm.cu"
#
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu"
# "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
...
...
csrc/ops.h
View file @
0de4f1dc
...
@@ -94,8 +94,8 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
...
@@ -94,8 +94,8 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
#endif
#endif
//
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor& input,
void
static_scaled_int8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
//
float scale);
float
scale
);
void
squeezellm_gemm
(
torch
::
Tensor
vec
,
torch
::
Tensor
mat
,
torch
::
Tensor
mul
,
void
squeezellm_gemm
(
torch
::
Tensor
vec
,
torch
::
Tensor
mat
,
torch
::
Tensor
mul
,
torch
::
Tensor
lookup_table
);
torch
::
Tensor
lookup_table
);
...
...
csrc/pybind.cpp
View file @
0de4f1dc
...
@@ -67,8 +67,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
...
@@ -67,8 +67,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
"Aligning the number of tokens to be processed by each expert such "
"Aligning the number of tokens to be processed by each expert such "
"that it is divisible by the block size."
);
"that it is divisible by the block size."
);
//
ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
ops
.
def
(
"static_scaled_int8_quant"
,
&
static_scaled_int8_quant
,
//
"Compute int8 quantized tensor for given scaling factor");
"Compute int8 quantized tensor for given scaling factor"
);
// Cache ops
// Cache ops
pybind11
::
module
cache_ops
=
m
.
def_submodule
(
"cache_ops"
,
"vLLM cache ops"
);
pybind11
::
module
cache_ops
=
m
.
def_submodule
(
"cache_ops"
,
"vLLM cache ops"
);
...
...
vllm/_custom_ops.py
View file @
0de4f1dc
...
@@ -264,21 +264,21 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
...
@@ -264,21 +264,21 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
# int8
# int8
#
def static_scaled_int8_quant(input: torch.Tensor,
def
static_scaled_int8_quant
(
input
:
torch
.
Tensor
,
#
scale: float) -> torch.Tensor:
scale
:
float
)
->
torch
.
Tensor
:
#
"""
"""
#
Quantize the input tensor to int8 and return the quantized tensor.
Quantize the input tensor to int8 and return the quantized tensor.
#
Args:
Args:
#
input: The input tensor to be quantized to int8.
input: The input tensor to be quantized to int8.
#
scale: Scaling factor for the int8 quantization.
scale: Scaling factor for the int8 quantization.
#
Returns:
Returns:
#
torch.Tensor: Output tensor in int8.
torch.Tensor: Output tensor in int8.
#
"""
"""
#
q = torch.empty_like(input, dtype=torch.int8)
q
=
torch
.
empty_like
(
input
,
dtype
=
torch
.
int8
)
#
vllm_ops.static_scaled_int8_quant(q, input, scale)
vllm_ops
.
static_scaled_int8_quant
(
q
,
input
,
scale
)
#
return q
return
q
# moe
# moe
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment