Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
eb8e460c
"...composable_kernel_onnxruntime.git" did not exist on "19a93dac051f3b5200fe00151b8fa5994aa890dd"
Commit
eb8e460c
authored
Sep 13, 2024
by
nicodafagood
Browse files
update mygq
parent
23fdbb68
Changes
17
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
156 additions
and
156 deletions
+156
-156
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+1
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+1
-1
csrc/ops.h
csrc/ops.h
+4
-4
csrc/pybind.cpp
csrc/pybind.cpp
+2
-2
csrc/quantization/mygq/compat.cuh
csrc/quantization/mygq/compat.cuh
+2
-2
csrc/quantization/mygq/matrix_view.cuh
csrc/quantization/mygq/matrix_view.cuh
+2
-2
csrc/quantization/mygq/q_gemm.cu
csrc/quantization/mygq/q_gemm.cu
+125
-125
csrc/quantization/mygq/qdq_2.cuh
csrc/quantization/mygq/qdq_2.cuh
+2
-2
csrc/quantization/mygq/qdq_3.cuh
csrc/quantization/mygq/qdq_3.cuh
+2
-2
csrc/quantization/mygq/qdq_4.cuh
csrc/quantization/mygq/qdq_4.cuh
+3
-3
csrc/quantization/mygq/qdq_8.cuh
csrc/quantization/mygq/qdq_8.cuh
+2
-2
csrc/quantization/mygq/qdq_util.cuh
csrc/quantization/mygq/qdq_util.cuh
+2
-2
setup.py
setup.py
+1
-1
vllm/config.py
vllm/config.py
+1
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+2
-2
vllm/model_executor/layers/quantization/mygq.py
vllm/model_executor/layers/quantization/mygq.py
+3
-3
No files found.
benchmarks/benchmark_latency.py
View file @
eb8e460c
...
@@ -92,7 +92,7 @@ if __name__ == '__main__':
...
@@ -92,7 +92,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--quantization'
,
parser
.
add_argument
(
'--quantization'
,
'-q'
,
'-q'
,
choices
=
[
'awq'
,
'gptq'
,
'myq'
,
'squeezellm'
,
None
],
choices
=
[
'awq'
,
'gptq'
,
'my
g
q'
,
'squeezellm'
,
None
],
default
=
None
)
default
=
None
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--input-len'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--input-len'
,
type
=
int
,
default
=
32
)
...
...
benchmarks/benchmark_throughput.py
View file @
eb8e460c
...
@@ -258,7 +258,7 @@ if __name__ == "__main__":
...
@@ -258,7 +258,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--quantization'
,
parser
.
add_argument
(
'--quantization'
,
'-q'
,
'-q'
,
choices
=
[
'awq'
,
'gptq'
,
'myq'
,
'squeezellm'
,
None
],
choices
=
[
'awq'
,
'gptq'
,
'my
g
q'
,
'squeezellm'
,
None
],
default
=
None
)
default
=
None
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
"-tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
"-tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--n"
,
parser
.
add_argument
(
"--n"
,
...
...
csrc/ops.h
View file @
eb8e460c
...
@@ -115,16 +115,16 @@ void gptq_shuffle(
...
@@ -115,16 +115,16 @@ void gptq_shuffle(
torch
::
Tensor
q_perm
,
torch
::
Tensor
q_perm
,
int
bit
);
int
bit
);
torch
::
Tensor
myq_gemm
(
torch
::
Tensor
my
g
q_gemm
(
torch
::
Tensor
a
,
torch
::
Tensor
a
,
torch
::
Tensor
b_q_weight
,
torch
::
Tensor
b_q_weight
,
torch
::
Tensor
b_myq_qzeros
,
torch
::
Tensor
b_my
g
q_qzeros
,
torch
::
Tensor
b_myq_scales
,
torch
::
Tensor
b_my
g
q_scales
,
torch
::
Tensor
b_g_idx
,
torch
::
Tensor
b_g_idx
,
bool
use_exllama
,
bool
use_exllama
,
int
bit
);
int
bit
);
void
myq_shuffle
(
void
my
g
q_shuffle
(
torch
::
Tensor
q_weight
,
torch
::
Tensor
q_weight
,
torch
::
Tensor
q_perm
,
torch
::
Tensor
q_perm
,
int
bit
);
int
bit
);
...
...
csrc/pybind.cpp
View file @
eb8e460c
...
@@ -61,8 +61,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
...
@@ -61,8 +61,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
ops
.
def
(
"gptq_gemm"
,
&
gptq_gemm
,
"Quantized GEMM for GPTQ"
);
ops
.
def
(
"gptq_gemm"
,
&
gptq_gemm
,
"Quantized GEMM for GPTQ"
);
ops
.
def
(
"gptq_shuffle"
,
&
gptq_shuffle
,
"Post processing for GPTQ"
);
ops
.
def
(
"gptq_shuffle"
,
&
gptq_shuffle
,
"Post processing for GPTQ"
);
ops
.
def
(
"myq_gemm"
,
&
myq_gemm
,
"Quantized GEMM for myq"
);
ops
.
def
(
"my
g
q_gemm"
,
&
my
g
q_gemm
,
"Quantized GEMM for my
g
q"
);
ops
.
def
(
"myq_shuffle"
,
&
myq_shuffle
,
"Post processing for
GPTQ
"
);
ops
.
def
(
"my
g
q_shuffle"
,
&
my
g
q_shuffle
,
"Post processing for
mygq
"
);
ops
.
def
(
"squeezellm_gemm"
,
&
squeezellm_gemm
,
"Quantized GEMM for SqueezeLLM"
);
ops
.
def
(
"squeezellm_gemm"
,
&
squeezellm_gemm
,
"Quantized GEMM for SqueezeLLM"
);
ops
.
def
(
ops
.
def
(
"moe_align_block_size"
,
"moe_align_block_size"
,
...
...
csrc/quantization/myq/compat.cuh
→
csrc/quantization/my
g
q/compat.cuh
View file @
eb8e460c
...
@@ -6,7 +6,7 @@ Copied from https://github.com/turboderp/exllamav2
...
@@ -6,7 +6,7 @@ Copied from https://github.com/turboderp/exllamav2
#define _compat_cuh
#define _compat_cuh
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
// atomicAdd for half types, to support CC < 7.x
// atomicAdd for half types, to support CC < 7.x
__device__
__forceinline__
void
atomicAdd_half
(
half
*
address
,
half
val
)
__device__
__forceinline__
void
atomicAdd_half
(
half
*
address
,
half
val
)
...
@@ -59,6 +59,6 @@ __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd
...
@@ -59,6 +59,6 @@ __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd
#endif
#endif
#endif
#endif
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
csrc/quantization/myq/matrix_view.cuh
→
csrc/quantization/my
g
q/matrix_view.cuh
View file @
eb8e460c
...
@@ -11,7 +11,7 @@ Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turbo
...
@@ -11,7 +11,7 @@ Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turbo
#include "qdq_util.cuh"
#include "qdq_util.cuh"
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
class
MatrixView_half
class
MatrixView_half
{
{
...
@@ -269,6 +269,6 @@ public:
...
@@ -269,6 +269,6 @@ public:
}
}
};
};
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
csrc/quantization/myq/q_gemm.cu
→
csrc/quantization/my
g
q/q_gemm.cu
View file @
eb8e460c
This diff is collapsed.
Click to expand it.
csrc/quantization/myq/qdq_2.cuh
→
csrc/quantization/my
g
q/qdq_2.cuh
View file @
eb8e460c
...
@@ -8,7 +8,7 @@ Copied from https://github.com/turboderp/exllamav2
...
@@ -8,7 +8,7 @@ Copied from https://github.com/turboderp/exllamav2
#include "qdq_util.cuh"
#include "qdq_util.cuh"
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
// Permutation:
// Permutation:
//
//
...
@@ -81,7 +81,7 @@ __forceinline__ __device__ void dequant_2bit_16
...
@@ -81,7 +81,7 @@ __forceinline__ __device__ void dequant_2bit_16
dq
[
7
]
=
__hfma2
(
q7
.
as_half2
,
y64
,
z64
);
dq
[
7
]
=
__hfma2
(
q7
.
as_half2
,
y64
,
z64
);
}
}
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
csrc/quantization/myq/qdq_3.cuh
→
csrc/quantization/my
g
q/qdq_3.cuh
View file @
eb8e460c
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "qdq_util.cuh"
#include "qdq_util.cuh"
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
// Permutation:
// Permutation:
//
//
// v9997775 55333111 u8886664 44222000 (u, v lsb)
// v9997775 55333111 u8886664 44222000 (u, v lsb)
...
@@ -135,7 +135,7 @@ __forceinline__ __device__ void dequant_3bit_32
...
@@ -135,7 +135,7 @@ __forceinline__ __device__ void dequant_3bit_32
dq
[
15
]
=
__hadd2
(
q15
.
as_half2
,
z1
);
dq
[
15
]
=
__hadd2
(
q15
.
as_half2
,
z1
);
}
}
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
csrc/quantization/myq/qdq_4.cuh
→
csrc/quantization/my
g
q/qdq_4.cuh
View file @
eb8e460c
...
@@ -8,7 +8,7 @@ Copied from https://github.com/turboderp/exllamav2
...
@@ -8,7 +8,7 @@ Copied from https://github.com/turboderp/exllamav2
#include "qdq_util.cuh"
#include "qdq_util.cuh"
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
// Permutation:
// Permutation:
//
//
// 77775555 33331111 66664444 22220000
// 77775555 33331111 66664444 22220000
...
@@ -107,7 +107,7 @@ __forceinline__ __device__ void dequant_4bit_8_prep_zero
...
@@ -107,7 +107,7 @@ __forceinline__ __device__ void dequant_4bit_8_prep_zero
}
}
__forceinline__
__device__
void
dequant_4bit_8_myq
__forceinline__
__device__
void
dequant_4bit_8_my
g
q
(
(
const
uint32_t
q_0
,
const
uint32_t
q_0
,
half2
(
&
dq
)[
4
],
half2
(
&
dq
)[
4
],
...
@@ -141,7 +141,7 @@ __forceinline__ __device__ void dequant_4bit_8_myq
...
@@ -141,7 +141,7 @@ __forceinline__ __device__ void dequant_4bit_8_myq
dq
[
3
]
=
__hfma2
(
q3
.
as_half2
,
y1y16
[
1
],
z1z16
[
1
]);
// half2( q[6] - z, q[7] - z )
dq
[
3
]
=
__hfma2
(
q3
.
as_half2
,
y1y16
[
1
],
z1z16
[
1
]);
// half2( q[6] - z, q[7] - z )
}
}
}
}
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
csrc/quantization/myq/qdq_8.cuh
→
csrc/quantization/my
g
q/qdq_8.cuh
View file @
eb8e460c
...
@@ -8,7 +8,7 @@ Copied from https://github.com/turboderp/exllamav2
...
@@ -8,7 +8,7 @@ Copied from https://github.com/turboderp/exllamav2
#include "qdq_util.cuh"
#include "qdq_util.cuh"
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
__forceinline__
__device__
void
shuffle_8bit_4
__forceinline__
__device__
void
shuffle_8bit_4
(
(
...
@@ -34,7 +34,7 @@ __forceinline__ __device__ void dequant_8bit_8
...
@@ -34,7 +34,7 @@ __forceinline__ __device__ void dequant_8bit_8
for
(
int
i
=
0
;
i
<
4
;
i
++
)
dq
[
i
]
=
__halves2half2
(
dqh
[
i
*
2
],
dqh
[
i
*
2
+
1
]);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
dq
[
i
]
=
__halves2half2
(
dqh
[
i
*
2
],
dqh
[
i
*
2
+
1
]);
}
}
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
csrc/quantization/myq/qdq_util.cuh
→
csrc/quantization/my
g
q/qdq_util.cuh
View file @
eb8e460c
...
@@ -6,7 +6,7 @@ Copied from https://github.com/turboderp/exllamav2
...
@@ -6,7 +6,7 @@ Copied from https://github.com/turboderp/exllamav2
#define _qdq_util_cuh
#define _qdq_util_cuh
namespace
vllm
{
namespace
vllm
{
namespace
myq
{
namespace
my
g
q
{
union
half2_uint32
union
half2_uint32
{
{
...
@@ -55,6 +55,6 @@ __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const i
...
@@ -55,6 +55,6 @@ __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const i
return
(
int
)(
__funnelshift_rc
(
q0
,
q1
,
shift
)
&
mask
);
return
(
int
)(
__funnelshift_rc
(
q0
,
q1
,
shift
)
&
mask
);
}
}
}
// namespace myq
}
// namespace my
g
q
}
// namespace vllm
}
// namespace vllm
#endif
#endif
setup.py
View file @
eb8e460c
...
@@ -339,7 +339,7 @@ vllm_extension_sources = [
...
@@ -339,7 +339,7 @@ vllm_extension_sources = [
"csrc/layernorm_kernels.cu"
,
"csrc/layernorm_kernels.cu"
,
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
,
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
,
"csrc/quantization/gptq/q_gemm.cu"
,
"csrc/quantization/gptq/q_gemm.cu"
,
"csrc/quantization/myq/q_gemm.cu"
,
"csrc/quantization/my
g
q/q_gemm.cu"
,
"csrc/cuda_utils_kernels.cu"
,
"csrc/cuda_utils_kernels.cu"
,
"csrc/moe_align_block_size_kernels.cu"
,
"csrc/moe_align_block_size_kernels.cu"
,
"csrc/pybind.cpp"
,
"csrc/pybind.cpp"
,
...
...
vllm/config.py
View file @
eb8e460c
...
@@ -155,7 +155,7 @@ class ModelConfig:
...
@@ -155,7 +155,7 @@ class ModelConfig:
self
.
tokenizer_mode
=
tokenizer_mode
self
.
tokenizer_mode
=
tokenizer_mode
def
_verify_quantization
(
self
)
->
None
:
def
_verify_quantization
(
self
)
->
None
:
supported_quantization
=
[
"awq"
,
"gptq"
,
"squeezellm"
,
"marlin"
,
"myq"
]
supported_quantization
=
[
"awq"
,
"gptq"
,
"squeezellm"
,
"marlin"
,
"my
g
q"
]
rocm_not_supported_quantization
=
[
"awq"
,
"marlin"
]
rocm_not_supported_quantization
=
[
"awq"
,
"marlin"
]
if
self
.
quantization
is
not
None
:
if
self
.
quantization
is
not
None
:
self
.
quantization
=
self
.
quantization
.
lower
()
self
.
quantization
=
self
.
quantization
.
lower
()
...
...
vllm/engine/arg_utils.py
View file @
eb8e460c
...
@@ -208,7 +208,7 @@ class EngineArgs:
...
@@ -208,7 +208,7 @@ class EngineArgs:
parser
.
add_argument
(
'--quantization'
,
parser
.
add_argument
(
'--quantization'
,
'-q'
,
'-q'
,
type
=
str
,
type
=
str
,
choices
=
[
'awq'
,
'gptq'
,
'squeezellm'
,
'myq'
,
None
],
choices
=
[
'awq'
,
'gptq'
,
'squeezellm'
,
'my
g
q'
,
None
],
default
=
EngineArgs
.
quantization
,
default
=
EngineArgs
.
quantization
,
help
=
'Method used to quantize the weights. If '
help
=
'Method used to quantize the weights. If '
'None, we first check the `quantization_config` '
'None, we first check the `quantization_config` '
...
...
vllm/model_executor/layers/quantization/__init__.py
View file @
eb8e460c
...
@@ -3,13 +3,13 @@ from typing import Type
...
@@ -3,13 +3,13 @@ from typing import Type
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.gptq
import
GPTQConfig
from
vllm.model_executor.layers.quantization.gptq
import
GPTQConfig
from
vllm.model_executor.layers.quantization.myq
import
MYQConfig
from
vllm.model_executor.layers.quantization.my
g
q
import
MYQConfig
from
vllm.model_executor.layers.quantization.squeezellm
import
SqueezeLLMConfig
from
vllm.model_executor.layers.quantization.squeezellm
import
SqueezeLLMConfig
from
vllm.model_executor.layers.quantization.marlin
import
MarlinConfig
from
vllm.model_executor.layers.quantization.marlin
import
MarlinConfig
_QUANTIZATION_CONFIG_REGISTRY
=
{
_QUANTIZATION_CONFIG_REGISTRY
=
{
"awq"
:
AWQConfig
,
"awq"
:
AWQConfig
,
"myq"
:
MYQConfig
,
"my
g
q"
:
MYQConfig
,
"gptq"
:
GPTQConfig
,
"gptq"
:
GPTQConfig
,
"squeezellm"
:
SqueezeLLMConfig
,
"squeezellm"
:
SqueezeLLMConfig
,
"marlin"
:
MarlinConfig
,
"marlin"
:
MarlinConfig
,
...
...
vllm/model_executor/layers/quantization/myq.py
→
vllm/model_executor/layers/quantization/my
g
q.py
View file @
eb8e460c
...
@@ -41,7 +41,7 @@ class MYQConfig(QuantizationConfig):
...
@@ -41,7 +41,7 @@ class MYQConfig(QuantizationConfig):
@
classmethod
@
classmethod
def
get_name
(
cls
)
->
str
:
def
get_name
(
cls
)
->
str
:
return
"myq"
return
"my
g
q"
@
classmethod
@
classmethod
def
get_supported_act_dtypes
(
cls
)
->
List
[
torch
.
dtype
]:
def
get_supported_act_dtypes
(
cls
)
->
List
[
torch
.
dtype
]:
...
@@ -201,9 +201,9 @@ class MYQLinearMethod(LinearMethodBase):
...
@@ -201,9 +201,9 @@ class MYQLinearMethod(LinearMethodBase):
else
:
else
:
weights
[
"g_idx"
]
=
torch
.
empty
((
1
,
1
),
device
=
"meta"
)
weights
[
"g_idx"
]
=
torch
.
empty
((
1
,
1
),
device
=
"meta"
)
weights
[
"exllama_state"
]
=
ExllamaState
.
READY
weights
[
"exllama_state"
]
=
ExllamaState
.
READY
ops
.
myq_shuffle
(
weights
[
"qweight"
],
weights
[
"g_idx"
],
ops
.
my
g
q_shuffle
(
weights
[
"qweight"
],
weights
[
"g_idx"
],
self
.
quant_config
.
weight_bits
)
self
.
quant_config
.
weight_bits
)
output
=
ops
.
myq_gemm
(
reshaped_x
,
weights
[
"qweight"
],
output
=
ops
.
my
g
q_gemm
(
reshaped_x
,
weights
[
"qweight"
],
weights
[
"qzeros"
],
weights
[
"scales"
],
weights
[
"qzeros"
],
weights
[
"scales"
],
weights
[
"g_idx"
],
weights
[
"g_idx"
],
weights
[
"exllama_state"
]
==
ExllamaState
.
READY
,
weights
[
"exllama_state"
]
==
ExllamaState
.
READY
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment