Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
6ef00b03
Unverified
Commit
6ef00b03
authored
Jan 03, 2024
by
Woosuk Kwon
Committed by
GitHub
Jan 03, 2024
Browse files
Enable CUDA graph for GPTQ & SqueezeLLM (#2318)
parent
91405610
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
15 additions
and
13 deletions
+15
-13
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/q_gemm.cu
+12
-6
csrc/quantization/squeezellm/quant_cuda_kernel.cu
csrc/quantization/squeezellm/quant_cuda_kernel.cu
+3
-1
vllm/config.py
vllm/config.py
+0
-6
No files found.
csrc/quantization/gptq/q_gemm.cu
View file @
6ef00b03
...
@@ -287,7 +287,8 @@ void gemm_half_q_half_cuda_part
...
@@ -287,7 +287,8 @@ void gemm_half_q_half_cuda_part
fp_gemm_half_q_half_gptq_kernel
kernel
=
pick_gemm_half_q_half_gptq_kernel
(
true
,
m_count
);
fp_gemm_half_q_half_gptq_kernel
kernel
=
pick_gemm_half_q_half_gptq_kernel
(
true
,
m_count
);
kernel
<<<
gridDim
,
blockDim
>>>
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
kernel
<<<
gridDim
,
blockDim
,
0
,
stream
>>>
(
(
a
,
a
,
b_q_weight
,
b_q_weight
,
...
@@ -434,7 +435,8 @@ void reconstruct_exllama
...
@@ -434,7 +435,8 @@ void reconstruct_exllama
gridDim
.
y
=
DIVIDE
(
height
,
BLOCK_KN_SIZE
);
gridDim
.
y
=
DIVIDE
(
height
,
BLOCK_KN_SIZE
);
gridDim
.
x
=
DIVIDE
(
width
,
BLOCK_KN_SIZE
);
gridDim
.
x
=
DIVIDE
(
width
,
BLOCK_KN_SIZE
);
reconstruct_exllama_kernel
<<<
gridDim
,
blockDim
>>>
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
reconstruct_exllama_kernel
<<<
gridDim
,
blockDim
,
0
,
stream
>>>
(
(
b_q_weight
,
b_q_weight
,
b_q_perm
,
b_q_perm
,
...
@@ -567,7 +569,8 @@ void gemm_half_q_half_alt
...
@@ -567,7 +569,8 @@ void gemm_half_q_half_alt
gridDim
.
y
=
DIVIDE
(
size_m
,
BLOCK_M_SIZE_MAX
);
gridDim
.
y
=
DIVIDE
(
size_m
,
BLOCK_M_SIZE_MAX
);
gridDim
.
z
=
DIVIDE
(
size_k
,
BLOCK_KN_SIZE
);
gridDim
.
z
=
DIVIDE
(
size_k
,
BLOCK_KN_SIZE
);
gemm_half_q_half_alt_kernel
<<<
gridDim
,
blockDim
>>>
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
gemm_half_q_half_alt_kernel
<<<
gridDim
,
blockDim
,
0
,
stream
>>>
(
(
(
const
half2
*
)
a
,
(
const
half2
*
)
a
,
b_q_weight
,
b_q_weight
,
...
@@ -639,7 +642,8 @@ void reconstruct_gptq
...
@@ -639,7 +642,8 @@ void reconstruct_gptq
blockDim
.
y
=
1
;
blockDim
.
y
=
1
;
gridDim
.
y
=
DIVIDE
(
height
,
8
);
gridDim
.
y
=
DIVIDE
(
height
,
8
);
gridDim
.
x
=
DIVIDE
(
width
,
BLOCK_KN_SIZE
);
gridDim
.
x
=
DIVIDE
(
width
,
BLOCK_KN_SIZE
);
reconstruct_gptq_kernel
<<<
gridDim
,
blockDim
>>>
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
reconstruct_gptq_kernel
<<<
gridDim
,
blockDim
,
0
,
stream
>>>
(
(
b_q_weight
,
b_q_weight
,
b_gptq_scales
,
b_gptq_scales
,
...
@@ -794,7 +798,8 @@ void shuffle_exllama_weight
...
@@ -794,7 +798,8 @@ void shuffle_exllama_weight
gridDim
.
x
=
DIVIDE
(
width
,
THREADS_X
);
gridDim
.
x
=
DIVIDE
(
width
,
THREADS_X
);
gridDim
.
y
=
height
/
8
;
gridDim
.
y
=
height
/
8
;
make_sequential_kernel
<<<
gridDim
,
blockDim
>>>
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
make_sequential_kernel
<<<
gridDim
,
blockDim
,
0
,
stream
>>>
(
(
q_weight
,
q_weight
,
new_qweight
,
new_qweight
,
...
@@ -813,7 +818,8 @@ void shuffle_exllama_weight
...
@@ -813,7 +818,8 @@ void shuffle_exllama_weight
blockDim
.
y
=
1
;
blockDim
.
y
=
1
;
gridDim
.
x
=
DIVIDE
(
width
,
THREADS_X
);
gridDim
.
x
=
DIVIDE
(
width
,
THREADS_X
);
gridDim
.
y
=
1
;
gridDim
.
y
=
1
;
shuffle_kernel
<<<
gridDim
,
blockDim
>>>
(
q_weight
,
height
,
width
);
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
shuffle_kernel
<<<
gridDim
,
blockDim
,
0
,
stream
>>>
(
q_weight
,
height
,
width
);
}
}
}
// namespace gptq
}
// namespace gptq
...
...
csrc/quantization/squeezellm/quant_cuda_kernel.cu
View file @
6ef00b03
...
@@ -200,8 +200,10 @@ void squeezellm_gemm(
...
@@ -200,8 +200,10 @@ void squeezellm_gemm(
(
width
+
BLOCKWIDTH
-
1
)
/
BLOCKWIDTH
(
width
+
BLOCKWIDTH
-
1
)
/
BLOCKWIDTH
);
);
dim3
threads
(
BLOCKWIDTH
);
dim3
threads
(
BLOCKWIDTH
);
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
vec
));
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
vec
));
vllm
::
squeezellm
::
NUQ4MatMulKernel
<<<
blocks
,
threads
>>>
(
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
vllm
::
squeezellm
::
NUQ4MatMulKernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
#ifndef USE_ROCM
#ifndef USE_ROCM
(
half2
*
)
vec
.
data
<
at
::
Half
>
(),
(
half2
*
)
vec
.
data
<
at
::
Half
>
(),
#else
#else
...
...
vllm/config.py
View file @
6ef00b03
...
@@ -181,12 +181,6 @@ class ModelConfig:
...
@@ -181,12 +181,6 @@ class ModelConfig:
self
.
max_context_len_to_capture
=
self
.
max_model_len
self
.
max_context_len_to_capture
=
self
.
max_model_len
self
.
max_context_len_to_capture
=
min
(
self
.
max_context_len_to_capture
,
self
.
max_context_len_to_capture
=
min
(
self
.
max_context_len_to_capture
,
self
.
max_model_len
)
self
.
max_model_len
)
if
(
self
.
quantization
in
[
"gptq"
,
"squeezellm"
]
and
not
self
.
enforce_eager
):
# Related issue: https://github.com/vllm-project/vllm/issues/2147
logger
.
warning
(
f
"
{
self
.
quantization
}
does not support CUDA graph "
"yet. Disabling CUDA graph."
)
self
.
enforce_eager
=
True
def
verify_with_parallel_config
(
def
verify_with_parallel_config
(
self
,
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment