Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0cdbf5e6
Unverified
Commit
0cdbf5e6
authored
Aug 20, 2025
by
Michael Goin
Committed by
GitHub
Aug 20, 2025
Browse files
[Kernel/Quant] Remove the original marlin format and qqq (#23204)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
ebe56a00
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
95 additions
and
2945 deletions
+95
-2945
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+0
-12
.buildkite/lm-eval-harness/configs/models-large.txt
.buildkite/lm-eval-harness/configs/models-large.txt
+0
-1
CMakeLists.txt
CMakeLists.txt
+0
-2
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_machete.py
+1
-22
csrc/quantization/machete/generate.py
csrc/quantization/machete/generate.py
+73
-72
csrc/quantization/marlin/dense/LICENSE
csrc/quantization/marlin/dense/LICENSE
+0
-209
csrc/quantization/marlin/dense/common/base.h
csrc/quantization/marlin/dense/common/base.h
+0
-32
csrc/quantization/marlin/dense/common/mem.h
csrc/quantization/marlin/dense/common/mem.h
+0
-89
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+0
-1073
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+0
-1248
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+0
-17
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+0
-6
tests/kernels/quantization/test_machete_mm.py
tests/kernels/quantization/test_machete_mm.py
+17
-17
tests/kernels/quantization/test_marlin_gemm.py
tests/kernels/quantization/test_marlin_gemm.py
+0
-83
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+0
-10
tests/quantization/test_lm_head.py
tests/quantization/test_lm_head.py
+1
-5
tests/weight_loading/models.txt
tests/weight_loading/models.txt
+0
-4
vllm/_custom_ops.py
vllm/_custom_ops.py
+0
-36
vllm/config/__init__.py
vllm/config/__init__.py
+3
-4
vllm/lora/layers.py
vllm/lora/layers.py
+0
-3
No files found.
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
deleted
100644 → 0
View file @
ebe56a00
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name
:
"
HandH1998/QQQ-Llama-3-8b-g128"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.419
-
name
:
"
exact_match,flexible-extract"
value
:
0.416
limit
:
1000
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/models-large.txt
View file @
0cdbf5e6
...
...
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml
Meta-Llama-3-8B-QQQ.yaml
CMakeLists.txt
View file @
0cdbf5e6
...
...
@@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND VLLM_EXT_SRC
${
MARLIN_TEMPLATE_KERNEL_SRC
}
)
set
(
MARLIN_SRCS
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
)
...
...
benchmarks/kernels/benchmark_machete.py
View file @
0cdbf5e6
...
...
@@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
else
:
assert
bt
.
a
.
dtype
==
torch
.
int8
assert
bt
.
wtype
==
scalar_types
.
uint4b8
if
bt
.
w_ch_s
is
not
None
:
s_ch
=
bt
.
w_ch_s
.
to
(
torch
.
float32
)
else
:
s_ch
=
torch
.
ones
(
bt
.
w_ref
.
shape
[
1
],
dtype
=
torch
.
float32
,
device
=
device
)
if
bt
.
w_tok_s
is
not
None
:
s_tok
=
bt
.
w_tok_s
.
to
(
torch
.
float32
)
else
:
s_tok
=
torch
.
ones
(
bt
.
a
.
shape
[
0
],
dtype
=
torch
.
float32
,
device
=
device
)
fn
=
lambda
:
ops
.
marlin_qqq_gemm
(
a
=
bt
.
a
,
b_q_weight
=
w_q
,
s_group
=
w_s
,
s_tok
=
s_tok
,
s_ch
=
s_ch
,
workspace
=
workspace
.
scratch
,
size_m
=
bt
.
a
.
shape
[
0
],
size_n
=
bt
.
w_ref
.
shape
[
1
],
size_k
=
bt
.
w_ref
.
shape
[
0
],
)
raise
NotImplementedError
(
"QQQ is not supported anymore"
)
return
fn
...
...
csrc/quantization/machete/generate.py
View file @
0cdbf5e6
...
...
@@ -571,78 +571,79 @@ def generate():
itertools
.
repeat
(
default_heuristic
))
]
# Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
# TODO (LucasWilkinson): Further tuning required
qqq_tile_heuristic_config
=
{
#### M = 257+
# ((128, 256), (2, 1, 1)) Broken for QQQ types
# TODO (LucasWilkinson): Investigate further
# "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
# "M > 256": ((128, 256), (2, 1, 1)),
"M > 256"
:
((
128
,
128
),
(
2
,
1
,
1
)),
#### M = 129-256
"M > 128 && K <= 4096 && N <= 4096"
:
((
128
,
64
),
(
2
,
1
,
1
)),
"M > 128 && K <= 8192 && N <= 8192"
:
((
128
,
128
),
(
2
,
1
,
1
)),
# ((128, 256), (2, 1, 1)) Broken for QQQ types
# TODO (LucasWilkinson): Investigate further
# "M > 128": ((128, 256), (2, 1, 1)),
"M > 128"
:
((
128
,
128
),
(
2
,
1
,
1
)),
#### M = 65-128
"M > 64 && K <= 4069 && N <= 4069"
:
((
128
,
32
),
(
2
,
1
,
1
)),
"M > 64 && K <= 4069 && N <= 8192"
:
((
128
,
64
),
(
2
,
1
,
1
)),
"M > 64 && K >= 8192 && N >= 12288"
:
((
256
,
128
),
(
2
,
1
,
1
)),
"M > 64"
:
((
128
,
128
),
(
2
,
1
,
1
)),
#### M = 33-64
"M > 32 && K <= 6144 && N <= 6144"
:
((
128
,
16
),
(
1
,
1
,
1
)),
# Broken for QQQ types
# TODO (LucasWilkinson): Investigate further
#"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
"M > 32"
:
((
128
,
64
),
(
2
,
1
,
1
)),
#### M = 17-32
"M > 16 && K <= 12288 && N <= 8192"
:
((
128
,
32
),
(
2
,
1
,
1
)),
"M > 16"
:
((
256
,
32
),
(
2
,
1
,
1
)),
#### M = 1-16
"N >= 26624"
:
((
256
,
16
),
(
1
,
1
,
1
)),
None
:
((
128
,
16
),
(
1
,
1
,
1
)),
}
# For now we use the same heuristic for all types
# Heuristic is currently tuned for H100s
qqq_heuristic
=
[
(
cond
,
ScheduleConfig
(
*
tile_config
,
**
sch_common_params
))
# type: ignore
for
cond
,
tile_config
in
qqq_tile_heuristic_config
.
items
()
]
QQQ_kernel_types
=
[
*
(
TypeConfig
(
a
=
DataType
.
s8
,
b
=
VLLMDataType
.
u4b8
,
b_group_scale
=
b_group_scale
,
b_group_zeropoint
=
DataType
.
void
,
b_channel_scale
=
DataType
.
f32
,
a_token_scale
=
DataType
.
f32
,
out
=
DataType
.
f16
,
accumulator
=
DataType
.
s32
,
)
for
b_group_scale
in
(
DataType
.
f16
,
DataType
.
void
)),
*
(
TypeConfig
(
a
=
DataType
.
e4m3
,
b
=
VLLMDataType
.
u4b8
,
b_group_scale
=
b_group_scale
,
b_group_zeropoint
=
DataType
.
void
,
b_channel_scale
=
DataType
.
f32
,
a_token_scale
=
DataType
.
f32
,
out
=
DataType
.
f16
,
accumulator
=
DataType
.
f32
,
)
for
b_group_scale
in
(
DataType
.
f16
,
DataType
.
void
)),
]
impl_configs
+=
[
ImplConfig
(
x
[
0
],
x
[
1
],
x
[
2
])
for
x
in
zip
(
QQQ_kernel_types
,
itertools
.
repeat
(
get_unique_schedules
(
qqq_heuristic
)),
itertools
.
repeat
(
qqq_heuristic
))
]
# TODO: Support W4A8 when ready
# # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
# # TODO (LucasWilkinson): Further tuning required
# qqq_tile_heuristic_config = {
# #### M = 257+
# # ((128, 256), (2, 1, 1)) Broken for QQQ types
# # TODO (LucasWilkinson): Investigate further
# # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
# # "M > 256": ((128, 256), (2, 1, 1)),
# "M > 256": ((128, 128), (2, 1, 1)),
# #### M = 129-256
# "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
# "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
# # ((128, 256), (2, 1, 1)) Broken for QQQ types
# # TODO (LucasWilkinson): Investigate further
# # "M > 128": ((128, 256), (2, 1, 1)),
# "M > 128": ((128, 128), (2, 1, 1)),
# #### M = 65-128
# "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
# "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
# "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
# "M > 64": ((128, 128), (2, 1, 1)),
# #### M = 33-64
# "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
# # Broken for QQQ types
# # TODO (LucasWilkinson): Investigate further
# #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
# "M > 32": ((128, 64), (2, 1, 1)),
# #### M = 17-32
# "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
# "M > 16": ((256, 32), (2, 1, 1)),
# #### M = 1-16
# "N >= 26624": ((256, 16), (1, 1, 1)),
# None: ((128, 16), (1, 1, 1)),
# }
# # For now we use the same heuristic for all types
# # Heuristic is currently tuned for H100s
# qqq_heuristic = [
# (cond, ScheduleConfig(*tile_config,
# **sch_common_params)) # type: ignore
# for cond, tile_config in qqq_tile_heuristic_config.items()
# ]
# QQQ_kernel_types = [
# *(TypeConfig(
# a=DataType.s8,
# b=VLLMDataType.u4b8,
# b_group_scale=b_group_scale,
# b_group_zeropoint=DataType.void,
# b_channel_scale=DataType.f32,
# a_token_scale=DataType.f32,
# out=DataType.f16,
# accumulator=DataType.s32,
# ) for b_group_scale in (DataType.f16, DataType.void)),
# *(TypeConfig(
# a=DataType.e4m3,
# b=VLLMDataType.u4b8,
# b_group_scale=b_group_scale,
# b_group_zeropoint=DataType.void,
# b_channel_scale=DataType.f32,
# a_token_scale=DataType.f32,
# out=DataType.f16,
# accumulator=DataType.f32,
# ) for b_group_scale in (DataType.f16, DataType.void)),
# ]
# impl_configs += [
# ImplConfig(x[0], x[1], x[2])
# for x in zip(QQQ_kernel_types,
# itertools.repeat(get_unique_schedules(qqq_heuristic)),
# itertools.repeat(qqq_heuristic))
# ]
output_dir
=
os
.
path
.
join
(
SCRIPT_DIR
,
"generated"
)
...
...
csrc/quantization/marlin/dense/LICENSE
deleted
100644 → 0
View file @
ebe56a00
Contains code from https://github.com/IST-DASLab/marlin
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
------------------------------------------------------------------------------------
This product bundles various third-party components under other open source licenses.
This section summarizes those components and their licenses. See licenses/
for text of these licenses.
csrc/quantization/marlin/dense/common/base.h
deleted
100644 → 0
View file @
ebe56a00
/*
* Modified by HandH1998
* Modified by Neural Magic
* Copyright (C) Marlin.2024 Elias Frantar
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
constexpr
int
ceildiv
(
int
a
,
int
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
// Instances of `Vec` are used to organize groups of >>registers<<, as needed
// for instance as inputs to tensor core operations. Consequently, all
// corresponding index accesses must be compile-time constants, which is why we
// extensively use `#pragma unroll` throughout the kernel code to guarantee
// this.
template
<
typename
T
,
int
n
>
struct
Vec
{
T
elems
[
n
];
__device__
T
&
operator
[](
int
i
)
{
return
elems
[
i
];
}
};
csrc/quantization/marlin/dense/common/mem.h
deleted
100644 → 0
View file @
ebe56a00
/*
* Modified by HandH1998
* Modified by Neural Magic
* Copyright (C) Marlin.2024 Elias Frantar
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
// Predicated asynchronous global->shared copy; used for inputs A where we apply
// predication to handle batchsizes that are not multiples of 16.
__device__
inline
void
cp_async4_pred
(
void
*
smem_ptr
,
const
void
*
glob_ptr
,
bool
pred
=
true
)
{
const
int
BYTES
=
16
;
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"{
\n
"
" .reg .pred p;
\n
"
" setp.ne.b32 p, %0, 0;
\n
"
" @p cp.async.cg.shared.global [%1], [%2], %3;
\n
"
"}
\n
"
::
"r"
((
int
)
pred
),
"r"
(
smem
),
"l"
(
glob_ptr
),
"n"
(
BYTES
));
}
// Asynchronous global->shared copy
__device__
inline
void
cp_async4
(
void
*
smem_ptr
,
const
void
*
glob_ptr
)
{
const
int
BYTES
=
16
;
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"{
\n
"
" cp.async.cg.shared.global [%0], [%1], %2;
\n
"
"}
\n
"
::
"r"
(
smem
),
"l"
(
glob_ptr
),
"n"
(
BYTES
));
}
// Async copy fence.
__device__
inline
void
cp_async_fence
()
{
asm
volatile
(
"cp.async.commit_group;
\n
"
::
);
}
// Wait until at most `n` async copy stages are still pending.
template
<
int
n
>
__device__
inline
void
cp_async_wait
()
{
asm
volatile
(
"cp.async.wait_group %0;
\n
"
::
"n"
(
n
));
}
// Wait until barrier reaches `count`, then lock for current threadblock.
__device__
inline
void
barrier_acquire
(
int
*
lock
,
int
count
)
{
if
(
threadIdx
.
x
==
0
)
{
int
state
=
-
1
;
do
// Guarantee that subsequent writes by this threadblock will be visible
// globally.
asm
volatile
(
"ld.global.acquire.gpu.b32 %0, [%1];
\n
"
:
"=r"
(
state
)
:
"l"
(
lock
));
while
(
state
!=
count
);
}
__syncthreads
();
}
// Release barrier and increment visitation count.
__device__
inline
void
barrier_release
(
int
*
lock
,
bool
reset
=
false
)
{
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
if
(
reset
)
{
lock
[
0
]
=
0
;
return
;
}
int
val
=
1
;
// Make sure that all writes since acquiring this barrier are visible
// globally, while releasing the barrier.
asm
volatile
(
"fence.acq_rel.gpu;
\n
"
);
asm
volatile
(
"red.relaxed.gpu.global.add.s32 [%0], %1;
\n
"
:
:
"l"
(
lock
),
"r"
(
val
));
}
}
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
deleted
100644 → 0
View file @
ebe56a00
/*
* Modified by Neural Magic
* Copyright (C) Marlin.2024 Elias Frantar
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
#include "common/base.h"
#include "core/registration.h"
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#include "common/mem.h"
#endif
template
<
typename
T
>
inline
std
::
string
str
(
T
x
)
{
return
std
::
to_string
(
x
);
}
namespace
marlin_dense
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
using
I4
=
Vec
<
int
,
4
>
;
// Matrix fragments for tensor core instructions; their precise layout is
// documented here:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
using
FragA
=
Vec
<
half2
,
4
>
;
using
FragB
=
Vec
<
half2
,
2
>
;
using
FragC
=
Vec
<
float
,
4
>
;
using
FragS
=
Vec
<
half2
,
1
>
;
// quantization scales
// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
__device__
inline
void
mma
(
const
FragA
&
a_frag
,
const
FragB
&
frag_b
,
FragC
&
frag_c
)
{
const
uint32_t
*
a
=
reinterpret_cast
<
const
uint32_t
*>
(
&
a_frag
);
const
uint32_t
*
b
=
reinterpret_cast
<
const
uint32_t
*>
(
&
frag_b
);
float
*
c
=
reinterpret_cast
<
float
*>
(
&
frag_c
);
asm
volatile
(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};
\n
"
:
"=f"
(
c
[
0
]),
"=f"
(
c
[
1
]),
"=f"
(
c
[
2
]),
"=f"
(
c
[
3
])
:
"r"
(
a
[
0
]),
"r"
(
a
[
1
]),
"r"
(
a
[
2
]),
"r"
(
a
[
3
]),
"r"
(
b
[
0
]),
"r"
(
b
[
1
]),
"f"
(
c
[
0
]),
"f"
(
c
[
1
]),
"f"
(
c
[
2
]),
"f"
(
c
[
3
]));
}
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
__device__
inline
void
ldsm4
(
FragA
&
frag_a
,
const
void
*
smem_ptr
)
{
uint32_t
*
a
=
reinterpret_cast
<
uint32_t
*>
(
&
frag_a
);
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];
\n
"
:
"=r"
(
a
[
0
]),
"=r"
(
a
[
1
]),
"=r"
(
a
[
2
]),
"=r"
(
a
[
3
])
:
"r"
(
smem
));
}
// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
template
<
int
lut
>
__device__
inline
int
lop3
(
int
a
,
int
b
,
int
c
)
{
int
res
;
asm
volatile
(
"lop3.b32 %0, %1, %2, %3, %4;
\n
"
:
"=r"
(
res
)
:
"r"
(
a
),
"r"
(
b
),
"r"
(
c
),
"n"
(
lut
));
return
res
;
}
// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
// values. We mostly follow the strategy in the link below, with some small
// changes:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
__device__
inline
FragB
dequant
(
int
q
)
{
const
int
LO
=
0x000f000f
;
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0x64086408
;
const
int
MUL
=
0x2c002c00
;
const
int
ADD
=
0xd480d480
;
FragB
frag_b
;
frag_b
[
0
]
=
__hsub2
(
*
reinterpret_cast
<
half2
*>
(
&
lo
),
*
reinterpret_cast
<
const
half2
*>
(
&
SUB
));
frag_b
[
1
]
=
__hfma2
(
*
reinterpret_cast
<
half2
*>
(
&
hi
),
*
reinterpret_cast
<
const
half2
*>
(
&
MUL
),
*
reinterpret_cast
<
const
half2
*>
(
&
ADD
));
return
frag_b
;
}
// Multiply dequantized values by the corresponding quantization scale; used
// only for grouped quantization.
__device__
inline
void
scale
(
FragB
&
frag_b
,
FragS
&
frag_s
,
int
i
)
{
half2
s
=
__half2half2
(
reinterpret_cast
<
__half
*>
(
&
frag_s
)[
i
]);
frag_b
[
0
]
=
__hmul2
(
frag_b
[
0
],
s
);
frag_b
[
1
]
=
__hmul2
(
frag_b
[
1
],
s
);
}
template
<
const
int
threads
,
// number of threads in a threadblock
const
int
thread_m_blocks
,
// number of 16x16 blocks in the m
// dimension (batchsize) of the
// threadblock
const
int
thread_n_blocks
,
// same for n dimension (output)
const
int
thread_k_blocks
,
// same for k dimension (reduction)
const
int
stages
,
// number of stages for the async global->shared
// fetch pipeline
const
int
group_blocks
=
-
1
// number of consecutive 16x16 blocks
// with a separate quantization scale
>
__global__
void
Marlin
(
const
int4
*
__restrict__
A
,
// fp16 input matrix of shape mxk
const
int4
*
__restrict__
B
,
// 4bit quantized weight matrix of shape kxn
int4
*
__restrict__
C
,
// fp16 output buffer of shape mxn
const
int4
*
__restrict__
s
,
// fp16 quantization scales of shape
// (k/groupsize)xn
int
prob_m
,
// batch dimension m
int
prob_n
,
// output dimension n
int
prob_k
,
// reduction dimension k
int
*
locks
// extra global storage for barrier synchronization
)
{
// Each threadblock processes one "stripe" of the B matrix with (roughly) the
// same size, which might involve multiple column "slices" (of width 16 *
// `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
// example:
// 0 1 3
// 0 2 3
// 1 2 4
// While this kind of partitioning makes things somewhat more complicated, it
// ensures good utilization of all SMs for many kinds of shape and GPU
// configurations, while requiring as few slow global cross-threadblock
// reductions as possible.
// For larger GEMMs we run multiple batchsize 64 versions in parallel for a
// better partitioning with less reductions
int
parallel
=
1
;
if
(
prob_m
>
16
*
thread_m_blocks
)
{
parallel
=
prob_m
/
(
16
*
thread_m_blocks
);
prob_m
=
16
*
thread_m_blocks
;
}
int
k_tiles
=
prob_k
/
16
/
thread_k_blocks
;
int
n_tiles
=
prob_n
/
16
/
thread_n_blocks
;
int
iters
=
ceildiv
(
k_tiles
*
n_tiles
*
parallel
,
gridDim
.
x
);
// Ensure that the number of tiles in each stripe is a multiple of the
// groupsize; this avoids an annoying special case where a stripe starts in
// the middle of group.
if
(
group_blocks
!=
-
1
)
iters
=
(
group_blocks
/
thread_k_blocks
)
*
ceildiv
(
iters
,
(
group_blocks
/
thread_k_blocks
));
int
slice_row
=
(
iters
*
blockIdx
.
x
)
%
k_tiles
;
int
slice_col_par
=
(
iters
*
blockIdx
.
x
)
/
k_tiles
;
int
slice_col
=
slice_col_par
;
int
slice_iters
;
// number of threadblock tiles in the current slice
int
slice_count
=
0
;
// total number of active threadblocks in the current slice
int
slice_idx
;
// index of threadblock in current slice; numbered bottom to
// top
// We can easily implement parallel problem execution by just remapping
// indices and advancing global pointers
if
(
slice_col_par
>=
n_tiles
)
{
A
+=
(
slice_col_par
/
n_tiles
)
*
16
*
thread_m_blocks
*
prob_k
/
8
;
C
+=
(
slice_col_par
/
n_tiles
)
*
16
*
thread_m_blocks
*
prob_n
/
8
;
locks
+=
(
slice_col_par
/
n_tiles
)
*
n_tiles
;
slice_col
=
slice_col_par
%
n_tiles
;
}
// Compute all information about the current slice which is required for
// synchronization.
auto
init_slice
=
[
&
]()
{
slice_iters
=
iters
*
(
blockIdx
.
x
+
1
)
-
(
k_tiles
*
slice_col_par
+
slice_row
);
if
(
slice_iters
<
0
||
slice_col_par
>=
n_tiles
*
parallel
)
slice_iters
=
0
;
if
(
slice_iters
==
0
)
return
;
if
(
slice_row
+
slice_iters
>
k_tiles
)
slice_iters
=
k_tiles
-
slice_row
;
slice_count
=
1
;
slice_idx
=
0
;
int
col_first
=
iters
*
ceildiv
(
k_tiles
*
slice_col_par
,
iters
);
if
(
col_first
<=
k_tiles
*
(
slice_col_par
+
1
))
{
int
col_off
=
col_first
-
k_tiles
*
slice_col_par
;
slice_count
=
ceildiv
(
k_tiles
-
col_off
,
iters
);
if
(
col_off
>
0
)
slice_count
++
;
int
delta_first
=
iters
*
blockIdx
.
x
-
col_first
;
if
(
delta_first
<
0
||
(
col_off
==
0
&&
delta_first
==
0
))
slice_idx
=
slice_count
-
1
;
else
{
slice_idx
=
slice_count
-
1
-
delta_first
/
iters
;
if
(
col_off
>
0
)
slice_idx
--
;
}
}
if
(
slice_col
==
n_tiles
)
{
A
+=
16
*
thread_m_blocks
*
prob_k
/
8
;
C
+=
16
*
thread_m_blocks
*
prob_n
/
8
;
locks
+=
n_tiles
;
slice_col
=
0
;
}
};
init_slice
();
int
a_gl_stride
=
prob_k
/
8
;
// stride of the A matrix in global memory
// We typically use `constexpr` to indicate that this value is a compile-time
// constant
constexpr
int
a_sh_stride
=
16
*
thread_k_blocks
/
8
;
// stride of an A matrix tile in shared memory
constexpr
int
a_gl_rd_delta_o
=
16
*
thread_k_blocks
/
8
;
// delta between subsequent A tiles in global memory
int
a_gl_rd_delta_i
=
a_gl_stride
*
(
threads
/
a_gl_rd_delta_o
);
// between subsequent accesses within a tile
constexpr
int
a_sh_wr_delta
=
a_sh_stride
*
(
threads
/
a_gl_rd_delta_o
);
// between shared memory writes
constexpr
int
a_sh_rd_delta_o
=
2
*
((
threads
/
32
)
/
(
thread_n_blocks
/
4
));
// between shared memory tile reads
constexpr
int
a_sh_rd_delta_i
=
a_sh_stride
*
16
;
// within a shared memory tile
constexpr
int
a_sh_stage
=
a_sh_stride
*
(
16
*
thread_m_blocks
);
// overall size of a tile
constexpr
int
a_sh_wr_iters
=
ceildiv
(
a_sh_stage
,
a_sh_wr_delta
);
// number of shared write iterations for a tile
int
b_gl_stride
=
16
*
prob_n
/
32
;
constexpr
int
b_sh_stride
=
32
*
thread_n_blocks
/
4
;
int
b_gl_rd_delta_o
=
b_gl_stride
*
thread_k_blocks
;
int
b_gl_rd_delta_i
=
b_gl_stride
*
(
threads
/
b_sh_stride
);
constexpr
int
b_sh_wr_delta
=
threads
;
constexpr
int
b_sh_rd_delta
=
threads
;
constexpr
int
b_sh_stage
=
b_sh_stride
*
thread_k_blocks
;
constexpr
int
b_sh_wr_iters
=
b_sh_stage
/
b_sh_wr_delta
;
int
s_gl_stride
=
prob_n
/
8
;
constexpr
int
s_sh_stride
=
16
*
thread_n_blocks
/
8
;
constexpr
int
s_sh_stage
=
s_sh_stride
;
int
s_gl_rd_delta
=
s_gl_stride
;
// Global A read index of current thread.
int
a_gl_rd
=
a_gl_stride
*
(
threadIdx
.
x
/
a_gl_rd_delta_o
)
+
(
threadIdx
.
x
%
a_gl_rd_delta_o
);
a_gl_rd
+=
a_gl_rd_delta_o
*
slice_row
;
// Shared write index of current thread.
int
a_sh_wr
=
a_sh_stride
*
(
threadIdx
.
x
/
a_gl_rd_delta_o
)
+
(
threadIdx
.
x
%
a_gl_rd_delta_o
);
// Shared read index.
int
a_sh_rd
=
a_sh_stride
*
((
threadIdx
.
x
%
32
)
%
16
)
+
(
threadIdx
.
x
%
32
)
/
16
;
a_sh_rd
+=
2
*
((
threadIdx
.
x
/
32
)
/
(
thread_n_blocks
/
4
));
int
b_gl_rd
=
b_gl_stride
*
(
threadIdx
.
x
/
b_sh_stride
)
+
(
threadIdx
.
x
%
b_sh_stride
);
b_gl_rd
+=
b_sh_stride
*
slice_col
;
b_gl_rd
+=
b_gl_rd_delta_o
*
slice_row
;
auto
b_sh_wr
=
threadIdx
.
x
;
auto
b_sh_rd
=
threadIdx
.
x
;
int
s_gl_rd
=
s_gl_stride
*
((
thread_k_blocks
*
slice_row
)
/
group_blocks
)
+
s_sh_stride
*
slice_col
+
threadIdx
.
x
;
auto
s_sh_wr
=
threadIdx
.
x
;
int
s_sh_rd
;
// We use a different scale layout for grouped and column-wise quantization as
// we scale a `half2` tile in column-major layout in the former and in
// row-major in the latter case.
if
(
group_blocks
!=
-
1
)
s_sh_rd
=
8
*
((
threadIdx
.
x
/
32
)
%
(
thread_n_blocks
/
4
))
+
(
threadIdx
.
x
%
32
)
/
4
;
else
s_sh_rd
=
8
*
((
threadIdx
.
x
/
32
)
%
(
thread_n_blocks
/
4
))
+
(
threadIdx
.
x
%
32
)
%
4
;
// Precompute which thread should not read memory in which iterations; this is
// needed if there are more threads than required for a certain tilesize or
// when the batchsize is not a multiple of 16.
bool
a_sh_wr_pred
[
a_sh_wr_iters
];
#pragma unroll
for
(
int
i
=
0
;
i
<
a_sh_wr_iters
;
i
++
)
a_sh_wr_pred
[
i
]
=
a_sh_wr_delta
*
i
+
a_sh_wr
<
a_sh_stride
*
prob_m
;
bool
s_sh_wr_pred
=
threadIdx
.
x
<
s_sh_stride
;
// To ensure that writing and reading A tiles to/from shared memory, the
// latter in fragment format, is fully bank conflict free, we need to use a
// rather fancy XOR-based layout. The key here is that neither reads nor
// writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
// same shared memory banks. Further, it seems (based on NSight-Compute) that
// each warp must also write a consecutive memory segment?
auto
transform_a
=
[
&
](
int
i
)
{
int
row
=
i
/
a_gl_rd_delta_o
;
return
a_gl_rd_delta_o
*
row
+
(
i
%
a_gl_rd_delta_o
)
^
row
;
};
// Since the computation of this remapping is non-trivial and, due to our main
// loop unrolls, all shared memory accesses are static, we simply precompute
// both transformed reads and writes.
int
a_sh_wr_trans
[
a_sh_wr_iters
];
#pragma unroll
for
(
int
i
=
0
;
i
<
a_sh_wr_iters
;
i
++
)
a_sh_wr_trans
[
i
]
=
transform_a
(
a_sh_wr_delta
*
i
+
a_sh_wr
);
int
a_sh_rd_trans
[
b_sh_wr_iters
][
thread_m_blocks
];
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
thread_m_blocks
;
j
++
)
a_sh_rd_trans
[
i
][
j
]
=
transform_a
(
a_sh_rd_delta_o
*
i
+
a_sh_rd_delta_i
*
j
+
a_sh_rd
);
}
// Since B-accesses have non-constant stride they have to be computed at
// runtime; we break dependencies between subsequent accesses with a tile by
// maintining multiple pointers (we have enough registers), a tiny
// optimization.
const
int4
*
B_ptr
[
b_sh_wr_iters
];
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
B_ptr
[
i
]
=
B
+
b_gl_rd_delta_i
*
i
+
b_gl_rd
;
extern
__shared__
int4
sh
[];
// Shared memory storage for global fetch pipelines.
int4
*
sh_a
=
sh
;
int4
*
sh_b
=
sh_a
+
(
stages
*
a_sh_stage
);
int4
*
sh_s
=
sh_b
+
(
stages
*
b_sh_stage
);
// Register storage for double buffer of shared memory reads.
FragA
frag_a
[
2
][
thread_m_blocks
];
I4
frag_b_quant
[
2
];
FragC
frag_c
[
thread_m_blocks
][
4
][
2
];
FragS
frag_s
[
2
][
4
];
// Zero accumulators.
auto
zero_accums
=
[
&
]()
{
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
*
4
*
2
*
4
;
i
++
)
reinterpret_cast
<
float
*>
(
frag_c
)[
i
]
=
0
;
};
// Asynchronously fetch the next A, B and s tile from global to the next
// shared memory pipeline location.
auto
fetch_to_shared
=
[
&
](
int
pipe
,
int
a_off
,
bool
pred
=
true
)
{
if
(
pred
)
{
int4
*
sh_a_stage
=
sh_a
+
a_sh_stage
*
pipe
;
#pragma unroll
for
(
int
i
=
0
;
i
<
a_sh_wr_iters
;
i
++
)
{
cp_async4_pred
(
&
sh_a_stage
[
a_sh_wr_trans
[
i
]],
&
A
[
a_gl_rd_delta_i
*
i
+
a_gl_rd
+
a_gl_rd_delta_o
*
a_off
],
a_sh_wr_pred
[
i
]);
}
int4
*
sh_b_stage
=
sh_b
+
b_sh_stage
*
pipe
;
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
{
cp_async4
(
&
sh_b_stage
[
b_sh_wr_delta
*
i
+
b_sh_wr
],
B_ptr
[
i
]);
B_ptr
[
i
]
+=
b_gl_rd_delta_o
;
}
// Only fetch scales if this tile starts a new group
if
constexpr
(
group_blocks
!=
-
1
)
{
// This assumes group_blocks >= thread_k_blocks
// and would need to be modified to support smaller groups.
static_assert
(
group_blocks
>=
thread_k_blocks
);
if
(
pipe
%
(
group_blocks
/
thread_k_blocks
)
==
0
)
{
int4
*
sh_s_stage
=
sh_s
+
s_sh_stage
*
pipe
;
if
(
s_sh_wr_pred
)
cp_async4
(
&
sh_s_stage
[
s_sh_wr
],
&
s
[
s_gl_rd
]);
s_gl_rd
+=
s_gl_rd_delta
;
}
}
}
// Insert a fence even when we are winding down the pipeline to ensure that
// waiting is also correct at this point.
cp_async_fence
();
};
// Wait until the next thread tile has been loaded to shared memory.
auto
wait_for_stage
=
[
&
]()
{
// We only have `stages - 2` active fetches since we are double buffering
// and can only issue the next fetch when it is guaranteed that the previous
// shared memory load is fully complete (as it may otherwise be
// overwritten).
cp_async_wait
<
stages
-
2
>
();
__syncthreads
();
};
// Load the next sub-tile from the current location in the shared memory pipe
// into the current register buffer.
auto
fetch_to_registers
=
[
&
](
int
k
,
int
pipe
)
{
// It may seem inefficient that we reload the groups for every sub-tile;
// however, this does not seem to be a significant bottleneck, while some
// theoretically better attempts have lead to bad instruction ordering by
// the compiler and correspondingly a noticeable drop in performance.
if
constexpr
(
group_blocks
!=
-
1
)
{
// This assumes group_blocks >= thread_k_blocks
// and would need to be modified to support smaller groups.
static_assert
(
group_blocks
>=
thread_k_blocks
);
int4
*
sh_s_stage
=
sh_s
+
s_sh_stage
*
((
group_blocks
/
thread_k_blocks
)
*
(
pipe
/
(
group_blocks
/
thread_k_blocks
)));
reinterpret_cast
<
int4
*>
(
&
frag_s
[
k
%
2
])[
0
]
=
sh_s_stage
[
s_sh_rd
];
}
int4
*
sh_a_stage
=
sh_a
+
a_sh_stage
*
pipe
;
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
ldsm4
(
frag_a
[
k
%
2
][
i
],
&
sh_a_stage
[
a_sh_rd_trans
[
k
%
b_sh_wr_iters
][
i
]]);
int4
*
sh_b_stage
=
sh_b
+
b_sh_stage
*
pipe
;
frag_b_quant
[
k
%
2
]
=
*
reinterpret_cast
<
I4
*>
(
&
sh_b_stage
[
b_sh_rd_delta
*
(
k
%
b_sh_wr_iters
)
+
b_sh_rd
]);
};
// Execute the actual tensor core matmul of a sub-tile.
auto
matmul
=
[
&
](
int
k
)
{
// We have the m dimension as the inner loop in order to encourage overlapping
// dequantization and matmul operations.
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
int
b_quant
=
frag_b_quant
[
k
%
2
][
j
];
int
b_quant_shift
=
b_quant
>>
8
;
FragB
frag_b0
=
dequant
(
b_quant
);
// If there are no groups, we can just scale the final output once and can
// avoid doing so for each weight.
if
(
group_blocks
!=
-
1
)
scale
(
frag_b0
,
frag_s
[
k
%
2
][
j
],
0
);
FragB
frag_b1
=
dequant
(
b_quant_shift
);
if
(
group_blocks
!=
-
1
)
scale
(
frag_b1
,
frag_s
[
k
%
2
][
j
],
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
{
mma
(
frag_a
[
k
%
2
][
i
],
frag_b0
,
frag_c
[
i
][
j
][
0
]);
mma
(
frag_a
[
k
%
2
][
i
],
frag_b1
,
frag_c
[
i
][
j
][
1
]);
}
}
};
// Since we slice across the k dimension of a tile in order to increase the
// number of warps while keeping the n dimension of a tile reasonable, we have
// multiple warps that accumulate their partial sums of the same output
// location; which we have to reduce over in the end. We do in shared memory.
auto
thread_block_reduce
=
[
&
]()
{
constexpr
int
red_off
=
threads
/
b_sh_stride
/
2
;
if
(
red_off
>=
1
)
{
auto
red_idx
=
threadIdx
.
x
/
b_sh_stride
;
constexpr
int
red_sh_stride
=
b_sh_stride
*
4
*
2
;
constexpr
int
red_sh_delta
=
b_sh_stride
;
int
red_sh_rd
=
red_sh_stride
*
(
threadIdx
.
x
/
b_sh_stride
)
+
(
threadIdx
.
x
%
b_sh_stride
);
// Parallel logarithmic shared memory reduction. We make sure to avoid any
// unnecessary read or write iterations, e.g., for two warps we write only
// once by warp 1 and read only once by warp 0.
#pragma unroll
for
(
int
m_block
=
0
;
m_block
<
thread_m_blocks
;
m_block
++
)
{
#pragma unroll
for
(
int
i
=
red_off
;
i
>
0
;
i
/=
2
)
{
if
(
i
<=
red_idx
&&
red_idx
<
2
*
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
4
*
2
;
j
++
)
{
int
red_sh_wr
=
red_sh_delta
*
j
+
(
red_sh_rd
-
red_sh_stride
*
i
);
if
(
i
<
red_off
)
{
float
*
c_rd
=
reinterpret_cast
<
float
*>
(
&
sh
[
red_sh_delta
*
j
+
red_sh_rd
]);
float
*
c_wr
=
reinterpret_cast
<
float
*>
(
&
sh
[
red_sh_wr
]);
#pragma unroll
for
(
int
k
=
0
;
k
<
4
;
k
++
)
reinterpret_cast
<
FragC
*>
(
frag_c
)[
4
*
2
*
m_block
+
j
][
k
]
+=
c_rd
[
k
]
+
c_wr
[
k
];
}
sh
[
red_sh_wr
]
=
reinterpret_cast
<
int4
*>
(
&
frag_c
)[
4
*
2
*
m_block
+
j
];
}
}
__syncthreads
();
}
if
(
red_idx
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
4
*
2
;
i
++
)
{
float
*
c_rd
=
reinterpret_cast
<
float
*>
(
&
sh
[
red_sh_delta
*
i
+
red_sh_rd
]);
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
reinterpret_cast
<
FragC
*>
(
frag_c
)[
4
*
2
*
m_block
+
i
][
j
]
+=
c_rd
[
j
];
}
}
__syncthreads
();
}
}
};
// Since multiple threadblocks may process parts of the same column slice, we
// finally have to globally reduce over the results. As the striped
// partitioning minimizes the number of such reductions and our outputs are
// usually rather small, we perform this reduction serially in L2 cache.
auto
global_reduce
=
[
&
](
bool
first
=
false
,
bool
last
=
false
)
{
// We are very careful here to reduce directly in the output buffer to
// maximize L2 cache utilization in this step. To do this, we write out
// results in FP16 (but still reduce with FP32 compute).
constexpr
int
active_threads
=
32
*
thread_n_blocks
/
4
;
if
(
threadIdx
.
x
<
active_threads
)
{
int
c_gl_stride
=
prob_n
/
8
;
int
c_gl_wr_delta_o
=
8
*
c_gl_stride
;
int
c_gl_wr_delta_i
=
4
*
(
active_threads
/
32
);
int
c_gl_wr
=
c_gl_stride
*
((
threadIdx
.
x
%
32
)
/
4
)
+
4
*
(
threadIdx
.
x
/
32
)
+
threadIdx
.
x
%
4
;
c_gl_wr
+=
(
2
*
thread_n_blocks
)
*
slice_col
;
constexpr
int
c_sh_wr_delta
=
active_threads
;
auto
c_sh_wr
=
threadIdx
.
x
;
int
row
=
(
threadIdx
.
x
%
32
)
/
4
;
if
(
!
first
)
{
// Interestingly, doing direct global accesses here really seems to mess up
// the compiler and lead to slowdowns, hence we also use async-copies even
// though these fetches are not actually asynchronous.
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
*
4
;
i
++
)
{
cp_async4_pred
(
&
sh
[
c_sh_wr
+
c_sh_wr_delta
*
i
],
&
C
[
c_gl_wr
+
c_gl_wr_delta_o
*
(
i
/
2
)
+
c_gl_wr_delta_i
*
(
i
%
2
)],
i
<
(
thread_m_blocks
-
1
)
*
4
||
8
*
(
i
/
2
)
+
row
<
prob_m
);
}
cp_async_fence
();
cp_async_wait
<
0
>
();
}
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
*
4
;
i
++
)
{
if
(
i
<
(
thread_m_blocks
-
1
)
*
4
||
8
*
(
i
/
2
)
+
row
<
prob_m
)
{
if
(
!
first
)
{
int4
c_red
=
sh
[
c_sh_wr
+
i
*
c_sh_wr_delta
];
#pragma unroll
for
(
int
j
=
0
;
j
<
2
*
4
;
j
++
)
{
reinterpret_cast
<
float
*>
(
&
frag_c
)[
4
*
2
*
4
*
(
i
/
4
)
+
4
*
j
+
(
i
%
4
)]
+=
__half2float
(
reinterpret_cast
<
__half
*>
(
&
c_red
)[
j
]);
}
}
if
(
!
last
)
{
int4
c
;
#pragma unroll
for
(
int
j
=
0
;
j
<
2
*
4
;
j
++
)
{
reinterpret_cast
<
__half
*>
(
&
c
)[
j
]
=
__float2half
(
reinterpret_cast
<
float
*>
(
&
frag_c
)[
4
*
2
*
4
*
(
i
/
4
)
+
4
*
j
+
(
i
%
4
)]);
}
C
[
c_gl_wr
+
c_gl_wr_delta_o
*
(
i
/
2
)
+
c_gl_wr_delta_i
*
(
i
%
2
)]
=
c
;
}
}
}
}
};
// Write out the reduce final result in the correct layout. We only actually
// reshuffle matrix fragments in this step, the reduction above is performed
// in fragment layout.
auto
write_result
=
[
&
]()
{
int
c_gl_stride
=
prob_n
/
8
;
constexpr
int
c_sh_stride
=
2
*
thread_n_blocks
+
1
;
int
c_gl_wr_delta
=
c_gl_stride
*
(
threads
/
(
2
*
thread_n_blocks
));
constexpr
int
c_sh_rd_delta
=
c_sh_stride
*
(
threads
/
(
2
*
thread_n_blocks
));
int
c_gl_wr
=
c_gl_stride
*
(
threadIdx
.
x
/
(
2
*
thread_n_blocks
))
+
(
threadIdx
.
x
%
(
2
*
thread_n_blocks
));
c_gl_wr
+=
(
2
*
thread_n_blocks
)
*
slice_col
;
int
c_sh_wr
=
(
4
*
c_sh_stride
)
*
((
threadIdx
.
x
%
32
)
/
4
)
+
(
threadIdx
.
x
%
32
)
%
4
;
c_sh_wr
+=
32
*
(
threadIdx
.
x
/
32
);
int
c_sh_rd
=
c_sh_stride
*
(
threadIdx
.
x
/
(
2
*
thread_n_blocks
))
+
(
threadIdx
.
x
%
(
2
*
thread_n_blocks
));
int
c_gl_wr_end
=
c_gl_stride
*
prob_m
;
// We first reorder in shared memory to guarantee the most efficient final
// global write patterns
auto
write
=
[
&
](
int
idx
,
float
c0
,
float
c1
,
FragS
&
s
)
{
half2
res
=
__halves2half2
(
__float2half
(
c0
),
__float2half
(
c1
));
if
(
group_blocks
==
-
1
)
// for per-column quantization we finally apply the scale here
res
=
__hmul2
(
res
,
s
[
0
]);
((
half2
*
)
sh
)[
idx
]
=
res
;
};
if
(
threadIdx
.
x
/
32
<
thread_n_blocks
/
4
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
int
wr
=
c_sh_wr
+
8
*
j
;
write
(
wr
+
(
4
*
c_sh_stride
)
*
0
+
0
,
frag_c
[
i
][
j
][
0
][
0
],
frag_c
[
i
][
j
][
0
][
1
],
frag_s
[
j
/
2
][
2
*
(
j
%
2
)
+
0
]);
write
(
wr
+
(
4
*
c_sh_stride
)
*
8
+
0
,
frag_c
[
i
][
j
][
0
][
2
],
frag_c
[
i
][
j
][
0
][
3
],
frag_s
[
j
/
2
][
2
*
(
j
%
2
)
+
0
]);
write
(
wr
+
(
4
*
c_sh_stride
)
*
0
+
4
,
frag_c
[
i
][
j
][
1
][
0
],
frag_c
[
i
][
j
][
1
][
1
],
frag_s
[
j
/
2
][
2
*
(
j
%
2
)
+
1
]);
write
(
wr
+
(
4
*
c_sh_stride
)
*
8
+
4
,
frag_c
[
i
][
j
][
1
][
2
],
frag_c
[
i
][
j
][
1
][
3
],
frag_s
[
j
/
2
][
2
*
(
j
%
2
)
+
1
]);
}
c_sh_wr
+=
16
*
(
4
*
c_sh_stride
);
}
}
__syncthreads
();
#pragma unroll
for
(
int
i
=
0
;
i
<
ceildiv
(
16
*
thread_m_blocks
,
threads
/
(
2
*
thread_n_blocks
));
i
++
)
{
if
(
c_gl_wr
<
c_gl_wr_end
)
{
C
[
c_gl_wr
]
=
sh
[
c_sh_rd
];
c_gl_wr
+=
c_gl_wr_delta
;
c_sh_rd
+=
c_sh_rd_delta
;
}
}
};
// Start global fetch and register load pipelines.
auto
start_pipes
=
[
&
]()
{
#pragma unroll
for
(
int
i
=
0
;
i
<
stages
-
1
;
i
++
)
fetch_to_shared
(
i
,
i
,
i
<
slice_iters
);
zero_accums
();
wait_for_stage
();
fetch_to_registers
(
0
,
0
);
a_gl_rd
+=
a_gl_rd_delta_o
*
(
stages
-
1
);
};
start_pipes
();
// Main loop.
while
(
slice_iters
)
{
// We unroll over both the global fetch and the register load pipeline to
// ensure all shared memory accesses are static. Note that both pipelines have
// even length meaning that the next iteration will always start at index 0.
#pragma unroll
for
(
int
pipe
=
0
;
pipe
<
stages
;)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
b_sh_wr_iters
;
k
++
)
{
fetch_to_registers
(
k
+
1
,
pipe
%
stages
);
if
(
k
==
b_sh_wr_iters
-
2
)
{
fetch_to_shared
((
pipe
+
stages
-
1
)
%
stages
,
pipe
,
slice_iters
>=
stages
);
pipe
++
;
wait_for_stage
();
}
matmul
(
k
);
}
slice_iters
--
;
if
(
slice_iters
==
0
)
break
;
}
a_gl_rd
+=
a_gl_rd_delta_o
*
stages
;
// Process results and, if necessary, proceed to the next column slice.
// While this pattern may not be the most readable, other ways of writing
// the loop seemed to noticeably worse performance after compilation.
if
(
slice_iters
==
0
)
{
cp_async_wait
<
0
>
();
bool
last
=
slice_idx
==
slice_count
-
1
;
// For per-column scales, we only fetch them here in the final step before
// write-out
if
(
group_blocks
==
-
1
&&
last
)
{
if
(
s_sh_wr_pred
)
cp_async4
(
&
sh_s
[
s_sh_wr
],
&
s
[
s_gl_rd
]);
cp_async_fence
();
}
thread_block_reduce
();
if
(
group_blocks
==
-
1
&&
last
)
{
cp_async_wait
<
0
>
();
__syncthreads
();
if
(
threadIdx
.
x
/
32
<
thread_n_blocks
/
4
)
{
reinterpret_cast
<
int4
*>
(
&
frag_s
)[
0
]
=
sh_s
[
s_sh_rd
+
0
];
reinterpret_cast
<
int4
*>
(
&
frag_s
)[
1
]
=
sh_s
[
s_sh_rd
+
4
];
}
}
if
(
slice_count
>
1
)
{
// only globally reduce if there is more than one
// block in a slice
barrier_acquire
(
&
locks
[
slice_col
],
slice_idx
);
global_reduce
(
slice_idx
==
0
,
last
);
barrier_release
(
&
locks
[
slice_col
],
last
);
}
if
(
last
)
// only the last block in a slice actually writes the result
write_result
();
slice_row
=
0
;
slice_col_par
++
;
slice_col
++
;
init_slice
();
if
(
slice_iters
)
{
a_gl_rd
=
a_gl_stride
*
(
threadIdx
.
x
/
a_gl_rd_delta_o
)
+
(
threadIdx
.
x
%
a_gl_rd_delta_o
);
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
B_ptr
[
i
]
+=
b_sh_stride
-
b_gl_rd_delta_o
*
k_tiles
;
if
(
slice_col
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
B_ptr
[
i
]
-=
b_gl_stride
;
}
s_gl_rd
=
s_sh_stride
*
slice_col
+
threadIdx
.
x
;
start_pipes
();
}
}
}
}
#else
template
<
const
int
threads
,
// number of threads in a threadblock
const
int
thread_m_blocks
,
// number of 16x16 blocks in the m
// dimension (batchsize) of the
// threadblock
const
int
thread_n_blocks
,
// same for n dimension (output)
const
int
thread_k_blocks
,
// same for k dimension (reduction)
const
int
stages
,
// number of stages for the async global->shared
// fetch pipeline
const
int
group_blocks
=
-
1
// number of consecutive 16x16 blocks
// with a separate quantization scale
>
__global__
void
Marlin
(
const
int4
*
__restrict__
A
,
// fp16 input matrix of shape mxk
const
int4
*
__restrict__
B
,
// 4bit quantized weight matrix of shape kxn
int4
*
__restrict__
C
,
// fp16 output buffer of shape mxn
const
int4
*
__restrict__
s
,
// fp16 quantization scales of shape
// (k/groupsize)xn
int
prob_m
,
// batch dimension m
int
prob_n
,
// output dimension n
int
prob_k
,
// reduction dimension k
int
*
locks
// extra global storage for barrier synchronization
)
{
// Marlin is not implemented yet for SM < 8.0
assert
(
false
);
return
;
}
#endif
// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
const
int
USER_THREADS
=
256
;
// Note: This is only used with user-provided thread_k/n
const
int
STAGES
=
4
;
// 4 pipeline stages fit into shared memory
const
int
SHARED_MEM
=
96
*
1024
;
// max shared memory on compute capability 8.6 (< 8.0)
static
constexpr
int
min_thread_n
=
64
;
static
constexpr
int
min_thread_k
=
64
;
static
constexpr
int
tile_size
=
16
;
static
constexpr
int
max_par
=
16
;
static
constexpr
int
pack_factor_4bit
=
8
;
// We have 8 4-bit vals inside a 32 bit
#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
GROUP_BLOCKS, NUM_THREADS) \
else if (thread_m_blocks == THREAD_M_BLOCKS && \
thread_n_blocks == THREAD_N_BLOCKS && \
thread_k_blocks == THREAD_K_BLOCKS && \
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \
cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>, \
cudaFuncAttributeMaxDynamicSharedMemorySize, \
SHARED_MEM); \
Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks); \
}
typedef
struct
{
int
thread_k
;
int
thread_n
;
int
num_threads
;
}
thread_config_t
;
thread_config_t
small_batch_thread_configs
[]
=
{
// Ordered by priority
// thread_k, thread_n, num_threads
{
128
,
128
,
256
},
// Default
{
128
,
64
,
128
},
// Reduce N 2X, same K
{
64
,
256
,
256
},
// Reduce K 2X, increase N 2X
{
64
,
128
,
128
},
// Reduce K 2X, same N
};
thread_config_t
large_batch_thread_configs
[]
=
{
// Ordered by priority
// thread_k, thread_n, num_threads
{
64
,
256
,
256
},
// Default
{
128
,
128
,
256
},
// Reduce N 2X, increase K 2X
{
64
,
128
,
128
},
// Reduce N 2X, same K
{
128
,
64
,
128
},
// Reduce N 4X, increase K 2X
};
bool
is_valid_config
(
thread_config_t
const
&
th_config
,
int
prob_m
,
int
prob_n
,
int
prob_k
)
{
// Sanity
if
(
th_config
.
thread_k
==
-
1
||
th_config
.
thread_n
==
-
1
||
th_config
.
num_threads
==
-
1
)
{
return
false
;
}
// Verify K/N are divisible by thread K/N
if
(
prob_k
%
th_config
.
thread_k
!=
0
||
prob_n
%
th_config
.
thread_n
!=
0
)
{
return
false
;
}
// thread_k can be only 128 or 64 (because it must be less than groupsize
// which is 128)
if
(
th_config
.
thread_k
!=
128
&&
th_config
.
thread_k
!=
64
)
{
return
false
;
}
// Verify min for thread K/N
if
(
th_config
.
thread_n
<
min_thread_n
||
th_config
.
thread_k
<
min_thread_k
)
{
return
false
;
}
// num_threads must be at least 128 (= 4 warps)
if
(
th_config
.
num_threads
<
128
)
{
return
false
;
}
return
true
;
}
thread_config_t
determine_thread_config
(
int
prob_m
,
int
prob_n
,
int
prob_k
)
{
if
(
prob_m
<=
16
)
{
for
(
auto
th_config
:
small_batch_thread_configs
)
{
if
(
is_valid_config
(
th_config
,
prob_m
,
prob_n
,
prob_k
))
{
return
th_config
;
}
}
}
else
{
for
(
auto
th_config
:
large_batch_thread_configs
)
{
if
(
is_valid_config
(
th_config
,
prob_m
,
prob_n
,
prob_k
))
{
return
th_config
;
}
}
}
return
thread_config_t
{
-
1
,
-
1
,
-
1
};
}
#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
void
marlin_cuda
(
const
void
*
A
,
const
void
*
B
,
void
*
C
,
void
*
s
,
int
prob_m
,
int
prob_n
,
int
prob_k
,
void
*
workspace
,
int
groupsize
=
-
1
,
int
dev
=
0
,
cudaStream_t
stream
=
0
,
int
thread_k
=
-
1
,
int
thread_n
=
-
1
,
int
sms
=
-
1
,
int
max_par
=
16
)
{
int
tot_m
=
prob_m
;
int
tot_m_blocks
=
ceildiv
(
tot_m
,
16
);
int
pad
=
16
*
tot_m_blocks
-
tot_m
;
if
(
sms
==
-
1
)
cudaDeviceGetAttribute
(
&
sms
,
cudaDevAttrMultiProcessorCount
,
dev
);
// Set thread config
thread_config_t
th_config
;
if
(
thread_k
!=
-
1
&&
thread_n
!=
-
1
)
{
// User-defined config
th_config
=
thread_config_t
{
thread_k
,
thread_n
,
USER_THREADS
};
}
else
{
// Auto config
th_config
=
determine_thread_config
(
prob_m
,
prob_n
,
prob_k
);
}
if
(
!
is_valid_config
(
th_config
,
prob_m
,
prob_n
,
prob_k
))
{
throw
std
::
runtime_error
(
"Invalid thread config: thread_k = "
+
str
(
th_config
.
thread_k
)
+
", thread_n = "
+
str
(
th_config
.
thread_n
)
+
", num_threads = "
+
str
(
th_config
.
num_threads
)
+
" for MKN = ["
+
str
(
prob_m
)
+
", "
+
str
(
prob_k
)
+
", "
+
str
(
prob_n
)
+
"]"
);
}
// Uncomment for debug
// std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
// ", thread_n = " + str(th_config.thread_n) +
// ", num_threads = " + str(th_config.num_threads) + " for
// MKN = [" + str(prob_m) +
// ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
int
num_threads
=
th_config
.
num_threads
;
thread_k
=
th_config
.
thread_k
;
thread_n
=
th_config
.
thread_n
;
int
thread_k_blocks
=
thread_k
/
16
;
int
thread_n_blocks
=
thread_n
/
16
;
int
group_blocks
=
(
groupsize
==
-
1
)
?
-
1
:
groupsize
/
16
;
int
blocks
=
sms
;
if
(
prob_m
==
0
||
prob_n
==
0
||
prob_k
==
0
)
{
return
;
}
TORCH_CHECK
(
prob_n
%
thread_n
==
0
,
"prob_n = "
,
prob_n
,
" is not divisible by thread_n = "
,
thread_n
);
TORCH_CHECK
(
prob_k
%
thread_k
==
0
,
"prob_k = "
,
prob_k
,
" is not divisible by thread_k = "
,
thread_k
);
if
(
group_blocks
!=
-
1
)
{
TORCH_CHECK
(
prob_k
%
group_blocks
==
0
,
"prob_k = "
,
prob_k
,
" is not divisible by group_blocks = "
,
group_blocks
);
}
const
int4
*
A_ptr
=
(
const
int4
*
)
A
;
const
int4
*
B_ptr
=
(
const
int4
*
)
B
;
int4
*
C_ptr
=
(
int4
*
)
C
;
const
int4
*
s_ptr
=
(
const
int4
*
)
s
;
int
*
locks
=
(
int
*
)
workspace
;
for
(
int
i
=
0
;
i
<
tot_m_blocks
;
i
+=
4
)
{
int
thread_m_blocks
=
tot_m_blocks
-
i
;
prob_m
=
tot_m
-
16
*
i
;
int
par
=
1
;
if
(
thread_m_blocks
>
4
)
{
// Note that parallel > 1 currently only works for inputs without any
// padding
par
=
(
16
*
thread_m_blocks
-
pad
)
/
64
;
if
(
par
>
max_par
)
par
=
max_par
;
prob_m
=
64
*
par
;
i
+=
4
*
(
par
-
1
);
thread_m_blocks
=
4
;
}
// For compilation speed, we only define the kernel configurations that have
// seemed useful (in terms of performance) in our testing, however many more
// are, in principle, possible.
if
(
false
)
{
}
CALL_IF
(
8
,
8
,
256
)
CALL_IF
(
16
,
4
,
256
)
CALL_IF
(
8
,
4
,
128
)
CALL_IF
(
4
,
8
,
128
)
else
{
throw
std
::
runtime_error
(
"Unsupported shapes: MKN = ["
+
str
(
prob_m
)
+
", "
+
str
(
prob_k
)
+
", "
+
str
(
prob_n
)
+
"]"
+
", groupsize = "
+
str
(
groupsize
)
+
", thread_m_blocks = "
+
str
(
thread_m_blocks
)
+
", thread_n_blocks = "
+
str
(
thread_n_blocks
)
+
", thread_k_blocks = "
+
str
(
thread_k_blocks
));
}
A_ptr
+=
16
*
thread_m_blocks
*
(
prob_k
/
8
)
*
par
;
C_ptr
+=
16
*
thread_m_blocks
*
(
prob_n
/
8
)
*
par
;
}
}
}
// namespace marlin_dense
torch
::
Tensor
marlin_gemm
(
torch
::
Tensor
&
a
,
torch
::
Tensor
&
b_q_weight
,
torch
::
Tensor
&
b_scales
,
torch
::
Tensor
&
workspace
,
int64_t
size_m
,
int64_t
size_n
,
int64_t
size_k
)
{
// Verify M
TORCH_CHECK
(
size_m
==
a
.
size
(
0
),
"Shape mismatch: a.size(0) = "
+
str
(
a
.
size
(
0
))
+
", size_m = "
+
str
(
size_m
));
// Verify K
TORCH_CHECK
(
size_k
==
a
.
size
(
1
),
"Shape mismatch: a.size(1) = "
+
str
(
a
.
size
(
1
))
+
", size_k = "
+
str
(
size_k
));
TORCH_CHECK
(
size_k
%
marlin_dense
::
tile_size
==
0
,
"size_k = "
+
str
(
size_k
)
+
" is not divisible by tile_size = "
+
str
(
marlin_dense
::
tile_size
));
TORCH_CHECK
((
size_k
/
marlin_dense
::
tile_size
)
==
b_q_weight
.
size
(
0
),
"Shape mismatch: b_q_weight.size(0) = "
+
str
(
b_q_weight
.
size
(
0
))
+
", size_k = "
+
str
(
size_k
)
+
", tile_size = "
+
str
(
marlin_dense
::
tile_size
));
// Verify N
TORCH_CHECK
(
b_scales
.
size
(
1
)
==
size_n
,
"b_scales.size(1) = "
+
str
(
b_scales
.
size
(
1
))
+
", size_n = "
+
str
(
size_n
));
TORCH_CHECK
(
b_q_weight
.
size
(
1
)
%
marlin_dense
::
tile_size
==
0
,
"b_q_weight.size(1) = "
+
str
(
b_q_weight
.
size
(
1
))
+
" is not divisible by tile_size = "
+
str
(
marlin_dense
::
tile_size
));
int
actual_size_n
=
(
b_q_weight
.
size
(
1
)
/
marlin_dense
::
tile_size
)
*
marlin_dense
::
pack_factor_4bit
;
TORCH_CHECK
(
size_n
==
actual_size_n
,
"size_n = "
+
str
(
size_n
)
+
", actual_size_n = "
+
str
(
actual_size_n
));
// Verify A device and strides
TORCH_CHECK
(
a
.
device
().
is_cuda
(),
"A is not on GPU"
);
TORCH_CHECK
(
a
.
is_contiguous
(),
"A is not contiguous"
);
// Verify B device and strides
TORCH_CHECK
(
b_q_weight
.
device
().
is_cuda
(),
"b_q_weight is not on GPU"
);
TORCH_CHECK
(
b_q_weight
.
is_contiguous
(),
"b_q_weight is not contiguous"
);
// Verify scales device and strides
TORCH_CHECK
(
b_scales
.
device
().
is_cuda
(),
"b_scales is not on GPU"
);
TORCH_CHECK
(
b_scales
.
is_contiguous
(),
"b_scales is not contiguous"
);
// Alloc C matrix
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
a
));
auto
options
=
torch
::
TensorOptions
().
dtype
(
a
.
dtype
()).
device
(
a
.
device
());
torch
::
Tensor
c
=
torch
::
empty
({
size_m
,
size_n
},
options
);
// thread_k: `k` size of a thread_tile in `weights` (can usually be left as
// auto -1)
int
thread_k
=
-
1
;
// thread_n: `n` size of a thread_tile in `weights` (can usually be left as
// auto -1)
int
thread_n
=
-
1
;
// sms: number of SMs to use for the kernel (can usually be left as auto -1)
int
sms
=
-
1
;
// Detect groupsize
if
(
b_scales
.
size
(
0
)
!=
1
)
{
TORCH_CHECK
(
size_k
%
b_scales
.
size
(
0
)
==
0
,
"size_k = "
+
str
(
size_k
)
+
", is not divisible by b_scales.size(0) = "
+
str
(
b_scales
.
size
(
0
)));
}
int
groupsize
=
b_scales
.
size
(
0
)
==
1
?
-
1
:
size_k
/
b_scales
.
size
(
0
);
// Verify groupsize
TORCH_CHECK
(
groupsize
==
-
1
||
groupsize
==
128
,
"Unexpected groupsize = "
+
str
(
groupsize
));
// Verify workspace size
TORCH_CHECK
(
size_n
%
marlin_dense
::
min_thread_n
==
0
,
"size_n = "
+
str
(
size_n
)
+
", is not divisible by min_thread_n = "
+
str
(
marlin_dense
::
min_thread_n
));
int
min_workspace_size
=
(
size_n
/
marlin_dense
::
min_thread_n
)
*
marlin_dense
::
max_par
;
TORCH_CHECK
(
workspace
.
numel
()
>=
min_workspace_size
,
"workspace.numel = "
+
str
(
workspace
.
numel
())
+
" is below min_workspace_size = "
+
str
(
min_workspace_size
));
int
dev
=
a
.
get_device
();
marlin_dense
::
marlin_cuda
(
a
.
data_ptr
(),
b_q_weight
.
data_ptr
(),
c
.
data_ptr
(),
b_scales
.
data_ptr
(),
size_m
,
size_n
,
size_k
,
workspace
.
data_ptr
(),
groupsize
,
dev
,
at
::
cuda
::
getCurrentCUDAStream
(
dev
),
thread_k
,
thread_n
,
sms
,
marlin_dense
::
max_par
);
return
c
;
}
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
CUDA
,
m
)
{
m
.
impl
(
"marlin_gemm"
,
&
marlin_gemm
);
}
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
deleted
100644 → 0
View file @
ebe56a00
/*
* Adapted from
* https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu
* https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp
* Modified by HandH1998
* Copyright (C) 2024 HandH1998
* Copyright (C) Marlin.2024 Elias Frantar
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
#include "../dense/common/base.h"
#include "core/registration.h"
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#include "../dense/common/mem.h"
#endif
template
<
typename
T
>
inline
std
::
string
str
(
T
x
)
{
return
std
::
to_string
(
x
);
}
namespace
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
using
I4
=
Vec
<
int
,
4
>
;
// Matrix fragments for tensor core instructions; their precise layout is
// documented here:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type
using
FragA
=
Vec
<
uint32_t
,
2
>
;
using
FragB
=
Vec
<
uint32_t
,
1
>
;
using
FragC
=
Vec
<
int
,
4
>
;
using
FragS_GROUP
=
Vec
<
half2
,
1
>
;
// weight per-group quantization scales
using
FragS_CHANNEL
=
Vec
<
float
,
2
>
;
// weight per-channel quantization scales or activaton
// per-token quantization scales
// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however,
// cp.async.ca can support BYTES = 4, 8, 16;
// as s_tok's shape is equal to prob_m, we need set s_tok to float type,
// and cp_size = 1 float, i.e., 4 BYTES
// Asynchronous global->shared copy for activation quantizaton scales s_tok
__device__
inline
void
cp_async1
(
void
*
smem_ptr
,
const
void
*
glob_ptr
)
{
const
int
BYTES
=
4
;
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"{
\n
"
" cp.async.ca.shared.global [%0], [%1], %2;
\n
"
"}
\n
"
::
"r"
(
smem
),
"l"
(
glob_ptr
),
"n"
(
BYTES
));
}
// m16n8k16 tensor core mma instruction with int8 inputs and int32
// output/accumulation.
__device__
inline
void
mma
(
const
FragA
&
a_frag
,
const
FragB
&
frag_b
,
FragC
&
frag_c
)
{
const
uint32_t
*
a
=
reinterpret_cast
<
const
uint32_t
*>
(
&
a_frag
);
const
uint32_t
*
b
=
reinterpret_cast
<
const
uint32_t
*>
(
&
frag_b
);
int
*
c
=
reinterpret_cast
<
int
*>
(
&
frag_c
);
asm
volatile
(
"mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};
\n
"
:
"=r"
(
c
[
0
]),
"=r"
(
c
[
1
]),
"=r"
(
c
[
2
]),
"=r"
(
c
[
3
])
:
"r"
(
a
[
0
]),
"r"
(
a
[
1
]),
"r"
(
b
[
0
]),
"r"
(
c
[
0
]),
"r"
(
c
[
1
]),
"r"
(
c
[
2
]),
"r"
(
c
[
3
]));
}
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in int8 tensor core layout.
__device__
inline
void
ldsm4
(
FragA
&
frag_a
,
const
void
*
smem_ptr
)
{
uint32_t
*
a
=
reinterpret_cast
<
uint32_t
*>
(
&
frag_a
);
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];
\n
"
:
"=r"
(
a
[
0
]),
"=r"
(
a
[
1
])
:
"r"
(
smem
));
}
inline
__device__
half2
float2_to_half2
(
float2
f
)
{
uint32_t
res
;
// NOTE(HandH1998): h0,h1 should be uint16_t, not half
uint16_t
h0
,
h1
;
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
h0
)
:
"f"
(
f
.
x
));
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
h1
)
:
"f"
(
f
.
y
));
asm
volatile
(
"mov.b32 %0, {%1, %2};
\n
"
:
"=r"
(
res
)
:
"h"
(
h0
),
"h"
(
h1
));
return
reinterpret_cast
<
half2
&>
(
res
);
}
inline
__device__
float
int32_to_float
(
int
h
)
{
float
res
;
asm
volatile
(
"cvt.rn.f32.s32 %0, %1;
\n
"
:
"=f"
(
res
)
:
"r"
(
h
));
return
res
;
}
// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
template
<
int
lut
>
__device__
inline
int
lop3
(
int
a
,
int
b
,
int
c
)
{
int
res
;
asm
volatile
(
"lop3.b32 %0, %1, %2, %3, %4;
\n
"
:
"=r"
(
res
)
:
"r"
(
a
),
"r"
(
b
),
"r"
(
c
),
"n"
(
lut
));
return
res
;
}
// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
// for weight per channel dequant.
__device__
inline
FragB
dequant_per_channel
(
int
q
)
{
static
constexpr
int
MASK
=
0xf0f0f0f0
;
FragB
frag_b
;
frag_b
[
0
]
=
(
q
&
MASK
);
return
frag_b
;
}
// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
// for weight per group dequant.
__device__
inline
FragB
dequant_per_group
(
int
q
,
FragS_GROUP
&
frag_s
,
int
i
)
{
static
constexpr
uint32_t
LO
=
0x000f000f
;
static
constexpr
uint32_t
HI
=
0x00f000f0
;
static
constexpr
uint32_t
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t
t0
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
uint32_t
t1
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
static
constexpr
uint32_t
SUB
=
0x64086408
;
static
constexpr
uint32_t
MUL
=
0x2c002c00
;
static
constexpr
uint32_t
ADD
=
0xd480d480
;
*
reinterpret_cast
<
half2
*>
(
&
t0
)
=
__hsub2
(
*
reinterpret_cast
<
half2
*>
(
&
t0
),
*
reinterpret_cast
<
const
half2
*>
(
&
SUB
));
*
reinterpret_cast
<
half2
*>
(
&
t1
)
=
__hfma2
(
*
reinterpret_cast
<
half2
*>
(
&
t1
),
*
reinterpret_cast
<
const
half2
*>
(
&
MUL
),
*
reinterpret_cast
<
const
half2
*>
(
&
ADD
));
uint16_t
s
=
reinterpret_cast
<
uint16_t
*>
(
&
frag_s
)[
i
];
uint32_t
double_s
;
// pack 2xfp16 to half2
asm
volatile
(
"mov.b32 %0, {%1, %2};
\n
"
:
"=r"
(
double_s
)
:
"h"
(
s
),
"h"
(
s
));
// dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4
// half, respectively)
static
constexpr
uint32_t
MAGIC_NUM
=
0x64806480
;
*
reinterpret_cast
<
half2
*>
(
&
t0
)
=
__hfma2
(
*
reinterpret_cast
<
half2
*>
(
&
t0
),
*
reinterpret_cast
<
half2
*>
(
&
double_s
),
*
reinterpret_cast
<
const
half2
*>
(
&
MAGIC_NUM
));
*
reinterpret_cast
<
half2
*>
(
&
t1
)
=
__hfma2
(
*
reinterpret_cast
<
half2
*>
(
&
t1
),
*
reinterpret_cast
<
half2
*>
(
&
double_s
),
*
reinterpret_cast
<
const
half2
*>
(
&
MAGIC_NUM
));
// take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4
// int8 into 1 uint32
FragB
frag_b
;
uint32_t
uint8s
;
static
constexpr
uint32_t
MASK_0246
=
0x6420
;
static
constexpr
uint32_t
UINT8s_TO_INT8s_MASK
=
0x80808080
;
asm
volatile
(
"prmt.b32 %0,%1,%2,%3;
\n
"
:
"=r"
(
uint8s
)
:
"r"
(
t0
),
"r"
(
t1
),
"n"
(
MASK_0246
));
frag_b
[
0
]
=
(
uint8s
^
UINT8s_TO_INT8s_MASK
);
return
frag_b
;
}
template
<
const
int
threads
,
// number of threads in a threadblock
const
int
thread_m_blocks
,
// number of 16x16 blocks in the m
// dimension (batchsize) of the
// threadblock
const
int
thread_n_blocks
,
// same for n dimension (output)
const
int
thread_k_blocks
,
// same for k dimension (reduction)
const
int
stages
,
// number of stages for the async global->shared
// fetch pipeline
const
int
group_blocks
=
-
1
// number of consecutive 16x16 blocks
// with a separate quantization scale
>
__global__
void
Marlin
(
const
int4
*
__restrict__
A
,
// int8 input matrix of shape mxk
const
int4
*
__restrict__
B
,
// 4bit quantized weight matrix of shape kxn
int4
*
__restrict__
C
,
// int32 global_reduce buffer of shape
// (max_par*16*4)xn, as int8 tensor core's output is
// int32 dtype
int4
*
__restrict__
D
,
// fp16 output buffer of shape mxn
const
float
*
__restrict__
s_tok
,
// fp32 activation per-token quantization
// scales of shape mx1
const
int4
*
__restrict__
s_ch
,
// fp32 weight per-channel quantization
// scales of shape 1xn
const
int4
*
__restrict__
s_group
,
// fp16 weight per-group quantization
// scales of shape (k/groupsize)xn, when
// group_blocks=-1, it should be nullptr
int
prob_m
,
// batch dimension m
int
prob_n
,
// output dimension n
int
prob_k
,
// reduction dimension k
int
*
locks
// extra global storage for barrier synchronization
)
{
// Each threadblock processes one "stripe" of the B matrix with (roughly) the
// same size, which might involve multiple column "slices" (of width 16 *
// `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
// example:
// 0 1 3
// 0 2 3
// 1 2 4
// While this kind of partitioning makes things somewhat more complicated, it
// ensures good utilization of all SMs for many kinds of shape and GPU
// configurations, while requiring as few slow global cross-threadblock
// reductions as possible.
// For larger GEMMs we run multiple batchsize 64 versions in parallel for a
// better partitioning with less reductions
int
parallel
=
1
;
if
(
prob_m
>
16
*
thread_m_blocks
)
{
parallel
=
prob_m
/
(
16
*
thread_m_blocks
);
prob_m
=
16
*
thread_m_blocks
;
}
int
k_tiles
=
prob_k
/
16
/
thread_k_blocks
;
int
n_tiles
=
prob_n
/
16
/
thread_n_blocks
;
int
iters
=
ceildiv
(
k_tiles
*
n_tiles
*
parallel
,
gridDim
.
x
);
// Ensure that the number of tiles in each stripe is a multiple of the
// groupsize; this avoids an annoying special case where a stripe starts in
// the middle of group.
if
constexpr
(
group_blocks
!=
-
1
)
iters
=
(
group_blocks
/
thread_k_blocks
)
*
ceildiv
(
iters
,
(
group_blocks
/
thread_k_blocks
));
int
slice_row
=
(
iters
*
blockIdx
.
x
)
%
k_tiles
;
int
slice_col_par
=
(
iters
*
blockIdx
.
x
)
/
k_tiles
;
int
slice_col
=
slice_col_par
;
int
slice_iters
;
// number of threadblock tiles in the current slice
int
slice_count
=
0
;
// total number of active threadblocks in the current slice
int
slice_idx
;
// index of threadblock in current slice; numbered bottom to
// top
// We can easily implement parallel problem execution by just remapping
// indices and advancing global pointers
if
(
slice_col_par
>=
n_tiles
)
{
A
+=
(
slice_col_par
/
n_tiles
)
*
16
*
thread_m_blocks
*
prob_k
/
16
;
C
+=
(
slice_col_par
/
n_tiles
)
*
16
*
thread_m_blocks
*
prob_n
/
4
;
D
+=
(
slice_col_par
/
n_tiles
)
*
16
*
thread_m_blocks
*
prob_n
/
8
;
s_tok
+=
(
slice_col_par
/
n_tiles
)
*
16
*
thread_m_blocks
;
locks
+=
(
slice_col_par
/
n_tiles
)
*
n_tiles
;
slice_col
=
slice_col_par
%
n_tiles
;
}
// Compute all information about the current slice which is required for
// synchronization.
auto
init_slice
=
[
&
]()
{
slice_iters
=
iters
*
(
blockIdx
.
x
+
1
)
-
(
k_tiles
*
slice_col_par
+
slice_row
);
if
(
slice_iters
<
0
||
slice_col_par
>=
n_tiles
*
parallel
)
slice_iters
=
0
;
if
(
slice_iters
==
0
)
return
;
if
(
slice_row
+
slice_iters
>
k_tiles
)
slice_iters
=
k_tiles
-
slice_row
;
slice_count
=
1
;
slice_idx
=
0
;
int
col_first
=
iters
*
ceildiv
(
k_tiles
*
slice_col_par
,
iters
);
if
(
col_first
<=
k_tiles
*
(
slice_col_par
+
1
))
{
int
col_off
=
col_first
-
k_tiles
*
slice_col_par
;
slice_count
=
ceildiv
(
k_tiles
-
col_off
,
iters
);
if
(
col_off
>
0
)
slice_count
++
;
int
delta_first
=
iters
*
blockIdx
.
x
-
col_first
;
if
(
delta_first
<
0
||
(
col_off
==
0
&&
delta_first
==
0
))
slice_idx
=
slice_count
-
1
;
else
{
slice_idx
=
slice_count
-
1
-
delta_first
/
iters
;
if
(
col_off
>
0
)
slice_idx
--
;
}
}
if
(
slice_col
==
n_tiles
)
{
A
+=
16
*
thread_m_blocks
*
prob_k
/
16
;
C
+=
16
*
thread_m_blocks
*
prob_n
/
4
;
D
+=
16
*
thread_m_blocks
*
prob_n
/
8
;
s_tok
+=
16
*
thread_m_blocks
;
locks
+=
n_tiles
;
slice_col
=
0
;
}
};
init_slice
();
int
a_gl_stride
=
prob_k
/
16
;
// stride of the A matrix in global memory
// We typically use `constexpr` to indicate that this value is a compile-time
// constant
constexpr
int
a_sh_stride
=
16
*
thread_k_blocks
/
16
;
// stride of an A matrix tile in shared memory
constexpr
int
a_gl_rd_delta_o
=
16
*
thread_k_blocks
/
16
;
// delta between subsequent A tiles in global memory
int
a_gl_rd_delta_i
=
a_gl_stride
*
(
threads
/
a_gl_rd_delta_o
);
// between subsequent accesses within a tile
constexpr
int
a_sh_wr_delta
=
a_sh_stride
*
(
threads
/
a_gl_rd_delta_o
);
// between shared memory writes
constexpr
int
a_sh_rd_delta_o
=
1
*
((
threads
/
32
)
/
(
thread_n_blocks
/
4
));
// between shared memory tile reads
constexpr
int
a_sh_rd_delta_i
=
a_sh_stride
*
16
;
// within a shared memory tile
constexpr
int
a_sh_stage
=
a_sh_stride
*
(
16
*
thread_m_blocks
);
// overall size of a tile
constexpr
int
a_sh_wr_iters
=
ceildiv
(
a_sh_stage
,
a_sh_wr_delta
);
// number of shared write iterations for a tile
int
b_gl_stride
=
16
*
prob_n
/
32
;
constexpr
int
b_sh_stride
=
32
*
thread_n_blocks
/
4
;
int
b_gl_rd_delta_o
=
b_gl_stride
*
thread_k_blocks
;
int
b_gl_rd_delta_i
=
b_gl_stride
*
(
threads
/
b_sh_stride
);
constexpr
int
b_sh_wr_delta
=
threads
;
constexpr
int
b_sh_rd_delta
=
threads
;
constexpr
int
b_sh_stage
=
b_sh_stride
*
thread_k_blocks
;
constexpr
int
b_sh_wr_iters
=
b_sh_stage
/
b_sh_wr_delta
;
constexpr
int
s_tok_sh_stride
=
16
*
thread_m_blocks
;
constexpr
int
s_ch_sh_stride
=
16
*
thread_n_blocks
/
4
;
int
s_group_gl_stride
=
prob_n
/
8
;
constexpr
int
s_group_sh_stride
=
16
*
thread_n_blocks
/
8
;
constexpr
int
s_group_sh_stage
=
s_group_sh_stride
;
int
s_group_gl_rd_delta
=
s_group_gl_stride
;
// Global A read index of current thread.
int
a_gl_rd
=
a_gl_stride
*
(
threadIdx
.
x
/
a_gl_rd_delta_o
)
+
(
threadIdx
.
x
%
a_gl_rd_delta_o
);
a_gl_rd
+=
a_gl_rd_delta_o
*
slice_row
;
// Shared write index of current thread.
int
a_sh_wr
=
a_sh_stride
*
(
threadIdx
.
x
/
a_gl_rd_delta_o
)
+
(
threadIdx
.
x
%
a_gl_rd_delta_o
);
// Shared read index.
// NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix
int
a_sh_rd
=
a_sh_stride
*
((
threadIdx
.
x
%
32
)
%
16
);
a_sh_rd
+=
1
*
((
threadIdx
.
x
/
32
)
/
(
thread_n_blocks
/
4
));
int
b_gl_rd
=
b_gl_stride
*
(
threadIdx
.
x
/
b_sh_stride
)
+
(
threadIdx
.
x
%
b_sh_stride
);
b_gl_rd
+=
b_sh_stride
*
slice_col
;
b_gl_rd
+=
b_gl_rd_delta_o
*
slice_row
;
auto
b_sh_wr
=
threadIdx
.
x
;
auto
b_sh_rd
=
threadIdx
.
x
;
auto
s_tok_gl_rd
=
threadIdx
.
x
;
// NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
// 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
// thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
// s_tok's size is not fixed, we can not shuffle before inference we shuffle
// it when fetching s_tok from global memory to shared memory, that's why
// s_tok_sh_wr is like this
int
s_tok_sh_wr
=
(
threadIdx
.
x
/
16
)
*
16
+
(
threadIdx
.
x
%
8
)
*
2
+
(
threadIdx
.
x
%
16
)
/
8
;
int
s_tok_sh_rd
=
(
threadIdx
.
x
%
32
)
/
4
;
bool
s_tok_sh_wr_pred
=
threadIdx
.
x
<
prob_m
;
auto
s_ch_gl_rd
=
s_ch_sh_stride
*
slice_col
+
threadIdx
.
x
;
auto
s_ch_sh_wr
=
threadIdx
.
x
;
int
s_ch_sh_rd
=
16
*
((
threadIdx
.
x
/
32
)
%
(
thread_n_blocks
/
4
))
+
2
*
((
threadIdx
.
x
%
32
)
%
4
);
bool
s_ch_sh_wr_pred
=
threadIdx
.
x
<
s_ch_sh_stride
;
int
s_group_gl_rd
,
s_group_sh_wr
,
s_group_sh_rd
;
bool
s_group_sh_wr_pred
;
if
constexpr
(
group_blocks
!=
-
1
)
{
s_group_gl_rd
=
s_group_gl_stride
*
((
thread_k_blocks
*
slice_row
)
/
group_blocks
)
+
s_group_sh_stride
*
slice_col
+
threadIdx
.
x
;
s_group_sh_wr
=
threadIdx
.
x
;
// NOTE(HandH1998): s_group_sh_rd is related to mma output C
s_group_sh_rd
=
8
*
((
threadIdx
.
x
/
32
)
%
(
thread_n_blocks
/
4
))
+
(
threadIdx
.
x
%
32
)
/
4
;
s_group_sh_wr_pred
=
threadIdx
.
x
<
s_group_sh_stride
;
}
// Precompute which thread should not read memory in which iterations; this is
// needed if there are more threads than required for a certain tilesize or
// when the batchsize is not a multiple of 16.
bool
a_sh_wr_pred
[
a_sh_wr_iters
];
#pragma unroll
for
(
int
i
=
0
;
i
<
a_sh_wr_iters
;
i
++
)
a_sh_wr_pred
[
i
]
=
a_sh_wr_delta
*
i
+
a_sh_wr
<
a_sh_stride
*
prob_m
;
// To ensure that writing and reading A tiles to/from shared memory, the
// latter in fragment format, is fully bank conflict free, we need to use a
// rather fancy XOR-based layout. The key here is that neither reads nor
// writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
// same shared memory banks. Further, it seems (based on NSight-Compute) that
// each warp must also write a consecutive memory segment?
auto
transform_a
=
[
&
](
int
i
)
{
int
row
=
i
/
a_gl_rd_delta_o
;
return
a_gl_rd_delta_o
*
row
+
(
i
%
a_gl_rd_delta_o
)
^
row
;
};
// Since the computation of this remapping is non-trivial and, due to our main
// loop unrolls, all shared memory accesses are static, we simply precompute
// both transformed reads and writes.
int
a_sh_wr_trans
[
a_sh_wr_iters
];
#pragma unroll
for
(
int
i
=
0
;
i
<
a_sh_wr_iters
;
i
++
)
a_sh_wr_trans
[
i
]
=
transform_a
(
a_sh_wr_delta
*
i
+
a_sh_wr
);
int
a_sh_rd_trans
[
b_sh_wr_iters
][
thread_m_blocks
];
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
thread_m_blocks
;
j
++
)
a_sh_rd_trans
[
i
][
j
]
=
transform_a
(
a_sh_rd_delta_o
*
i
+
a_sh_rd_delta_i
*
j
+
a_sh_rd
);
}
// Since B-accesses have non-constant stride they have to be computed at
// runtime; we break dependencies between subsequent accesses with a tile by
// maintining multiple pointers (we have enough registers), a tiny
// optimization.
const
int4
*
B_ptr
[
b_sh_wr_iters
];
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
B_ptr
[
i
]
=
B
+
b_gl_rd_delta_i
*
i
+
b_gl_rd
;
extern
__shared__
int4
sh
[];
// Shared memory storage for global fetch pipelines.
// NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages *
// a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage)
int4
*
sh_a
=
sh
;
int4
*
sh_b
=
sh_a
+
(
stages
*
a_sh_stage
);
int4
*
sh_s_tok
=
sh_b
+
(
stages
*
b_sh_stage
);
int4
*
sh_s_ch
=
sh_s_tok
+
s_tok_sh_stride
;
int4
*
sh_s_group
=
sh_s_ch
+
s_ch_sh_stride
;
// Register storage for double buffer of shared memory reads.
FragA
frag_a
[
2
][
thread_m_blocks
];
I4
frag_b_quant
[
2
];
FragC
frag_c
[
thread_m_blocks
][
4
][
2
];
FragS_GROUP
frag_s_group
[
2
][
4
];
FragS_CHANNEL
frag_s_tok
[
thread_m_blocks
];
FragS_CHANNEL
frag_s_ch
[
2
][
4
];
// Zero accumulators.
auto
zero_accums
=
[
&
]()
{
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
*
4
*
2
*
4
;
i
++
)
reinterpret_cast
<
int
*>
(
frag_c
)[
i
]
=
0
;
};
// Asynchronously fetch the next A, B and s tile from global to the next
// shared memory pipeline location.
auto
fetch_to_shared
=
[
&
](
int
pipe
,
int
a_off
,
bool
pred
=
true
)
{
if
(
pred
)
{
int4
*
sh_a_stage
=
sh_a
+
a_sh_stage
*
pipe
;
#pragma unroll
for
(
int
i
=
0
;
i
<
a_sh_wr_iters
;
i
++
)
{
cp_async4_pred
(
&
sh_a_stage
[
a_sh_wr_trans
[
i
]],
&
A
[
a_gl_rd_delta_i
*
i
+
a_gl_rd
+
a_gl_rd_delta_o
*
a_off
],
a_sh_wr_pred
[
i
]);
}
int4
*
sh_b_stage
=
sh_b
+
b_sh_stage
*
pipe
;
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
{
cp_async4
(
&
sh_b_stage
[
b_sh_wr_delta
*
i
+
b_sh_wr
],
B_ptr
[
i
]);
B_ptr
[
i
]
+=
b_gl_rd_delta_o
;
}
// Only fetch scales if this tile starts a new group
if
constexpr
(
group_blocks
!=
-
1
)
{
if
(
pipe
%
(
group_blocks
/
thread_k_blocks
)
==
0
)
{
int4
*
sh_s_group_stage
=
sh_s_group
+
s_group_sh_stage
*
pipe
;
if
(
s_group_sh_wr_pred
)
cp_async4
(
&
sh_s_group_stage
[
s_group_sh_wr
],
&
s_group
[
s_group_gl_rd
]);
s_group_gl_rd
+=
s_group_gl_rd_delta
;
}
}
}
// Insert a fence even when we are winding down the pipeline to ensure that
// waiting is also correct at this point.
cp_async_fence
();
};
// Wait until the next thread tile has been loaded to shared memory.
auto
wait_for_stage
=
[
&
]()
{
// We only have `stages - 2` active fetches since we are double buffering
// and can only issue the next fetch when it is guaranteed that the previous
// shared memory load is fully complete (as it may otherwise be
// overwritten).
cp_async_wait
<
stages
-
2
>
();
__syncthreads
();
};
// Load the next sub-tile from the current location in the shared memory pipe
// into the current register buffer.
auto
fetch_to_registers
=
[
&
](
int
k
,
int
pipe
)
{
// It may seem inefficient that we reload the groups for every sub-tile;
// however, this does not seem to be a significant bottleneck, while some
// theoretically better attempts have lead to bad instruction ordering by
// the compiler and correspondingly a noticeable drop in performance.
if
constexpr
(
group_blocks
!=
-
1
)
{
int4
*
sh_s_group_stage
=
sh_s_group
+
s_group_sh_stage
*
((
group_blocks
/
thread_k_blocks
)
*
(
pipe
/
(
group_blocks
/
thread_k_blocks
)));
reinterpret_cast
<
int4
*>
(
&
frag_s_group
[
k
%
2
])[
0
]
=
sh_s_group_stage
[
s_group_sh_rd
];
}
int4
*
sh_a_stage
=
sh_a
+
a_sh_stage
*
pipe
;
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
ldsm4
(
frag_a
[
k
%
2
][
i
],
&
sh_a_stage
[
a_sh_rd_trans
[
k
%
b_sh_wr_iters
][
i
]]);
int4
*
sh_b_stage
=
sh_b
+
b_sh_stage
*
pipe
;
frag_b_quant
[
k
%
2
]
=
*
reinterpret_cast
<
I4
*>
(
&
sh_b_stage
[
b_sh_rd_delta
*
(
k
%
b_sh_wr_iters
)
+
b_sh_rd
]);
};
// Execute the actual tensor core matmul of a sub-tile.
auto
matmul
=
[
&
](
int
k
)
{
// We have the m dimension as the inner loop in order to encourage overlapping
// dequantization and matmul operations.
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
int
b_quant
=
frag_b_quant
[
k
%
2
][
j
];
// int b_quant_shift = b_quant << 4;
FragB
frag_b0
,
frag_b1
;
// If there are no groups, we can just scale the final output once and can
// avoid doing so for each weight.
if
constexpr
(
group_blocks
!=
-
1
)
{
int
b_quant_shift
=
b_quant
>>
8
;
frag_b0
=
dequant_per_group
(
b_quant
,
frag_s_group
[
k
%
2
][
j
],
0
);
frag_b1
=
dequant_per_group
(
b_quant_shift
,
frag_s_group
[
k
%
2
][
j
],
1
);
}
else
{
int
b_quant_shift
=
b_quant
<<
4
;
frag_b0
=
dequant_per_channel
(
b_quant
);
frag_b1
=
dequant_per_channel
(
b_quant_shift
);
}
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
{
mma
(
frag_a
[
k
%
2
][
i
],
frag_b0
,
frag_c
[
i
][
j
][
0
]);
mma
(
frag_a
[
k
%
2
][
i
],
frag_b1
,
frag_c
[
i
][
j
][
1
]);
}
}
};
// Since we slice across the k dimension of a tile in order to increase the
// number of warps while keeping the n dimension of a tile reasonable, we have
// multiple warps that accumulate their partial sums of the same output
// location; which we have to reduce over in the end. We do in shared memory.
auto
thread_block_reduce
=
[
&
]()
{
constexpr
int
red_off
=
threads
/
b_sh_stride
/
2
;
if
(
red_off
>=
1
)
{
auto
red_idx
=
threadIdx
.
x
/
b_sh_stride
;
constexpr
int
red_sh_stride
=
b_sh_stride
*
4
*
2
;
constexpr
int
red_sh_delta
=
b_sh_stride
;
int
red_sh_rd
=
red_sh_stride
*
(
threadIdx
.
x
/
b_sh_stride
)
+
(
threadIdx
.
x
%
b_sh_stride
);
// Parallel logarithmic shared memory reduction. We make sure to avoid any
// unnecessary read or write iterations, e.g., for two warps we write only
// once by warp 1 and read only once by warp 0.
#pragma unroll
for
(
int
m_block
=
0
;
m_block
<
thread_m_blocks
;
m_block
++
)
{
#pragma unroll
for
(
int
i
=
red_off
;
i
>
0
;
i
/=
2
)
{
if
(
i
<=
red_idx
&&
red_idx
<
2
*
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
4
*
2
;
j
++
)
{
int
red_sh_wr
=
red_sh_delta
*
j
+
(
red_sh_rd
-
red_sh_stride
*
i
);
if
(
i
<
red_off
)
{
int
*
c_rd
=
reinterpret_cast
<
int
*>
(
&
sh
[
red_sh_delta
*
j
+
red_sh_rd
]);
int
*
c_wr
=
reinterpret_cast
<
int
*>
(
&
sh
[
red_sh_wr
]);
#pragma unroll
for
(
int
k
=
0
;
k
<
4
;
k
++
)
reinterpret_cast
<
FragC
*>
(
frag_c
)[
4
*
2
*
m_block
+
j
][
k
]
+=
c_rd
[
k
]
+
c_wr
[
k
];
}
sh
[
red_sh_wr
]
=
reinterpret_cast
<
int4
*>
(
&
frag_c
)[
4
*
2
*
m_block
+
j
];
}
}
__syncthreads
();
}
if
(
red_idx
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
4
*
2
;
i
++
)
{
int
*
c_rd
=
reinterpret_cast
<
int
*>
(
&
sh
[
red_sh_delta
*
i
+
red_sh_rd
]);
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
reinterpret_cast
<
FragC
*>
(
frag_c
)[
4
*
2
*
m_block
+
i
][
j
]
+=
c_rd
[
j
];
}
}
__syncthreads
();
}
}
};
// Since multiple threadblocks may process parts of the same column slice, we
// finally have to globally reduce over the results. As the striped
// partitioning minimizes the number of such reductions and our outputs are
// usually rather small, we perform this reduction serially in L2 cache.
// global_reduce works on INT32 elements, which are the results of INT8 GEMM.
// This is why we need another INT32 maxtrix `C` to reduce instead of the
// original half matrix `D`.
auto
global_reduce
=
[
&
](
bool
first
=
false
,
bool
last
=
false
)
{
// We are very careful here to reduce directly in the output buffer to
// maximize L2 cache utilization in this step. To do this, we write out
// results in FP16 (but still reduce with FP32 compute).
constexpr
int
active_threads
=
32
*
thread_n_blocks
/
4
;
if
(
threadIdx
.
x
<
active_threads
)
{
int
c_gl_stride
=
prob_n
/
4
;
int
c_gl_wr_delta_o
=
8
*
c_gl_stride
;
int
c_gl_wr_delta_i
=
8
*
(
active_threads
/
32
);
int
c_gl_wr
=
c_gl_stride
*
((
threadIdx
.
x
%
32
)
/
4
)
+
8
*
(
threadIdx
.
x
/
32
)
+
(
threadIdx
.
x
%
4
)
*
2
;
c_gl_wr
+=
(
4
*
thread_n_blocks
)
*
slice_col
;
constexpr
int
c_sh_wr_delta
=
active_threads
*
2
;
auto
c_sh_wr
=
2
*
threadIdx
.
x
;
int
row
=
(
threadIdx
.
x
%
32
)
/
4
;
if
(
!
first
)
{
// Interestingly, doing direct global accesses here really seems to mess up
// the compiler and lead to slowdowns, hence we also use async-copies even
// though these fetches are not actually asynchronous.
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
*
4
;
i
++
)
{
cp_async4_pred
(
&
sh
[
c_sh_wr
+
c_sh_wr_delta
*
i
],
&
C
[
c_gl_wr
+
c_gl_wr_delta_o
*
(
i
/
2
)
+
c_gl_wr_delta_i
*
(
i
%
2
)],
i
<
(
thread_m_blocks
-
1
)
*
4
||
8
*
(
i
/
2
)
+
row
<
prob_m
);
cp_async4_pred
(
&
sh
[
c_sh_wr
+
c_sh_wr_delta
*
i
+
1
],
&
C
[
c_gl_wr
+
c_gl_wr_delta_o
*
(
i
/
2
)
+
c_gl_wr_delta_i
*
(
i
%
2
)
+
1
],
i
<
(
thread_m_blocks
-
1
)
*
4
||
8
*
(
i
/
2
)
+
row
<
prob_m
);
}
cp_async_fence
();
cp_async_wait
<
0
>
();
}
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
*
4
;
i
++
)
{
if
(
i
<
(
thread_m_blocks
-
1
)
*
4
||
8
*
(
i
/
2
)
+
row
<
prob_m
)
{
if
(
!
first
)
{
int4
d_red1
=
sh
[
c_sh_wr
+
i
*
c_sh_wr_delta
];
int4
d_red2
=
sh
[
c_sh_wr
+
i
*
c_sh_wr_delta
+
1
];
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
reinterpret_cast
<
int
*>
(
&
frag_c
)[
4
*
2
*
4
*
(
i
/
4
)
+
4
*
j
+
(
i
%
4
)]
+=
reinterpret_cast
<
int
*>
(
&
d_red1
)[
j
];
}
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
reinterpret_cast
<
int
*>
(
&
frag_c
)[
4
*
2
*
4
*
(
i
/
4
)
+
4
*
(
j
+
4
)
+
(
i
%
4
)]
+=
reinterpret_cast
<
int
*>
(
&
d_red2
)[
j
];
}
}
if
(
!
last
)
{
int4
d1
,
d2
;
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
reinterpret_cast
<
int
*>
(
&
d1
)[
j
]
=
reinterpret_cast
<
int
*>
(
&
frag_c
)[
4
*
2
*
4
*
(
i
/
4
)
+
4
*
j
+
(
i
%
4
)];
}
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
reinterpret_cast
<
int
*>
(
&
d2
)[
j
]
=
reinterpret_cast
<
int
*>
(
&
frag_c
)[
4
*
2
*
4
*
(
i
/
4
)
+
4
*
(
j
+
4
)
+
(
i
%
4
)];
}
C
[
c_gl_wr
+
c_gl_wr_delta_o
*
(
i
/
2
)
+
c_gl_wr_delta_i
*
(
i
%
2
)]
=
d1
;
C
[
c_gl_wr
+
c_gl_wr_delta_o
*
(
i
/
2
)
+
c_gl_wr_delta_i
*
(
i
%
2
)
+
1
]
=
d2
;
}
}
}
}
};
// Write out the reduce final result in the correct layout. We only actually
// reshuffle matrix fragments in this step, the reduction above is performed
// in fragment layout.
auto
write_result
=
[
&
]()
{
int
d_gl_stride
=
prob_n
/
8
;
constexpr
int
d_sh_stride
=
2
*
thread_n_blocks
+
1
;
int
d_gl_wr_delta
=
d_gl_stride
*
(
threads
/
(
2
*
thread_n_blocks
));
constexpr
int
d_sh_rd_delta
=
d_sh_stride
*
(
threads
/
(
2
*
thread_n_blocks
));
int
d_gl_wr
=
d_gl_stride
*
(
threadIdx
.
x
/
(
2
*
thread_n_blocks
))
+
(
threadIdx
.
x
%
(
2
*
thread_n_blocks
));
d_gl_wr
+=
(
2
*
thread_n_blocks
)
*
slice_col
;
int
d_sh_wr
=
(
4
*
d_sh_stride
)
*
((
threadIdx
.
x
%
32
)
/
4
)
+
(
threadIdx
.
x
%
32
)
%
4
;
d_sh_wr
+=
32
*
(
threadIdx
.
x
/
32
);
int
d_sh_rd
=
d_sh_stride
*
(
threadIdx
.
x
/
(
2
*
thread_n_blocks
))
+
(
threadIdx
.
x
%
(
2
*
thread_n_blocks
));
int
d_gl_wr_end
=
d_gl_stride
*
prob_m
;
// We first reorder in shared memory to guarantee the most efficient final
// global write patterns
auto
write
=
[
&
](
int
idx
,
int
c0
,
int
c1
,
float
a_s
,
FragS_CHANNEL
&
w_s
)
{
float2
deq_res
;
deq_res
.
x
=
int32_to_float
(
c0
)
*
w_s
[
0
]
*
a_s
;
deq_res
.
y
=
int32_to_float
(
c1
)
*
w_s
[
1
]
*
a_s
;
((
half2
*
)
sh
)[
idx
]
=
float2_to_half2
(
deq_res
);
};
if
(
threadIdx
.
x
/
32
<
thread_n_blocks
/
4
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
int
wr
=
d_sh_wr
+
8
*
j
;
write
(
wr
+
(
4
*
d_sh_stride
)
*
0
+
0
,
frag_c
[
i
][
j
][
0
][
0
],
frag_c
[
i
][
j
][
0
][
1
],
frag_s_tok
[
i
][
0
],
frag_s_ch
[
j
/
2
][
2
*
(
j
%
2
)
+
0
]);
write
(
wr
+
(
4
*
d_sh_stride
)
*
8
+
0
,
frag_c
[
i
][
j
][
0
][
2
],
frag_c
[
i
][
j
][
0
][
3
],
frag_s_tok
[
i
][
1
],
frag_s_ch
[
j
/
2
][
2
*
(
j
%
2
)
+
0
]);
write
(
wr
+
(
4
*
d_sh_stride
)
*
0
+
4
,
frag_c
[
i
][
j
][
1
][
0
],
frag_c
[
i
][
j
][
1
][
1
],
frag_s_tok
[
i
][
0
],
frag_s_ch
[
j
/
2
][
2
*
(
j
%
2
)
+
1
]);
write
(
wr
+
(
4
*
d_sh_stride
)
*
8
+
4
,
frag_c
[
i
][
j
][
1
][
2
],
frag_c
[
i
][
j
][
1
][
3
],
frag_s_tok
[
i
][
1
],
frag_s_ch
[
j
/
2
][
2
*
(
j
%
2
)
+
1
]);
}
d_sh_wr
+=
16
*
(
4
*
d_sh_stride
);
}
}
__syncthreads
();
#pragma unroll
for
(
int
i
=
0
;
i
<
ceildiv
(
16
*
thread_m_blocks
,
threads
/
(
2
*
thread_n_blocks
));
i
++
)
{
if
(
d_gl_wr
<
d_gl_wr_end
)
{
D
[
d_gl_wr
]
=
sh
[
d_sh_rd
];
d_gl_wr
+=
d_gl_wr_delta
;
d_sh_rd
+=
d_sh_rd_delta
;
}
}
};
// Start global fetch and register load pipelines.
auto
start_pipes
=
[
&
]()
{
#pragma unroll
for
(
int
i
=
0
;
i
<
stages
-
1
;
i
++
)
fetch_to_shared
(
i
,
i
,
i
<
slice_iters
);
zero_accums
();
wait_for_stage
();
fetch_to_registers
(
0
,
0
);
a_gl_rd
+=
a_gl_rd_delta_o
*
(
stages
-
1
);
};
start_pipes
();
// Main loop.
while
(
slice_iters
)
{
// We unroll over both the global fetch and the register load pipeline to
// ensure all shared memory accesses are static. Note that both pipelines have
// even length meaning that the next iteration will always start at index 0.
#pragma unroll
for
(
int
pipe
=
0
;
pipe
<
stages
;)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
b_sh_wr_iters
;
k
++
)
{
fetch_to_registers
(
k
+
1
,
pipe
%
stages
);
if
(
k
==
b_sh_wr_iters
-
2
)
{
fetch_to_shared
((
pipe
+
stages
-
1
)
%
stages
,
pipe
,
slice_iters
>=
stages
);
pipe
++
;
wait_for_stage
();
}
matmul
(
k
);
}
slice_iters
--
;
if
(
slice_iters
==
0
)
break
;
}
a_gl_rd
+=
a_gl_rd_delta_o
*
stages
;
// Process results and, if necessary, proceed to the next column slice.
// While this pattern may not be the most readable, other ways of writing
// the loop seemed to noticeably worse performance after compilation.
if
(
slice_iters
==
0
)
{
cp_async_wait
<
0
>
();
bool
last
=
slice_idx
==
slice_count
-
1
;
// For per-column scales, we only fetch them here in the final step before
// write-out
if
(
last
)
{
if
(
s_tok_sh_wr_pred
)
{
cp_async1
(
&
sh_s_tok
[
s_tok_sh_wr
],
&
s_tok
[
s_tok_gl_rd
]);
}
if
(
s_ch_sh_wr_pred
)
{
cp_async4
(
&
sh_s_ch
[
s_ch_sh_wr
],
&
s_ch
[
s_ch_gl_rd
]);
}
cp_async_fence
();
}
thread_block_reduce
();
if
(
last
)
{
cp_async_wait
<
0
>
();
__syncthreads
();
if
(
threadIdx
.
x
/
32
<
thread_n_blocks
/
4
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
thread_m_blocks
;
i
++
)
{
frag_s_tok
[
i
][
0
]
=
*
reinterpret_cast
<
float
*>
(
&
sh_s_tok
[
16
*
i
+
2
*
s_tok_sh_rd
]);
frag_s_tok
[
i
][
1
]
=
*
reinterpret_cast
<
float
*>
(
&
sh_s_tok
[
16
*
i
+
2
*
s_tok_sh_rd
+
1
]);
}
reinterpret_cast
<
int4
*>
(
&
frag_s_ch
)[
0
]
=
sh_s_ch
[
s_ch_sh_rd
+
0
];
reinterpret_cast
<
int4
*>
(
&
frag_s_ch
)[
1
]
=
sh_s_ch
[
s_ch_sh_rd
+
1
];
reinterpret_cast
<
int4
*>
(
&
frag_s_ch
)[
2
]
=
sh_s_ch
[
s_ch_sh_rd
+
8
];
reinterpret_cast
<
int4
*>
(
&
frag_s_ch
)[
3
]
=
sh_s_ch
[
s_ch_sh_rd
+
9
];
}
}
if
(
slice_count
>
1
)
{
// only globally reduce if there is more than one
// block in a slice
barrier_acquire
(
&
locks
[
slice_col
],
slice_idx
);
global_reduce
(
slice_idx
==
0
,
last
);
barrier_release
(
&
locks
[
slice_col
],
last
);
}
if
(
last
)
// only the last block in a slice actually writes the result
write_result
();
slice_row
=
0
;
slice_col_par
++
;
slice_col
++
;
init_slice
();
if
(
slice_iters
)
{
a_gl_rd
=
a_gl_stride
*
(
threadIdx
.
x
/
a_gl_rd_delta_o
)
+
(
threadIdx
.
x
%
a_gl_rd_delta_o
);
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
B_ptr
[
i
]
+=
b_sh_stride
-
b_gl_rd_delta_o
*
k_tiles
;
if
(
slice_col
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
b_sh_wr_iters
;
i
++
)
B_ptr
[
i
]
-=
b_gl_stride
;
}
s_group_gl_rd
=
s_group_sh_stride
*
slice_col
+
threadIdx
.
x
;
s_ch_gl_rd
=
s_ch_sh_stride
*
slice_col
+
threadIdx
.
x
;
start_pipes
();
}
}
}
}
#else
template
<
const
int
threads
,
// number of threads in a threadblock
const
int
thread_m_blocks
,
// number of 16x16 blocks in the m
// dimension (batchsize) of the
// threadblock
const
int
thread_n_blocks
,
// same for n dimension (output)
const
int
thread_k_blocks
,
// same for k dimension (reduction)
const
int
stages
,
// number of stages for the async global->shared
// fetch pipeline
const
int
group_blocks
=
-
1
// number of consecutive 16x16 blocks
// with a separate quantization scale
>
__global__
void
Marlin
(
const
int4
*
__restrict__
A
,
// int8 input matrix of shape mxk
const
int4
*
__restrict__
B
,
// 4bit quantized weight matrix of shape kxn
int4
*
__restrict__
C
,
// int32 global_reduce buffer of shape
// (max_par*16*4)xn, as int8 tensor core's output is
// int32 dtype
int4
*
__restrict__
D
,
// fp16 output buffer of shape mxn
const
float
*
__restrict__
s_tok
,
// fp32 activation per-token quantization
// scales of shape mx1
const
int4
*
__restrict__
s_ch
,
// fp32 weight per-channel quantization
// scales of shape 1xn
const
int4
*
__restrict__
s_group
,
// fp16 weight per-group quantization
// scales of shape (k/groupsize)xn, when
// group_blocks=-1, it should be nullptr
int
prob_m
,
// batch dimension m
int
prob_n
,
// output dimension n
int
prob_k
,
// reduction dimension k
int
*
locks
// extra global storage for barrier synchronization
)
{
// Marlin is not implemented yet for SM < 8.0
assert
(
false
);
return
;
}
#endif
// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
const
int
USER_THREADS
=
256
;
// Note: This is only used with user-provided thread_k/n
const
int
STAGES
=
4
;
// 4 pipeline stages fit into shared memory
static
constexpr
int
min_thread_n
=
64
;
static
constexpr
int
min_thread_k
=
64
;
static
constexpr
int
tile_size
=
16
;
static
constexpr
int
max_par
=
16
;
static
constexpr
int
pack_factor_4bit
=
8
;
// We have 8 4-bit vals inside a 32 bit
#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
GROUP_BLOCKS, NUM_THREADS) \
else if (thread_m_blocks == THREAD_M_BLOCKS && \
thread_n_blocks == THREAD_N_BLOCKS && \
thread_k_blocks == THREAD_K_BLOCKS && \
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \
cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>, \
cudaFuncAttributeMaxDynamicSharedMemorySize, \
max_shared_mem); \
Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
STAGES, GROUP_BLOCKS> \
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr, \
prob_m, prob_n, prob_k, locks); \
}
typedef
struct
{
int
thread_k
;
int
thread_n
;
int
num_threads
;
}
thread_config_t
;
thread_config_t
small_batch_thread_configs
[]
=
{
// Ordered by priority
// thread_k, thread_n, num_threads
{
128
,
128
,
256
},
// Default
{
128
,
64
,
128
},
// Reduce N 2X, same K
{
64
,
256
,
256
},
// Reduce K 2X, increase N 2X
{
64
,
128
,
128
},
// Reduce K 2X, same N
};
thread_config_t
large_batch_thread_configs
[]
=
{
// Ordered by priority
// thread_k, thread_n, num_threads
{
64
,
256
,
256
},
// Default
{
128
,
128
,
256
},
// Reduce N 2X, increase K 2X
{
64
,
128
,
128
},
// Reduce N 2X, same K
{
128
,
64
,
128
},
// Reduce N 4X, increase K 2X
};
bool
is_valid_config
(
thread_config_t
const
&
th_config
,
int
prob_m
,
int
prob_n
,
int
prob_k
)
{
// Sanity
if
(
th_config
.
thread_k
==
-
1
||
th_config
.
thread_n
==
-
1
||
th_config
.
num_threads
==
-
1
)
{
return
false
;
}
// Verify K/N are divisible by thread K/N
if
(
prob_k
%
th_config
.
thread_k
!=
0
||
prob_n
%
th_config
.
thread_n
!=
0
)
{
return
false
;
}
// thread_k can be only 128 or 64 (because it must be less than groupsize
// which is 128)
if
(
th_config
.
thread_k
!=
128
&&
th_config
.
thread_k
!=
64
)
{
return
false
;
}
// Verify min for thread K/N
if
(
th_config
.
thread_n
<
min_thread_n
||
th_config
.
thread_k
<
min_thread_k
)
{
return
false
;
}
// num_threads must be at least 128 (= 4 warps)
if
(
th_config
.
num_threads
<
128
)
{
return
false
;
}
return
true
;
}
thread_config_t
determine_thread_config
(
int
prob_m
,
int
prob_n
,
int
prob_k
)
{
if
(
prob_m
<=
16
)
{
for
(
auto
th_config
:
small_batch_thread_configs
)
{
if
(
is_valid_config
(
th_config
,
prob_m
,
prob_n
,
prob_k
))
{
return
th_config
;
}
}
}
else
{
for
(
auto
th_config
:
large_batch_thread_configs
)
{
if
(
is_valid_config
(
th_config
,
prob_m
,
prob_n
,
prob_k
))
{
return
th_config
;
}
}
}
return
thread_config_t
{
-
1
,
-
1
,
-
1
};
}
#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \
__CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
__CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
void
marlin_qqq_cuda
(
const
void
*
A
,
const
void
*
B
,
void
*
C
,
void
*
D
,
void
*
s_tok
,
void
*
s_ch
,
void
*
s_group
,
int
prob_m
,
int
prob_n
,
int
prob_k
,
void
*
workspace
,
int
groupsize
=
-
1
,
int
dev
=
0
,
cudaStream_t
stream
=
0
,
int
thread_k
=
-
1
,
int
thread_n
=
-
1
,
int
sms
=
-
1
,
int
max_par
=
16
)
{
int
tot_m
=
prob_m
;
int
tot_m_blocks
=
ceildiv
(
tot_m
,
16
);
int
pad
=
16
*
tot_m_blocks
-
tot_m
;
if
(
sms
==
-
1
)
cudaDeviceGetAttribute
(
&
sms
,
cudaDevAttrMultiProcessorCount
,
dev
);
int
max_shared_mem
=
0
;
cudaDeviceGetAttribute
(
&
max_shared_mem
,
cudaDevAttrMaxSharedMemoryPerBlockOptin
,
dev
);
TORCH_CHECK
(
max_shared_mem
>
0
);
// Set thread config
thread_config_t
th_config
;
if
(
thread_k
!=
-
1
&&
thread_n
!=
-
1
)
{
// User-defined config
th_config
=
thread_config_t
{
thread_k
,
thread_n
,
USER_THREADS
};
}
else
{
// Auto config
th_config
=
determine_thread_config
(
prob_m
,
prob_n
,
prob_k
);
}
if
(
!
is_valid_config
(
th_config
,
prob_m
,
prob_n
,
prob_k
))
{
throw
std
::
runtime_error
(
"Invalid thread config: thread_k = "
+
str
(
th_config
.
thread_k
)
+
", thread_n = "
+
str
(
th_config
.
thread_n
)
+
", num_threads = "
+
str
(
th_config
.
num_threads
)
+
" for MKN = ["
+
str
(
prob_m
)
+
", "
+
str
(
prob_k
)
+
", "
+
str
(
prob_n
)
+
"]"
);
}
int
num_threads
=
th_config
.
num_threads
;
thread_k
=
th_config
.
thread_k
;
thread_n
=
th_config
.
thread_n
;
int
thread_k_blocks
=
thread_k
/
16
;
int
thread_n_blocks
=
thread_n
/
16
;
int
group_blocks
=
(
groupsize
==
-
1
)
?
-
1
:
groupsize
/
16
;
int
blocks
=
sms
;
if
(
prob_m
==
0
||
prob_n
==
0
||
prob_k
==
0
)
{
return
;
}
TORCH_CHECK
(
prob_n
%
thread_n
==
0
,
"prob_n = "
,
prob_n
,
" is not divisible by thread_n = "
,
thread_n
);
TORCH_CHECK
(
prob_k
%
thread_k
==
0
,
"prob_k = "
,
prob_k
,
" is not divisible by thread_k = "
,
thread_k
);
if
(
group_blocks
!=
-
1
)
{
TORCH_CHECK
(
prob_k
%
group_blocks
==
0
,
"prob_k = "
,
prob_k
,
" is not divisible by group_blocks = "
,
group_blocks
);
}
const
int4
*
A_ptr
=
(
const
int4
*
)
A
;
const
int4
*
B_ptr
=
(
const
int4
*
)
B
;
int4
*
C_ptr
=
(
int4
*
)
C
;
int4
*
D_ptr
=
(
int4
*
)
D
;
const
float
*
s_tok_ptr
=
(
const
float
*
)
s_tok
;
const
int4
*
s_ch_ptr
=
(
const
int4
*
)
s_ch
;
const
int4
*
s_group_ptr
=
(
const
int4
*
)
s_group
;
int
*
locks
=
(
int
*
)
workspace
;
for
(
int
i
=
0
;
i
<
tot_m_blocks
;
i
+=
4
)
{
int
thread_m_blocks
=
tot_m_blocks
-
i
;
prob_m
=
tot_m
-
16
*
i
;
int
par
=
1
;
if
(
thread_m_blocks
>
4
)
{
// Note that parallel > 1 currently only works for inputs without any
// padding
par
=
(
16
*
thread_m_blocks
-
pad
)
/
64
;
if
(
par
>
max_par
)
par
=
max_par
;
prob_m
=
64
*
par
;
i
+=
4
*
(
par
-
1
);
thread_m_blocks
=
4
;
}
// For compilation speed, we only define the kernel configurations that have
// seemed useful (in terms of performance) in our testing, however many more
// are, in principle, possible.
if
(
false
)
{
}
CALL_IF
(
8
,
8
,
256
)
CALL_IF
(
16
,
4
,
256
)
CALL_IF
(
8
,
4
,
128
)
CALL_IF
(
4
,
8
,
128
)
else
{
throw
std
::
runtime_error
(
"Unsupported shapes: MKN = ["
+
str
(
prob_m
)
+
", "
+
str
(
prob_k
)
+
", "
+
str
(
prob_n
)
+
"]"
+
", groupsize = "
+
str
(
groupsize
)
+
", thread_m_blocks = "
+
str
(
thread_m_blocks
)
+
", thread_n_blocks = "
+
str
(
thread_n_blocks
)
+
", thread_k_blocks = "
+
str
(
thread_k_blocks
));
}
A_ptr
+=
16
*
thread_m_blocks
*
(
prob_k
/
16
)
*
par
;
D_ptr
+=
16
*
thread_m_blocks
*
(
prob_n
/
8
)
*
par
;
s_tok_ptr
+=
16
*
thread_m_blocks
*
par
;
}
}
}
// anonymous namespace
torch
::
Tensor
marlin_qqq_gemm
(
torch
::
Tensor
const
&
a
,
torch
::
Tensor
const
&
b_q_weight
,
torch
::
Tensor
const
&
s_tok
,
torch
::
Tensor
const
&
s_ch
,
torch
::
Tensor
const
&
s_group
,
torch
::
Tensor
&
workspace
,
int64_t
size_m
,
int64_t
size_n
,
int64_t
size_k
)
{
// Verify M
TORCH_CHECK
(
size_m
==
a
.
size
(
0
),
"Shape mismatch: a.size(0) = "
+
str
(
a
.
size
(
0
))
+
", size_m = "
+
str
(
size_m
));
TORCH_CHECK
(
size_m
==
s_tok
.
numel
(),
"Shape mismatch: s_tok.numel() = "
+
str
(
s_tok
.
numel
())
+
", size_m = "
+
str
(
size_m
));
// Verify K
TORCH_CHECK
(
size_k
==
a
.
size
(
1
),
"Shape mismatch: a.size(1) = "
+
str
(
a
.
size
(
1
))
+
", size_k = "
+
str
(
size_k
));
TORCH_CHECK
(
size_k
%
tile_size
==
0
,
"size_k = "
+
str
(
size_k
)
+
" is not divisible by tile_size = "
+
str
(
tile_size
));
TORCH_CHECK
(
(
size_k
/
tile_size
)
==
b_q_weight
.
size
(
0
),
"Shape mismatch: b_q_weight.size(0) = "
+
str
(
b_q_weight
.
size
(
0
))
+
", size_k = "
+
str
(
size_k
)
+
", tile_size = "
+
str
(
tile_size
));
int
groupsize
=
(
s_group
.
numel
()
==
0
)
?
-
1
:
size_k
/
s_group
.
size
(
0
);
// Verify groupsize
TORCH_CHECK
(
groupsize
==
-
1
||
groupsize
==
128
,
"Unexpected groupsize = "
+
str
(
groupsize
));
// Verify N
TORCH_CHECK
(
s_ch
.
numel
()
==
size_n
,
"Shape mismatch: s_ch.numel() = "
+
str
(
s_ch
.
numel
())
+
", size_n = "
+
str
(
size_n
));
TORCH_CHECK
(
b_q_weight
.
size
(
1
)
%
tile_size
==
0
,
"b_q_weight.size(1) = "
+
str
(
b_q_weight
.
size
(
1
))
+
" is not divisible by tile_size = "
+
str
(
tile_size
));
if
(
groupsize
!=
-
1
)
{
TORCH_CHECK
(
s_group
.
size
(
1
)
==
size_n
,
"Shape mismatch: s_group.size(1) = "
+
str
(
s_group
.
size
(
1
))
+
", size_n = "
+
str
(
size_n
));
TORCH_CHECK
(
size_k
%
s_group
.
size
(
0
)
==
0
,
"size_k = "
+
str
(
size_k
)
+
", is not divisible by s_group.size(0) = "
+
str
(
s_group
.
size
(
0
)));
}
int
actual_size_n
=
(
b_q_weight
.
size
(
1
)
/
tile_size
)
*
pack_factor_4bit
;
TORCH_CHECK
(
size_n
==
actual_size_n
,
"Shape mismatch: size_n = "
+
str
(
size_n
)
+
", actual_size_n = "
+
str
(
actual_size_n
));
// Verify A device and strides
TORCH_CHECK
(
a
.
device
().
is_cuda
(),
"A is not on GPU"
);
TORCH_CHECK
(
a
.
is_contiguous
(),
"A is not contiguous"
);
// Verify B device and strides
TORCH_CHECK
(
b_q_weight
.
device
().
is_cuda
(),
"b_q_weight is not on GPU"
);
TORCH_CHECK
(
b_q_weight
.
is_contiguous
(),
"b_q_weight is not contiguous"
);
// Verify s_tok device, strides and dtype
TORCH_CHECK
(
s_tok
.
device
().
is_cuda
(),
"s_tok is not on GPU"
);
TORCH_CHECK
(
s_tok
.
is_contiguous
(),
"s_tok is not contiguous"
);
TORCH_CHECK
(
s_tok
.
dtype
()
==
torch
::
kFloat32
,
"s_tok's dtype is not float32"
);
// Verify s_ch device, strides and dtype
TORCH_CHECK
(
s_ch
.
device
().
is_cuda
(),
"s_ch is not on GPU"
);
TORCH_CHECK
(
s_ch
.
is_contiguous
(),
"s_ch is not contiguous"
);
TORCH_CHECK
(
s_ch
.
dtype
()
==
torch
::
kFloat32
,
"s_ch's dtype is not float32"
);
// Verify s_group device, strides and dtype
TORCH_CHECK
(
s_group
.
device
().
is_cuda
(),
"s_group is not on GPU"
);
TORCH_CHECK
(
s_group
.
is_contiguous
(),
"s_group is not contiguous"
);
TORCH_CHECK
(
s_group
.
dtype
()
==
torch
::
kFloat16
,
"s_group's dtype is not float16"
);
// Verify workspace size
TORCH_CHECK
(
size_n
%
min_thread_n
==
0
,
"size_n = "
+
str
(
size_n
)
+
", is not divisible by min_thread_n = "
+
str
(
min_thread_n
));
int
min_workspace_size
=
(
size_n
/
min_thread_n
)
*
max_par
;
TORCH_CHECK
(
workspace
.
numel
()
>=
min_workspace_size
,
"workspace.numel = "
+
str
(
workspace
.
numel
())
+
" is below min_workspace_size = "
+
str
(
min_workspace_size
));
// Alloc C matrix
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
a
));
auto
options_c
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt
).
device
(
a
.
device
());
torch
::
Tensor
c
=
torch
::
empty
({
max_par
*
64
,
size_n
},
options_c
);
// Alloc D matrix
auto
options_d
=
torch
::
TensorOptions
().
dtype
(
torch
::
kFloat16
).
device
(
a
.
device
());
torch
::
Tensor
d
=
torch
::
empty
({
size_m
,
size_n
},
options_d
);
// thread_k: `k` size of a thread_tile in `weights` (can usually be left as
// auto -1)
int
thread_k
=
-
1
;
// thread_n: `n` size of a thread_tile in `weights` (can usually be left as
// auto -1)
int
thread_n
=
-
1
;
// sms: number of SMs to use for the kernel (can usually be left as auto -1)
int
sms
=
-
1
;
int
dev
=
a
.
get_device
();
marlin_qqq_cuda
(
a
.
data_ptr
(),
b_q_weight
.
data_ptr
(),
c
.
data_ptr
(),
d
.
data_ptr
(),
s_tok
.
data_ptr
(),
s_ch
.
data_ptr
(),
s_group
.
data_ptr
(),
size_m
,
size_n
,
size_k
,
workspace
.
data_ptr
(),
groupsize
,
dev
,
at
::
cuda
::
getCurrentCUDAStream
(
dev
),
thread_k
,
thread_n
,
sms
,
max_par
);
return
d
;
}
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
CUDA
,
m
)
{
m
.
impl
(
"marlin_qqq_gemm"
,
&
marlin_qqq_gemm
);
}
csrc/torch_bindings.cpp
View file @
0cdbf5e6
...
...
@@ -241,14 +241,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// custom types:
// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
// Marlin (Dense) Optimized Quantized GEMM for GPTQ.
ops
.
def
(
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
"Tensor"
,
{
stride_tag
});
// conditionally compiled so impl in source file
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
ops
.
def
(
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
...
...
@@ -353,15 +345,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops
.
def
(
"ggml_moe_get_block_size"
,
&
ggml_moe_get_block_size
);
#ifndef USE_ROCM
// marlin_qqq_gemm for QQQ.
ops
.
def
(
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
"Tensor! workspace, SymInt size_m, SymInt size_n, "
"SymInt size_k) -> Tensor"
,
{
stride_tag
});
// conditionally compiled so impl registration is in source file
// CUTLASS nvfp4 block scaled GEMM
ops
.
def
(
"cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
...
...
tests/compile/test_full_graph.py
View file @
0cdbf5e6
...
...
@@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
"quantization"
:
"gptq_marlin_24"
}))
if
is_quant_method_supported
(
"marlin"
):
TEST_MODELS
.
append
(
(
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"
,
{
"quantization"
:
"marlin"
}))
if
not
current_platform
.
is_rocm
()
and
is_quant_method_supported
(
"awq"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
{
"quantization"
:
"AWQ"
...
...
tests/kernels/quantization/test_machete_mm.py
View file @
0cdbf5e6
...
...
@@ -95,23 +95,23 @@ TEST_TYPES = [
token_scale_type
=
None
)
for
w_type
in
[
scalar_types
.
uint4
,
scalar_types
.
uint8
]
for
a_type
in
[
torch
.
float16
,
torch
.
bfloat16
]),
# QQQ style
*
(
TypeConfig
(
act_type
=
torch
.
int8
,
weight_type
=
scalar_types
.
uint4b8
,
output_type
=
torch
.
float16
,
group_scale_type
=
group_scale_type
,
group_zero_type
=
None
,
channel_scale_type
=
torch
.
float
,
token_scale_type
=
torch
.
float
)
for
group_scale_type
in
[
None
,
torch
.
float16
]),
*
(
TypeConfig
(
act_type
=
torch
.
float8_e4m3fn
,
weight_type
=
scalar_types
.
uint4b8
,
output_type
=
torch
.
float16
,
group_scale_type
=
group_scale_type
,
group_zero_type
=
None
,
channel_scale_type
=
torch
.
float
,
token_scale_type
=
torch
.
float
)
for
group_scale_type
in
[
None
,
torch
.
float16
]),
#
#
QQQ style
#
*(TypeConfig(act_type=torch.int8,
#
weight_type=scalar_types.uint4b8,
#
output_type=torch.float16,
#
group_scale_type=group_scale_type,
#
group_zero_type=None,
#
channel_scale_type=torch.float,
#
token_scale_type=torch.float)
#
for group_scale_type in [None, torch.float16]),
#
*(TypeConfig(act_type=torch.float8_e4m3fn,
#
weight_type=scalar_types.uint4b8,
#
output_type=torch.float16,
#
group_scale_type=group_scale_type,
#
group_zero_type=None,
#
channel_scale_type=torch.float,
#
token_scale_type=torch.float)
#
for group_scale_type in [None, torch.float16]),
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
...
...
tests/kernels/quantization/test_marlin_gemm.py
View file @
0cdbf5e6
...
...
@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
from
vllm.model_executor.layers.quantization.gptq_marlin_24
import
(
GPTQ_MARLIN_24_MAX_PARALLEL
,
GPTQ_MARLIN_24_MIN_THREAD_N
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
)
from
vllm.model_executor.layers.quantization.qqq
import
(
MARLIN_QQQ_MAX_PARALLEL
,
MARLIN_QQQ_MIN_THREAD_N
,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES
,
MARLIN_QQQ_SUPPORTED_NUM_BITS
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
GPTQ_MARLIN_MAX_PARALLEL
,
GPTQ_MARLIN_MIN_THREAD_N
,
MARLIN_SUPPORTED_GROUP_SIZES
,
marlin_make_empty_g_idx
,
marlin_make_workspace_new
,
marlin_permute_bias
,
marlin_permute_scales
,
query_marlin_supported_quant_types
)
...
...
@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
marlin_weights
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
marlin_24_quantize
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq
import
(
# noqa: E501
marlin_qqq_quantize
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
awq_pack
,
gptq_pack
,
gptq_quantize_weights
,
quantize_weights
,
sort_weights
)
from
vllm.scalar_type
import
scalar_types
...
...
@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
assert
max_diff
<
0.04
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"qqq"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_QQQ_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_qqq_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
,
):
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
a_input
=
rand_data
((
size_m
,
size_k
))
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize activations
s_a
=
a_input
.
abs
().
max
(
dim
=-
1
,
keepdim
=
True
)[
0
].
div
(
int8_traits
.
max
).
to
(
torch
.
float
)
q_a
=
(
a_input
/
s_a
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
# Quantize weights
w_ref
,
marlin_qqq_q_w
,
marlin_qqq_s_group
,
marlin_qqq_s_channel
=
\
marlin_qqq_quantize
(
b_weight
,
num_bits
,
group_size
)
workspace
=
MarlinWorkspace
(
size_n
,
MARLIN_QQQ_MIN_THREAD_N
,
MARLIN_QQQ_MAX_PARALLEL
)
opcheck
(
torch
.
ops
.
_C
.
marlin_qqq_gemm
,
(
q_a
,
marlin_qqq_q_w
,
s_a
,
marlin_qqq_s_channel
,
marlin_qqq_s_group
,
workspace
.
scratch
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
]))
output
=
ops
.
marlin_qqq_gemm
(
q_a
,
marlin_qqq_q_w
,
s_a
,
marlin_qqq_s_channel
,
marlin_qqq_s_group
,
workspace
.
scratch
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
)
output_ref
=
torch
.
matmul
(
q_a
.
half
()
*
s_a
.
half
(),
w_ref
)
torch
.
cuda
.
synchronize
()
max_diff
=
compute_max_diff
(
output
,
output_ref
)
assert
max_diff
<
0.04
def
test_marlin_gemm_subset_input
():
quant_type
=
scalar_types
.
uint4b8
group_size
=
128
...
...
@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
max_diff
=
compute_max_diff
(
output
,
output_ref
)
assert
max_diff
<
0.04
def
test_marlin_gemm_opcheck
():
size_m
=
2048
size_n
=
4096
size_k
=
4096
a
=
torch
.
rand
((
size_m
,
size_n
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
w
=
torch
.
randint
(
-
5
,
5
,
(
256
,
8192
),
device
=
'cuda'
,
dtype
=
torch
.
int32
)
s
=
torch
.
full
((
32
,
size_k
),
0.125
,
device
=
'cuda'
,
dtype
=
torch
.
float16
)
wk
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
).
scratch
x
=
torch
.
ops
.
_C
.
marlin_gemm
(
a
,
w
,
s
,
wk
,
size_m
,
size_n
,
size_k
)
y
=
torch
.
ops
.
_C
.
marlin_gemm
(
a
,
w
,
s
,
wk
,
size_m
,
size_n
,
size_k
)
torch
.
testing
.
assert_close
(
x
,
y
)
opcheck
(
torch
.
ops
.
_C
.
marlin_gemm
,
(
a
,
w
,
s
,
wk
,
size_m
,
size_n
,
size_k
))
tests/quantization/test_configs.py
View file @
0cdbf5e6
...
...
@@ -22,22 +22,12 @@ class ModelPair:
MODEL_ARG_EXPTYPES
=
[
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Marlin Format should always use Marlin kernel.
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
None
,
"marlin"
),
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"marlin"
,
"marlin"
),
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"gptq"
,
"marlin"
),
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"awq"
,
"ERROR"
),
# Model Serialized in Exllama Format.
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
None
,
"gptq_marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"marlin"
,
"gptq_marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"gptq"
,
"gptq"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"awq"
,
"ERROR"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Marlin Format should always use Marlin kernel.
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
None
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"marlin"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"gptq"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"awq"
,
"ERROR"
),
# Model Serialized in Exllama Format.
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
None
,
"gptq_marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"marlin"
,
"gptq_marlin"
),
...
...
tests/quantization/test_lm_head.py
View file @
0cdbf5e6
...
...
@@ -11,7 +11,6 @@ import torch
from
vllm.model_executor.layers.quantization.gptq
import
GPTQLinearMethod
from
vllm.model_executor.layers.quantization.gptq_marlin
import
(
GPTQMarlinLinearMethod
)
from
vllm.model_executor.layers.quantization.marlin
import
MarlinLinearMethod
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
UnquantizedEmbeddingMethod
)
...
...
@@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found"
MODELS_QUANT
=
[
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
,
True
),
(
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024"
,
False
),
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
False
),
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
False
)
]
...
...
@@ -41,8 +38,7 @@ def test_lm_head(
lm_head_layer
=
model
.
lm_head
if
lm_head_quantized
:
assert
isinstance
(
lm_head_layer
.
quant_method
,
(
GPTQLinearMethod
,
GPTQMarlinLinearMethod
,
MarlinLinearMethod
))
(
GPTQLinearMethod
,
GPTQMarlinLinearMethod
))
else
:
assert
isinstance
(
lm_head_layer
.
quant_method
,
UnquantizedEmbeddingMethod
)
...
...
tests/weight_loading/models.txt
View file @
0cdbf5e6
...
...
@@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing
awq, casperhansen/mixtral-instruct-awq, main
awq_marlin, casperhansen/mixtral-instruct-awq, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
qqq, HandH1998/QQQ-Llama-3-8b-g128, main
qqq, HandH1998/QQQ-Llama-3-8b, main
hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
vllm/_custom_ops.py
View file @
0cdbf5e6
...
...
@@ -387,14 +387,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
torch
.
ops
.
_C
.
gptq_shuffle
(
q_weight
,
q_perm
,
bit
)
# marlin
def
marlin_gemm
(
a
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
b_scales
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
size_m
:
int
,
size_n
:
int
,
size_k
:
int
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
marlin_gemm
(
a
,
b_q_weight
,
b_scales
,
workspace
,
size_m
,
size_n
,
size_k
)
# marlin_24
def
gptq_marlin_24_gemm
(
a
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
b_meta
:
torch
.
Tensor
,
b_scales
:
torch
.
Tensor
,
...
...
@@ -437,25 +429,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
is_zp_float
:
bool
=
False
)
->
torch
.
Tensor
:
return
torch
.
empty
((
size_m
,
size_n
),
device
=
a
.
device
,
dtype
=
a
.
dtype
)
@
register_fake
(
"_C::marlin_qqq_gemm"
)
def
_marlin_qqq_gemm_fake
(
a
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
s_tok
:
torch
.
Tensor
,
s_ch
:
torch
.
Tensor
,
s_group
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
size_m
:
torch
.
SymInt
,
size_n
:
torch
.
SymInt
,
size_k
:
torch
.
SymInt
)
->
torch
.
Tensor
:
return
torch
.
empty
((
size_m
,
size_n
),
dtype
=
torch
.
float16
,
device
=
a
.
device
)
@
register_fake
(
"_C::marlin_gemm"
)
def
_marlin_gemm_fake
(
a
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
b_scales
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
size_m
:
torch
.
SymInt
,
size_n
:
torch
.
SymInt
,
size_k
:
torch
.
SymInt
)
->
torch
.
Tensor
:
return
torch
.
empty
((
size_m
,
size_n
),
dtype
=
torch
.
float16
,
device
=
a
.
device
)
@
register_fake
(
"_C::awq_dequantize"
)
def
_awq_dequantize_fake
(
qweight
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
zeros
:
torch
.
Tensor
,
split_k_iters
:
torch
.
SymInt
,
...
...
@@ -1348,15 +1321,6 @@ def scaled_int8_quant(
return
output
,
input_scales
,
input_azp
# qqq ops
def
marlin_qqq_gemm
(
a
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
s_tok
:
torch
.
Tensor
,
s_ch
:
torch
.
Tensor
,
s_group
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
size_m
:
int
,
size_n
:
int
,
size_k
:
int
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
marlin_qqq_gemm
(
a
,
b_q_weight
,
s_tok
,
s_ch
,
s_group
,
workspace
,
size_m
,
size_n
,
size_k
)
# gguf
def
ggml_dequantize
(
W
:
torch
.
Tensor
,
quant_type
:
int
,
m
:
int
,
n
:
int
,
dtype
:
Optional
[
torch
.
dtype
])
->
torch
.
Tensor
:
...
...
vllm/config/__init__.py
View file @
0cdbf5e6
...
...
@@ -1112,9 +1112,9 @@ class ModelConfig:
def
_verify_quantization
(
self
)
->
None
:
supported_quantization
=
me_quant
.
QUANTIZATION_METHODS
optimized_quantization_methods
=
[
"fp8"
,
"marlin"
,
"modelopt"
,
"gptq_marlin_24"
,
"gptq_marlin"
,
"awq_marlin"
,
"fbgemm_fp8"
,
"compressed-tensors"
,
"experts_int8"
,
"quark"
,
"modelopt_fp4"
,
"bitblas"
,
"gptq_bitblas"
,
"inc"
"fp8"
,
"modelopt"
,
"gptq_marlin_24"
,
"gptq_marlin"
,
"awq_marlin"
,
"fbgemm_fp8"
,
"compressed-tensors"
,
"experts_int8"
,
"quark"
,
"modelopt_fp4"
,
"bitblas"
,
"gptq_bitblas"
,
"inc"
]
if
self
.
quantization
is
not
None
:
self
.
quantization
=
cast
(
me_quant
.
QuantizationMethods
,
...
...
@@ -1137,7 +1137,6 @@ class ModelConfig:
# `override_quantization_method` method) must be checked in order
# of preference (this is particularly important for GPTQ).
overrides
=
[
"marlin"
,
"bitblas"
,
"gptq_marlin_24"
,
"gptq_marlin"
,
...
...
vllm/lora/layers.py
View file @
0cdbf5e6
...
...
@@ -48,9 +48,6 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
# GPTQ/AWQ
elif
hasattr
(
base_layer
,
"qweight"
):
return
base_layer
.
qweight
.
device
# marlin
elif
hasattr
(
base_layer
,
"B"
):
return
base_layer
.
B
.
device
# HQQ marlin
elif
hasattr
(
base_layer
,
"W_q"
):
return
base_layer
.
W_q
.
device
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment