Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
06c4873d
Unverified
Commit
06c4873d
authored
Nov 13, 2025
by
Jane (Yuan) Xu
Committed by
GitHub
Nov 14, 2025
Browse files
Rewrite C++ meta funcs to Python (#28595)
Signed-off-by:
Jane Xu
<
janeyx@meta.com
>
parent
d3387750
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
38 additions
and
33 deletions
+38
-33
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+0
-16
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+0
-16
vllm/_custom_ops.py
vllm/_custom_ops.py
+38
-1
No files found.
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
View file @
06c4873d
...
@@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
...
@@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
return
out
;
return
out
;
}
}
torch
::
Tensor
awq_marlin_repack_meta
(
torch
::
Tensor
&
b_q_weight
,
c10
::
SymInt
size_k
,
c10
::
SymInt
size_n
,
int64_t
num_bits
)
{
int
const
pack_factor
=
32
/
num_bits
;
auto
options
=
torch
::
TensorOptions
()
.
dtype
(
b_q_weight
.
dtype
())
.
device
(
b_q_weight
.
device
());
return
torch
::
empty_symint
(
{
size_k
/
marlin
::
tile_size
,
size_n
*
marlin
::
tile_size
/
pack_factor
},
options
);
}
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
CUDA
,
m
)
{
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
CUDA
,
m
)
{
m
.
impl
(
"awq_marlin_repack"
,
&
awq_marlin_repack
);
m
.
impl
(
"awq_marlin_repack"
,
&
awq_marlin_repack
);
}
}
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
Meta
,
m
)
{
m
.
impl
(
"awq_marlin_repack"
,
&
awq_marlin_repack_meta
);
}
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
View file @
06c4873d
...
@@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
...
@@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
return
out
;
return
out
;
}
}
torch
::
Tensor
gptq_marlin_repack_meta
(
torch
::
Tensor
&
b_q_weight
,
torch
::
Tensor
&
perm
,
c10
::
SymInt
size_k
,
c10
::
SymInt
size_n
,
int64_t
num_bits
)
{
int
const
pack_factor
=
32
/
num_bits
;
auto
options
=
torch
::
TensorOptions
()
.
dtype
(
b_q_weight
.
dtype
())
.
device
(
b_q_weight
.
device
());
return
torch
::
empty_symint
(
{
size_k
/
marlin
::
tile_size
,
size_n
*
marlin
::
tile_size
/
pack_factor
},
options
);
}
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
CUDA
,
m
)
{
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
CUDA
,
m
)
{
m
.
impl
(
"gptq_marlin_repack"
,
&
gptq_marlin_repack
);
m
.
impl
(
"gptq_marlin_repack"
,
&
gptq_marlin_repack
);
}
}
TORCH_LIBRARY_IMPL_EXPAND
(
TORCH_EXTENSION_NAME
,
Meta
,
m
)
{
m
.
impl
(
"gptq_marlin_repack"
,
&
gptq_marlin_repack_meta
);
}
vllm/_custom_ops.py
View file @
06c4873d
...
@@ -1174,13 +1174,50 @@ def gptq_marlin_repack(
...
@@ -1174,13 +1174,50 @@ def gptq_marlin_repack(
return
torch
.
ops
.
_C
.
gptq_marlin_repack
(
b_q_weight
,
perm
,
size_k
,
size_n
,
num_bits
)
return
torch
.
ops
.
_C
.
gptq_marlin_repack
(
b_q_weight
,
perm
,
size_k
,
size_n
,
num_bits
)
# gptq_marlin
if
hasattr
(
torch
.
ops
.
_C
,
"gptq_marlin_repack"
):
@
register_fake
(
"_C::gptq_marlin_repack"
)
def
_gptq_marlin_repack_fake
(
b_q_weight
:
torch
.
Tensor
,
perm
:
torch
.
Tensor
,
size_k
:
torch
.
SymInt
,
size_n
:
torch
.
SymInt
,
num_bits
:
int
,
)
->
torch
.
Tensor
:
pack_factor
=
32
//
num_bits
marlin_tile_size
=
16
return
torch
.
empty
(
(
size_k
//
marlin_tile_size
,
size_n
*
marlin_tile_size
//
pack_factor
),
dtype
=
b_q_weight
.
dtype
,
device
=
b_q_weight
.
device
,
)
# awq_marlin
def
awq_marlin_repack
(
def
awq_marlin_repack
(
b_q_weight
:
torch
.
Tensor
,
size_k
:
int
,
size_n
:
int
,
num_bits
:
int
b_q_weight
:
torch
.
Tensor
,
size_k
:
int
,
size_n
:
int
,
num_bits
:
int
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
return
torch
.
ops
.
_C
.
awq_marlin_repack
(
b_q_weight
,
size_k
,
size_n
,
num_bits
)
return
torch
.
ops
.
_C
.
awq_marlin_repack
(
b_q_weight
,
size_k
,
size_n
,
num_bits
)
if
hasattr
(
torch
.
ops
.
_C
,
"awq_marlin_repack"
):
@
register_fake
(
"_C::awq_marlin_repack"
)
def
_awq_marlin_repack_fake
(
b_q_weight
:
torch
.
Tensor
,
size_k
:
torch
.
SymInt
,
size_n
:
torch
.
SymInt
,
num_bits
:
int
,
)
->
torch
.
Tensor
:
pack_factor
=
32
//
num_bits
marlin_tile_size
=
16
return
torch
.
empty
(
(
size_k
//
marlin_tile_size
,
size_n
*
marlin_tile_size
//
pack_factor
),
dtype
=
b_q_weight
.
dtype
,
device
=
b_q_weight
.
device
,
)
def
gptq_marlin_moe_repack
(
def
gptq_marlin_moe_repack
(
b_q_weight
:
torch
.
Tensor
,
b_q_weight
:
torch
.
Tensor
,
perm
:
torch
.
Tensor
,
perm
:
torch
.
Tensor
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment