Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f09d77ac
Commit
f09d77ac
authored
May 25, 2024
by
zhuwenwen
Browse files
skip fp8
parent
145787ae
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
24 additions
and
25 deletions
+24
-25
csrc/ops.h
csrc/ops.h
+9
-9
csrc/pybind.cpp
csrc/pybind.cpp
+2
-2
vllm/_custom_ops.py
vllm/_custom_ops.py
+11
-11
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+2
-3
No files found.
csrc/ops.h
View file @
f09d77ac
...
...
@@ -166,15 +166,15 @@ void gptq_shuffle(
torch
::
Tensor
q_perm
,
int
bit
);
void
static_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
scale
);
void
dynamic_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
scale
);
//
void static_scaled_fp8_quant(
//
torch::Tensor& out,
//
torch::Tensor& input,
//
torch::Tensor& scale);
//
void dynamic_scaled_fp8_quant(
//
torch::Tensor& out,
//
torch::Tensor& input,
//
torch::Tensor& scale);
void
moe_align_block_size
(
torch
::
Tensor
topk_ids
,
...
...
csrc/pybind.cpp
View file @
f09d77ac
...
...
@@ -75,8 +75,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
ops
.
def
(
"gptq_gemm"
,
&
gptq_gemm
,
"Quantized GEMM for GPTQ"
);
ops
.
def
(
"gptq_shuffle"
,
&
gptq_shuffle
,
"Post processing for GPTQ"
);
ops
.
def
(
"squeezellm_gemm"
,
&
squeezellm_gemm
,
"Quantized GEMM for SqueezeLLM"
);
ops
.
def
(
"static_scaled_fp8_quant"
,
&
static_scaled_fp8_quant
,
"Compute FP8 quantized tensor for given scaling factor"
);
ops
.
def
(
"dynamic_scaled_fp8_quant"
,
&
dynamic_scaled_fp8_quant
,
"Compute FP8 quantized tensor and scaling factor"
);
//
ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
//
ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
ops
.
def
(
"moe_align_block_size"
,
&
moe_align_block_size
,
...
...
vllm/_custom_ops.py
View file @
f09d77ac
...
...
@@ -186,17 +186,17 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
# fp8
def
scaled_fp8_quant
(
input
:
torch
.
Tensor
,
scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
output
=
torch
.
empty_like
(
input
,
dtype
=
torch
.
float8_e4m3fn
)
if
scale
is
None
:
scale
=
torch
.
zeros
(
1
,
device
=
input
.
device
,
dtype
=
torch
.
float32
)
vllm_ops
.
dynamic_scaled_fp8_quant
(
output
,
input
,
scale
)
else
:
vllm_ops
.
static_scaled_fp8_quant
(
output
,
input
,
scale
)
return
output
,
scale
#
def scaled_fp8_quant(
#
input: torch.Tensor,
#
scale: Optional[torch.Tensor] = None,
#
) -> Tuple[torch.Tensor, torch.Tensor]:
#
output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
#
if scale is None:
#
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
#
vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
#
else:
#
vllm_ops.static_scaled_fp8_quant(output, input, scale)
#
return output, scale
# moe
...
...
vllm/model_executor/model_loader/loader.py
View file @
f09d77ac
...
...
@@ -54,10 +54,9 @@ def _get_quantization_config(
f
"
{
model_config
.
dtype
}
is not supported for quantization "
f
"method
{
model_config
.
quantization
}
. Supported dtypes: "
f
"
{
supported_dtypes
}
"
)
if
quant_config
!=
None
:
os
.
environ
[
'LLAMA_NN'
]
=
'0'
return
quant_config
if
quant_config
!=
None
:
os
.
environ
[
'LLAMA_NN'
]
=
'0'
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment