"examples/offline_inference/vision_language_pooling.py" did not exist on "d9fc8cd9da4a69cb4171efb7cb5a46308680c83c"
Commit f09d77ac authored by zhuwenwen's avatar zhuwenwen
Browse files

skip fp8

parent 145787ae
......@@ -166,15 +166,15 @@ void gptq_shuffle(
torch::Tensor q_perm,
int bit);
void static_scaled_fp8_quant(
torch::Tensor& out,
torch::Tensor& input,
torch::Tensor& scale);
void dynamic_scaled_fp8_quant(
torch::Tensor& out,
torch::Tensor& input,
torch::Tensor& scale);
// void static_scaled_fp8_quant(
// torch::Tensor& out,
// torch::Tensor& input,
// torch::Tensor& scale);
// void dynamic_scaled_fp8_quant(
// torch::Tensor& out,
// torch::Tensor& input,
// torch::Tensor& scale);
void moe_align_block_size(
torch::Tensor topk_ids,
......
......@@ -75,8 +75,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
// ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
// ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
ops.def(
"moe_align_block_size",
&moe_align_block_size,
......
......@@ -186,17 +186,17 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
# fp8
def scaled_fp8_quant(
input: torch.Tensor,
scale: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
if scale is None:
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
else:
vllm_ops.static_scaled_fp8_quant(output, input, scale)
return output, scale
# def scaled_fp8_quant(
# input: torch.Tensor,
# scale: Optional[torch.Tensor] = None,
# ) -> Tuple[torch.Tensor, torch.Tensor]:
# output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
# if scale is None:
# scale = torch.zeros(1, device=input.device, dtype=torch.float32)
# vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
# else:
# vllm_ops.static_scaled_fp8_quant(output, input, scale)
# return output, scale
# moe
......
......@@ -54,10 +54,9 @@ def _get_quantization_config(
f"{model_config.dtype} is not supported for quantization "
f"method {model_config.quantization}. Supported dtypes: "
f"{supported_dtypes}")
return quant_config
if quant_config != None:
os.environ['LLAMA_NN'] = '0'
return quant_config
return None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment