Commit 7462218e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.5.0-dtk24.04.1'

parents 6ccd3f47 1cec5e62
...@@ -11,6 +11,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, ...@@ -11,6 +11,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
import vllm.envs as envs
class SiluAndMul(CustomOp): class SiluAndMul(CustomOp):
...@@ -34,6 +35,9 @@ class SiluAndMul(CustomOp): ...@@ -34,6 +35,9 @@ class SiluAndMul(CustomOp):
d = x.shape[-1] // 2 d = x.shape[-1] // 2
output_shape = (x.shape[:-1] + (d, )) output_shape = (x.shape[:-1] + (d, ))
out = torch.empty(output_shape, dtype=x.dtype, device=x.device) out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
if envs.VLLM_USE_OPT_OP:
ops.silu_and_mul_opt(out, x)
else:
ops.silu_and_mul(out, x) ops.silu_and_mul(out, x)
return out return out
...@@ -66,8 +70,14 @@ class GeluAndMul(CustomOp): ...@@ -66,8 +70,14 @@ class GeluAndMul(CustomOp):
output_shape = (x.shape[:-1] + (d, )) output_shape = (x.shape[:-1] + (d, ))
out = torch.empty(output_shape, dtype=x.dtype, device=x.device) out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
if self.approximate == "none": if self.approximate == "none":
if envs.VLLM_USE_OPT_OP:
ops.gelu_and_mul_opt(out, x)
else:
ops.gelu_and_mul(out, x) ops.gelu_and_mul(out, x)
elif self.approximate == "tanh": elif self.approximate == "tanh":
if envs.VLLM_USE_OPT_OP:
ops.gelu_tanh_and_mul_opt(out, x)
else:
ops.gelu_tanh_and_mul(out, x) ops.gelu_tanh_and_mul(out, x)
return out return out
......
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts, fused_moe, fused_topk, get_config_file_name) fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
__all__ = [ __all__ = [
"fused_moe", "fused_moe",
"fused_topk", "fused_topk",
"fused_experts", "fused_experts",
"get_config_file_name", "get_config_file_name",
"grouped_topk",
] ]
...@@ -5,6 +5,7 @@ import torch ...@@ -5,6 +5,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
import vllm.envs as envs
class RMSNorm(CustomOp): class RMSNorm(CustomOp):
...@@ -51,6 +52,14 @@ class RMSNorm(CustomOp): ...@@ -51,6 +52,14 @@ class RMSNorm(CustomOp):
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
if residual is not None: if residual is not None:
if envs.VLLM_USE_OPT_OP:
ops.fused_add_rms_norm_opt(
x,
residual,
self.weight.data,
self.variance_epsilon,
)
else:
ops.fused_add_rms_norm( ops.fused_add_rms_norm(
x, x,
residual, residual,
...@@ -59,6 +68,14 @@ class RMSNorm(CustomOp): ...@@ -59,6 +68,14 @@ class RMSNorm(CustomOp):
) )
return x, residual return x, residual
out = torch.empty_like(x) out = torch.empty_like(x)
if envs.VLLM_USE_OPT_OP:
ops.rms_norm_opt(
out,
x,
self.weight.data,
self.variance_epsilon,
)
else:
ops.rms_norm( ops.rms_norm(
out, out,
x, x,
......
This diff is collapsed.
...@@ -3,6 +3,7 @@ from typing import Optional, Union ...@@ -3,6 +3,7 @@ from typing import Optional, Union
import torch import torch
import triton import triton
import triton.language as tl import triton.language as tl
from vllm.utils import is_hip
def seeded_uniform( def seeded_uniform(
...@@ -69,8 +70,14 @@ def seeded_uniform( ...@@ -69,8 +70,14 @@ def seeded_uniform(
# Manual tuning. This seems to give best performance on A100 for # Manual tuning. This seems to give best performance on A100 for
# simple kernels like this. # simple kernels like this.
if philox_block_size >= 8192: if philox_block_size >= 8192:
if is_hip():
num_warps = 16
else:
num_warps = 32 num_warps = 32
elif philox_block_size >= 4096: elif philox_block_size >= 4096:
if is_hip():
num_warps = 8
else:
num_warps = 16 num_warps = 16
elif philox_block_size >= 2048: elif philox_block_size >= 2048:
num_warps = 8 num_warps = 8
......
This diff is collapsed.
...@@ -274,7 +274,7 @@ class DefaultModelLoader(BaseModelLoader): ...@@ -274,7 +274,7 @@ class DefaultModelLoader(BaseModelLoader):
for _, module in model.named_modules(): for _, module in model.named_modules():
quant_method = getattr(module, "quant_method", None) quant_method = getattr(module, "quant_method", None)
if quant_method is not None: if quant_method is not None and quant_method!="awq" and quant_method!="gptq":
quant_method.process_weights_after_loading(module) quant_method.process_weights_after_loading(module)
# FIXME: Remove this after Mixtral is updated # FIXME: Remove this after Mixtral is updated
# to use quant_method. # to use quant_method.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment