Merge branch 'v0.5.0-dtk24.04.1'

7462218e · zhuwenwen · 6ccd3f47 · 1cec5e62 · 7462218e · 7462218e
Commit 7462218e authored Sep 05, 2024 by zhuwenwen
20 changed files
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -11,6 +11,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
+import vllm.envs as envs
 class SiluAndMul(CustomOp):
@@ -34,6 +35,9 @@ class SiluAndMul(CustomOp):
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if envs.VLLM_USE_OPT_OP:
+            ops.silu_and_mul_opt(out, x)
+        else:
            ops.silu_and_mul(out, x)
        return out
@@ -66,8 +70,14 @@ class GeluAndMul(CustomOp):
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        if self.approximate == "none":
+            if envs.VLLM_USE_OPT_OP:
+                ops.gelu_and_mul_opt(out, x)
+            else:
                ops.gelu_and_mul(out, x)
        elif self.approximate == "tanh":
+            if envs.VLLM_USE_OPT_OP:
+                ops.gelu_tanh_and_mul_opt(out, x)
+            else:
                ops.gelu_tanh_and_mul(out, x)
        return out

--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name)
+    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
 __all__ = [
    "fused_moe",
    "fused_topk",
    "fused_experts",
    "get_config_file_name",
+    "grouped_topk",
 ]
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -5,6 +5,7 @@ import torch
 import torch.nn as nn
 from vllm.model_executor.custom_op import CustomOp
+import vllm.envs as envs
 class RMSNorm(CustomOp):
@@ -51,6 +52,14 @@ class RMSNorm(CustomOp):
        from vllm import _custom_ops as ops
        if residual is not None:
+            if envs.VLLM_USE_OPT_OP:
+                ops.fused_add_rms_norm_opt(
+                    x,
+                    residual,
+                    self.weight.data,
+                    self.variance_epsilon,
+                )
+            else:
                ops.fused_add_rms_norm(
                    x,
                    residual,
@@ -59,6 +68,14 @@ class RMSNorm(CustomOp):
                )
            return x, residual
        out = torch.empty_like(x)
+        if envs.VLLM_USE_OPT_OP:
+            ops.rms_norm_opt(
+                out,
+                x,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+        else:
            ops.rms_norm(
                out,
                x,

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
--- a/vllm/model_executor/layers/ops/rand.py
+++ b/vllm/model_executor/layers/ops/rand.py
@@ -3,6 +3,7 @@ from typing import Optional, Union
 import torch
 import triton
 import triton.language as tl
+from vllm.utils import is_hip
 def seeded_uniform(
@@ -69,8 +70,14 @@ def seeded_uniform(
    # Manual tuning. This seems to give best performance on A100 for
    # simple kernels like this.
    if philox_block_size >= 8192:
+        if is_hip():
+            num_warps = 16
+        else:
            num_warps = 32
    elif philox_block_size >= 4096:
+        if is_hip():
+            num_warps = 8
+        else:
            num_warps = 16
    elif philox_block_size >= 2048:
        num_warps = 8

--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -274,7 +274,7 @@ class DefaultModelLoader(BaseModelLoader):
            for _, module in model.named_modules():
                quant_method = getattr(module, "quant_method", None)
-                if quant_method is not None:
+                if quant_method is not None and quant_method!="awq" and quant_method!="gptq":
                    quant_method.process_weights_after_loading(module)
                # FIXME: Remove this after Mixtral is updated
                # to use quant_method.

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py