merge v0.5.0

f48954a4 · zhuwenwen · 1dba29d3 · 8f89d720 · f48954a4 · f48954a4
Commit f48954a4 authored Jun 12, 2024 by zhuwenwen
20 changed files
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
 import asyncio
 import concurrent.futures
-from copy import copy
 from enum import Enum
-from functools import lru_cache
 from json import dumps as json_dumps
 from re import escape as regex_escape
 from typing import Tuple, Union
@@ -54,8 +52,10 @@ global_thread_pool = None  # used for generating logits processor fsm
 async def get_outlines_guided_decoding_logits_processor(
-        request: Union[CompletionRequest, ChatCompletionRequest],
+    request: Union[CompletionRequest,
-        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
+                   ChatCompletionRequest], tokenizer: PreTrainedTokenizerBase
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
    """
    Given an OpenAI-compatible request, check for guided decoding parameters
    and get the necessary logits processor for the given guide.
@@ -64,7 +64,7 @@ async def get_outlines_guided_decoding_logits_processor(
    """
    global global_thread_pool
    guide, mode = _get_guide_and_mode(request)
-    if not guide:
+    if not guide or not mode:
        return None
    if global_thread_pool is None:
@@ -72,15 +72,9 @@ async def get_outlines_guided_decoding_logits_processor(
            max_workers=2)
    loop = asyncio.get_running_loop()
-    result = await loop.run_in_executor(global_thread_pool,
+    return await loop.run_in_executor(global_thread_pool,
-                                        _get_cached_logits_processor, guide,
+                                      _get_logits_processor, guide, tokenizer,
-                                        tokenizer, mode,
+                                      mode, request.guided_whitespace_pattern)
-                                        request.guided_whitespace_pattern)
-    logits_processor = copy(result)
-    # reset logits processor's internal state
-    logits_processor.init_state()
-    return logits_processor
 def _get_guide_and_mode(
@@ -115,11 +109,10 @@ def _get_guide_and_mode(
        return None, None
-@lru_cache(maxsize=32)
+def _get_logits_processor(
-def _get_cached_logits_processor(guide: str,
+    guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
-                                 tokenizer: PreTrainedTokenizerBase,
+    whitespace_pattern: Union[str, None]
-                                 mode: GuidedDecodingMode,
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
-                                 whitespace_pattern: Union[str, None]):
    if mode == GuidedDecodingMode.JSON:
        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:

--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,7 +21,7 @@ from functools import lru_cache
 from typing import Callable, DefaultDict, Dict, List, Union
 import torch
-from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
+from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
@@ -29,28 +29,32 @@ from transformers import PreTrainedTokenizerBase
 class BaseLogitsProcessor:
-    def __init__(self):
+    def __init__(self, guide: Guide):
-        # Child class should use initialize in their init.
+        self._guide: Guide = guide
-        self.fsm: FSM
+        self._fsm_state: DefaultDict[int, int] = defaultdict(int)
-    def init_state(self):
-        """Initialize the FSM states."""
-        self.fsm_state: DefaultDict[int, int] = defaultdict(int)
    def __call__(self, input_ids: List[int],
                 scores: torch.Tensor) -> torch.Tensor:
        """Use the FSM to bias the logits before sampling the next token."""
        seq_id = hash(tuple(input_ids))
-        if len(input_ids) == 0:
+        if len(input_ids) > 0:
-            self.init_state()
-        else:
            last_token = input_ids[-1]
            last_seq_id = hash(tuple(input_ids[:-1]))
-            self.fsm_state[seq_id] = self.fsm.next_state(
+            self._fsm_state[seq_id] = self._guide.get_next_state(
-                self.fsm_state[last_seq_id], last_token)
+                state=self._fsm_state[last_seq_id], token_id=last_token)
+        instruction = self._guide.get_next_instruction(
+            state=self._fsm_state[seq_id])
-        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
+        if type(instruction) == Generate:
+            allowed_tokens = instruction.tokens
+        elif type(instruction) == Write:
+            # TODO: support fast forward tokens
+            allowed_tokens = [instruction.tokens[0]]
+        else:
+            raise TypeError(
+                f"Unsupported instruction type {type(instruction)}")
        mask = torch.full((scores.shape[-1], ),
                          -math.inf,
@@ -62,6 +66,13 @@ class BaseLogitsProcessor:
 class RegexLogitsProcessor(BaseLogitsProcessor):
+    @classmethod
+    @lru_cache(maxsize=32)
+    def _get_guide(cls, regex_string: str,
+                   tokenizer: PreTrainedTokenizerBase) -> Guide:
+        tokenizer = _adapt_tokenizer(tokenizer)
+        return RegexGuide(regex_string, tokenizer)
    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
        """Compile the FSM that drives the regex-structured generation.
@@ -73,9 +84,8 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
            The model's tokenizer
        """
-        tokenizer = _adapt_tokenizer(tokenizer)
+        super().__init__(
-        fsm = RegexFSM(regex_string, tokenizer)
+            RegexLogitsProcessor._get_guide(regex_string, tokenizer))
-        self.fsm = fsm
 class JSONLogitsProcessor(RegexLogitsProcessor):
@@ -115,6 +125,12 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
 class CFGLogitsProcessor(BaseLogitsProcessor):
+    @classmethod
+    @lru_cache(maxsize=32)
+    def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
+        tokenizer = _adapt_tokenizer(tokenizer)
+        return CFGGuide(cfg, tokenizer)
    def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
        """Compile the FSM that drives the context free grammar generation.
@@ -126,17 +142,11 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
            The model's tokenizer
        """
-        tokenizer = _adapt_tokenizer(tokenizer)
+        super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer))
-        fsm = CFGFSM(cfg, tokenizer)
+        self._guide = self._guide.copy()
-        self.fsm = fsm
-    def init_state(self):
-        """Initialize state with a CFGFSM copy."""
-        super().init_state()
-        self.fsm = self.fsm.copy()
-@lru_cache
+@lru_cache(maxsize=32)
 def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
    """Adapt vLLM's tokenizer to use to compile the FSM.

--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -6,14 +6,14 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from vllm import _custom_ops as ops
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
-class SiluAndMul(nn.Module):
+class SiluAndMul(CustomOp):
    """An activation function for SwiGLU.
    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
@@ -23,12 +23,14 @@ class SiluAndMul(nn.Module):
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.silu(x[..., :d]) * x[..., d:]
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -36,7 +38,7 @@ class SiluAndMul(nn.Module):
        return out
-class GeluAndMul(nn.Module):
+class GeluAndMul(CustomOp):
    """An activation function for GeGLU.
    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
@@ -52,12 +54,14 @@ class GeluAndMul(nn.Module):
        if approximate not in ("none", "tanh"):
            raise ValueError(f"Unknown approximate mode: {approximate}")
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -71,28 +75,32 @@ class GeluAndMul(nn.Module):
        return f'approximate={repr(self.approximate)}'
-class NewGELU(nn.Module):
+class NewGELU(CustomOp):
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        c = math.sqrt(2.0 / math.pi)
        return 0.5 * x * (1.0 + torch.tanh(c *
                                           (x + 0.044715 * torch.pow(x, 3.0))))
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
        out = torch.empty_like(x)
        ops.gelu_new(out, x)
        return out
-class FastGELU(nn.Module):
+class FastGELU(CustomOp):
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                           (1.0 + 0.044715 * x * x)))
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
        out = torch.empty_like(x)
        ops.gelu_fast(out, x)
        return out

--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
 {
    "1": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
    },
    "2": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
    },
    "4": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
    },
    "8": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
    },
    "16": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
        "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
    },
    "24": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
+        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
    },
    "96": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 3
    },
    "128": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
    },
    "512": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
+        "num_warps": 8,
-        "num_stages": 2
+        "num_stages": 4
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
        "num_warps": 8,
        "num_stages": 4
    },
@@ -109,7 +115,7 @@
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
@@ -125,7 +131,7 @@
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },

--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
 {
    "1": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
    },
    "4": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
    },
    "8": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "num_warps": 4,
        "num_stages": 4
    },
    "16": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
    },
    "24": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
        "num_stages": 4
    },
    "48": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "64": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
        "num_stages": 4
    },
    "96": {
        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "128": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
        "num_stages": 4
    },
    "256": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
+        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 4
    },
@@ -107,7 +107,7 @@
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
        "num_warps": 8,
        "num_stages": 4
    },
@@ -115,7 +115,7 @@
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 4
    },
@@ -139,7 +139,7 @@
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    }

--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -10,7 +10,6 @@ import triton.language as tl
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.utils import is_hip
 logger = init_logger(__name__)
@@ -308,6 +307,30 @@ def get_moe_configs(E: int, N: int,
    return None
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+) -> Dict[str, int]:
+    config = {
+        'BLOCK_SIZE_M': 64,
+        'BLOCK_SIZE_N': 64,
+        'BLOCK_SIZE_K': 32,
+        'GROUP_SIZE_M': 8
+    }
+    if M <= E:
+        config = {
+            'BLOCK_SIZE_M': 16,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 64,
+            'GROUP_SIZE_M': 1
+        }
+    return config
 def fused_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
@@ -319,34 +342,26 @@ def fused_topk(
    M, _ = hidden_states.shape
-    if is_hip():
+    topk_weights = torch.empty(M,
-        # The MoE kernels are not yet supported on ROCm.
-        routing_weights = torch.softmax(gating_output,
-                                        dim=-1,
-                                        dtype=torch.float32)
-        topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)
-    else:
-        import vllm._moe_C as moe_kernels
-        topk_weights = torch.empty(M,
-                                   topk,
-                                   dtype=torch.float32,
-                                   device=hidden_states.device)
-        topk_ids = torch.empty(M,
                               topk,
-                               dtype=torch.int32,
+                               dtype=torch.float32,
                               device=hidden_states.device)
-        token_expert_indicies = torch.empty(M,
+    topk_ids = torch.empty(M,
-                                            topk,
+                           topk,
-                                            dtype=torch.int32,
+                           dtype=torch.int32,
-                                            device=hidden_states.device)
+                           device=hidden_states.device)
-        moe_kernels.topk_softmax(
+    token_expert_indicies = torch.empty(M,
-            topk_weights,
+                                        topk,
-            topk_ids,
+                                        dtype=torch.int32,
-            token_expert_indicies,
+                                        device=hidden_states.device)
-            gating_output.float(),  # TODO(woosuk): Optimize this.
+    ops.topk_softmax(
-        )
+        topk_weights,
-        del token_expert_indicies  # Not used. Will be used in the future.
+        topk_ids,
+        token_expert_indicies,
+        gating_output.float(),  # TODO(woosuk): Optimize this.
+    )
+    del token_expert_indicies  # Not used. Will be used in the future.
    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
    return topk_weights, topk_ids
@@ -390,20 +405,9 @@ def fused_experts(hidden_states: torch.Tensor,
            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
        else:
            # Else use the default config
-            config = {
+            config = get_default_config(M, E, N, w1.shape[2],
-                'BLOCK_SIZE_M': 64,
+                                        topk_ids.shape[1],
-                'BLOCK_SIZE_N': 64,
+                                        "float8" if use_fp8 else None)
-                'BLOCK_SIZE_K': 32,
-                'GROUP_SIZE_M': 8
-            }
-            if M <= E:
-                config = {
-                    'BLOCK_SIZE_M': 16,
-                    'BLOCK_SIZE_N': 32,
-                    'BLOCK_SIZE_K': 64,
-                    'GROUP_SIZE_M': 1
-                }
    intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
                                      device=hidden_states.device,

--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -4,10 +4,10 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from vllm import _custom_ops as ops
+from vllm.model_executor.custom_op import CustomOp
-class RMSNorm(nn.Module):
+class RMSNorm(CustomOp):
    """Root mean square normalization.
    Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
@@ -23,7 +23,7 @@ class RMSNorm(nn.Module):
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
-    def _forward(
+    def forward_native(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
@@ -43,11 +43,13 @@ class RMSNorm(nn.Module):
        else:
            return x, residual
-    def forward(
+    def forward_cuda(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm import _custom_ops as ops
        if residual is not None:
            ops.fused_add_rms_norm(
                x,

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
 from abc import abstractmethod
-from typing import List, Optional
+from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
@@ -29,6 +29,21 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+def adjust_bitsandbytes_shard(param: Parameter,
+                              qkv_offsets: Dict[str, Tuple[int, int]],
+                              loaded_shard_id: str) -> Tuple[int, int]:
+    """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
+    total, _ = qkv_offsets["total"]
+    orig_offset, orig_size = qkv_offsets[loaded_shard_id]
+    quantized_total = param.data.shape[0]
+    quantized_offset = orig_offset * quantized_total // total
+    quantized_size = orig_size * quantized_total // total
+    return quantized_size, quantized_offset
 class LinearMethodBase(QuantizeMethodBase):
    """Base class for different (maybe quantized) linear methods."""
@@ -40,7 +55,7 @@ class LinearMethodBase(QuantizeMethodBase):
                       **extra_weight_attrs):
        """Create weights for a linear layer. 
           The weights will be set as attributes of the layer.
        Args:
            layer: The layer that is using the LinearMethodBase factory.
            input_size_per_partition: Size of the weight input dim on rank X.
@@ -433,6 +448,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                # Special case for Marlin.
                shard_size, shard_offset = adjust_marlin_shard(
                    param, shard_size, shard_offset)
+            use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
+            if use_bitsandbytes:
+                shard_size = loaded_weight.shape[output_dim]
+                shard_offset = loaded_weight.shape[output_dim] * \
+                    loaded_shard_id
            if self.use_llama_nn:
                param_data_ = param_data.narrow(output_dim, shard_offset,
@@ -440,6 +461,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            else:
                param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
            start_idx = tp_rank * shard_size
            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                 shard_size)
@@ -645,12 +668,29 @@ class QKVParallelLinear(ColumnParallelLinear):
                shard_size, shard_offset = adjust_marlin_shard(
                    param, shard_size, shard_offset)
+            use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
+            if use_bitsandbytes:
+                orig_qkv_offsets = {
+                    "q": (0, self.num_heads * self.head_size),
+                    "k": (self.num_heads * self.head_size,
+                          self.num_kv_heads * self.head_size),
+                    "v":
+                    ((self.num_heads + self.num_kv_heads) * self.head_size,
+                     self.num_kv_heads * self.head_size),
+                    "total":
+                    ((self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                     0)
+                }
+                shard_size, shard_offset = adjust_bitsandbytes_shard(
+                    param, orig_qkv_offsets, loaded_shard_id)
            if self.use_llama_nn:
                param_data_ = param_data.narrow(output_dim, shard_offset,
                                           shard_size)
            else:
                param_data = param_data.narrow(output_dim, shard_offset,
-                                           shard_size)
+                                            shard_size)
            if loaded_shard_id == "q":
                shard_id = tp_rank
            else:

--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -21,7 +21,7 @@ class LogitsProcessor(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 org_vocab_size: Optional[int] = None,
-                 scale: Optional[float] = 1.0,
+                 scale: float = 1.0,
                 logits_as_input: bool = False) -> None:
        """
        Args:
@@ -52,7 +52,8 @@ class LogitsProcessor(nn.Module):
            logits = self._get_logits(hidden_states, embedding, embedding_bias)
        if logits is not None:
-            logits *= self.scale
+            if self.scale != 1.0:
+                logits *= self.scale
            # Apply logits processors (if any).
            logits = _apply_logits_processors(logits, sampling_metadata)

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,6 +4,8 @@ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
+from vllm.model_executor.layers.quantization.bitsandbytes import (
+    BitsAndBytesConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
    CompressedTensorsConfig)
 from vllm.model_executor.layers.quantization.deepspeedfp import (
@@ -29,7 +31,8 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
    "gptq_marlin": GPTQMarlinConfig,
    "gptq": GPTQConfig,
    "squeezellm": SqueezeLLMConfig,
-    "sparseml": CompressedTensorsConfig,
+    "compressed-tensors": CompressedTensorsConfig,
+    "bitsandbytes": BitsAndBytesConfig,
 }

--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
+from typing import Any, Dict, List, Optional
+import torch
+from torch.nn.parameter import Parameter
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+class BitsAndBytesConfig(QuantizationConfig):
+    """Config class for BitsAndBytes Quantization.
+    Reference: https://arxiv.org/abs/2305.14314
+    """
+    def __init__(
+        self,
+        adapter_name_or_path: str,
+        target_modules: List[str],
+    ) -> None:
+        self.adapter_name_or_path = adapter_name_or_path
+        self.target_modules = target_modules
+    def __repr__(self) -> str:
+        return (
+            f"BitsAndBytesConfig(adapter_name_or_path={self.adapter_name_or_path}"
+        )
+    @classmethod
+    def get_name(self) -> str:
+        return "bitsandbytes"
+    @classmethod
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+    @classmethod
+    def get_min_capability(self) -> int:
+        return 70
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return [
+            "adapter_config.json",
+        ]
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
+        adapter_name = cls.get_from_keys(config, ["adapter_name_or_path"])
+        default_target_modules = [
+            "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj",
+            "o_proj"
+        ]
+        if adapter_name == "":
+            target_modules = default_target_modules
+        else:
+            target_modules = cls.get_from_keys(config, ["target_modules"])
+        return cls(adapter_name, target_modules)
+    def get_quant_method(
+            self,
+            layer: torch.nn.Module) -> Optional["BitsAndBytesLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return BitsAndBytesLinearMethod(self)
+        return None
+    def get_scaled_act_names(self) -> List[str]:
+        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
+class BitsAndBytesLinearMethod(LinearMethodBase):
+    """Linear method for BitsAndBytes.
+    Args:
+       quant_config: The BitsAndBytes quantization config.
+    """
+    def __init__(self, quant_config: BitsAndBytesConfig):
+        try:
+            import bitsandbytes
+            if bitsandbytes.__version__ < "0.42.0":
+                raise ImportError("bitsandbytes version is wrong. Please "
+                                  "install bitsandbytes>=0.42.0.")
+        except ImportError as err:
+            raise ImportError("Please install bitsandbytes>=0.42.0 via "
+                              "`pip install bitsandbytes>=0.42.0` to use "
+                              "bitsandbytes quantizer.") from err
+        self.quant_config = quant_config
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        quant_ratio = 0
+        if params_dtype.is_floating_point:
+            quant_ratio = torch.finfo(params_dtype).bits // torch.iinfo(
+                torch.uint8).bits
+        else:
+            quant_ratio = torch.iinfo(params_dtype).bits // torch.iinfo(
+                torch.uint8).bits
+        if input_size_per_partition * sum(
+                output_partition_sizes) % quant_ratio != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. ")
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition * sum(output_partition_sizes) //
+                quant_ratio,
+                1,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                # In bitsandbytes, a tensor of shape [n,m] is quantized to
+                #[n*m/pack_ratio, 1],so the output_dim is 0
+                "output_dim": 0,
+                "pack_factor": quant_ratio,
+                "use_bitsandbytes": True,
+            })
+        layer.register_parameter("qweight", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # only load the bitsandbytes module when needed
+        from bitsandbytes import matmul_4bit
+        original_type = x.dtype
+        bf_x = x.to(torch.bfloat16)
+        qweight = layer.qweight
+        quant_states = qweight.bnb_quant_state
+        offsets = qweight.bnb_shard_offsets
+        out_dim_0 = x.shape[0]
+        out_dim_1 = sum(
+            [quant_state[1].shape[0] for quant_state in quant_states.items()])
+        out = torch.empty(out_dim_0,
+                          out_dim_1,
+                          dtype=torch.bfloat16,
+                          device=x.device)
+        current_index = 0
+        for i in range(len(quant_states)):
+            output_size = quant_states[i].shape[0]
+            # It is more efficient to use out kwarg like
+            # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+            # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+            # Need to change  after the bug is fixed.
+            out[:, current_index:current_index + output_size] = matmul_4bit(
+                bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+            current_index += output_size
+        out = out.to(original_type)
+        if bias is not None:
+            out += bias
+        return out
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
 from typing import Any, Dict, List, Optional
 import torch
+from pydantic import BaseModel
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
    QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
 class CompressedTensorsConfig(QuantizationConfig):
@@ -47,10 +51,12 @@ class CompressedTensorsConfig(QuantizationConfig):
            targets = quant_config.get("targets")
            for target in targets:
                layer_quant_details[target] = {}
-                layer_quant_details[target]["weight"] = quant_config.get(
+                layer_quant_details[target][
-                    "weights")
+                    "weight"] = QuantizationArgs.parse_obj(
-                layer_quant_details[target]["input"] = quant_config.get(
+                        quant_config.get("weights"))
-                    "input_activations")
+                layer_quant_details[target][
+                    "input"] = QuantizationArgs.parse_obj(
+                        quant_config.get("input_activations"))
        return cls(layer_quant_details=layer_quant_details, ignore=ignore)
@@ -58,40 +64,46 @@ class CompressedTensorsConfig(QuantizationConfig):
    def get_config_filenames(cls) -> List[str]:
        return []
-    def _get_schema(self, weight_quant: Dict, input_quant: Dict):
+    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
-        # TODO: Refactor as additional cases are supported
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        weight_bit = weight_quant.get("num_bits")
+        is_tensor = (weight_quant.strategy == input_quant.strategy ==
-        input_bit = input_quant.get("num_bits")
+                     QuantizationStrategy.TENSOR.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
-        weight_strategy = weight_quant.get("strategy")
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
-        input_strategy = input_quant.get("strategy")
+        return is_8_bits and is_tensor and is_symmetric and is_static
-        weight_symmetric = weight_quant.get("symmetric")
-        input_symmetric = input_quant.get("symmetric")
+    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        is_token_tensor = (weight_quant.strategy
+                           == QuantizationStrategy.TENSOR.value) and (
+                               input_quant.strategy
+                               == QuantizationStrategy.TOKEN.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+    def _get_schema(self, weight_quant: BaseModel,
+                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+        if self._is_static_tensor_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8StaticTensor()
-        is_8_bits = weight_bit == input_bit == 8
+        if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-        is_tensor = weight_strategy == input_strategy == "tensor"
+            return CompressedTensorsW8A8DynamicToken()
-        is_symmetric = weight_symmetric and input_symmetric
-        if is_8_bits and is_tensor and is_symmetric and \
+        raise NotImplementedError("Scheme not supported.")
-                torch.cuda.is_available():
-            # CompressedTensorsW8A8StaticTensor only supports CUDA path for
-            # now.
-            return CompressedTensorsW8A8StaticTensor()
-        raise NotImplementedError(
-            "Scheme not supported. Only CUDA, 8-bit static symmtetric "
-            "per tensor quantization is currently supported")
    def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
-        # TODO: update with matching function from `compressed_tensors`
+        layer_type_name = find_first_name_or_class_match(
-        layer_type_name = None
+            name="",
-        layer_name_class = type(layer).__name__.lower()
+            module=layer,
-        for target in self.layer_quant_details:
+            targets=self.layer_quant_details.keys(),
-            if target.lower() in layer_name_class:
+            check_contains=True)
-                layer_type_name = target
-                break
        if layer_type_name is None:
            raise ValueError(f"Could not matching target for layer {layer}")
@@ -117,7 +129,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
                       **extra_weight_attrs):
        """
        Use the CompressedTensorsScheme associated with each layer to create 
-        the necessary parameters for the layer.
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
        """
        weight_loader = extra_weight_attrs.get("weight_loader")
@@ -139,7 +153,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
        """
        Use the output of create_weights and the CompressedTensorsScheme 
        associated with the layer to apply the forward pass with the 
-        layer input.
+        layer input.  See LinearMethodBase for param details
        """
        if bias is not None:

--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
    CompressedTensorsUnquantized)
+from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
+    CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
    CompressedTensorsW8A8StaticTensor)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+from typing import Callable, List, Tuple, Union
+import torch
+from torch.nn import Parameter
+from vllm import _custom_ops as custom_ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.utils import set_weight_attrs
+__all__ = ["CompressedTensorsW8A8DynamicToken"]
+class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+        assert isinstance(shard_id, str)
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+    def scales_shard_splitter(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int],
+            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self._shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id])
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset:offset + size], loaded_weight
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        # When the scales have a single value, it is required that they be
+        # on the CPU for performance and CUDA Graphs compatibility. Please
+        # refer to the comment in
+        # CompressedTensorsW8A8StaticTensor::create_weights for further
+        # information.
+        is_tensor_partitioned = len(output_partition_sizes) != 1
+        weight_scale_dim = sum(
+            output_partition_sizes) if is_tensor_partitioned else 1
+        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
+                                      requires_grad=False)
+        weight_scale = Parameter(torch.empty(weight_scale_dim,
+                                             dtype=torch.float32),
+                                 requires_grad=False)
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        set_weight_attrs(weight, {"weight_loader": weight_loader})
+        set_weight_attrs(weight, {"logical_widths": output_partition_sizes})
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        set_weight_attrs(
+            weight_scale, {
+                "shard_splitter": self.scales_shard_splitter,
+                "logical_widths": output_partition_sizes
+            })
+        layer.register_parameter("weight_zero_point", weight_zero_point)
+        set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader})
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        weight = layer.weight
+        weight_scale = layer.weight_scale
+        x_q, input_scales = custom_ops.scaled_int8_quant(x)
+        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales,
+                                               weight_scale, x.dtype)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -41,46 +41,19 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
        # TODO: remove zero_point parameters once the configs given remove them
-        # Note on input/weight scales and zero_points
-        #
-        # When the scales have a single value, it is required that they be
-        # on the CPU for 2 reasons,
-        # 1. Performance:
-        #   When the scales (input_scale/weight_scales) have only a single
-        #   value, we perform a scalar broadcast of that value during the
-        #   quant/dequant operations. The "quant" and the "gemm+dequant"
-        #   kernels accept the Scalar by-value. These tensors are allocated
-        #   on the CPU in order to avoid the GPU-to-CPU copy when passing
-        #   by-value.
-        #
-        # 2. CUDA Graphs:
-        #   CUDA Graphs don't support GPU-to-CPU copy operations during
-        #   stream capture.
-        #
-        # TODO: zero-points are not supported yet. But we expect a similar
-        # pattern.
        is_tensor_partitioned = len(output_partition_sizes) != 1
        weight_scale_dim = sum(
            output_partition_sizes) if is_tensor_partitioned else 1
-        weight_scale_device = "cpu" if weight_scale_dim == 1 else "cuda"
-        input_scale = Parameter(torch.empty(1,
+        input_scale = Parameter(torch.empty(1, dtype=torch.float32),
-                                            device="cpu",
-                                            dtype=torch.float32),
                                requires_grad=False)
-        input_zero_point = Parameter(torch.empty(1,
+        input_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
-                                                 device="cpu",
-                                                 dtype=torch.int8),
                                     requires_grad=False)
        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             device=weight_scale_device,
                                             dtype=torch.float32),
                                 requires_grad=False)
-        weight_zero_point = Parameter(torch.empty(1,
+        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
-                                                  device="cpu",
-                                                  dtype=torch.int8),
                                      requires_grad=False)
        weight = Parameter(torch.empty(sum(output_partition_sizes),
@@ -124,7 +97,7 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
        act_scale = layer.input_scale
        # Input quantize
-        x_q = custom_ops.static_scaled_int8_quant(x, act_scale[0].item())
+        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
                                               weight_scale, x.dtype)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+import re
+from enum import Enum
+from typing import Any, Dict, Iterable, Optional
+from pydantic import BaseModel, Field
+from torch.nn import Module
+class QuantizationType(str, Enum):
+    """
+    Enum storing quantization type options
+    """
+    INT = "int"
+    FLOAT = "float"
+class QuantizationStrategy(str, Enum):
+    """
+    Enum storing quantization strategy options
+    """
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+    TOKEN = "token"
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config 
+    for weights or activations
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric
+    :param strategy: string determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block 
+    strategy, must be of the format "2x4", "8x16", etc.
+    :param dynamic: set True to perform dynamic quantization -
+        values will not be calibrated during calibration phase, 
+        instead during inference new quantization ranges will be 
+        observed with every sample. Defaults to False for static
+        quantization. Note that enabling dynamic quantization 
+        will change the default observer to a memoryless one
+    """
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    group_size: Optional[int] = None
+    strategy: Optional[QuantizationStrategy] = None
+    block_structure: Optional[str] = None
+    dynamic: bool = False
+    observer: str = Field(
+        default="minmax",
+        description=("The class to use to compute the quantization param - "
+                     "scale and zero-point'"),
+    )
+    observer_kwargs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=
+        ("optional dict of kwargs to be passed directly to torch quantization "
+         "Observers constructor excluding quantization range or symmetry"),
+    )
+def find_first_name_or_class_match(
+        name: str,
+        module: Module,
+        targets: Iterable[str],
+        check_contains: bool = False) -> Optional[str]:
+    """
+    Helper function to map the quantization details listed in the config 
+    for a given list of targets against each model layer. First uses the
+    layer name to try and find a match. If no name match is found, uses
+    the layer class name. Returns None otherwise.
+    :param name: layer name
+    :param module: torch.nn.Module
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+    return _find_first_match(name, targets) or _find_first_match(
+        module.__class__.__name__, targets, check_contains)
+def _find_first_match(value: str,
+                      targets: Iterable[str],
+                      check_contains: bool = False) -> Optional[str]:
+    """
+    Returns first element of target that matches value either
+    exactly or as a regex after 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    :param value: string to compare the list of targets against
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+    for target in targets:
+        if target.startswith("re:"):
+            pattern = target[3:]
+            if re.match(pattern, value):
+                return target
+        elif check_contains:
+            if target.lower() in value.lower():
+                return target
+        elif target == value:
+            return target
+    return None