add balance-serve, support concurrence

25cee581 · Atream · 8d0292aa · 25cee581 · 25cee581 · 25cee581
Commit 25cee581 authored Mar 31, 2025 by Atream
20 changed files
--- a/ktransformers/server/backend/interfaces/ktransformers.py
+++ b/ktransformers/server/backend/interfaces/ktransformers.py
@@ -211,11 +211,11 @@ class KTransformersInterface(TransformersInterface):
        chunk_start = 0
        while chunk_start < input_ids_length:
-            chunk_end = min(chunk_start + self.args.chunk_prefill_size, input_ids_length)
+            chunk_end = min(chunk_start + self.args.chunk_size, input_ids_length)
            if self.cache != None:
                self.cache.cur_idx=cache_position[chunk_start:chunk_end]
            logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end])
-            chunk_start += self.args.chunk_prefill_size
+            chunk_start += self.args.chunk_size
        if flashinfer_enabled:
            MLAWrapperSingleton.reset_buffer()

--- a/ktransformers/server/balance_serve/inference/__init__.py
+++ b/ktransformers/server/balance_serve/inference/__init__.py
--- a/ktransformers/server/balance_serve/inference/config.py
+++ b/ktransformers/server/balance_serve/inference/config.py
+'''
+Date: 2024-11-07 07:30:16
+LastEditors: djw
+LastEditTime: 2024-11-15 14:23:26
+'''
+import math
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+import yaml
+import json
+from typing import Optional
+class ModelConfig:
+    vocab_size: int = 32000
+    n_layer: int = 1
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = 18944
+    n_local_heads: int = 8
+    head_dim: int = 128
+    rope_base: float = 1000000.0
+    norm_eps: float = 1e-06
+    rope_scaling: Optional[dict] = None
+    rms_norm_eps: float = 1e-6
+    hidden_act: str = "silu"
+    model_path: str
+    gguf_path: str
+    optimize_rule_path: str
+    speculative_rule_path: str
+    # quantize config
+    quant_algorithm: Optional[str] = None
+    quant_group_size: Optional[int] = None
+    quant_num_bits: Optional[int] = None
+    json_key_map = {
+        "vocab_size": "vocab_size",
+        "n_layer": "num_hidden_layers",
+        "n_head": "num_attention_heads",
+        "dim": "hidden_size",
+        "intermediate_size": "intermediate_size",
+        "n_local_heads": "num_key_value_heads",
+        "rope_base": "rope_theta",
+        "norm_eps": "norm_eps",
+        "rms_norm_eps": "rms_norm_eps",
+        "hidden_act": "hidden_act",
+    }
+    def __init__(self, config):
+        self.model_path = config["model"]["model_path"]
+        self.gguf_path = config["model"]["gguf_path"]
+        self.optimize_rule_path = config["model"]["optimize_rule_path"]
+        if "speculative_rule_path" in config["model"]:
+            self.speculative_rule_path =  config["model"]["speculative_rule_path"]
+            self.speculative_gguf_path = config["model"]["speculative_gguf_path"]
+            self.speculative_model_path = config["model"]["speculative_model_path"]
+        self.quant_algorithm = config["model"]["quant"]["algorithm"]
+        self.quant_group_size = config["model"]["quant"]["group_size"]
+        self.quant_num_bits = config["model"]["quant"]["num_bits"]
+        self.load_config()
+        self.n_layer = config["model"]["n_layers"]
+    def load_config(self):
+        config_file = f"{self.model_path}/config.json"
+        try:
+            with open(config_file, "r") as f:
+                config_data = json.load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Configuration file not found at {config_file}")
+        for attr, json_key in self.json_key_map.items():
+            if json_key in config_data:
+                setattr(self, attr, config_data[json_key])
+            else:
+                setattr(self, attr, getattr(self, attr))
+class ParallelConfig:
+    def __init__(
+        self,
+        config,
+    ) -> None:
+        self.pipeline_parallel_size = config["parallel"]["pp"]
+        self.tensor_parallel_size = config["parallel"]["tp"]
+        self.disable_custom_all_reduce = config["parallel"]["disable_custom_all_reduce"]
+        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+class AttnConfig:
+    page_size: int = 256
+    block_num: int = 32
+    max_batch_token : int = 256
+    max_batch_size: int = 32
+    def __init__(self, config):
+        self.page_size = config["attn"]["page_size"]
+        self.block_num = config["attn"]["block_num"]
+        self.max_batch_token = config["attn"]["max_batch_token"]
+        self.max_batch_size = config["attn"]["max_batch_size"]
+class SamplerConfig():
+	# Batched sampling params
+    temperatures: float
+    is_all_greedy: bool
+    def __init__(self, config):
+        self.temperatures = config["sample"]["temperature"]
+        self.is_all_greedy = True
+def load_yaml_config(file_path):
+    with open(file_path, "r") as f:
+        return yaml.safe_load(f)
+class LLMConfig:
+    model_config: ModelConfig
+    parallel_config: ParallelConfig
+    attn_config: AttnConfig
+    sample_config: SamplerConfig
+    config_file: str
+    def __init__(self, config_file):
+        self.config_file = config_file
+        config = load_yaml_config(config_file)
+        self.model_config = ModelConfig(config)
+        self.parallel_config = ParallelConfig(config)
+        self.attn_config = AttnConfig(config)
+        self.sample_config = SamplerConfig(config)
--- a/ktransformers/server/balance_serve/inference/distributed/__init__.py
+++ b/ktransformers/server/balance_serve/inference/distributed/__init__.py
+from .communication_op import *
+from .parallel_state import *
+from .utils import *
--- a/ktransformers/server/balance_serve/inference/distributed/communication_op.py
+++ b/ktransformers/server/balance_serve/inference/distributed/communication_op.py
+"""
+Date: 2024-12-11 06:02:42
+LastEditors: djw
+LastEditTime: 2024-12-12 09:52:06
+"""
+from typing import Any, Dict, Optional, Union
+import torch
+import torch.distributed
+from .parallel_state import get_tp_group
+def tensor_model_parallel_all_reduce(input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)
+def tensor_model_parallel_all_gather(
+    input_: torch.Tensor, dim: int = -1
+) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group().all_gather(input_, dim)
+def tensor_model_parallel_gather(
+    input_: torch.Tensor, dst: int = 0, dim: int = -1
+) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
+def broadcast_tensor_dict(
+    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
+):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
--- a/ktransformers/server/balance_serve/inference/distributed/cuda_wrapper.py
+++ b/ktransformers/server/balance_serve/inference/distributed/cuda_wrapper.py
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+import ctypes
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found = False
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found = True
+                break
+    if not found:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = line.index("/")
+    path = line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+class CudaRTLibrary:
+    exported_functions = [
+        # cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+        # cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function("cudaMalloc", cudaError_t,
+                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
+        # cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function("cudaMemset", cudaError_t,
+                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
+        # cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function("cudaMemcpy", cudaError_t, [
+            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
+        ]),
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function("cudaIpcGetMemHandle", cudaError_t,
+                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
+        # cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function("cudaIpcOpenMemHandle", cudaError_t, [
+            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
+        ]),
+    ]
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+    def __init__(self, so_file: Optional[str] = None):
+        if so_file is None:
+            so_file = find_loaded_library("libcudart")
+            assert so_file is not None, \
+                "libcudart is not loaded in the current process"
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
+                   count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
+                   count: int) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+    def cudaIpcGetMemHandle(self,
+                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
+            ctypes.byref(handle), devPtr))
+        return handle
+    def cudaIpcOpenMemHandle(self,
+                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
+            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
+        return devPtr
--- a/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce.py
+++ b/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce.py
+import ctypes
+from contextlib import contextmanager
+from typing import List, Optional, Union
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+import server.envs as envs
+from server.inference.distributed.cuda_wrapper import CudaRTLibrary
+from server.inference.distributed.custom_all_reduce_utils import gpu_p2p_access_check
+from server.inference.distributed.parallel_state import in_the_same_node_as
+from server.inference.platforms import current_platform
+from server.utils import cuda_device_count_stateless
+import vLLMCustomAllreduce
+try:
+    vLLMCustomAllreduce.meta_size()
+    custom_ar = True
+except Exception:
+    # For AMD GPUs and CPUs
+    custom_ar = False
+def _can_p2p(rank: int, world_size: int) -> bool:
+    for i in range(world_size):
+        if i == rank:
+            continue
+        if envs.VLLM_SKIP_P2P_CHECK:
+            print("Skipping P2P check and trusting the driver's P2P report.")
+            return torch.cuda.can_device_access_peer(rank, i)
+        if not gpu_p2p_access_check(rank, i):
+            return False
+    return True
+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+class CustomAllreduce:
+    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+    # max_size: max supported allreduce size
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
+        max_size=8192 * 1024,
+    ) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+        if not custom_ar:
+            # disable because of missing custom allreduce library
+            # e.g. in a non-cuda environment
+            return
+        self.group = group
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "CustomAllreduce should be attached to a non-NCCL group."
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom allreduce for multi-node case.
+            print(
+                "Custom allreduce is disabled because this process group"
+                " spans across nodes."
+            )
+            return
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize custom allreduce for single GPU case.
+            return
+        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
+            print(
+                "Custom allreduce is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly.",
+                world_size,
+                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(cuda_device_count_stateless()))
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+        # test nvlink first, this will filter out most of the cases
+        # where custom allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        assert current_platform.is_cuda()
+        from server.inference.platforms.cuda import CudaPlatform
+        cuda_platform: CudaPlatform = current_platform
+        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
+        if world_size > 2 and not full_nvlink:
+            print(
+                "Custom allreduce is disabled because it's not supported on"
+                " more than two PCIe-only GPUs. To silence this warning, "
+                "specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+        # test P2P capability, this checks software/cudaruntime support
+        # this is expensive to compute at the first time
+        # then we cache the result
+        if not _can_p2p(rank, world_size):
+            print(
+                "Custom allreduce is disabled because your platform lacks "
+                "GPU P2P capability or P2P test failed. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+        self.disabled = False
+        # Buffers memory are owned by this Python class and passed to C++.
+        # Meta data composes of two parts: meta data for synchronization and a
+        # temporary buffer for storing intermediate allreduce results.
+        self.meta_ptrs = self.create_shared_buffer(
+            vLLMCustomAllreduce.meta_size() + max_size, group=group
+        )
+        # This is a pre-registered IPC buffer. In eager mode, input tensors
+        # are first copied into this buffer before allreduce is performed
+        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+        # This is a buffer for storing the tuples of pointers pointing to
+        # IPC buffers from all ranks. Each registered tuple has size of
+        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+        # is enough for 131072 such tuples. The largest model I've seen only
+        # needs less than 10000 of registered tuples.
+        self.rank_data = torch.empty(
+            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+        )
+        self.max_size = max_size
+        self.rank = rank
+        self.world_size = world_size
+        self.full_nvlink = full_nvlink
+        self._ptr = vLLMCustomAllreduce.init_custom_ar(
+            self.meta_ptrs, self.rank_data, rank, self.full_nvlink
+        )
+        vLLMCustomAllreduce.register_buffer(self._ptr, self.buffer_ptrs)
+    @staticmethod
+    def create_shared_buffer(
+        size_in_bytes: int, group: Optional[ProcessGroup] = None
+    ) -> List[int]:
+        """
+        Creates a shared buffer and returns a list of pointers
+        representing the buffer on all processes in the group.
+        """
+        lib = CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)  # type: ignore
+            else:
+                pointers.append(lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
+        return pointers
+    @staticmethod
+    def free_shared_buffer(
+        pointers: List[int], group: Optional[ProcessGroup] = None
+    ) -> None:
+        rank = dist.get_rank(group=group)
+        lib = CudaRTLibrary()
+        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+    @contextmanager
+    def capture(self):
+        """
+        The main responsibility of this context manager is the
+        `register_graph_buffers` call at the end of the context.
+        It records all the buffer addresses used in the CUDA graph.
+        """
+        try:
+            self._IS_CAPTURING = True
+            yield
+        finally:
+            self._IS_CAPTURING = False
+            if not self.disabled:
+                self.register_graph_buffers()
+    def register_graph_buffers(self):
+        handle, offset = vLLMCustomAllreduce.get_graph_buffer_ipc_meta(self._ptr)
+        print("Registering %d cuda graph addresses", len(offset))
+        # We cannot directly use `dist.all_gather_object` here
+        # because it is incompatible with `gloo` backend under inference mode.
+        # see https://github.com/pytorch/pytorch/issues/126032 for details.
+        all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
+        all_data[self.rank] = [handle, offset]
+        ranks = sorted(dist.get_process_group_ranks(group=self.group))
+        for i, rank in enumerate(ranks):
+            dist.broadcast_object_list(
+                all_data[i], src=rank, group=self.group, device="cpu"
+            )
+        # Unpack list of tuples to tuple of lists.
+        handles = [d[0] for d in all_data]  # type: ignore
+        offsets = [d[1] for d in all_data]  # type: ignore
+        vLLMCustomAllreduce.register_graph_buffers(self._ptr, handles, offsets)
+    def should_custom_ar(self, inp: torch.Tensor):
+        if self.disabled:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom allreduce requires input byte size to be multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
+        # little performance improvement over NCCL.
+        if self.world_size == 2 or self.full_nvlink:
+            return inp_size < self.max_size
+        return False
+    def all_reduce(
+        self, inp: torch.Tensor, *, out: torch.Tensor = None, bsz_tensor: torch.Tensor = None, registered: bool = False,
+        is_compute_bound=False, overlap=False
+    ):
+        """Performs an out-of-place all reduce.
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
+        if is_compute_bound:
+            sms = 2 if overlap else 36
+        else:
+            sms = 20 if overlap else 36
+        #print("all reduce sms", sms)
+        if out is None:
+            out = torch.empty_like(inp)
+        if registered:
+            vLLMCustomAllreduce.all_reduce(self._ptr, inp, out, 0, 0, bsz_tensor, block_limit=sms)
+        else:
+            vLLMCustomAllreduce.all_reduce(
+                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size, bsz_tensor, block_limit=sms
+            )
+        return out
+    def custom_all_reduce(self, input: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> Optional[torch.Tensor]:
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
+        if self.disabled or not self.should_custom_ar(input):
+            return None
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                return self.all_reduce(input, bsz_tensor=bsz_tensor, registered=True, is_compute_bound=is_compute_bound, overlap=overlap)
+            else:
+                # If warm up, mimic the allocation pattern since custom
+                # allreduce is out-of-place.
+                return torch.empty_like(input)
+        else:
+            # Note: outside of cuda graph context, custom allreduce incurs a
+            # cost of cudaMemcpy, which should be small (<=1% of overall
+            # latency) compared to the performance gain of using custom kernels
+            return self.all_reduce(input, bsz_tensor=bsz_tensor, registered=False, is_compute_bound=is_compute_bound, overlap=overlap)
+    def close(self):
+        if not self.disabled and self._ptr:
+            vLLMCustomAllreduce.dispose(self._ptr)
+            self._ptr = 0
+            self.free_shared_buffer(self.meta_ptrs)
+            self.free_shared_buffer(self.buffer_ptrs)
+    def __del__(self):
+        self.close()
--- a/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce_utils.py
+++ b/ktransformers/server/balance_serve/inference/distributed/custom_all_reduce_utils.py
+import ctypes
+import json
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+from itertools import product
+from typing import Dict, List, Optional, Sequence
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import server.envs as envs
+from server.inference.distributed.cuda_wrapper import CudaRTLibrary
+from server.utils import cuda_device_count_stateless, update_environment_variables
+def producer(
+    batch_src: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: Optional[str] = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+def consumer(
+    batch_tgt: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: Optional[str] = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+) -> Sequence[bool]:
+    """
+    Usually, checking if P2P access is enabled can be done by
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
+    returns `True` even if P2P access is not actually possible.
+    See https://github.com/vllm-project/vllm/issues/2728 and
+    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
+    Therefore, we have to perform a real P2P access to check if it is actually
+    possible.
+    Note on p2p and cuda IPC:
+    Usually, one process uses one GPU:
+    GPU src --> cuda context src --> tensor src --> process src
+    We need to combine p2p and cuda IPC, so that:
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
+    they are the same memory segment.
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    # pass the CUDA_VISIBLE_DEVICES to the child process
+    # to make sure they see the same set of GPUs
+    # make sure the processes are spawned
+    smp = mp.get_context("spawn")
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(
+        target=producer,
+        args=(
+            batch_src,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_tgt = smp.Process(
+        target=consumer,
+        args=(
+            batch_tgt,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
+    result: List[bool] = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        if a != b:
+            print(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.",
+                src,
+                tgt,
+            )
+            result.append(False)
+        else:
+            result.append(a)
+    return result
+# why do we need this cache?
+# we are testing peer-to-peer (p2p) access between GPUs,across processes.
+# if we test it every time, it will be very slow, because we need to create
+#  N * N * 2 processes, where N is the world size. This is very slow.
+# to reduce the time, we use a cache file to store the p2p access status.
+# the cache file is generated by the master process if it does not exist.
+# then all the processes can read the cache file to check the p2p access status.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+#  e.g. used by different vllm engines. The device id in the cache file is a
+#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+#  of visible devices in the vllm engine.
+_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
+    # if the cache variable is already calculated,
+    # read from the cache instead of checking it again
+    global _gpu_p2p_access_cache
+    if _gpu_p2p_access_cache is not None:
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+    is_distributed = dist.is_initialized()
+    num_dev = cuda_device_count_stateless()
+    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+    if cuda_visible_devices is None:
+        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+    path = os.path.join(
+        envs.VLLM_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    from server.inference.distributed.parallel_state import get_world_group
+    if (not is_distributed or get_world_group().local_rank == 0) and (
+        not os.path.exists(path)
+    ):
+        # only the local master process (with local_rank == 0) can
+        #  enter this block to calculate the cache
+        print("generating GPU P2P access cache in %s", path)
+        cache: Dict[str, bool] = {}
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+        # use a temporary file to store the result
+        # we don't use the output of the subprocess directly,
+        # because the subprocess might produce logging output
+        with tempfile.NamedTemporaryFile() as output_file:
+            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
+            returned = subprocess.run(
+                [sys.executable, __file__], input=input_bytes, capture_output=True
+            )
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(
+                    f"Error happened when batch testing "
+                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
+                    f"{returned.stderr.decode()}"
+                ) from e
+            with open(output_file.name, "rb") as f:
+                result = pickle.load(f)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
+        with open(path, "w") as f:
+            json.dump(cache, f, indent=4)
+    if is_distributed:
+        get_world_group().barrier()
+    print("reading GPU P2P access cache from %s", path)
+    with open(path) as f:
+        cache = json.load(f)
+    _gpu_p2p_access_cache = cache
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+__all__ = ["gpu_p2p_access_check"]
+if __name__ == "__main__":
+    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    with open(output_file, "wb") as f:
+        f.write(pickle.dumps(result))
--- a/ktransformers/server/balance_serve/inference/distributed/parallel_state.py
+++ b/ktransformers/server/balance_serve/inference/distributed/parallel_state.py
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""vLLM distributed state.
+It takes over the control of the distributed environment from PyTorch.
+The typical workflow is:
+- call `init_distributed_environment` to initialize the distributed environment.
+- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
+ initialize the model parallel groups.
+- any code dealing with the distributed stuff
+- call `destroy_model_parallel` to destroy the model parallel groups.
+- call `destroy_distributed_environment` to destroy the distributed environment.
+If you only need to use the distributed environment without model/pipeline
+ parallelism, you can skip the model parallel initialization and destruction
+ steps.
+"""
+import contextlib
+import gc
+import pickle
+import weakref
+from collections import namedtuple
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from multiprocessing import shared_memory
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest.mock import patch
+import torch
+import torch.distributed
+from torch.distributed import Backend, ProcessGroup
+import server.envs as envs
+from server.inference.platforms import current_platform
+from server.utils import direct_register_custom_op, supports_custom_op
+@dataclass
+class GraphCaptureContext:
+    stream: torch.cuda.Stream
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+def _split_tensor_dict(
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list: List[Tuple[str, Any]] = []
+    tensor_list: List[torch.Tensor] = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = value.device.type
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size()))
+            )
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
+_group_name_counter: Dict[str, int] = {}
+def _get_unique_name(name: str) -> str:
+    """Get a unique name for the group.
+    Example:
+    _get_unique_name("tp") -> "tp:0"
+    _get_unique_name("tp") -> "tp:1"
+    """
+    if name not in _group_name_counter:
+        _group_name_counter[name] = 0
+    newname = f"{name}:{_group_name_counter[name]}"
+    _group_name_counter[name] += 1
+    return newname
+_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}
+def _register_group(group: "GroupCoordinator") -> None:
+    _groups[group.unique_name] = weakref.ref(group)
+if supports_custom_op():
+    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
+        assert group_name in _groups, f"Group {group_name} is not found."
+        group = _groups[group_name]()
+        if group is None:
+            raise ValueError(f"Group {group_name} is destroyed.")
+        group._all_reduce_in_place(tensor)
+    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
+        return
+    direct_register_custom_op(
+        op_name="inplace_all_reduce",
+        op_func=inplace_all_reduce,
+        mutates_args=["tensor"],
+        fake_impl=inplace_all_reduce_fake,
+    )
+    def outplace_all_reduce(tensor: torch.Tensor, group_name: str, bsz_tensor: torch.Tensor, is_compute_bound: bool = False, overlap: bool = False) -> torch.Tensor:
+        assert group_name in _groups, f"Group {group_name} is not found."
+        group = _groups[group_name]()
+        if group is None:
+            raise ValueError(f"Group {group_name} is destroyed.")
+        return group._all_reduce_out_place(tensor, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)
+    def outplace_all_reduce_fake(tensor: torch.Tensor, group_name: str, bsz_tensor: torch.Tensor, is_compute_bound: bool = False, overlap: bool = False) -> torch.Tensor:
+        return torch.empty_like(tensor)
+    direct_register_custom_op(
+        op_name="outplace_all_reduce",
+        op_func=outplace_all_reduce,
+        mutates_args=[],
+        fake_impl=outplace_all_reduce_fake,
+    )
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+    """
+    # available attributes:
+    rank: int  # global rank
+    ranks: List[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_pynccl: bool  # a hint of whether to use PyNccl
+    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    # communicators are only created for world size > 1
+    pynccl_comm: Optional[Any]  # PyNccl communicator
+    ca_comm: Optional[Any]  # Custom allreduce communicator
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool,
+        use_custom_allreduce: bool,
+        use_tpu_communicator: bool,
+        use_hpu_communicator: bool,
+        use_xpu_communicator: bool,
+        use_message_queue_broadcaster: bool = False,
+        group_name: Optional[str] = None,
+    ):
+        group_name = group_name or "anonymous"
+        self.unique_name = _get_unique_name(group_name)
+        _register_group(self)
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend
+            )
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+        assert current_platform.is_cuda_alike()
+        if current_platform.is_cuda_alike():
+            self.device = torch.device(f"cuda:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+        self.use_pynccl = use_pynccl
+        self.use_custom_allreduce = use_custom_allreduce
+        self.use_tpu_communicator = use_tpu_communicator
+        self.use_hpu_communicator = use_hpu_communicator
+        self.use_xpu_communicator = use_xpu_communicator
+        # lazy import to avoid documentation build error
+        from server.inference.distributed.custom_all_reduce import CustomAllreduce
+        from server.inference.distributed.pynccl import PyNcclCommunicator
+        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        # if use_pynccl and self.world_size > 1:
+        #     self.pynccl_comm = PyNcclCommunicator(
+        #         group=self.cpu_group,
+        #         device=self.device,
+        #     )
+        self.ca_comm: Optional[CustomAllreduce] = None
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        #### we assume we won't use tpu or hpu or xpu or messagequeue broadcast
+        # from vllm.distributed.device_communicators.tpu_communicator import (
+        #     TpuCommunicator)
+        # self.tpu_communicator: Optional[TpuCommunicator] = None
+        # if use_tpu_communicator and self.world_size > 1:
+        #     self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
+        self.tpu_communicator = None
+        # from vllm.distributed.device_communicators.hpu_communicator import (
+        #     HpuCommunicator)
+        # self.hpu_communicator: Optional[HpuCommunicator]
+        # if use_hpu_communicator and self.world_size > 1:
+        #     self.hpu_communicator = HpuCommunicator(group=self.device_group)
+        self.hpu_communicator = None
+        # from vllm.distributed.device_communicators.xpu_communicator import (
+        #     XpuCommunicator)
+        # self.xpu_communicator: Optional[XpuCommunicator]
+        # if use_xpu_communicator and self.world_size > 1:
+        #     self.xpu_communicator = XpuCommunicator(group=self.device_group)
+        self.xpu_communicator = None
+        # from vllm.distributed.device_communicators.shm_broadcast import (
+        #     MessageQueue)
+        # self.mq_broadcaster: Optional[MessageQueue] = None
+        # if use_message_queue_broadcaster and self.world_size > 1:
+        #     self.mq_broadcaster = MessageQueue.create_from_process_group(
+        #         self.cpu_group, 1 << 22, 6)
+        self.mq_broadcaster = None
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+    @contextmanager
+    def graph_capture(
+        self, graph_capture_context: Optional[GraphCaptureContext] = None
+    ):
+        if graph_capture_context is None:
+            stream = torch.cuda.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+        ca_comm = self.ca_comm
+        maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = torch.cuda.current_stream()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+        with torch.cuda.stream(stream), maybe_ca_context:
+            # In graph mode, we have to be very careful about the collective
+            # operations. The current status is:
+            #     allreduce \ Mode   |  Eager  |  Graph  |
+            # --------------------------------------------
+            # custom allreduce       | enabled | enabled |
+            # PyNccl                 | disabled| enabled |
+            # torch.distributed      | enabled | disabled|
+            #
+            # Note that custom allreduce will have a runtime check, if the
+            #  tensor size is too large, it will fallback to the next
+            #  available option.
+            # In summary: When using CUDA graph, we use
+            #  either custom all-reduce kernel or pynccl. When not using
+            #  CUDA graph, we use either custom all-reduce kernel or
+            #  PyTorch NCCL. We always prioritize using custom all-reduce
+            #  kernel but fall back to PyTorch or pynccl if it is
+            #  disabled or not supported.
+            pynccl_comm = self.pynccl_comm
+            maybe_pynccl_context: Any
+            if not pynccl_comm:
+                maybe_pynccl_context = nullcontext()
+            else:
+                maybe_pynccl_context = pynccl_comm.change_state(
+                    enable=True, stream=torch.cuda.current_stream()
+                )
+            with maybe_pynccl_context:
+                yield graph_capture_context
+    def all_reduce(self, input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
+        """
+        User-facing all-reduce function before we actually call the
+        all-reduce operation.
+        We need this because Dynamo does not support passing an arbitrary
+        object (`self` in this case) to a custom op. We need to pass the
+         group name as a string, and then look up the group coordinator from
+         the group name, dispatch the all-reduce operation to the group
+         coordinator.
+        In addition, PyTorch custom ops do not support mutation or returning
+        a new tensor in the same op. So we need to figure out if the op is
+        in-place or out-of-place ahead of time.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        if input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
+            return input_
+        if not supports_custom_op():
+            self._all_reduce_in_place(input_)
+            return input_
+        if self.tpu_communicator is not None and not self.tpu_communicator.disabled:
+            # TPU handles Dynamo with its own logic.
+            return self.tpu_communicator.all_reduce(input_)
+        if self.hpu_communicator is not None and not self.hpu_communicator.disabled:
+            return self.hpu_communicator.all_reduce(input_)
+        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
+            return self.xpu_communicator.all_reduce(input_)
+        if (
+            self.ca_comm is not None
+            and not self.ca_comm.disabled
+            and self.ca_comm.should_custom_ar(input_)
+        ):
+            return torch.ops.vllm.outplace_all_reduce(
+                input_, group_name=self.unique_name, bsz_tensor=bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap
+            )
+        else:
+            #assert self.ca_comm is not None
+            #assert not self.ca_comm.disabled
+            #assert self.ca_comm.should_custom_ar(input_)
+            torch.ops.vllm.inplace_all_reduce(input_, group_name=self.unique_name)
+            return input_
+    def _all_reduce_out_place(self, input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
+        ca_comm = self.ca_comm
+        assert ca_comm is not None
+        assert not ca_comm.disabled
+        out = ca_comm.custom_all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)
+        assert out is not None
+        return out
+    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.all_reduce(input_)
+        else:
+            torch.distributed.all_reduce(input_, group=self.device_group)
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_gather(input_, dim)
+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_gather(input_, dim)
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * world_size,) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            output_size, dtype=input_.dtype, device=input_.device
+        )
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(
+            output_tensor, input_, group=self.device_group
+        )
+        # Reshape
+        output_tensor = output_tensor.reshape((world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
+            return self.xpu_communicator.gather(input_, self.rank_in_group, dst, dim)
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(
+            input_, src=self.ranks[src], group=self.device_group
+        )
+        return input_
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list(
+                [obj], src=self.ranks[src], group=self.cpu_group
+            )
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(
+                recv, src=self.ranks[src], group=self.cpu_group
+            )
+            return recv[0]
+    def broadcast_object_list(
+        self, obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None
+    ):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(
+            obj_list, src=self.ranks[src], group=self.device_group
+        )
+        return obj_list
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+        assert dst != self.rank_in_group, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank."
+        )
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+        size_tensor = torch.tensor(
+            [object_tensor.numel()], dtype=torch.long, device="cpu"
+        )
+        # Send object size
+        torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)
+        # Send object
+        torch.distributed.send(object_tensor, dst=self.ranks[dst], group=self.cpu_group)
+        return None
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        assert (
+            src != self.rank_in_group
+        ), "Invalid source rank. Source rank is the same as the current rank."
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+        # Receive object size
+        rank_size = torch.distributed.recv(
+            size_tensor, src=self.ranks[src], group=self.cpu_group
+        )
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu",
+        )
+        rank_object = torch.distributed.recv(
+            object_tensor, src=self.ranks[src], group=self.cpu_group
+        )
+        assert (
+            rank_object == rank_size
+        ), "Received object sender rank does not match the size sender rank."
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+        return obj
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict, dict
+            ), f"Expecting a dictionary, got {type(tensor_dict)}"
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            self.broadcast_object(metadata_list, src=src)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=metadata_group, async_op=True
+                    )
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=group, async_op=True
+                    )
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+        else:
+            metadata_list = self.broadcast_object(None, src=src)
+            tensor_dict = {}
+            async_handles = []
+            for key, value in metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(
+                        value.size, dtype=value.dtype, device=value.device
+                    )
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=metadata_group,
+                            async_op=True,
+                        )
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor, src=self.ranks[src], group=group, async_op=True
+                        )
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
+        group = self.device_group
+        metadata_group = self.cpu_group
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+        metadata_list: List[Tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict, dict
+        ), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+            # send-allgather: send only a slice, then do allgather.
+            if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
+                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(
+                    tensor, dst=self.ranks[dst], group=metadata_group
+                )
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
+        return None
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
+        group = self.device_group
+        metadata_group = self.cpu_group
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: Dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
+                # send-allgather: send only a slice, then do allgather.
+                use_all_gather = (
+                    all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0
+                )
+                if use_all_gather:
+                    orig_shape = tensor.shape
+                    tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(
+                        tensor, src=self.ranks[src], group=metadata_group
+                    )
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor, src=self.ranks[src], group=group)
+                if use_all_gather:
+                    # do the allgather
+                    tensor = all_gather_group.all_gather(tensor, dim=0)  # type: ignore
+                    tensor = tensor.reshape(orig_shape)
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
+    ) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None
+_WORLD: Optional[GroupCoordinator] = None
+def get_world_group() -> GroupCoordinator:
+    assert _WORLD is not None, "world group is not initialized"
+    return _WORLD
+def init_world_group(
+    ranks: List[int], local_rank: int, backend: str
+) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_custom_allreduce=False,
+        use_tpu_communicator=False,
+        use_hpu_communicator=False,
+        use_xpu_communicator=False,
+        group_name="world",
+    )
+def init_model_parallel_group(
+    group_ranks: List[List[int]],
+    local_rank: int,
+    backend: str,
+    use_custom_allreduce: Optional[bool] = None,
+    use_message_queue_broadcaster: bool = False,
+    group_name: Optional[str] = None,
+) -> GroupCoordinator:
+    if use_custom_allreduce is None:
+        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=use_custom_allreduce,
+        use_tpu_communicator=True,
+        use_hpu_communicator=True,
+        use_xpu_communicator=True,
+        use_message_queue_broadcaster=use_message_queue_broadcaster,
+        group_name=group_name,
+    )
+_TP: Optional[GroupCoordinator] = None
+def get_tp_group() -> GroupCoordinator:
+    assert _TP is not None, "tensor model parallel group is not initialized"
+    return _TP
+# kept for backward compatibility
+get_tensor_model_parallel_group = get_tp_group
+_PP: Optional[GroupCoordinator] = None
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, "pipeline model parallel group is not initialized"
+    return _PP
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+@contextmanager
+def graph_capture():
+    """
+    `graph_capture` is a context manager which should surround the code that
+    is capturing the CUDA graph. Its main purpose is to ensure that the
+    some operations will be run after the graph is captured, before the graph
+    is replayed. It returns a `GraphCaptureContext` object which contains the
+    necessary data for the graph capture. Currently, it only contains the
+    stream that the graph capture is running on. This stream is set to the
+    current CUDA stream when the context manager is entered and reset to the
+    default stream when the context manager is exited. This is to ensure that
+    the graph capture is running on a separate stream from the default stream,
+    in order to explicitly distinguish the kernels to capture
+    from other kernels possibly launched on background in the default stream.
+    """
+    with get_tp_group().graph_capture() as context, get_pp_group().graph_capture(
+        context
+    ):
+        yield context
+_ENABLE_CUSTOM_ALL_REDUCE = True
+def set_custom_all_reduce(enable: bool):
+    global _ENABLE_CUSTOM_ALL_REDUCE
+    _ENABLE_CUSTOM_ALL_REDUCE = enable
+def init_distributed_environment(
+    world_size: int = -1,
+    rank: int = -1,
+    distributed_init_method: str = "env://",
+    local_rank: int = -1,
+    backend: str = "nccl",
+):
+    print(
+        "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s",
+        world_size,
+        rank,
+        local_rank,
+        distributed_init_method,
+        backend,
+    )
+    if not torch.distributed.is_initialized():
+        assert distributed_init_method is not None, (
+            "distributed_init_method must be provided when initializing "
+            "distributed environment"
+        )
+        # this backend is used for WORLD
+        torch.distributed.init_process_group(
+            backend=backend,
+            init_method=distributed_init_method,
+            world_size=world_size,
+            rank=rank,
+        )
+    # set the local rank
+    # local_rank is not available in torch ProcessGroup,
+    # see https://github.com/pytorch/pytorch/issues/122816
+    if local_rank == -1:
+        # local rank not set, this usually happens in single-node
+        # setting, where we can use rank as local rank
+        if distributed_init_method == "env://":
+            local_rank = envs.LOCAL_RANK
+        else:
+            local_rank = rank
+    global _WORLD
+    if _WORLD is None:
+        ranks = list(range(torch.distributed.get_world_size()))
+        _WORLD = init_world_group(ranks, local_rank, backend)
+    else:
+        assert (
+            _WORLD.world_size == torch.distributed.get_world_size()
+        ), "world group already initialized with a different world size"
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    backend: Optional[str] = None,
+) -> None:
+    """
+    Initialize model parallel groups.
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used for tensor model
+            parallelism.
+        pipeline_model_parallel_size: number of GPUs used for pipeline model
+            parallelism.
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+        4 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 pipeline model-parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+    if world_size != tensor_model_parallel_size * pipeline_model_parallel_size:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not equal to "
+            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+        )
+    # Build the tensor model-parallel groups.
+    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    global _TP
+    assert _TP is None, "tensor model parallel group is already initialized"
+    group_ranks = []
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = list(
+            range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        )
+        group_ranks.append(ranks)
+    # message queue broadcaster is only used in tensor model parallel group
+    _TP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_message_queue_broadcaster=True,
+        group_name="tp",
+    )
+    # Build the pipeline model-parallel groups.
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    global _PP
+    assert _PP is None, "pipeline model parallel group is already initialized"
+    group_ranks = []
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+        group_ranks.append(ranks)
+    # pipeline parallel does not need custom allreduce
+    _PP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_custom_allreduce=False,
+        group_name="pp",
+    )
+def ensure_model_parallel_initialized(
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+    backend: Optional[str] = None,
+) -> None:
+    """Helper to initialize model parallel groups if they are not initialized,
+    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+    values if the model parallel groups are initialized.
+    """
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+    if not model_parallel_is_initialized():
+        initialize_model_parallel(
+            tensor_model_parallel_size, pipeline_model_parallel_size, backend
+        )
+        return
+    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
+        "tensor parallel group already initialized, but of unexpected size: "
+        f"{get_tensor_model_parallel_world_size()=} vs. "
+        f"{tensor_model_parallel_size=}"
+    )
+    pp_world_size = get_pp_group().world_size
+    assert pp_world_size == pipeline_model_parallel_size, (
+        "pipeline parallel group already initialized, but of unexpected size: "
+        f"{pp_world_size=} vs. "
+        f"{pipeline_model_parallel_size=}"
+    )
+def model_parallel_is_initialized():
+    """Check if tensor and pipeline parallel groups are initialized."""
+    return _TP is not None and _PP is not None
+_TP_STATE_PATCHED = False
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    return get_tp_group().world_size
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    return get_tp_group().rank_in_group
+def destroy_model_parallel():
+    """Set the groups to none and destroy them."""
+    global _TP
+    if _TP:
+        _TP.destroy()
+    _TP = None
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+def destroy_distributed_environment():
+    global _WORLD
+    if _WORLD:
+        _WORLD.destroy()
+    _WORLD = None
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    if shutdown_ray:
+        import ray  # Lazy import Ray
+        ray.shutdown()
+    gc.collect()
+    if not current_platform.is_cpu():
+        torch.cuda.empty_cache()
+def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+    """
+    This is a collective operation that returns if each rank is in the same node
+    as the source rank. It tests if processes are attached to the same
+    memory system (shared access to shared memory).
+    """
+    assert (
+        torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL
+    ), "in_the_same_node_as should be tested with a non-NCCL group."
+    # local rank inside the group
+    rank = torch.distributed.get_rank(group=pg)
+    world_size = torch.distributed.get_world_size(group=pg)
+    # local tensor in each process to store the result
+    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+    # global ranks of the processes in the group
+    ranks = torch.distributed.get_process_group_ranks(pg)
+    magic_message = b"magic_message"
+    shm = None
+    try:
+        with contextlib.suppress(OSError):
+            if rank == source_rank:
+                # create a shared memory segment
+                shm = shared_memory.SharedMemory(create=True, size=128)
+                shm.buf[: len(magic_message)] = magic_message
+                torch.distributed.broadcast_object_list(
+                    [shm.name], src=ranks[source_rank], group=pg
+                )
+                is_in_the_same_node[rank] = 1
+            else:
+                # try to open the shared memory segment
+                recv = [None]
+                torch.distributed.broadcast_object_list(
+                    recv, src=ranks[source_rank], group=pg
+                )
+                name = recv[0]
+                # fix to https://stackoverflow.com/q/62748654/9191338
+                # Python incorrectly tracks shared memory even if it is not
+                # created by the process. The following patch is a workaround.
+                with patch(
+                    "multiprocessing.resource_tracker.register",
+                    lambda *args, **kwargs: None,
+                ):
+                    shm = shared_memory.SharedMemory(name=name)
+                if shm.buf[: len(magic_message)] == magic_message:
+                    is_in_the_same_node[rank] = 1
+    except Exception as e:
+        print("Error ignored in is_in_the_same_node: %s", e)
+    finally:
+        if shm:
+            shm.close()
+    torch.distributed.barrier(group=pg)
+    # clean up the shared memory segment
+    with contextlib.suppress(OSError):
+        if rank == source_rank and shm:
+            shm.unlink()
+    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+    return [x == 1 for x in is_in_the_same_node.tolist()]
--- a/ktransformers/server/balance_serve/inference/distributed/pynccl.py
+++ b/ktransformers/server/balance_serve/inference/distributed/pynccl.py
+from contextlib import contextmanager
+from typing import Optional, Union
+# ===================== import region =====================
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+from server.inference.distributed.pynccl_wrapper import (
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
+from server.inference.distributed.utils import StatelessProcessGroup
+class PyNcclCommunicator:
+    def __init__(
+        self,
+        group: Union[ProcessGroup, StatelessProcessGroup],
+        device: Union[int, str, torch.device],
+        library_path: Optional[str] = None,
+    ):
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the PyNcclCommunicator to. If None,
+                it will be bind to f"cuda:{local_rank}".
+            library_path: the path to the NCCL library. If None, it will
+                use the default library path.
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device.
+        """
+        if not isinstance(group, StatelessProcessGroup):
+            assert dist.is_initialized()
+            assert (
+                dist.get_backend(group) != dist.Backend.NCCL
+            ), "PyNcclCommunicator should be attached to a non-NCCL group."
+            # note: this rank is the rank in the group
+            self.rank = dist.get_rank(group)
+            self.world_size = dist.get_world_size(group)
+        else:
+            self.rank = group.rank
+            self.world_size = group.world_size
+        self.group = group
+        # if world_size == 1, no need to create communicator
+        if self.world_size == 1:
+            self.available = False
+            self.disabled = True
+            self.stream = None
+            return
+        try:
+            self.nccl = NCCLLibrary(library_path)
+        except Exception:
+            # disable because of missing NCCL library
+            # e.g. in a non-GPU environment
+            self.available = False
+            self.disabled = True
+            self.stream = None
+            return
+        self.available = True
+        self.disabled = False
+        print("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
+        if self.rank == 0:
+            # get the unique id from NCCL
+            self.unique_id = self.nccl.ncclGetUniqueId()
+        else:
+            # construct an empty unique id
+            self.unique_id = ncclUniqueId()
+        if not isinstance(group, StatelessProcessGroup):
+            tensor = torch.ByteTensor(list(self.unique_id.internal))
+            ranks = dist.get_process_group_ranks(group)
+            # arg `src` in `broadcast` is the global rank
+            dist.broadcast(tensor, src=ranks[0], group=group)
+            byte_list = tensor.tolist()
+            for i, byte in enumerate(byte_list):
+                self.unique_id.internal[i] = byte
+        else:
+            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        # nccl communicator and stream will use this device
+        # `torch.cuda.device` is a context manager that changes the
+        # current cuda device to the specified one
+        with torch.cuda.device(device):
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                self.world_size, self.unique_id, self.rank
+            )
+            self.stream = torch.cuda.Stream()
+            # A small all_reduce for warmup.
+            data = torch.zeros(1, device=device)
+            self.all_reduce(data)
+            self.stream.synchronize()
+            del data
+        # by default it is disabled, e.g. in profiling models and prefill phase.
+        # to use it, use under `with obj.change_state(enable=True)`, usually
+        # when we are using CUDA graph.
+        self.disabled = True
+    def all_reduce(
+        self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclAllReduce(
+            buffer_type(tensor.data_ptr()),
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+    @contextmanager
+    def change_state(
+        self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
+    ):
+        """
+        A context manager to change the state of the communicator.
+        """
+        if enable is None:
+            # guess a default value when not specified
+            enable = self.available
+        if stream is None:
+            stream = self.stream
+        old_disable = self.disabled
+        old_stream = self.stream
+        self.stream = stream
+        self.disabled = not enable
+        yield
+        self.disabled = old_disable
+        self.stream = old_stream
--- a/ktransformers/server/balance_serve/inference/distributed/pynccl_wrapper.py
+++ b/ktransformers/server/balance_serve/inference/distributed/pynccl_wrapper.py
+# This file is a pure Python wrapper for the NCCL library.
+# The main purpose is to use NCCL combined with CUDA graph.
+# Before writing this script, we tried the following approach:
+# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+#  often gets stuck when initializing the NCCL communicator.
+# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+#  contains many other potential cuda APIs, that are not allowed during
+#  capturing the CUDA graph. For further details, please check
+# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
+#
+# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
+# doable, but we often encounter issues related with nccl versions, and need
+# to switch between different versions of NCCL. See
+# https://github.com/NVIDIA/nccl/issues/1234 for more details.
+# A C/C++ binding is not flexible enough to handle this. It requires
+# recompilation of the code every time we want to switch between different
+# versions. This current implementation, with a **pure** Python wrapper, is
+# more flexible. We can easily switch between different versions of NCCL by
+# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
+# variable in the code.
+import ctypes
+import platform
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import torch
+from torch.distributed import ReduceOp
+from server.utils import find_nccl_library
+# === export types and functions from nccl to Python ===
+# for the original nccl definition, please check
+# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+ncclResult_t = ctypes.c_int
+ncclComm_t = ctypes.c_void_p
+class ncclUniqueId(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+cudaStream_t = ctypes.c_void_p
+buffer_type = ctypes.c_void_p
+ncclDataType_t = ctypes.c_int
+class ncclDataTypeEnum:
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclBfloat16 = 9
+    ncclNumTypes = 10
+    @classmethod
+    def from_torch(cls, dtype: torch.dtype) -> int:
+        if dtype == torch.int8:
+            return cls.ncclInt8
+        if dtype == torch.uint8:
+            return cls.ncclUint8
+        if dtype == torch.int32:
+            return cls.ncclInt32
+        if dtype == torch.int64:
+            return cls.ncclInt64
+        if dtype == torch.float16:
+            return cls.ncclFloat16
+        if dtype == torch.float32:
+            return cls.ncclFloat32
+        if dtype == torch.float64:
+            return cls.ncclFloat64
+        if dtype == torch.bfloat16:
+            return cls.ncclBfloat16
+        raise ValueError(f"Unsupported dtype: {dtype}")
+ncclRedOp_t = ctypes.c_int
+class ncclRedOpTypeEnum:
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclAvg = 4
+    ncclNumOps = 5
+    @classmethod
+    def from_torch(cls, op: ReduceOp) -> int:
+        if op == ReduceOp.SUM:
+            return cls.ncclSum
+        if op == ReduceOp.PRODUCT:
+            return cls.ncclProd
+        if op == ReduceOp.MAX:
+            return cls.ncclMax
+        if op == ReduceOp.MIN:
+            return cls.ncclMin
+        if op == ReduceOp.AVG:
+            return cls.ncclAvg
+        raise ValueError(f"Unsupported op: {op}")
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+class NCCLLibrary:
+    exported_functions = [
+        # const char* ncclGetErrorString(ncclResult_t result)
+        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
+        # ncclResult_t  ncclGetVersion(int *version);
+        Function("ncclGetVersion", ncclResult_t,
+                 [ctypes.POINTER(ctypes.c_int)]),
+        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+        Function("ncclGetUniqueId", ncclResult_t,
+                 [ctypes.POINTER(ncclUniqueId)]),
+        # ncclResult_t  ncclCommInitRank(
+        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+        # note that ncclComm_t is a pointer type, so the first argument
+        # is a pointer to a pointer
+        Function("ncclCommInitRank", ncclResult_t, [
+            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
+            ctypes.c_int
+        ]),
+        # ncclResult_t  ncclAllReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclAllReduce", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ncclComm_t, cudaStream_t
+        ]),
+        # ncclResult_t  ncclSend(
+        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
+        #   int dest, ncclComm_t comm, cudaStream_t stream);
+        Function("ncclSend", ncclResult_t, [
+            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
+            ncclComm_t, cudaStream_t
+        ]),
+        # ncclResult_t  ncclRecv(
+        #   void* recvbuff, size_t count, ncclDataType_t datatype,
+        #   int src, ncclComm_t comm, cudaStream_t stream);
+        Function("ncclRecv", ncclResult_t, [
+            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
+            ncclComm_t, cudaStream_t
+        ]),
+        # be cautious! this is a collective call, it will block until all
+        # processes in the communicator have called this function.
+        # because Python object destruction can happen in random order,
+        # it is better not to call it at all.
+        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+    ]
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+    def __init__(self, so_file: Optional[str] = None):
+        so_file = so_file or find_nccl_library()
+        try:
+            if so_file not in NCCLLibrary.path_to_dict_mapping:
+                lib = ctypes.CDLL(so_file)
+                NCCLLibrary.path_to_library_cache[so_file] = lib
+            self.lib = NCCLLibrary.path_to_library_cache[so_file]
+        except Exception as e:
+            print(
+                "Failed to load NCCL library from %s ."
+                "It is expected if you are not running on NVIDIA/AMD GPUs."
+                "Otherwise, the nccl library might not exist, be corrupted "
+                "or it does not support the current platform %s."
+                "If you already have the library, please set the "
+                "environment variable VLLM_NCCL_SO_PATH"
+                " to point to the correct nccl library path.", so_file,
+                platform.platform())
+            raise e
+        if so_file not in NCCLLibrary.path_to_dict_mapping:
+            _funcs: Dict[str, Any] = {}
+            for func in NCCLLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
+        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
+    def ncclGetErrorString(self, result: ncclResult_t) -> str:
+        return self._funcs["ncclGetErrorString"](result).decode("utf-8")
+    def NCCL_CHECK(self, result: ncclResult_t) -> None:
+        if result != 0:
+            error_str = self.ncclGetErrorString(result)
+            raise RuntimeError(f"NCCL error: {error_str}")
+    def ncclGetVersion(self) -> str:
+        version = ctypes.c_int()
+        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
+        version_str = str(version.value)
+        # something like 21903 --> "2.19.3"
+        major = version_str[0].lstrip("0")
+        minor = version_str[1:3].lstrip("0")
+        patch = version_str[3:].lstrip("0")
+        return f"{major}.{minor}.{patch}"
+    def ncclGetUniqueId(self) -> ncclUniqueId:
+        unique_id = ncclUniqueId()
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
+            ctypes.byref(unique_id)))
+        return unique_id
+    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
+                         rank: int) -> ncclComm_t:
+        comm = ncclComm_t()
+        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
+                                                        world_size, unique_id,
+                                                        rank))
+        return comm
+    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, op: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
+                                                     datatype, op, comm,
+                                                     stream))
+    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
+                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
+                                                dest, comm, stream))
+    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
+                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
+                                                comm, stream))
+    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+__all__ = [
+    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
+    "ncclComm_t", "cudaStream_t", "buffer_type"
+]
--- a/ktransformers/server/balance_serve/inference/distributed/utils.py
+++ b/ktransformers/server/balance_serve/inference/distributed/utils.py
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import dataclasses
+import pickle
+import time
+from collections import deque
+from typing import Any, Deque, Dict, Optional, Sequence, Tuple
+import torch
+from torch.distributed import TCPStore
+import server.envs as envs
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> Sequence[torch.Tensor]:
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # NOTE: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+def get_pp_indices(
+    num_hidden_layers: int, pp_rank: int, pp_size: int
+) -> Tuple[int, int]:
+    """Try to evenly distribute layers across partitions.
+    If the number of layers is not divisible by the number of partitions,
+    the last partition will have the remaining layers.
+    """
+    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
+    if partition_list_str is not None:
+        try:
+            partitions = [int(layer) for layer in partition_list_str.split(",")]
+        except ValueError as err:
+            raise ValueError(
+                "Invalid partition string: {}".format(partition_list_str)
+            ) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+        start_layer = sum(partitions[:pp_rank])
+        end_layer = start_layer + partitions[pp_rank]
+    else:
+        layers_per_partition = num_hidden_layers // pp_size
+        start_layer = pp_rank * layers_per_partition
+        end_layer = start_layer + layers_per_partition
+        if pp_rank == pp_size - 1:
+            end_layer = num_hidden_layers
+    return (start_layer, end_layer)
+@dataclasses.dataclass
+class StatelessProcessGroup:
+    """A dataclass to hold a metadata store, and the rank, world_size of the
+    group. Only use it to communicate metadata between processes.
+    For data-plane communication, create NCCL-related objects.
+    """
+    rank: int
+    world_size: int
+    store: torch._C._distributed_c10d.Store
+    data_expiration_seconds: int = 3600  # 1 hour
+    # dst rank -> counter
+    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    # src rank -> counter
+    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    broadcast_send_counter: int = 0
+    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    # A deque to store the data entries, with key and timestamp.
+    entries: Deque[Tuple[str, float]] = dataclasses.field(default_factory=deque)
+    def __post_init__(self):
+        assert self.rank < self.world_size
+        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
+        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
+        self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)}
+    def send_obj(self, obj: Any, dst: int):
+        """Send an object to a destination rank."""
+        self.expire_data()
+        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
+        self.store.set(key, pickle.dumps(obj))
+        self.send_dst_counter[dst] += 1
+        self.entries.append((key, time.time()))
+    def expire_data(self):
+        """Expire data that is older than `data_expiration_seconds` seconds."""
+        while self.entries:
+            # check the oldest entry
+            key, timestamp = self.entries[0]
+            if time.time() - timestamp > self.data_expiration_seconds:
+                self.store.delete_key(key)
+                self.entries.popleft()
+            else:
+                break
+    def recv_obj(self, src: int) -> Any:
+        """Receive an object from a source rank."""
+        obj = pickle.loads(
+            self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}")
+        )
+        self.recv_src_counter[src] += 1
+        return obj
+    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+        """Broadcast an object from a source rank to all other ranks.
+        It does not clean up after all ranks have received the object.
+        Use it for limited times, e.g., for initialization.
+        """
+        if self.rank == src:
+            self.expire_data()
+            key = f"broadcast_from/{src}/" f"{self.broadcast_send_counter}"
+            self.store.set(key, pickle.dumps(obj))
+            self.broadcast_send_counter += 1
+            self.entries.append((key, time.time()))
+            return obj
+        else:
+            key = f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}"
+            recv_obj = pickle.loads(self.store.get(key))
+            self.broadcast_recv_src_counter[src] += 1
+            return recv_obj
+    def all_gather_obj(self, obj: Any) -> list[Any]:
+        """All gather an object from all ranks."""
+        gathered_objs = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                gathered_objs.append(obj)
+                self.broadcast_obj(obj, src=self.rank)
+            else:
+                recv_obj = self.broadcast_obj(None, src=i)
+                gathered_objs.append(recv_obj)
+        return gathered_objs
+    def barrier(self):
+        """A barrier to synchronize all ranks."""
+        for i in range(self.world_size):
+            if i == self.rank:
+                self.broadcast_obj(None, src=self.rank)
+            else:
+                self.broadcast_obj(None, src=i)
+    @staticmethod
+    def create(
+        host: str,
+        port: int,
+        rank: int,
+        world_size: int,
+        data_expiration_seconds: int = 3600,
+    ) -> "StatelessProcessGroup":
+        """A replacement for `torch.distributed.init_process_group` that does not
+        pollute the global state.
+        If we have process A and process B called `torch.distributed.init_process_group`
+        to form a group, and then we want to form another group with process A, B, C,
+        D, it is not possible in PyTorch, because process A and process B have already
+        formed a group, and process C and process D cannot join that group. This
+        function is a workaround for this issue.
+        `torch.distributed.init_process_group` is a global call, while this function
+        is a stateless call. It will return a `StatelessProcessGroup` object that can be
+        used for exchanging metadata. With this function, process A and process B
+        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
+        C, and D can call `StatelessProcessGroup.create` to form another group.
+        """  # noqa
+        store = TCPStore(
+            host_name=host,
+            port=port,
+            world_size=world_size,
+            is_master=(rank == 0),
+        )
+        return StatelessProcessGroup(
+            rank=rank,
+            world_size=world_size,
+            store=store,
+            data_expiration_seconds=data_expiration_seconds,
+        )
--- a/ktransformers/server/balance_serve/inference/forward_batch.py
+++ b/ktransformers/server/balance_serve/inference/forward_batch.py
+'''
+Date: 2024-11-12 14:15:16
+LastEditors: Xie Weiyu ervinxie@qq.com
+LastEditTime: 2024-11-26 08:12:49
+'''
+import torch
+from ktransformers.server.balance_serve.settings import sched_ext
+from ktransformers.server.balance_serve.inference.query_manager import QueryManager, QueryInfo
+import time
+from ktransformers.server.config.config import Config
+class ForwardBatchInput:
+    class ForwardMiniBatch:
+        q_indptr: torch.Tensor
+        kv_indptr: torch.Tensor
+        kv_indices: torch.Tensor
+        kv_last_page_len: torch.Tensor
+        kv_len: torch.Tensor
+        position_ids: torch.Tensor
+        tokens: torch.Tensor
+        batch_indices: torch.Tensor
+        positions: torch.Tensor
+        chunk_size: int
+        decode_batch: int        
+        is_last_prefill_chunk: bool
+        logits_start: list
+        temperatures: torch.Tensor
+        top_ps: torch.Tensor
+        def __init__(self, prefill_querys_info: list[QueryInfo], decode_querys_info: list[QueryInfo], prefill_s: list[int] = None, prefill_l: list[int] = None, device = torch.device('cuda'), page_size = 256):
+            batch_decode = len(decode_querys_info)
+            batch_prefill = len(prefill_querys_info)
+            self.q_indptr = torch.tensor([0], device=device, dtype=torch.int32)
+            self.kv_indptr = torch.tensor([0], device=device, dtype=torch.int32)
+            self.kv_indices = torch.tensor([], device=device, dtype=torch.int32)
+            self.kv_len = torch.tensor([], device=device, dtype=torch.int32)
+            self.kv_last_page_len = torch.tensor([], device=device, dtype=torch.int32)
+            self.position_ids = torch.tensor([], device=device, dtype=torch.int32)
+            self.tokens = torch.tensor([], device=device, dtype=torch.int32)
+            self.temperatures = torch.tensor([], device=device, dtype=torch.float32)
+            self.top_ps = torch.tensor([], device=device, dtype=torch.float32)
+            self.logits_start = []
+            self.decode_batch = batch_decode
+            self.num_tokens = batch_decode + sum(prefill_l)
+            self.batch_size = batch_decode + batch_prefill
+            for i, prefill_query_info in enumerate(prefill_querys_info):
+                if prefill_query_info != None:
+                    prefill_kv_block_len = (prefill_query_info.active_position + prefill_l[i] + page_size - 1) // page_size if prefill_query_info is not None else 0
+                    # print(f"block_len: {prefill_kv_block_len}, page_size: {page_size}")
+                    self.q_indptr = torch.concat((self.q_indptr, torch.tensor([prefill_l[i] + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                    self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([prefill_kv_block_len + self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                    self.kv_indices = torch.concat((self.kv_indices, prefill_query_info.block_index[:prefill_kv_block_len]), dim=0)
+                    self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i]) % page_size if (prefill_query_info.active_position + prefill_l[i]) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
+                    self.kv_len = torch.concat((self.kv_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i])], device=device, dtype=torch.int32)), dim=0)
+                    self.position_ids = torch.concat((self.position_ids, torch.arange(prefill_s[i], prefill_l[i] + prefill_s[i], device=device, dtype=torch.int32)), dim=0)
+                    self.tokens = torch.concat((self.tokens, prefill_query_info.query_tokens[prefill_s[i]:prefill_s[i] + prefill_l[i]]), dim=0)
+                    self.logits_start.append(prefill_l[i] - 1 if len(self.logits_start) == 0 else sum(prefill_l[:i+1])-1)
+                    self.temperatures = torch.concat((self.temperatures, torch.tensor([prefill_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
+                    self.top_ps = torch.concat((self.top_ps, torch.tensor([prefill_query_info.top_p], device=device, dtype=torch.float32)), dim=0)
+            for decode_query_info in decode_querys_info:
+                decode_kv_block_len = (decode_query_info.active_position + 1 + page_size - 1) // page_size
+                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([1 + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([decode_kv_block_len+self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                self.kv_indices = torch.concat((self.kv_indices, decode_query_info.block_index[:decode_kv_block_len]), dim=0)
+                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(decode_query_info.active_position + 1) % page_size if (decode_query_info.active_position + 1) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
+                self.kv_len = torch.concat((self.kv_len, torch.tensor([(decode_query_info.active_position + 1)], device=device, dtype=torch.int32)), dim=0)
+                self.position_ids = torch.concat((self.position_ids, torch.arange(decode_query_info.active_position, decode_query_info.active_position + 1, device=device, dtype=torch.int32)), dim=0)
+                if decode_query_info.active_position > 0:
+                    self.tokens = torch.concat((self.tokens, decode_query_info.query_tokens[decode_query_info.active_position:decode_query_info.active_position+1]), dim=0)
+                else: 
+                    self.tokens = torch.concat((self.tokens, torch.tensor([0], device=device, dtype=torch.int32)), dim=0)
+                self.logits_start.append(0 if len(self.logits_start) == 0 else self.logits_start[-1]+1)
+                self.temperatures = torch.concat((self.temperatures, torch.tensor([decode_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
+                self.top_ps = torch.concat((self.top_ps, torch.tensor([decode_query_info.top_p], device=device, dtype=torch.float32)), dim=0)
+            self.q_indptr = self.q_indptr.contiguous()
+            self.kv_indptr = self.kv_indptr.contiguous()
+            self.kv_indices = self.kv_indices.contiguous()
+            self.kv_len = self.kv_len.contiguous()
+            self.kv_last_page_len = self.kv_last_page_len.contiguous()
+            self.position_ids = self.position_ids.contiguous()
+            self.tokens = self.tokens.contiguous()
+            self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)
+        def fill(self, prefill_querys_info: list[QueryInfo], decode_querys_info: list[QueryInfo], prefill_s: list[int] = None, prefill_l: list[int] = None, device = torch.device('cuda'), page_size = 256):
+            batch_decode = len(decode_querys_info)
+            batch_prefill = len(prefill_querys_info)
+            self.q_indptr = torch.tensor([0], device=device, dtype=torch.int32)
+            self.kv_indptr = torch.tensor([0], device=device, dtype=torch.int32)
+            self.kv_indices = torch.tensor([], device=device, dtype=torch.int32)
+            self.kv_len = torch.tensor([], device=device, dtype=torch.int32)
+            self.kv_last_page_len = torch.tensor([], device=device, dtype=torch.int32)
+            new_position_ids = torch.tensor([], device=device, dtype=torch.int32)
+            new_tokens = torch.tensor([], device=device, dtype=torch.int32)
+            self.temperatures = torch.tensor([], device=device, dtype=torch.float32)
+            self.top_ps = torch.tensor([], device=device, dtype=torch.float32)
+            self.logits_start = []
+            self.decode_batch = batch_decode
+            self.num_tokens = batch_decode + sum(prefill_l)
+            self.batch_size = batch_decode + batch_prefill
+            for i, prefill_query_info in enumerate(prefill_querys_info):
+                prefill_kv_block_len = (prefill_query_info.active_position + prefill_l[i] + page_size - 1) // page_size if prefill_query_info is not None else 0
+            # print(f"block_len: {prefill_kv_block_len}, page_size: {page_size}")
+                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([prefill_l[i] + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([prefill_kv_block_len + self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                self.kv_indices = torch.concat((self.kv_indices, prefill_query_info.block_index[:prefill_kv_block_len]), dim=0)
+                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i]) % page_size if (prefill_query_info.active_position + prefill_l[i]) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
+                self.kv_len = torch.concat((self.kv_len, torch.tensor([(prefill_query_info.active_position + prefill_l[i])], device=device, dtype=torch.int32)), dim=0)
+                new_position_ids = torch.concat((new_position_ids, torch.arange(prefill_s[i], prefill_l[i] + prefill_s[i], device=device, dtype=torch.int32)), dim=0)
+                new_tokens = torch.concat((new_tokens, prefill_query_info.query_tokens[prefill_s[i]:prefill_s[i] + prefill_l[i]]), dim=0)
+                self.logits_start.append(prefill_l[i] - 1 if len(self.logits_start) == 0 else sum(prefill_l[:i+1])-1)
+                self.temperatures = torch.concat((self.temperatures, torch.tensor([prefill_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
+                self.top_ps = torch.concat((self.top_ps, torch.tensor([prefill_query_info.top_p], device=device, dtype=torch.float32)), dim=0)
+            for decode_query_info in decode_querys_info:
+                decode_kv_block_len = (decode_query_info.active_position + 1 + page_size - 1) // page_size
+                self.q_indptr = torch.concat((self.q_indptr, torch.tensor([1 + self.q_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                self.kv_indptr = torch.concat((self.kv_indptr, torch.tensor([decode_kv_block_len+self.kv_indptr[-1]], device=device, dtype=torch.int32)), dim=0)
+                self.kv_indices = torch.concat((self.kv_indices, decode_query_info.block_index[:decode_kv_block_len]), dim=0)
+                self.kv_last_page_len = torch.concat((self.kv_last_page_len, torch.tensor([(decode_query_info.active_position + 1) % page_size if (decode_query_info.active_position + 1) % page_size != 0 else page_size], device=device, dtype=torch.int32)), dim=0)
+                self.kv_len = torch.concat((self.kv_len, torch.tensor([(decode_query_info.active_position + 1)], device=device, dtype=torch.int32)), dim=0)
+                new_position_ids = torch.concat((new_position_ids, torch.arange(decode_query_info.active_position, decode_query_info.active_position + 1, device=device, dtype=torch.int32)), dim=0)
+                if decode_query_info.active_position > 0:
+                    new_tokens = torch.concat((new_tokens, decode_query_info.query_tokens[decode_query_info.active_position:decode_query_info.active_position+1]), dim=0)
+                else: 
+                    new_tokens = torch.concat((new_tokens, torch.tensor([0], device=device, dtype=torch.int32)), dim=0)
+                self.logits_start.append(0 if len(self.logits_start) == 0 else self.logits_start[-1]+1)
+                self.temperatures = torch.concat((self.temperatures, torch.tensor([decode_query_info.temperature], device=device, dtype=torch.float32)), dim=0)
+                self.top_ps = torch.concat((self.top_ps, torch.tensor([decode_query_info.top_p], device=device, dtype=torch.float32)), dim=0)
+            self.q_indptr = self.q_indptr.contiguous()
+            self.kv_indptr = self.kv_indptr.contiguous()
+            self.kv_indices = self.kv_indices.contiguous()
+            self.kv_len = self.kv_len.contiguous()
+            self.kv_last_page_len = self.kv_last_page_len.contiguous()
+            self.bsz_tensor = torch.tensor([self.batch_size], device=device, dtype=torch.int32)
+            # copy new_position_ids and new_tokens to self.position_ids and self.tokens
+            # print("new_position_ids: ", new_position_ids)
+            # self.print()
+            self.position_ids[:new_position_ids.size(0)].copy_(new_position_ids)
+            self.position_ids[new_position_ids.size(0):].zero_()
+            self.tokens[:new_tokens.size(0)].copy_(new_tokens)
+    forward_minibatchs: list[ForwardMiniBatch]
+    batch_size: int
+    minibatch: ForwardMiniBatch
+    def __init__(self, batch : sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None, device=None, tokens: torch.Tensor = None):
+        if batch is None:
+            return
+        prefill_minibatches = batch.prefill_mini_batches
+        decode_mini_batches = [item for sublist in batch.decode_mini_batches for item in sublist]
+        prefill_querys_info = []
+        prefill_s = []
+        prefill_l = []
+        decode_querys_info = []
+        self.batch_size = 1
+        for (id, s, l) in prefill_minibatches:
+            prefill_querys_info.append(query_manager.query_map[id])
+            prefill_s.append(s)
+            prefill_l.append(l)
+        for decode_batch_idx in decode_mini_batches:
+            if query_manager.query_map[decode_batch_idx].decode_start_time is None:
+                query_manager.query_map[decode_batch_idx].decode_start_time =time.time()
+            decode_querys_info.append(query_manager.query_map[decode_batch_idx])
+        minibatch = ForwardBatchInput.ForwardMiniBatch(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device = query_manager.device, page_size = query_manager.page_size)
+        self.minibatch = minibatch
+    @classmethod
+    def gen_max_forward_batch(
+        cls,
+        device=None,
+        tokens: torch.Tensor = None,
+        num_mini_batches: int = 1,
+        max_seq_length: int = 1024, # TODO: add to yaml
+        prefill_query_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size, # TODO: use config
+        prefill_active_length: int = (Config().chunk_size - Config().max_decode_batch_size) // Config().max_prefill_batch_size,
+        gen_prefill: bool = True,
+        decode_batch_size: int = Config().max_decode_batch_size,
+        decode_active_position: torch.Tensor = None,
+        page_size = 256,
+        cuda_lens = 1
+    ):
+        instance = cls()
+        instance.batch_size = num_mini_batches
+        page_size = page_size
+        prefill_query_info = []
+        offset = 0
+        if gen_prefill and prefill_query_length != 0:
+            for i in range(Config().max_prefill_batch_size):
+                prefill_query_info.append(QueryInfo(i, prefill_query_length, max_seq_length, page_size, device, offset=offset))
+                offset += max_seq_length // page_size
+        decode_querys_info = []
+        for i in range(min(decode_batch_size, cuda_lens)):
+            query_info = QueryInfo(i+Config().max_prefill_batch_size, prefill_query_length, max_seq_length, page_size, device, is_prefill=False, offset=offset)
+            offset += max_seq_length // page_size
+            if tokens is not None:
+                query_info.query_tokens[prefill_active_length:prefill_active_length + 1].copy_(tokens)            
+            if decode_active_position is None:
+                query_info.active_position = prefill_active_length
+            else: 
+                query_info.active_position = decode_active_position[i]
+            decode_querys_info.append(query_info)
+        if prefill_query_length*Config().max_prefill_batch_size + len(decode_querys_info) < cuda_lens:
+            decode_querys_info.append(query_info)
+        instance.minibatch = ForwardBatchInput.ForwardMiniBatch(prefill_query_info, decode_querys_info, [0, 0], [prefill_active_length for _ in range(Config().max_prefill_batch_size)], device, page_size)
+        return instance
+    def fill(self, batch : sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None, page_size = 256):
+        if batch is None:
+            return
+        prefill_minibatches = batch.prefill_mini_batches
+        decode_mini_batches = [item for sublist in batch.decode_mini_batches for item in sublist]
+        prefill_querys_info = []
+        prefill_s = []
+        prefill_l = []
+        decode_querys_info = []
+        self.batch_size = 1
+        for (id, s, l) in prefill_minibatches:
+            prefill_querys_info.append(query_manager.query_map[id])
+            prefill_s.append(s)
+            prefill_l.append(l)
+        for decode_batch_idx in decode_mini_batches:
+            if query_manager.query_map[decode_batch_idx].decode_start_time is None:
+                query_manager.query_map[decode_batch_idx].decode_start_time =time.time()
+            decode_querys_info.append(query_manager.query_map[decode_batch_idx])
+        self.minibatch.fill(prefill_querys_info, decode_querys_info, prefill_s, prefill_l, device=query_manager.device, page_size=page_size)
+class ForwardBatchOutput:
+    logits: list[torch.Tensor]
+    num_batchs: int
+    batch_sizes: list[int]
+    generated_tokens_num: list[int]
+    lm_start: list[int]
+    temperatures: list[torch.Tensor]
+    top_ps: list[torch.Tensor]
+    def __init__(self):
+        self.logits = []
+        self.batch_sizes = []
+        self.generated_tokens_num = []
+        self.top_ps = []
+        self.temperatures = []
+        pass
\ No newline at end of file
--- a/ktransformers/server/balance_serve/inference/model_runner.py
+++ b/ktransformers/server/balance_serve/inference/model_runner.py
+"""
+Date: 2024-11-07 07:02:20
+LastEditors: djw
+LastEditTime: 2024-12-10 08:48:32
+"""
+import torch
+from torch import nn
+import queue
+import signal
+import queue
+from typing import AsyncIterable
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+from contextlib import asynccontextmanager
+from pydantic import BaseModel, Field
+import asyncio
+import multiprocessing
+import time
+import torch.multiprocessing as mp
+import random
+import torch.distributed as dist
+import zmq
+import tempfile
+from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
+from ktransformers.server.config.config import Config
+from ktransformers.models.custom_modeling_deepseek_v3 import KDeepseekV3ForCausalLM
+from ktransformers.models.custom_modeling_deepseek_v2 import KDeepseekV2ForCausalLM
+from ktransformers.server.balance_serve.inference.query_manager import QueryManager
+from ktransformers.server.balance_serve.settings import sched_ext
+def pad_num_tokens(num_tokens):
+    return (num_tokens + 63) // 64 * 64
+def deduplicate_and_sort(lst):
+    return sorted(set(lst))
+class ModelRunner:
+    """A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile."""
+    model: KDeepseekV3ForCausalLM
+    input: ForwardBatchInput | list[ForwardBatchInput]
+    output: ForwardBatchOutput
+    def __init__(self, model = None, device = None, use_cuda_graph = False, max_decode_batch_size = 1, max_chunk_size = 4096, num_mini_batches: int = 1, page_size = 256):
+        self.stream = torch.cuda.Stream(device=device)
+        # 先注释掉
+        self.model = model  # Compile and move model to the specified device
+        self.device = device
+        self.input = None
+        self.features_buf = None
+        self.output = None
+        self.graph_memory_pool = None
+        self.cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size])
+        self.use_cuda_graph = use_cuda_graph
+        self.model_time = 0
+        self.page_size = page_size
+        # GPU timing for model execution
+        self.start_model_event = torch.cuda.Event(enable_timing=True)
+        self.end_model_event = torch.cuda.Event(enable_timing=True)
+        if isinstance(self.cuda_graphs, list):
+            self.graphs = [torch.cuda.CUDAGraph() for _ in range(len(self.cuda_graphs))]
+            self.page_idx_buf = [torch.zeros([self.cuda_graphs[i]], dtype=torch.int32, device = self.device) for i in range(len(self.cuda_graphs))]
+            self.page_offset_buf = [torch.zeros([self.cuda_graphs[i]], dtype=torch.int32, device = self.device) for i in range(len(self.cuda_graphs))]
+        else:
+            self.graphs = torch.cuda.CUDAGraph()
+            self.page_idx_buf = torch.zeros([self.cuda_graphs], dtype=torch.int32, device = self.device)
+            self.page_offset_buf = torch.zeros([self.cuda_graphs], dtype=torch.int32, device = self.device)
+        self.num_mini_batches = num_mini_batches
+        self.max_chunk_size = max_chunk_size
+        self.bsz_tensor_buf = torch.empty((1, ),dtype=torch.int32, device=device)
+        self.num_tokens_tensor_buf = torch.empty((1, ),dtype=torch.int32, device=device)
+    def warmup(self):
+        def capture_graphs(cuda_graph_idx=-1):
+            if cuda_graph_idx != -1:
+                with torch.cuda.graph(self.graphs[cuda_graph_idx], pool=self.graph_memory_pool, stream=self.stream):
+                    self.outputs_buf[cuda_graph_idx] = self.model(self.input[cuda_graph_idx], self.features_buf[cuda_graph_idx], self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[cuda_graph_idx], self.page_offset_buf[cuda_graph_idx], cuda_graph_idx=cuda_graph_idx)   
+                self.graph_memory_pool = self.graphs[cuda_graph_idx].pool()
+            else:
+                with torch.cuda.graph(self.graphs, pool=self.graph_memory_pool, stream=self.stream):
+                    self.outputs_buf = self.model(self.input, self.features_buf, self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf, self.page_offset_buf)   
+                self.graph_memory_pool = self.graphs.pool()
+        if isinstance(self.cuda_graphs, list):
+            self.input = []
+            self.features_buf = []
+            self.outputs_buf = []
+            self.bsz_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
+            self.num_tokens_tensor_buf = torch.tensor([0], dtype=torch.int32, device=self.device)
+            for i in range(len(self.cuda_graphs)):
+                prefill_query_length = (self.cuda_graphs[i] - Config().max_decode_batch_size) // Config().max_prefill_batch_size if self.cuda_graphs[i] > Config().max_decode_batch_size else 0  #@TODO only supprot 2 prefill batch
+                self.input.append(ForwardBatchInput.gen_max_forward_batch(device=self.device, num_mini_batches = self.num_mini_batches, prefill_query_length=prefill_query_length, prefill_active_length=prefill_query_length, page_size=self.page_size, cuda_lens = self.cuda_graphs[i]))
+                self.features_buf.append(self.model.batch_embeddings(self.input[i]))
+                batch_size = self.input[i].minibatch.q_indptr.size(0)-1
+                num_tokens = self.features_buf[i][0].size(0)
+                print("capturing cuda graph", batch_size, num_tokens)
+                self.bsz_tensor_buf[0] = batch_size
+                self.num_tokens_tensor_buf[0] = num_tokens
+                self.model.flash_infer_attn_plan(self.input[i], self.bsz_tensor_buf, self.num_tokens_tensor_buf,
+                                                num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
+                                                    head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.model.cache.page_size, causal=True,
+                                                    sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
+                page_idx, page_offset = self.model.cache.get_page_table(self.input[i].minibatch.position_ids, self.input[i].minibatch.q_indptr, self.input[i].minibatch.kv_indptr, self.input[i].minibatch.kv_indices, self.num_tokens_tensor_buf)
+                self.page_idx_buf[i][:num_tokens].copy_(page_idx[:num_tokens])
+                self.page_offset_buf[i][:num_tokens].copy_(page_offset[:num_tokens])
+                self.page_idx_buf[i][num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size -1) 
+                self.outputs_buf.append(None)
+                torch.cuda.synchronize()
+                for warm_up_iters in range(11):
+                    with torch.cuda.stream(self.stream):
+                        self.outputs_buf[i] = self.model(self.input[i], self.features_buf[i], self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf[i], self.page_offset_buf[i])
+                torch.cuda.synchronize()
+                capture_graphs(i)
+                with torch.cuda.stream(self.stream):
+                    self.graphs[i].replay()
+                self.sync(calc_time=False)
+                print(f"cuda_graph: {i+1}/{len(self.cuda_graphs)}, warmup finished.")
+        else:
+            self.input = ForwardBatchInput.gen_max_forward_batch(device=self.device, num_mini_batches = self.num_mini_batches)
+            self.features_buf = self.model.batch_embeddings(self.input)
+            batch_size = self.input.minibatch.q_indptr.size(0)-1
+            num_tokens = self.features_buf[0].size(0)
+            self.bsz_tensor_buf = torch.tensor([batch_size], dtype=torch.int32, device=self.device)
+            self.num_tokens_tensor_buf = torch.tensor([num_tokens], dtype=torch.int32, device=self.device)
+            self.model.flash_infer_attn_plan(self.input, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
+                                            num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
+                                                head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.model.cache.page_size, causal=True,
+                                                sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
+            page_idx, page_offset = self.model.cache.get_page_table(self.input.minibatch.position_ids, self.input.minibatch.q_indptr, self.input.minibatch.kv_indptr, self.input.minibatch.kv_indices, self.num_tokens_tensor_buf)
+            self.page_idx_buf[:num_tokens].copy_(page_idx[:num_tokens])
+            self.page_offset_buf[:num_tokens].copy_(page_offset[:num_tokens])
+            self.page_idx_buf[num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size - 1) 
+            torch.cuda.synchronize()
+            for warm_up_iters in range(11):
+                with torch.cuda.stream(self.stream):
+                    self.outputs_buf = self.model(self.input, self.features_buf, self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf, self.page_offset_buf)
+            torch.cuda.synchronize()
+            def capture_graphs():
+                with torch.cuda.graph(self.graphs,  stream=self.stream):
+                    self.outputs_buf = self.model(self.input, self.features_buf, self.bsz_tensor_buf, self.num_tokens_tensor_buf, self.page_idx_buf, self.page_offset_buf)   
+                    # self.graph_memory_pool = self.graphs.pool()
+            capture_graphs()
+            with torch.cuda.stream(self.stream):
+                self.graphs.replay()
+            self.sync(calc_time=False)
+            print("warmup finished.")
+    def run(self, batch: sched_ext.BatchQueryTodo = None, query_manager: QueryManager = None):
+        with torch.cuda.stream(self.stream):
+            batch_size = len(batch.prefill_mini_batches) # TODO: calc this
+            num_tokens = 0
+            for i in range(len(batch.decode_mini_batches)):
+                batch_size += len(batch.decode_mini_batches[i])
+                num_tokens += len(batch.decode_mini_batches[i])
+                print(f'decode_batch_i: {len(batch.decode_mini_batches[i])},')
+            for i in range(len(batch.prefill_mini_batches)):
+                num_tokens += batch.prefill_mini_batches[i][2]
+                print(f'prefill_batch_i: {batch.prefill_mini_batches[i][2]},')
+            if isinstance(self.cuda_graphs, list):
+                # cuda graph idx equal to min idx i in self.cuda_graphs, that self.cuda_graphs[i] > num_tokens
+                cuda_graph_idx = next((i for i, token in enumerate(self.cuda_graphs) if token >= num_tokens), len(self.cuda_graphs))
+                if cuda_graph_idx == len(self.cuda_graphs):
+                    assert False, "num_tokens is too large"
+            else:
+                cuda_graph_idx = -1
+            if self.use_cuda_graph:
+                if cuda_graph_idx != -1:
+                    self.input[cuda_graph_idx].fill(batch, query_manager, self.page_size)
+                else:
+                    self.input.fill(batch, query_manager, self.page_size)
+            else:
+                self.input = ForwardBatchInput(batch=batch, query_manager=query_manager, device=self.device)
+            if cuda_graph_idx != -1 and self.use_cuda_graph:
+                self.features = self.model.batch_embeddings(self.input[cuda_graph_idx], device=self.device)
+            else:
+                self.features = self.model.batch_embeddings(self.input, device=self.device)
+            self.bsz_tensor_buf.copy_(batch_size)
+            self.num_tokens_tensor_buf.copy_(torch.tensor([num_tokens], dtype=torch.int32, device=self.device))
+            if self.use_cuda_graph:
+                if cuda_graph_idx != -1:
+                    self.features_buf[cuda_graph_idx][0].copy_(self.features[0], non_blocking=True)
+                else:
+                    self.features_buf[0].copy_(self.features[0], non_blocking=True)
+                """
+                if num_tokens_0 > 64:
+                    padded_num_tokens_0 = pad_num_tokens(num_tokens_0)
+                    self.features_buf[0][num_tokens_0:padded_num_tokens_0] = 0
+                """
+            #self.input.forward_minibatchs[0].print()
+            # print([[hash(k[i].float().cpu().numpy().tobytes()) for i in self.input.forward_minibatchs[0].kv_indices] for k in self.model.cache.k_caches])
+            # print(f"overlap: {overlap}, is_compute_bound: {is_compute_bound}")
+            # self.model.flash_infer_attn_plan(self.input, self.bsz_tensors, self.num_tokens_tensors)
+            """
+            if self.use_cuda_graph:
+                print("before replay features_buf", self.features_buf[0])
+                print("features_buf addr", self.features_buf[0].data_ptr())
+            else:
+                print("before run features", self.features[0])
+            """
+            if cuda_graph_idx != -1 and self.use_cuda_graph:
+                self.model.flash_infer_attn_plan(self.input[cuda_graph_idx], self.bsz_tensor_buf, self.num_tokens_tensor_buf,
+                                            num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
+                                                head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.model.cache.page_size, causal=True,
+                                                sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
+                self.start_model_event.record(self.stream)
+                page_idx, page_offset = self.model.cache.get_page_table(self.input[cuda_graph_idx].minibatch.position_ids, self.input[cuda_graph_idx].minibatch.q_indptr, self.input[cuda_graph_idx].minibatch.kv_indptr, self.input[cuda_graph_idx].minibatch.kv_indices, self.num_tokens_tensor_buf)
+                if self.use_cuda_graph:
+                    self.page_idx_buf[cuda_graph_idx][:num_tokens].copy_(page_idx[:num_tokens])
+                    self.page_offset_buf[cuda_graph_idx][:num_tokens].copy_(page_offset[:num_tokens])
+                    self.page_idx_buf[cuda_graph_idx][num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size - 1)
+                    self.replay(cuda_graph_idx)
+                    self.output = ForwardBatchOutput()
+                    self.output.top_ps.append(self.input[cuda_graph_idx].minibatch.top_ps)
+                    self.output.temperatures.append(self.input[cuda_graph_idx].minibatch.temperatures)
+                    self.output.logits.append(self.outputs_buf[cuda_graph_idx].logits[0][self.input[cuda_graph_idx].minibatch.logits_start].clone())
+                else:
+                    self.output = self.model(self.input[cuda_graph_idx], self.features, self.bsz_tensor_buf, self.num_tokens_tensor_buf, page_idx, page_offset)
+                    self.output.logits[0] = self.output.logits[0][self.input[cuda_graph_idx].minibatch.logits_start]
+                self.end_model_event.record(self.stream)
+            else:
+                self.model.flash_infer_attn_plan(self.input, self.bsz_tensor_buf, self.num_tokens_tensor_buf,
+                                            num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
+                                                head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.model.cache.page_size, causal=True,
+                                                sm_scale=self.model.model.layers[0].self_attn.softmax_scale, q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
+                self.start_model_event.record(self.stream)
+                page_idx, page_offset = self.model.cache.get_page_table(self.input.minibatch.position_ids, self.input.minibatch.q_indptr, self.input.minibatch.kv_indptr, self.input.minibatch.kv_indices, self.num_tokens_tensor_buf)
+                if self.use_cuda_graph:
+                    self.page_idx_buf[:num_tokens].copy_(page_idx[:num_tokens])
+                    self.page_offset_buf[:num_tokens].copy_(page_offset[:num_tokens])
+                    self.page_idx_buf[num_tokens:].fill_(self.model.cache.max_cache_len // self.model.cache.page_size - 1) 
+                    self.replay(cuda_graph_idx)
+                    self.output = ForwardBatchOutput()
+                    self.output.top_ps.append(self.input.minibatch.top_ps)
+                    self.output.temperatures.append(self.input.minibatch.temperatures)
+                    self.output.logits.append(self.outputs_buf.logits[0][self.input.minibatch.logits_start].clone())
+                else:
+                    self.output = self.model(self.input, self.features, self.bsz_tensor_buf, self.num_tokens_tensor_buf, page_idx, page_offset)
+                    self.output.logits[0] = self.output.logits[0][self.input.minibatch.logits_start]
+                    self.output.top_ps.append(self.input.minibatch.top_ps)
+                    self.output.temperatures.append(self.input.minibatch.temperatures)
+                self.end_model_event.record(self.stream)
+        if not self.use_cuda_graph:
+            self.output.num_batchs = self.input.batch_size
+        else:
+            self.output.num_batchs = self.input[cuda_graph_idx].batch_size
+    def replay(self, cuda_graph_idx=-1):
+        with torch.cuda.stream(self.stream):
+            if cuda_graph_idx != -1:
+                self.graphs[cuda_graph_idx].replay()
+            else:
+                self.graphs.replay()
+    def sync(self, calc_time = True):
+        self.stream.synchronize()
+        if calc_time:
+            self.model_time = self.start_model_event.elapsed_time(self.end_model_event)  # In ms
\ No newline at end of file
--- a/ktransformers/server/balance_serve/inference/query_manager.py
+++ b/ktransformers/server/balance_serve/inference/query_manager.py
+'''
+Date: 2024-11-14 12:23:45
+LastEditors: djw
+LastEditTime: 2024-11-20 04:06:23
+'''
+import torch
+from ktransformers.server.balance_serve.settings import sched_ext
+import random
+import time
+class QueryInfo:
+    id: int
+    active_position: int
+    query_length: int
+    is_prefill: int
+    block_index: torch.Tensor
+    query_tokens: torch.Tensor
+    stop_criteria: list[torch.Tensor]
+    temperature: float
+    top_p: float
+    max_length: int 
+    def __init__(self, id, query_length: int, max_length: int, page_size: int, device: torch.device, is_prefill: bool = True, offset: int = 0, active_position: int = 0, temperature: float = 0.01, top_p: float = 1.0):
+        self.id = id
+        self.is_prefill = is_prefill
+        self.active_position = active_position
+        self.max_length = max_length - 1
+        self.query_tokens = torch.zeros((max_length,), dtype=torch.int, device = device)
+        self.stop_criteria = []
+        self.block_index = torch.arange(offset, offset + (max_length + active_position + page_size - 1) // page_size, dtype=torch.int, device = device)
+        self.query_length = query_length
+        self.enqueue_time = time.time()
+        self.decode_start_time = None
+        self.speculative_token = {} # {position: (accept, token)}
+        self.temperature = temperature
+        self.top_p = top_p
+    def check_stop(self):
+        if self.active_position >= self.max_length - 2:
+            return True
+        # 遍历每个停止条件
+        for stop_tensor in self.stop_criteria:
+            stop_len = len(stop_tensor)
+            # 如果停止条件比 query_tokens 长，跳过
+            if stop_len >= self.active_position:
+                continue
+            #print(f"stop_tensor: {stop_tensor}, stop_len: {stop_len}, active_position: {self.active_position}, query_token: {self.query_tokens[self.active_position - stop_len - 1:self.active_position - 1]}")
+            if (torch.equal(self.query_tokens[self.active_position - stop_len - 1:self.active_position - 1], stop_tensor) and self.active_position) or self.max_length <= self.active_position + 3:
+                self.life_time = time.time() - self.enqueue_time
+                self.decode_duration_time = time.time() - self.decode_start_time
+                self.decode_tps = (self.active_position -  self.query_length) / self.decode_duration_time
+                print(f"prefill length: {self.query_length}, prefill time: {self.prefill_duration_time}, prefill tps {self.prefill_tps}, decode length: {self.active_position -  self.query_length}, decode time: {self.decode_duration_time}, decode tps {self.decode_tps}")
+                return True  # 找到匹配的停止条件
+        return False  # 没有找到任何停止条件
+    def print(self):
+        print(f"active_position: {self.active_position}, query_length: {self.query_length}, is_prefill: {self.is_prefill}")
+        print(f"block_index_shape: {self.block_index.shape}, query_tokens_shape: {self.query_tokens.shape}")
+class QueryManager:
+    max_length: int = 65536
+    page_size: int = 256
+    device: torch.device
+    query_map : dict[int, QueryInfo]
+    def __init__(self, max_length = 65536, page_size = 256, device = torch.device('cuda')):
+        self.max_length = max_length
+        self.page_size = page_size
+        self.device = device
+        self.query_map = {}
+    def add_query(self, batch: sched_ext.BatchQueryTodo):
+        for i in range(len(batch.query_ids)):
+            id = batch.query_ids[i]
+            if id not in self.query_map:
+                print(f"add query id: {id}, batch.query_lengths: {batch.query_lengths[i]}, batch_query_tokens: {batch.query_tokens[i].shape}, batch.block_indexes: {batch.block_indexes[i]}")
+                assert batch.query_tokens[i].size(0) < self.max_length, "query max length in batchquerytodo exceeds internal max_length"
+                query_info = QueryInfo(id=id, query_length=batch.query_lengths[i], max_length=batch.query_tokens[i].size(0) + 1, page_size=self.page_size, device=self.device, temperature=batch.sample_options[i].temperature, top_p=batch.sample_options[i].top_p)
+                query_info.query_tokens[:query_info.query_length].copy_(batch.query_tokens[i][:query_info.query_length].to(self.device))
+                for stop_token_list in batch.stop_criteria[i]:
+                    query_info.stop_criteria.append(torch.tensor(stop_token_list, dtype=torch.int, device = self.device))
+                block_num = batch.block_indexes[i].size(0)
+                query_info.block_index[:block_num].copy_(batch.block_indexes[i].to(self.device))
+                self.query_map[id] = query_info
+                prefill_mini_batches = batch.prefill_mini_batches
+                for (prefill_id, s, l) in prefill_mini_batches:
+                    if prefill_id == id:
+                        self.query_map[prefill_id].active_position = s
+    def update(self, batch: sched_ext.BatchQueryTodo) -> list[sched_ext.QueryUpdate]:
+        query_updates = []
+        prefill_mini_batches = batch.prefill_mini_batches
+        for (id, s, l) in prefill_mini_batches:
+            if id not in self.query_map:
+                assert False, f"query id {id} not found in query_map"
+            # update query_info
+            query_info = self.query_map[id]
+            query_info.active_position += l
+            if query_info.active_position >= query_info.query_length and query_info.is_prefill:
+                query_info.is_prefill = False
+                query_info.prefill_duration_time = time.time() - query_info.enqueue_time
+                query_info.prefill_tps = query_info.query_length / query_info.prefill_duration_time
+            # generate schedule query_update
+            query_update = sched_ext.QueryUpdate()
+            query_update.id = id
+            query_update.ok = True
+            query_update.is_prefill = query_info.is_prefill
+            query_update.active_position = query_info.active_position
+            # if(not query_info.is_prefill):
+            query_updates.append(query_update)
+        decode_mini_batches = batch.decode_mini_batches
+        for ids in decode_mini_batches:
+            for id in ids:
+                if id not in self.query_map:
+                    assert False, f"query id {id} not found in query_map"
+                query_info = self.query_map[id]
+                query_info.active_position += 1
+                query_update = sched_ext.QueryUpdate()
+                query_update.id = id
+                query_update.ok = True
+                query_update.is_prefill = query_info.is_prefill
+                query_update.decode_done = query_info.check_stop()
+                query_update.active_position = query_info.active_position
+                query_updates.append(query_update)
+        return query_updates
\ No newline at end of file
--- a/ktransformers/server/balance_serve/inference/sampling/penaltylib/__init__.py
+++ b/ktransformers/server/balance_serve/inference/sampling/penaltylib/__init__.py
+from .orchestrator import BatchedPenalizerOrchestrator
+from .penalizers.frequency_penalty import BatchedFrequencyPenalizer
+from .penalizers.min_new_tokens import BatchedMinNewTokensPenalizer
+from .penalizers.presence_penalty import BatchedPresencePenalizer
+from .penalizers.repetition_penalty import BatchedRepetitionPenalizer
+__all__ = [
+    "BatchedFrequencyPenalizer",
+    "BatchedMinNewTokensPenalizer",
+    "BatchedPresencePenalizer",
+    "BatchedRepetitionPenalizer",
+    "BatchedPenalizerOrchestrator",
+]
--- a/ktransformers/server/balance_serve/inference/sampling/penaltylib/orchestrator.py
+++ b/ktransformers/server/balance_serve/inference/sampling/penaltylib/orchestrator.py
+import abc
+import dataclasses
+import typing
+import torch
+@dataclasses.dataclass
+class _ReqLike:
+    origin_input_ids: typing.Union[torch.Tensor, typing.List[int]]
+@dataclasses.dataclass
+class _BatchLike:
+    reqs: typing.List[_ReqLike]
+    def batch_size(self):
+        return len(self.reqs)
+class BatchedPenalizerOrchestrator:
+    batch: _BatchLike
+    device: str
+    vocab_size: int
+    penalizers: typing.Dict[typing.Type["_BatchedPenalizer"], "_BatchedPenalizer"]
+    def __init__(
+        self,
+        vocab_size: int,
+        batch: _BatchLike,
+        device: str,
+        Penalizers: typing.Set[typing.Type["_BatchedPenalizer"]],
+    ):
+        self.vocab_size = vocab_size
+        self.batch = batch
+        self.device = device
+        self.penalizers = {Penalizer: Penalizer(self) for Penalizer in Penalizers}
+        is_required = False
+        for penalizer in self.penalizers.values():
+            pen_is_required = penalizer.prepare_if_required()
+            is_required |= pen_is_required
+        self.is_required = is_required
+        if self.is_required:
+            self.cumulate_input_tokens(
+                input_ids=[req.origin_input_ids for req in self.reqs()]
+            )
+    def reqs(self):
+        return self.batch.reqs
+    def batch_size(self):
+        return self.batch.batch_size()
+    def cumulate_input_tokens(
+        self,
+        input_ids: typing.Union[
+            typing.List[torch.Tensor], typing.List[typing.List[int]]
+        ],
+    ):
+        """
+        Feed the input tokens to the penalizers.
+        Args:
+            input_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The input tokens.
+        """
+        token_ids = _TokenIDs(orchestrator=self, token_ids=input_ids)
+        for penalizer in self.penalizers.values():
+            penalizer.cumulate_input_tokens(input_ids=token_ids)
+    def cumulate_output_tokens(
+        self,
+        output_ids: typing.Union[
+            typing.List[torch.Tensor], typing.List[typing.List[int]]
+        ],
+    ):
+        """
+        Feed the output tokens to the penalizers.
+        Args:
+            output_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The output tokens.
+        """
+        if not self.is_required:
+            return
+        token_ids = _TokenIDs(orchestrator=self, token_ids=output_ids)
+        for penalizer in self.penalizers.values():
+            penalizer.cumulate_output_tokens(output_ids=token_ids)
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the penalizers to the logits.
+        Note that it may apply the penalizers in-place.
+        Args:
+            logits (torch.Tensor): The logits to apply the penalizers to.
+        Returns:
+            torch.Tensor: The logits after applying the penalizers.
+        """
+        if not self.is_required:
+            return
+        for penalizer in self.penalizers.values():
+            logits = penalizer.apply(logits)
+        return logits
+    def filter(
+        self,
+        indices_to_keep: typing.List[int],
+        indices_tensor_to_keep: torch.Tensor = None,
+    ):
+        """
+        Filter the penalizers based on the indices to keep in the batch.
+        Args:
+            indices_to_keep (typing.List[int]): List of indices to keep in the batch.
+            indices_tensor_to_keep (torch.Tensor = None): Tensor of indices to keep in the batch. If not None, it will be used instead of converting indices_to_keep to a tensor.
+        """
+        if not self.is_required:
+            return
+        empty_indices = len(indices_to_keep) == 0
+        is_required = False
+        for penalizer in self.penalizers.values():
+            tmp_is_required = penalizer.is_required()
+            is_required = is_required or tmp_is_required
+            if not tmp_is_required or empty_indices:
+                penalizer.teardown()
+            else:
+                # create tensor index only when it's needed
+                if indices_tensor_to_keep is None:
+                    indices_tensor_to_keep = torch.tensor(
+                        indices_to_keep, dtype=torch.int32, device=self.device
+                    )
+                penalizer.filter(
+                    indices_to_keep=indices_to_keep,
+                    indices_tensor_to_keep=indices_tensor_to_keep,
+                )
+        self.is_required = is_required
+    def merge(self, their: "BatchedPenalizerOrchestrator"):
+        """
+        Merge the penalizers of another orchestrator into this one.
+        Note that this function **must** be called _before_ self.batch.reqs is updated (filtered).
+        Each unprepared penalizers would have to be prepared (creating tensors, etc.) first before merging.
+        This step requires the original batch.reqs, before it gets merged with other batch.reqs.
+        Args:
+            their (BatchedPenalizerOrchestrator): The orchestrator to merge into this one.
+        """
+        if not self.is_required and not their.is_required:
+            return
+        self.is_required |= their.is_required
+        for Penalizer, their_penalizer in their.penalizers.items():
+            if Penalizer not in self.penalizers:
+                raise ValueError(f"Penalizer {Penalizer} not found in self.penalizers")
+            self.penalizers[Penalizer].merge(their_penalizer)
+class _TokenIDs:
+    """
+    A class that wraps token IDs to provide additional utility functions to penalizers.
+    Attributes:
+        orchestrator (BatchedPenalizerOrchestrator): The orchestrator that this token IDs belong to.
+        token_ids (typing.Union[torch.Tensor, typing.List[torch.Tensor]]): The token IDs.
+        cached_counts (torch.Tensor): The cached occurrence count tensor.
+    """
+    orchestrator: BatchedPenalizerOrchestrator
+    token_ids: typing.Union[torch.Tensor, typing.List[torch.Tensor]]
+    cached_counts: torch.Tensor = None
+    def __init__(
+        self,
+        orchestrator: BatchedPenalizerOrchestrator,
+        token_ids: typing.Union[
+            typing.List[torch.Tensor], typing.List[typing.List[int]]
+        ],
+    ):
+        self.orchestrator = orchestrator
+        if not isinstance(token_ids[0], torch.Tensor):
+            token_ids = [
+                torch.tensor(
+                    data=ids, dtype=torch.int64, device=self.orchestrator.device
+                )
+                for ids in token_ids
+            ]
+        self.token_ids = token_ids
+    def occurrence_count(self) -> torch.Tensor:
+        """
+        Returns a tensor of shape (batch_size, vocab_size) where each element is the number of times the corresponding token appears in the batch.
+        Returns:
+            torch.Tensor: The occurrence count tensor.
+        """
+        if self.cached_counts is not None:
+            return self.cached_counts
+        token_ids = self.token_ids
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.unsqueeze(1)
+            # needs to be long to be used as index in scatter_add
+            if token_ids.dtype != torch.int64:
+                token_ids = token_ids.to(torch.int64)
+        padded_token_ids = torch.nn.utils.rnn.pad_sequence(
+            sequences=token_ids,
+            batch_first=True,
+            padding_value=self.orchestrator.vocab_size,
+        )
+        self.cached_counts = torch.zeros(
+            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
+            dtype=torch.int64,
+            device=self.orchestrator.device,
+        ).scatter_add_(
+            dim=1,
+            index=padded_token_ids,
+            src=torch.ones_like(padded_token_ids),
+        )[
+            :, : self.orchestrator.vocab_size
+        ]
+        return self.cached_counts
+class _BatchedPenalizer(abc.ABC):
+    """
+    An abstract class for a batched penalizer.
+    """
+    orchestrator: BatchedPenalizerOrchestrator
+    _is_prepared: bool = False
+    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
+        self.orchestrator = orchestrator
+    def is_prepared(self) -> bool:
+        return self._is_prepared
+    def is_required(self) -> bool:
+        return self._is_required()
+    def prepare(self):
+        if not self.is_prepared():
+            self._prepare()
+            self._is_prepared = True
+    def prepare_if_required(self):
+        if self.is_required():
+            self.prepare()
+            return True
+        else:
+            return False
+    def teardown(self):
+        if self.is_prepared():
+            self._teardown()
+            self._is_prepared = False
+    def cumulate_input_tokens(self, input_ids: _TokenIDs):
+        if not self.is_prepared():
+            return
+        self._cumulate_input_tokens(input_ids=input_ids)
+    def cumulate_output_tokens(self, output_ids: _TokenIDs):
+        if not self.is_prepared():
+            return
+        self._cumulate_output_tokens(output_ids=output_ids)
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.is_prepared():
+            return logits
+        return self._apply(logits=logits)
+    def filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        if not self.is_prepared():
+            return
+        self._filter(
+            indices_to_keep=indices_to_keep,
+            indices_tensor_to_keep=indices_tensor_to_keep,
+        )
+    def merge(self, their: "_BatchedPenalizer"):
+        if not self.is_prepared() and not their.is_prepared():
+            return
+        self.prepare()
+        their.prepare()
+        self._merge(their)
+    @abc.abstractmethod
+    def _is_required(self) -> bool:
+        """
+        Check if the penalizer is required to be prepared.
+        """
+        pass
+    @abc.abstractmethod
+    def _prepare(self):
+        """
+        Prepare the penalizer.
+        Usually, this is where the penalizer initializes its tensors.
+        """
+        pass
+    @abc.abstractmethod
+    def _teardown(self):
+        """
+        Tear down the penalizer.
+        Usually, this is where the penalizer frees its tensors.
+        """
+        pass
+    @abc.abstractmethod
+    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
+        """
+        Cumulate the input tokens.
+        Orchestrator will call this function to feed the input tokens to the penalizer.
+        """
+        pass
+    @abc.abstractmethod
+    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
+        """
+        Cumulate the output tokens.
+        Orchestrator will call this function to feed the output tokens to the penalizer.
+        """
+        pass
+    @abc.abstractmethod
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the penalizer to the logits.
+        Penalizers can modify the logits in-place if needed.
+        """
+        pass
+    @abc.abstractmethod
+    def _filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        """
+        Filter the penalizer (tensors or underlying data) based on the indices to keep in the batch.
+        """
+        pass
+    @abc.abstractmethod
+    def _merge(self, their: "_BatchedPenalizer"):
+        """
+        Merge the penalizer with another penalizer.
+        """
+        pass
--- a/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/frequency_penalty.py
+++ b/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/frequency_penalty.py
+import typing
+import torch
+from ..orchestrator import _BatchedPenalizer, _TokenIDs
+class BatchedFrequencyPenalizer(_BatchedPenalizer):
+    """
+    Frequency penalizer penalizes tokens based on their frequency in the output.
+    """
+    frequency_penalties: torch.Tensor = None
+    cumulated_frequency_penalties: torch.Tensor = None
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.frequency_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+    def _prepare(self):
+        self.cumulated_frequency_penalties = (
+            torch.tensor(
+                data=[0.0 for _ in self.orchestrator.reqs()],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .repeat(1, self.orchestrator.vocab_size)
+        )
+        self.frequency_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.frequency_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .expand_as(self.cumulated_frequency_penalties)
+        )
+    def _teardown(self):
+        del self.frequency_penalties
+        del self.cumulated_frequency_penalties
+        self.frequency_penalties = None
+        self.cumulated_frequency_penalties = None
+    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
+        pass
+    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
+        self.cumulated_frequency_penalties += (
+            self.frequency_penalties * output_ids.occurrence_count()
+        )
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits -= self.cumulated_frequency_penalties
+        return logits
+    def _filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        self.frequency_penalties = self.frequency_penalties[indices_tensor_to_keep]
+        self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
+            indices_tensor_to_keep
+        ]
+    def _merge(self, their: "BatchedFrequencyPenalizer"):
+        self.frequency_penalties = torch.cat(
+            [self.frequency_penalties, their.frequency_penalties], dim=0
+        )
+        self.cumulated_frequency_penalties = torch.cat(
+            [self.cumulated_frequency_penalties, their.cumulated_frequency_penalties],
+            dim=0,
+        )
--- a/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/min_new_tokens.py
+++ b/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/min_new_tokens.py
+import typing
+import torch
+from ..orchestrator import _BatchedPenalizer, _TokenIDs
+class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
+    """
+    Min new tokens penalizer penalizes tokens based on the length of the output.
+    """
+    min_new_tokens: torch.Tensor = None
+    stop_token_penalties: torch.Tensor = None
+    len_output_tokens: torch.Tensor = None
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.min_new_tokens > 0 for req in self.orchestrator.reqs()
+        )
+    def _prepare(self):
+        self.min_new_tokens = torch.tensor(
+            data=[
+                req.sampling_params.min_new_tokens for req in self.orchestrator.reqs()
+            ],
+            dtype=torch.int32,
+            device=self.orchestrator.device,
+        ).unsqueeze_(1)
+        padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
+            sequences=[
+                torch.tensor(
+                    data=(
+                        list(
+                            (req.sampling_params.stop_token_ids or set())
+                            | (req.tokenizer.additional_stop_token_ids or set())
+                            | {req.tokenizer.eos_token_id}
+                        )
+                    ),
+                    dtype=torch.int64,
+                    device=self.orchestrator.device,
+                )
+                for req in self.orchestrator.reqs()
+            ],
+            batch_first=True,
+            padding_value=self.orchestrator.vocab_size,
+        )
+        self.stop_token_penalties = torch.zeros(
+            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        ).scatter_add_(
+            dim=1,
+            index=padded_stop_token_ids,
+            src=torch.full_like(
+                input=padded_stop_token_ids,
+                dtype=torch.float32,
+                fill_value=float("-inf"),
+                device=self.orchestrator.device,
+            ),
+        )[
+            :, : self.orchestrator.vocab_size
+        ]
+        self.len_output_tokens = torch.zeros(
+            size=(self.orchestrator.batch_size(), 1),
+            dtype=torch.int32,
+            device=self.orchestrator.device,
+        )
+    def _teardown(self):
+        del self.min_new_tokens
+        del self.stop_token_penalties
+        del self.len_output_tokens
+        self.min_new_tokens = None
+        self.stop_token_penalties = None
+        self.len_output_tokens = None
+    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
+        pass
+    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
+        self.len_output_tokens += 1
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        mask = (self.len_output_tokens < self.min_new_tokens).expand_as(logits)
+        logits[mask] += self.stop_token_penalties[mask]
+        return logits
+    def _filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        self.min_new_tokens = self.min_new_tokens[indices_tensor_to_keep]
+        self.stop_token_penalties = self.stop_token_penalties[indices_tensor_to_keep]
+        self.len_output_tokens = self.len_output_tokens[indices_tensor_to_keep]
+    def _merge(self, their: "BatchedMinNewTokensPenalizer"):
+        self.min_new_tokens = torch.cat(
+            [self.min_new_tokens, their.min_new_tokens], dim=0
+        )
+        self.stop_token_penalties = torch.cat(
+            [self.stop_token_penalties, their.stop_token_penalties], dim=0
+        )
+        self.len_output_tokens = torch.cat(
+            [self.len_output_tokens, their.len_output_tokens], dim=0
+        )
--- a/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/presence_penalty.py
+++ b/ktransformers/server/balance_serve/inference/sampling/penaltylib/penalizers/presence_penalty.py
+import typing
+import torch
+from ..orchestrator import _BatchedPenalizer, _TokenIDs
+class BatchedPresencePenalizer(_BatchedPenalizer):
+    """
+    Presence penalizer penalizes tokens based on their presence in the output.
+    """
+    presence_penalties: torch.Tensor = None
+    cumulated_presence_penalties: torch.Tensor = None
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.presence_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+    def _prepare(self):
+        self.cumulated_presence_penalties = (
+            torch.tensor(
+                data=[0.0 for _ in self.orchestrator.reqs()],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .repeat(1, self.orchestrator.vocab_size)
+        )
+        self.presence_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.presence_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .expand_as(self.cumulated_presence_penalties)
+        )
+    def _teardown(self):
+        del self.presence_penalties
+        del self.cumulated_presence_penalties
+        self.presence_penalties = None
+        self.cumulated_presence_penalties = None
+    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
+        pass
+    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
+        mask = output_ids.occurrence_count() > 0
+        self.cumulated_presence_penalties[mask] = self.presence_penalties[mask]
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits -= self.cumulated_presence_penalties
+        return logits
+    def _filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        self.presence_penalties = self.presence_penalties[indices_tensor_to_keep]
+        self.cumulated_presence_penalties = self.cumulated_presence_penalties[
+            indices_tensor_to_keep
+        ]
+    def _merge(self, their: "BatchedPresencePenalizer"):
+        self.presence_penalties = torch.cat(
+            [self.presence_penalties, their.presence_penalties], dim=0
+        )
+        self.cumulated_presence_penalties = torch.cat(
+            [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
+            dim=0,
+        )