Commit a1175a4e authored by maxiao1's avatar maxiao1
Browse files

Merge remote-tracking branch 'origin/v0.5.4_dev' into sglang_v0.5.5

parents 0c006b88 31653dd9
...@@ -9,6 +9,58 @@ def is_hip() -> bool: ...@@ -9,6 +9,58 @@ def is_hip() -> bool:
_is_hip = is_hip() _is_hip = is_hip()
def dcu_create_extend_after_decode_spec_info(
verified_id: torch.Tensor,
seq_lens: torch.Tensor,
accept_lens: torch.Tensor,
positions: torch.Tensor,
new_verified_id: torch.Tensor,
bs: int,
):
torch.ops.sgl_kernel.dcu_create_extend_after_decode_spec_info(
verified_id,
seq_lens,
accept_lens,
positions,
new_verified_id,
bs,
)
def dcu_alloc_extend_kernel(
pre_lens_ptr: torch.Tensor,
seq_lens_ptr: torch.Tensor,
last_loc_ptr: torch.Tensor,
free_page_ptr: torch.Tensor,
out_indices: torch.Tensor,
bs: int,
page_size: int,
):
torch.ops.sgl_kernel.dcu_alloc_extend_kernel(
pre_lens_ptr,
seq_lens_ptr,
last_loc_ptr,
free_page_ptr,
out_indices,
bs,
page_size,
)
def dcu_alloc_decode_kernel(
seq_lens_ptr: torch.Tensor,
last_loc_ptr: torch.Tensor,
free_page_ptr: torch.Tensor ,
out_indices: torch.Tensor ,
bs: int,
page_size: int,
):
torch.ops.sgl_kernel.dcu_alloc_decode_kernel(
seq_lens_ptr,
last_loc_ptr,
free_page_ptr,
out_indices,
bs,
page_size,
)
def transfer_kv_per_layer( def transfer_kv_per_layer(
src_k: torch.Tensor, src_k: torch.Tensor,
...@@ -305,3 +357,76 @@ def transfer_kv_all_layer_mla_lf_pf( ...@@ -305,3 +357,76 @@ def transfer_kv_all_layer_mla_lf_pf(
block_quota, block_quota,
num_warps_per_block, num_warps_per_block,
) )
def dcu_assign_req_to_token_pool(
req_pool_indices:torch.Tensor,
req_to_token:torch.Tensor,
allocate_lens:torch.Tensor,
new_allocate_lens:torch.Tensor,
out_cache_loc:torch.Tensor,
shape:int,
bs:int,
):
torch.ops.sgl_kernel.dcu_assign_req_to_token_pool(
req_pool_indices,
req_to_token,
allocate_lens,
new_allocate_lens,
out_cache_loc,
shape,
bs,
)
def dcu_get_last_loc(
req_to_token: torch.Tensor,
req_pool_indices: torch.Tensor,
prefix_lens: torch.Tensor,
):
result = torch.ops.sgl_kernel.dcu_get_last_loc(
req_to_token,
req_pool_indices,
prefix_lens,
)
return result
def dcu_assign_extend_cache_locs(
req_pool_indices: torch.Tensor,
req_to_token: torch.Tensor,
start_offset: torch.Tensor,
end_offset: torch.Tensor,
out_cache_loc: torch.Tensor,
pool_len: int,
bs: int,
):
torch.ops.sgl_kernel.dcu_assign_extend_cache_locs(
req_pool_indices,
req_to_token,
start_offset,
end_offset,
out_cache_loc,
pool_len,
bs,
)
def dcu_create_chunked_prefix_cache_kv_indices(
req_to_token: torch.Tensor,
req_pool_indices: torch.Tensor,
chunk_starts: torch.Tensor,
chunk_seq_lens: torch.Tensor,
chunk_cu_seq_lens: torch.Tensor,
chunk_kv_indices: torch.Tensor,
col_num: int,
bs: int,
):
torch.ops.sgl_kernel.dcu_create_chunked_prefix_cache_kv_indices(
req_to_token,
req_pool_indices,
chunk_starts,
chunk_seq_lens,
chunk_cu_seq_lens,
chunk_kv_indices,
col_num,
bs,
)
# Copyright 2025 SGLang Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import platform
import sys
from pathlib import Path
import torch
from setuptools import find_packages, setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
root = Path(__file__).parent.resolve()
arch = platform.machine().lower()
def _get_version():
with open(root / "pyproject.toml") as f:
for line in f:
if line.startswith("version"):
return line.split("=")[1].strip().strip('"')
operator_namespace = "sgl_kernel"
include_dirs = [
root / "include",
root / "include" / "impl",
root / "csrc",
]
sources = [
"csrc/allreduce/custom_all_reduce.hip",
"csrc/allreduce/quick_all_reduce.cu",
"csrc/common_extension_rocm.cc",
"csrc/elementwise/activation.cu",
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu",
"csrc/moe/moe_align_kernel.cu",
"csrc/moe/moe_topk_softmax_kernels.cu",
"csrc/speculative/eagle_utils.cu",
"csrc/kvcacheio/transfer.cu",
"csrc/attention/merge_attn_states.cu",
]
cxx_flags = ["-O3"]
libraries = ["hiprtc", "amdhip64", "c10", "torch", "torch_python"]
extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", f"-L/usr/lib/{arch}-linux-gnu"]
hipcc_flags = [
"-DNDEBUG",
f"-DOPERATOR_NAMESPACE={operator_namespace}",
"-O3",
"-Xcompiler",
"-fPIC",
"-std=c++17",
"-DENABLE_BF16",
"-DENABLE_FP8",
]
ext_modules = [
CUDAExtension(
name="sgl_kernel.common_ops",
sources=sources,
include_dirs=include_dirs,
extra_compile_args={
"nvcc": hipcc_flags,
"cxx": cxx_flags,
},
libraries=libraries,
extra_link_args=extra_link_args,
py_limited_api=False,
),
]
setup(
name="sgl-kernel",
version=_get_version(),
packages=find_packages(where="python"),
package_dir={"": "python"},
ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension.with_options(use_ninja=True)},
options={"bdist_wheel": {"py_limited_api": "cp39"}},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment