setup.py 5.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2025 SGLang Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

16
import multiprocessing
17
import os
18
import sys
19
20
from pathlib import Path

21
import torch
lukec's avatar
lukec committed
22
from setuptools import find_packages, setup
23
24
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

25
26
27
root = Path(__file__).parent.resolve()


28
29
if "bdist_wheel" in sys.argv and "--plat-name" not in sys.argv:
    sys.argv.extend(["--plat-name", "manylinux2014_x86_64"])
Yineng Zhang's avatar
Yineng Zhang committed
30
31


32
def _get_cuda_version():
33
34
35
36
37
    if torch.version.cuda:
        return tuple(map(int, torch.version.cuda.split(".")))
    return (0, 0)


38
def _get_device_sm():
39
40
41
42
43
44
    if torch.cuda.is_available():
        major, minor = torch.cuda.get_device_capability()
        return major * 10 + minor
    return 0


45
46
47
48
49
50
def _get_version():
    with open(root / "pyproject.toml") as f:
        for line in f:
            if line.startswith("version"):
                return line.split("=")[1].strip().strip('"')

51

52
operator_namespace = "sgl_kernels"
53
54
cutlass_default = root / "3rdparty" / "cutlass"
cutlass = Path(os.environ.get("CUSTOM_CUTLASS_SRC_DIR", default=cutlass_default))
55
flashinfer = root / "3rdparty" / "flashinfer"
56
turbomind = root / "3rdparty" / "turbomind"
57
58
59
include_dirs = [
    cutlass.resolve() / "include",
    cutlass.resolve() / "tools" / "util" / "include",
60
    root / "src" / "sgl-kernel" / "include",
Ke Bao's avatar
Ke Bao committed
61
    root / "src" / "sgl-kernel" / "csrc",
62
    flashinfer.resolve() / "include",
63
    flashinfer.resolve() / "include" / "gemm",
64
    flashinfer.resolve() / "csrc",
65
    "cublas",
66
67
    turbomind.resolve(),
    turbomind.resolve() / "src",
68
]
69

Ke Bao's avatar
Ke Bao committed
70
nvcc_flags = [
71
    "-DNDEBUG",
72
    f"-DOPERATOR_NAMESPACE={operator_namespace}",
Ke Bao's avatar
Ke Bao committed
73
74
75
76
77
78
79
    "-O3",
    "-Xcompiler",
    "-fPIC",
    "-gencode=arch=compute_75,code=sm_75",
    "-gencode=arch=compute_80,code=sm_80",
    "-gencode=arch=compute_89,code=sm_89",
    "-gencode=arch=compute_90,code=sm_90",
80
81
82
    "-std=c++17",
    "-use_fast_math",
    "-DFLASHINFER_ENABLE_F16",
83
84
85
86
87
88
    "-DCUTLASS_VERSIONS_GENERATED",
    "-DCUTE_USE_PACKED_TUPLE=1",
    "-DCUTLASS_TEST_LEVEL=0",
    "-DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1",
    "-DCUTLASS_DEBUG_TRACE_LEVEL=0",
    "--ptxas-options=-v",
89
90
    "-Xcompiler=-Wconversion",
    "-Xcompiler=-fno-strict-aliasing",
Ke Bao's avatar
Ke Bao committed
91
]
92
93
94
95
96
nvcc_flags_fp8 = [
    "-DFLASHINFER_ENABLE_FP8",
    "-DFLASHINFER_ENABLE_FP8_E4M3",
    "-DFLASHINFER_ENABLE_FP8_E5M2",
]
97

98
sources = [
99
    "src/sgl-kernel/torch_extension.cc",
100
101
102
    "src/sgl-kernel/csrc/activation/fused_add_rms_norm_kernel.cu",
    "src/sgl-kernel/csrc/allreduce/trt_reduce_internal.cu",
    "src/sgl-kernel/csrc/allreduce/trt_reduce_kernel.cu",
103
    "src/sgl-kernel/csrc/attention/lightning_attention_decode_kernel.cu",
104
105
106
107
108
109
110
111
    "src/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu",
    "src/sgl-kernel/csrc/gemm/fp8_gemm_kernel.cu",
    "src/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu",
    "src/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu",
    "src/sgl-kernel/csrc/gemm/per_token_group_quant_fp8.cu",
    "src/sgl-kernel/csrc/moe/moe_align_kernel.cu",
    "src/sgl-kernel/csrc/speculative/eagle_utils.cu",
    "src/sgl-kernel/csrc/speculative/speculative_sampling.cu",
112
113
114
115
116
    "3rdparty/flashinfer/csrc/activation.cu",
    "3rdparty/flashinfer/csrc/bmm_fp8.cu",
    "3rdparty/flashinfer/csrc/norm.cu",
    "3rdparty/flashinfer/csrc/sampling.cu",
    "3rdparty/flashinfer/csrc/renorm.cu",
117
    "3rdparty/flashinfer/csrc/rope.cu",
118
119
]

120
121
122
123
124
125
126
127
128
129
enable_bf16 = os.getenv("SGL_KERNEL_ENABLE_BF16", "0") == "1"
enable_fp8 = os.getenv("SGL_KERNEL_ENABLE_FP8", "0") == "1"
enable_sm90a = os.getenv("SGL_KERNEL_ENABLE_SM90A", "0") == "1"
cuda_version = _get_cuda_version()
sm_version = _get_device_sm()

if torch.cuda.is_available():
    if cuda_version >= (12, 0) and sm_version >= 90:
        nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
    if sm_version >= 90:
130
        nvcc_flags.extend(nvcc_flags_fp8)
131
132
133
134
135
136
137
    if sm_version >= 80:
        nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
else:
    # compilation environment without GPU
    if enable_sm90a:
        nvcc_flags.append("-gencode=arch=compute_90a,code=sm_90a")
    if enable_fp8:
138
        nvcc_flags.extend(nvcc_flags_fp8)
139
140
    if enable_bf16:
        nvcc_flags.append("-DFLASHINFER_ENABLE_BF16")
141

142
143
144
145
146
147
148
149
150
151
for flag in [
    "-D__CUDA_NO_HALF_OPERATORS__",
    "-D__CUDA_NO_HALF_CONVERSIONS__",
    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
    "-D__CUDA_NO_HALF2_OPERATORS__",
]:
    try:
        torch.utils.cpp_extension.COMMON_NVCC_FLAGS.remove(flag)
    except ValueError:
        pass
152

Ke Bao's avatar
Ke Bao committed
153
cxx_flags = ["-O3"]
Yineng Zhang's avatar
Yineng Zhang committed
154
libraries = ["c10", "torch", "torch_python", "cuda", "cublas"]
155
extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
156

Ke Bao's avatar
Ke Bao committed
157
158
159
ext_modules = [
    CUDAExtension(
        name="sgl_kernel.ops._kernels",
160
        sources=sources,
161
        include_dirs=include_dirs,
Ke Bao's avatar
Ke Bao committed
162
163
164
165
166
167
        extra_compile_args={
            "nvcc": nvcc_flags,
            "cxx": cxx_flags,
        },
        libraries=libraries,
        extra_link_args=extra_link_args,
168
        py_limited_api=True,
Ke Bao's avatar
Ke Bao committed
169
170
171
    ),
]

172
173
setup(
    name="sgl-kernel",
174
    version=_get_version(),
lukec's avatar
lukec committed
175
    packages=find_packages(),
176
    package_dir={"": "src"},
Ke Bao's avatar
Ke Bao committed
177
    ext_modules=ext_modules,
178
179
180
181
182
    cmdclass={
        "build_ext": BuildExtension.with_options(
            use_ninja=True, max_jobs=multiprocessing.cpu_count()
        )
    },
183
    options={"bdist_wheel": {"py_limited_api": "cp39"}},
184
)