__init__.py 1.52 KB
Newer Older
1
2
3
import ctypes
import os

4
5
import torch

6
7
8
9
10
11
if os.path.exists("/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12"):
    ctypes.CDLL(
        "/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12",
        mode=ctypes.RTLD_GLOBAL,
    )

12
13
from sgl_kernel import common_ops
from sgl_kernel.allreduce import *
14
15
16
17
from sgl_kernel.attention import (
    cutlass_mla_decode,
    cutlass_mla_get_workspace_size,
    lightning_attention_decode,
Yineng Zhang's avatar
Yineng Zhang committed
18
    merge_state,
19
)
20
from sgl_kernel.elementwise import (
21
22
23
24
25
26
27
28
29
    apply_rope_with_cos_sin_cache_inplace,
    fused_add_rmsnorm,
    gelu_and_mul,
    gelu_tanh_and_mul,
    gemma_fused_add_rmsnorm,
    gemma_rmsnorm,
    rmsnorm,
    silu_and_mul,
)
30
from sgl_kernel.gemm import (
31
    awq_dequantize,
32
    bmm_fp8,
Trevor Morris's avatar
Trevor Morris committed
33
    cutlass_scaled_fp4_mm,
34
35
36
    fp8_blockwise_scaled_mm,
    fp8_scaled_mm,
    int8_scaled_mm,
Trevor Morris's avatar
Trevor Morris committed
37
    scaled_fp4_quant,
38
    sgl_per_tensor_quant_fp8,
39
    sgl_per_token_group_quant_fp8,
40
    sgl_per_token_group_quant_int8,
41
    sgl_per_token_quant_fp8,
42
)
43
from sgl_kernel.moe import moe_align_block_size, moe_fused_gate, topk_softmax
44
from sgl_kernel.sampling import (
45
46
47
48
49
50
    min_p_sampling_from_probs,
    top_k_renorm_prob,
    top_k_top_p_sampling_from_probs,
    top_p_renorm_prob,
    top_p_sampling_from_probs,
)
51
from sgl_kernel.speculative import (
52
    build_tree_kernel_efficient,
53
    segment_packbits,
54
    tree_speculative_sampling_target_only,
55
    verify_tree_greedy,
56
)
Lianmin Zheng's avatar
Lianmin Zheng committed
57
from sgl_kernel.version import __version__
58
59
60
61

build_tree_kernel = (
    None  # TODO(ying): remove this after updating the sglang python code.
)