activation_kernels.hip 4.63 KB
Newer Older
fengzch-das's avatar
fengzch-das committed
1
#include "hip/hip_runtime.h"
Zhekai Zhang's avatar
Zhekai Zhang committed
2
3
4
5
6
#include "activation_kernels_impl.cuh"
#include "activation_kernels.h"
#include "dispatch_utils.h"

// Launch element-wise activation kernel.
Muyang Li's avatar
Muyang Li committed
7
8
9
10
11
#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                                               \
    int d          = input.size(-1);                                                                                   \
    int num_tokens = input.numel() / d;                                                                                \
    dim3 grid(num_tokens);                                                                                             \
    dim3 block(std::min(d, 1024));                                                                                     \
fengzch-das's avatar
fengzch-das committed
12
    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();                                                                \
Muyang Li's avatar
Muyang Li committed
13
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] {                                       \
fengzch-das's avatar
fengzch-das committed
14
15
       hipLaunchKernelGGL(( vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>)                                                            \
            , dim3(grid), dim3(block), 0, stream, out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);                     \
Zhekai Zhang's avatar
Zhekai Zhang committed
16
17
    });

Muyang Li's avatar
Muyang Li committed
18
19
void silu_and_mul(Tensor &out,   // [..., d]
                  Tensor &input) // [..., 2 * d]
Zhekai Zhang's avatar
Zhekai Zhang committed
20
{
Muyang Li's avatar
Muyang Li committed
21
22
23
24
    int64_t num_tokens = input.numel() / input.size(-1);
    int d              = input.size(-1) / 2;
    dim3 grid(num_tokens);
    dim3 block(std::min(d, 1024));
fengzch-das's avatar
fengzch-das committed
25
    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
Muyang Li's avatar
Muyang Li committed
26
27
28
29
30
    //   dispatchFloat(input.scalar_type(), [&]<typename scalar_t>() {
    //     vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
    //         out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);
    //   });
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_kernel", [&] {
fengzch-das's avatar
fengzch-das committed
31
32
       hipLaunchKernelGGL(( vllm::silu_and_mul_kernel<scalar_t>)
            , dim3(grid), dim3(block), 0, stream, out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);
Muyang Li's avatar
Muyang Li committed
33
    });
Zhekai Zhang's avatar
Zhekai Zhang committed
34
35
}

Muyang Li's avatar
Muyang Li committed
36
37
38
39
40
41
42
43
44
void invoke_dequant_silu_and_mul_quant(Tensor &out,   // [..., d]
                                       Tensor &input, // [..., 2 * d]
                                       const float scale_gate,
                                       const float scale_up,
                                       const float scale_out) {
    int64_t num_tokens = input.numel() / input.size(-1);
    int d              = input.size(-1) / 2;
    dim3 grid(num_tokens);
    dim3 block(std::min(d, 1024));
fengzch-das's avatar
fengzch-das committed
45
46
    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
   hipLaunchKernelGGL(( vllm::dequant_silu_and_mul_quant_kernel<float, false>), dim3(grid), dim3(block), 0, stream, 
Muyang Li's avatar
Muyang Li committed
47
        out.data_ptr<int8_t>(), input.data_ptr<int32_t>(), d, scale_gate, scale_up, scale_out);
Zhekai Zhang's avatar
Zhekai Zhang committed
48
49
}

Muyang Li's avatar
Muyang Li committed
50
51
52
53
54
55
void invoke_dequant_silu_and_mul_quant(Tensor &out,   // [..., d]
                                       Tensor &input, // [..., 2 * d]
                                       const float scale_gate,
                                       const float scale_up,
                                       Tensor &scale_out, // [num_tokens]
                                       Tensor &tmp        // [..., d]
Zhekai Zhang's avatar
Zhekai Zhang committed
56
) {
Muyang Li's avatar
Muyang Li committed
57
58
59
60
    int64_t num_tokens = input.numel() / input.size(-1);
    int d              = input.size(-1) / 2;
    dim3 grid(num_tokens);
    dim3 block(std::min(d, 1024));
fengzch-das's avatar
fengzch-das committed
61
62
    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
   hipLaunchKernelGGL(( vllm::dequant_silu_and_mul_quant_kernel<float *, true>), dim3(grid), dim3(block), 0, stream, out.data_ptr<int8_t>(),
Muyang Li's avatar
Muyang Li committed
63
64
65
66
67
68
                                                                                       input.data_ptr<int32_t>(),
                                                                                       d,
                                                                                       scale_gate,
                                                                                       scale_up,
                                                                                       scale_out.data_ptr<float>(),
                                                                                       tmp.data_ptr<float>());
Zhekai Zhang's avatar
Zhekai Zhang committed
69
70
}

Muyang Li's avatar
Muyang Li committed
71
72
void silu(Tensor &out,   // [..., d]
          Tensor &input) // [..., d]
Zhekai Zhang's avatar
Zhekai Zhang committed
73
{
Muyang Li's avatar
Muyang Li committed
74
    LAUNCH_ACTIVATION_KERNEL(vllm::silu);
Zhekai Zhang's avatar
Zhekai Zhang committed
75
76
}

Muyang Li's avatar
Muyang Li committed
77
78
void gelu_new(Tensor &out,   // [..., d]
              Tensor &input) // [..., d]
Zhekai Zhang's avatar
Zhekai Zhang committed
79
{
Muyang Li's avatar
Muyang Li committed
80
    LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
Zhekai Zhang's avatar
Zhekai Zhang committed
81
82
}

Muyang Li's avatar
Muyang Li committed
83
84
void gelu_fast(Tensor &out,   // [..., d]
               Tensor &input) // [..., d]
Zhekai Zhang's avatar
Zhekai Zhang committed
85
{
Muyang Li's avatar
Muyang Li committed
86
87
    LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}