softmax_fast.h

#pragma once
#include <iostream>
#include <type_traits>
#include <limits>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <curand_kernel.h>
#include "util.h"

template <int N>
using IntegerBits = typename std::conditional<N <= 8, uint8_t,
                                              typename std::conditional<N <= 16, uint16_t,
                                                                        typename std::conditional<N <= 32, uint32_t,
                                                                                                  typename std::conditional<N <= 64, uint64_t, void>::type>::type>::type>::type;

template <int LogElements>
struct SoftmaxParameters
{
    static_assert(LogElements <= 11, "");
    static constexpr int Elements = 1 << LogElements;
    static constexpr int WarpBatch = Elements <= 128 ? 2 : 1;
    static constexpr int WarpIterations = Elements <= 32 ? 1 : Elements / 32;
    using MaskType = IntegerBits<WarpIterations>;
    static constexpr int WarpSize = Elements <= 32 ? Elements : 32;
    static constexpr int MaskStride = WarpSize;
};

inline int log2_ceil(int value)
{
    int log2_value = 0;
    while ((1 << log2_value) < value)
        ++log2_value;
    return log2_value;
}

inline at::ScalarType softmax_mask_dtype(int elements)
{
    if (elements > 1024)
    {
        return torch::kInt64;
    }
    else if (elements > 512)
    {
        return torch::kInt32;
    }
    else if (elements > 256)
    {
        return torch::kInt16;
    }
    return torch::kInt8;
}

inline int softmax_mask_size(int batch_size, int elements)
{
    int log2_elements = log2_ceil(elements);
    int e = 1 << log2_elements;
    int warp_size = e < 32 ? e : 32;
    return batch_size * warp_size;
}

inline int softmax_rng_delta_offset(int elements)
{
    int log2_elements = log2_ceil(elements);
    int e = 1 << log2_elements;
    int warp_iterations = e <= 32 ? 1 : e / 32;
    int warp_batch = e <= 128 ? 2 : 1;
    return warp_iterations * warp_batch;
}

template <
    typename input_t, typename output_t, typename acc_t,
    typename Parameters, bool NeedMask, bool NeedBias, bool NeedAttnMask>
__global__ void softmax_warp_forward(input_t *dst, input_t *dst_orig, const output_t *src, const input_t *attn_mask, const input_t *bias,
                                     typename Parameters::MaskType *mask, acc_t p, int64_t batch_size, int64_t attn_inner_skip_batch, int64_t bias_batch_size, int element_count, uint64_t seed, uint64_t rand_offset)
{
    using MaskType = typename Parameters::MaskType;
    curandStatePhilox4_32_10_t state;
    int64_t first_batch = (static_cast<int64_t>(blockDim.y) * static_cast<int64_t>(blockIdx.x) + threadIdx.y) * Parameters::WarpBatch;
    // there might be multiple batches per warp. compute the index within the batch
    int64_t local_idx = threadIdx.x;
    const int64_t thread_offset = first_batch * element_count + local_idx;
    if IF_CONSTEXPR (NeedMask)
    {
        curand_init(seed, thread_offset, rand_offset, &state);
    }

    // batch_size might not be a multiple of Parameters::WarpBatch. Check how
    // many batches have to computed within this WARP.
    int local_batches = batch_size - first_batch;
    if (local_batches > Parameters::WarpBatch)
        local_batches = Parameters::WarpBatch;

    src += thread_offset;
    dst += thread_offset;
    if IF_CONSTEXPR (NeedMask)
    {
        dst_orig += thread_offset;
        mask += first_batch * Parameters::MaskStride;
    }

    int64_t bias_mod_size = bias_batch_size * element_count;

    int64_t attn_mask_div_size = element_count;
    if IF_CONSTEXPR (NeedAttnMask)
    {
        attn_mask_div_size = attn_inner_skip_batch * element_count;
    }

    // load data from global memory
    input_t elements_input[Parameters::WarpBatch][Parameters::WarpIterations];
#pragma unroll
    for (int i = 0; i < Parameters::WarpBatch; ++i)
    {
        int batch_element_count = (i >= local_batches) ? 0 : element_count;
#pragma unroll
        for (int it = 0; it < Parameters::WarpIterations; ++it)
        {
            int element_index = local_idx + it * Parameters::WarpSize;
            elements_input[i][it] = -std::numeric_limits<float>::infinity();

            if (element_index < batch_element_count)
            {
                elements_input[i][it] = src[i * element_count + it * Parameters::WarpSize];
            }
        }
    }

    // convert input_t to acc_t
    acc_t elements[Parameters::WarpBatch][Parameters::WarpIterations];
#pragma unroll
    for (int i = 0; i < Parameters::WarpBatch; ++i)
    {
        int batch_element_count = (i >= local_batches) ? 0 : element_count;
#pragma unroll
        for (int it = 0; it < Parameters::WarpIterations; ++it)
        {
            elements[i][it] = elements_input[i][it];
            int element_index = local_idx + it * Parameters::WarpSize;
            if (element_index < batch_element_count)
            {
                int64_t global_idx = thread_offset + i * element_count + it * Parameters::WarpSize;
                if IF_CONSTEXPR (NeedAttnMask)
                {
                    auto attn_mask_idx = static_cast<int64_t>(global_idx / attn_mask_div_size) * element_count + (global_idx % element_count);
                    elements[i][it] += attn_mask[attn_mask_idx];
                }
                if IF_CONSTEXPR (NeedBias)
                {
                    elements[i][it] += bias[global_idx % bias_mod_size];
                }
            }
        }
    }

    // compute local max_value

    // take the max_value of the first element to avoid one max call
    acc_t max_value[Parameters::WarpBatch];
#pragma unroll
    for (int i = 0; i < Parameters::WarpBatch; ++i)
    {
        max_value[i] = elements[i][0];
    }

#pragma unroll
    for (int it = 1; it < Parameters::WarpIterations; ++it)
    {
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
        }
    }

// reduction max_value
#pragma unroll
    for (int offset = Parameters::WarpSize / 2; offset > 0; offset /= 2)
    {
        float val[Parameters::WarpBatch];
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            val[i] = SHFL_XOR(max_value[i], offset, Parameters::WarpSize);
        }
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            max_value[i] = max_value[i] > val[i] ? max_value[i] : val[i];
        }
    }

    // compute local sum
    acc_t sum[Parameters::WarpBatch]{0.0f};

#pragma unroll
    for (int i = 0; i < Parameters::WarpBatch; ++i)
    {
#pragma unroll
        for (int it = 0; it < Parameters::WarpIterations; ++it)
        {
            elements[i][it] = std::exp(elements[i][it] - max_value[i]);
            sum[i] += elements[i][it];
        }
    }

// reduction sum
#pragma unroll
    for (int offset = Parameters::WarpSize / 2; offset > 0; offset /= 2)
    {
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            sum[i] += SHFL_XOR(sum[i], offset, Parameters::WarpSize);
        }
    }

    // store result
    if IF_CONSTEXPR (NeedMask)
    {
        const acc_t pinv = 1.0 / p;
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            if (i >= local_batches)
                break;
            MaskType m = 0;
            if IF_CONSTEXPR (Parameters::WarpIterations == 1)
            {
                float rand = curand_uniform(&state);
                m = rand < p;
            }
            else if IF_CONSTEXPR (Parameters::WarpIterations == 2)
            {
                m = curand_uniform(&state) < p;
                m |= (curand_uniform(&state) < p) << 1;
            }
            else
            {
#pragma unroll
                for (int j = 0; j < DIV_CELL(Parameters::WarpIterations, 4); ++j)
                {
                    float4 rand4 = curand_uniform4(&state);
                    m |= (((MaskType)(rand4.x < p)) << (j * 4)) | (((MaskType)(rand4.y < p)) << (j * 4 + 1)) | (((MaskType)(rand4.z < p)) << (j * 4 + 2)) | (((MaskType)(rand4.w < p)) << (j * 4 + 3));
                }
            }
            mask[i * Parameters::MaskStride + local_idx] = m;
#pragma unroll
            for (int it = 0; it < Parameters::WarpIterations; ++it)
            {
                int element_index = local_idx + it * Parameters::WarpSize;
                if (element_index < element_count)
                {
                    const output_t d = elements[i][it] / sum[i];
                    dst[i * element_count + it * Parameters::WarpSize] = (acc_t)d * ((acc_t)((m >> it) & 1) * pinv);
                    dst_orig[i * element_count + it * Parameters::WarpSize] = d;
                }
                else
                {
                    break;
                }
            }
        }
    }
    else
    {
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            if (i >= local_batches)
                break;
#pragma unroll
            for (int it = 0; it < Parameters::WarpIterations; ++it)
            {
                int element_index = local_idx + it * Parameters::WarpSize;
                if (element_index < element_count)
                {
                    dst[i * element_count + it * Parameters::WarpSize] = elements[i][it] / sum[i];
                }
                else
                {
                    break;
                }
            }
        }
    }
}

#define LAUNCH_FORWARD_KERNEL(l)                                                                           \
    softmax_warp_forward<input_t, output_t, acc_t, SoftmaxParameters<l>, NeedMask, NeedBias, NeedAttnMask> \
        <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(                                        \
            dst, dst_orig, src, attn_mask, bias, (typename SoftmaxParameters<l>::MaskType *)mask, p,       \
            batch_count, attn_inner_skip_batch, bias_batch_count, softmax_elements, seed, offset);         \
    return true;

template <typename input_t, typename output_t, typename acc_t, bool NeedMask, bool NeedBias, bool NeedAttnMask>
bool dispatch_softmax_forward(output_t *dst, output_t *dst_orig, const input_t *src, const input_t *attn_mask, const input_t *bias, void *mask, acc_t p,
                              int softmax_elements, int64_t batch_count, int64_t attn_inner_skip_batch, int64_t bias_batch_count, uint64_t seed, uint64_t offset)
{
    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048);
    if (softmax_elements == 0)
    {
        return false;
    }
    else
    {
        int log2_elements = log2_ceil(softmax_elements);
        const int next_power_of_two = 1 << log2_elements;

        // This value must match the Parameters::WarpSize constexpr value computed inside softmax_warp_backward.
        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

        // This value must match the Parameters::WarpBatch constexpr value computed inside softmax_warp_backward.
        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

        // use 128 threads per block to maximimize gpu utilization
        constexpr int threads_per_block = 128;

        int warps_per_block = (threads_per_block / warp_size);
        int batches_per_block = warps_per_block * batches_per_warp;
        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
        dim3 threads(warp_size, warps_per_block, 1);
        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
        switch (log2_elements)
        {
        case 0:
            LAUNCH_FORWARD_KERNEL(0)
        case 1:
            LAUNCH_FORWARD_KERNEL(1)
        case 2:
            LAUNCH_FORWARD_KERNEL(2)
        case 3:
            LAUNCH_FORWARD_KERNEL(3)
        case 4:
            LAUNCH_FORWARD_KERNEL(4)
        case 5:
            LAUNCH_FORWARD_KERNEL(5)
        case 6:
            LAUNCH_FORWARD_KERNEL(6)
        case 7:
            LAUNCH_FORWARD_KERNEL(7)
        case 8:
            LAUNCH_FORWARD_KERNEL(8)
        case 9:
            LAUNCH_FORWARD_KERNEL(9)
        case 10:
            LAUNCH_FORWARD_KERNEL(10)
        case 11:
            LAUNCH_FORWARD_KERNEL(11)
        default:
            return false;
        }
    }
    return false;
}

template <
    typename input_t, typename output_t, typename acc_t, typename Parameters,
    bool IsLogSoftmax, bool NeedMask>
__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output,
                                      const typename Parameters::MaskType *mask, acc_t p, int64_t batch_size, int element_count)
{
    using MaskType = typename Parameters::MaskType;
    int64_t first_batch = (static_cast<int64_t>(blockDim.y) * static_cast<int64_t>(blockIdx.x) + threadIdx.y) * Parameters::WarpBatch;

    // batch_size might not be a multiple of Parameters::WarpBatch. Check how
    // many batches have to computed within this WARP.
    int local_batches = batch_size - first_batch;
    if (local_batches > Parameters::WarpBatch)
        local_batches = Parameters::WarpBatch;

    // there might be multiple batches per warp. compute the index within the batch
    int64_t local_idx = threadIdx.x;

    // the first element to process by the current thread
    int64_t thread_offset = first_batch * element_count + local_idx;
    grad += thread_offset;
    output += thread_offset;
    gradInput += thread_offset;
    if IF_CONSTEXPR (NeedMask)
    {
        mask += first_batch * Parameters::MaskStride;
    }

    // The nested loops over Parameters::WarpBatch and then Parameters::WarpIterations can be simplified to one loop,
    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
    // the nested loops.
    // This should have no impact on performance because the loops are unrolled anyway.

    // load data from global memory
    acc_t grad_reg[Parameters::WarpBatch][Parameters::WarpIterations];
    acc_t output_reg[Parameters::WarpBatch][Parameters::WarpIterations];
    if IF_CONSTEXPR (NeedMask)
    {
        MaskType mask_reg[Parameters::WarpBatch];
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            if (i >= local_batches)
                break;
            mask_reg[i] = mask[i * Parameters::MaskStride + local_idx];
        }

        const acc_t pinv = 1.0 / p;

#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            int batch_element_count = (i >= local_batches) ? 0 : element_count;
            MaskType m = mask_reg[i];
#pragma unroll
            for (int it = 0; it < Parameters::WarpIterations; ++it)
            {
                int element_index = local_idx + it * Parameters::WarpSize;
                if (element_index < batch_element_count)
                {
                    grad_reg[i][it] =
                        (input_t)((acc_t)((m >> it) & 1) *
                                  (acc_t)grad[i * element_count + it * Parameters::WarpSize] *
                                  pinv) *
                        output[i * element_count + it * Parameters::WarpSize];
                    output_reg[i][it] = output[i * element_count + it * Parameters::WarpSize];
                }
                else
                {
                    grad_reg[i][it] = acc_t(0);
                    output_reg[i][it] = acc_t(0);
                }
            }
        }
    }
    else
    {
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            int batch_element_count = (i >= local_batches) ? 0 : element_count;
#pragma unroll
            for (int it = 0; it < Parameters::WarpIterations; ++it)
            {
                int element_index = local_idx + it * Parameters::WarpSize;
                if (element_index < batch_element_count)
                {
                    grad_reg[i][it] = grad[i * element_count + it * Parameters::WarpSize] *
                                      output[i * element_count + it * Parameters::WarpSize];
                    output_reg[i][it] = output[i * element_count + it * Parameters::WarpSize];
                }
                else
                {
                    grad_reg[i][it] = acc_t(0);
                    output_reg[i][it] = acc_t(0);
                }
            }
        }
    }

    acc_t sum[Parameters::WarpBatch];
#pragma unroll
    for (int i = 0; i < Parameters::WarpBatch; ++i)
    {
        sum[i] = grad_reg[i][0];
#pragma unroll
        for (int it = 1; it < Parameters::WarpIterations; ++it)
        {
            sum[i] += grad_reg[i][it];
        }
    }

#pragma unroll
    for (int offset = Parameters::WarpSize / 2; offset > 0; offset /= 2)
    {
#pragma unroll
        for (int i = 0; i < Parameters::WarpBatch; ++i)
        {
            sum[i] += SHFL_XOR(sum[i], offset, Parameters::WarpSize);
        }
    }

// store result
#pragma unroll
    for (int i = 0; i < Parameters::WarpBatch; ++i)
    {
        if (i >= local_batches)
            break;
#pragma unroll
        for (int it = 0; it < Parameters::WarpIterations; ++it)
        {
            int element_index = local_idx + it * Parameters::WarpSize;
            if (element_index < element_count)
            {
                // compute gradients
                if IF_CONSTEXPR (IsLogSoftmax)
                {
                    gradInput[i * element_count + it * Parameters::WarpSize] =
                        (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
                }
                else
                {
                    gradInput[i * element_count + it * Parameters::WarpSize] =
                        (grad_reg[i][it] - output_reg[i][it] * sum[i]);
                }
            }
        }
    }
}

#define LAUNCH_BACKWARD_KERNEL(l)                                                                 \
    softmax_warp_backward<input_t, output_t, acc_t, SoftmaxParameters<l>, IsLogSoftmax, NeedMask> \
        <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(                               \
            grad_input, grad, output, (const typename SoftmaxParameters<l>::MaskType *)mask, p,   \
            batch_count, softmax_elements);                                                       \
    break;

template <typename input_t, typename output_t, typename acc_t, bool IsLogSoftmax, bool NeedMask>
void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output,
                               const void *mask, acc_t p, int softmax_elements, int64_t batch_count)
{
    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048);
    if (softmax_elements == 0)
    {
        return;
    }
    else
    {
        int log2_elements = log2_ceil(softmax_elements);
        const int next_power_of_two = 1 << log2_elements;

        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;

        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

        // use 128 threads per block to maximimize gpu utilization
        constexpr int threads_per_block = 128;

        int warps_per_block = (threads_per_block / warp_size);
        int batches_per_block = warps_per_block * batches_per_warp;
        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
        dim3 threads(warp_size, warps_per_block, 1);
        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
        switch (log2_elements)
        {
        case 0:
            LAUNCH_BACKWARD_KERNEL(0)
        case 1:
            LAUNCH_BACKWARD_KERNEL(1)
        case 2:
            LAUNCH_BACKWARD_KERNEL(2)
        case 3:
            LAUNCH_BACKWARD_KERNEL(3)
        case 4:
            LAUNCH_BACKWARD_KERNEL(4)
        case 5:
            LAUNCH_BACKWARD_KERNEL(5)
        case 6:
            LAUNCH_BACKWARD_KERNEL(6)
        case 7:
            LAUNCH_BACKWARD_KERNEL(7)
        case 8:
            LAUNCH_BACKWARD_KERNEL(8)
        case 9:
            LAUNCH_BACKWARD_KERNEL(9)
        case 10:
            LAUNCH_BACKWARD_KERNEL(10)
        case 11:
            LAUNCH_BACKWARD_KERNEL(11)
        default:
            break;
        }
    }
}