helpers.h 717 Bytes
Newer Older
1
2
3
#pragma once

#include <cute/tensor.hpp>
4
5
6
#include <cuda_bf16.h>
#include <cuda_fp8.h>

7
8
9
10
11
12
13
#include "defines.h"

namespace sm100 {

using namespace cute;

CUTE_DEVICE
14
15
int int4_max(int4 t) {
    return max(max(t.x, t.y), max(t.z, t.w));
16
17
18
}

CUTE_DEVICE
19
20
int int4_min(int4 t) {
    return min(min(t.x, t.y), min(t.z, t.w));
21
22
}

23
// Convert 2x fp8_e4m3 to 2x bf16 with scaling
24
CUTE_DEVICE
25
26
27
28
29
30
31
32
nv_bfloat162 fp8x2_to_bf16x2_with_scale(__nv_fp8x2_e4m3 data, nv_bfloat16 scale) {
    // TODO Use native conversion for CUDA >= 13.1
    float2 data_float2 = (float2)data;
    nv_bfloat162 data_bf16x2 = __float22bfloat162_rn(data_float2);
    return nv_bfloat162 {
        data_bf16x2.x * scale,
        data_bf16x2.y * scale
    };
33
34
35
}

}