test_utils.h 1.3 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "decoder_multihead_attention.h"
#include "src/turbomind/macro.h"
#include <cuda_fp16.h>
#include <memory>

namespace turbomind {

template<typename T>
void Compare(
    const T* src, const T* ref, size_t stride, int m, int n, bool show = false, float rtol = 1e-2, float atol = 1e-4);

void LoadBinary(const std::string& path, size_t size, void* dst);

class RNG {
public:
    RNG();
    ~RNG();
    void GenerateUInt(uint* out, size_t count);

    template<typename T>
    void GenerateUniform(T* out, size_t count, float scale = 1.f, float shift = 0.f);

    template<typename T>
    void GenerateNormal(T* out, size_t count, float scale = 1.f, float shift = 0.f);

private:
    struct Impl;
    std::unique_ptr<Impl> impl_;
};

template<typename T>
36
37
38
39
40
41
void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& params,
                       T**                                       per_sample_k_cache,
                       T**                                       per_sample_v_cache,
                       const int*                                sequence_length,
                       int                                       max_memory_len,
                       cudaStream_t                              st);
Li Zhang's avatar
Li Zhang committed
42
43

}  // namespace turbomind