// Copyright (c) OpenMMLab. All rights reserved. #pragma once #include "decoder_multihead_attention.h" #include "src/turbomind/macro.h" #include #include namespace turbomind { template void Compare( const T* src, const T* ref, size_t stride, int m, int n, bool show = false, float rtol = 1e-2, float atol = 1e-4); void LoadBinary(const std::string& path, size_t size, void* dst); class RNG { public: RNG(); ~RNG(); void GenerateUInt(uint* out, size_t count); template void GenerateUniform(T* out, size_t count, float scale = 1.f, float shift = 0.f); template void GenerateNormal(T* out, size_t count, float scale = 1.f, float shift = 0.f); private: struct Impl; std::unique_ptr impl_; }; template void mmha_ft_reference(const DecoderMultiHeadAttentionParams& params, cudaStream_t st); } // namespace turbomind