Commit 8b59f4fe authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/204

parents 16506fc0 df1c6b5d
#ifndef __INFINIOP_REDUCE_KUNLUN_H__ #ifndef __INFINIOP_REDUCE_KUNLUN_H__
#define __INFINIOP_REDUCE_KUNLUN_H__ #define __INFINIOP_REDUCE_KUNLUN_H__
#include "../../devices/kunlun/kunlun_common.h" #include "../../devices/kunlun/kunlun_kernel_common.h"
namespace op::common_kunlun::reduce_op { namespace op::common_kunlun::reduce_op {
using namespace device::kunlun::kernel;
// Use 16 floats instruction to calculate reduce // Use 16 floats instruction to calculate reduce
// data_ptr is the pointer of LM // data_ptr is the pointer of LM
static inline __device__ float sumSquaredF32(float *data_ptr, int count) { static inline __device__ float sumSquaredF32(float *data_ptr, int count) {
......
...@@ -2,9 +2,19 @@ ...@@ -2,9 +2,19 @@
#define __INFINIOP_TENSOR_H__ #define __INFINIOP_TENSOR_H__
#include "infiniop/tensor_descriptor.h" #include "infiniop/tensor_descriptor.h"
#include "../utils.h"
#include <string> #include <string>
#include <vector> #include <vector>
#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
do { \
auto __RESULT__ = __TENSOR_DESC__->__OP__; \
CHECK_RESULT(__RESULT__); \
__TENSOR_DESC__ = __RESULT__.take(); \
} while (0)
struct InfiniopTensorDescriptor { struct InfiniopTensorDescriptor {
private: private:
// Datatype // Datatype
...@@ -32,9 +42,9 @@ public: ...@@ -32,9 +42,9 @@ public:
bool hasBroadcastDim() const; bool hasBroadcastDim() const;
std::vector<size_t> getBroadcastDim() const; std::vector<size_t> getBroadcastDim() const;
infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const; utils::Result<infiniopTensorDescriptor_t> dimMerge(size_t dim_start, size_t dim_end) const;
infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const; utils::Result<infiniopTensorDescriptor_t> dimSplit(size_t axis, const std::vector<size_t> &dims) const;
infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const; utils::Result<infiniopTensorDescriptor_t> dimPermute(const std::vector<size_t> &order) const;
std::string toString() const; std::string toString() const;
}; };
......
...@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip ...@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
std::vector<ptrdiff_t> strides(ndim); std::vector<ptrdiff_t> strides(ndim);
ptrdiff_t dsize = 1; ptrdiff_t dsize = 1;
if (ndim > 0) { if (ndim > 0) {
for (size_t i = ndim - 1; i >= 0; i--) { for (int i = (int)ndim - 1; i >= 0; i--) {
strides[i] = dsize; strides[i] = dsize;
dsize *= shape_[i]; dsize *= shape_[i];
} }
...@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const { ...@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
return res; return res;
} }
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const { utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
if (dim_start > dim_end || dim_end >= ndim()) { CHECK_OR_RETURN(dim_start <= dim_end && dim_end < ndim(), INFINI_STATUS_BAD_PARAM);
return nullptr;
}
size_t new_ndim = ndim() - (dim_end - dim_start); size_t new_ndim = ndim() - (dim_end - dim_start);
std::vector<size_t> new_shape(new_ndim); std::vector<size_t> new_shape(new_ndim);
...@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, ...@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index++; index++;
} }
if (!isContiguous(dim_start, dim_end)) { CHECK_OR_RETURN(isContiguous(dim_start, dim_end), INFINI_STATUS_BAD_PARAM);
return nullptr;
}
new_shape[index] = 1; new_shape[index] = 1;
for (size_t i = dim_start; i <= dim_end; i++) { for (size_t i = dim_start; i <= dim_end; i++) {
...@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, ...@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index++; index++;
} }
return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()); return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
} }
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const { utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
size_t ndim_ = ndim(); size_t ndim_ = ndim();
if (dim(axis) != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>())) { CHECK_OR_RETURN(dim(axis) == std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>()),
return nullptr; INFINI_STATUS_BAD_PARAM);
}
size_t new_ndim = ndim_ + dims.size() - 1; size_t new_ndim = ndim_ + dims.size() - 1;
std::vector<size_t> new_shape(new_ndim); std::vector<size_t> new_shape(new_ndim);
...@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const ...@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
index++; index++;
} }
return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()); return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
} }
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const { utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
auto ndim_ = ndim(); auto ndim_ = ndim();
if (order.size() != ndim_) { CHECK_OR_RETURN(order.size() == ndim_, INFINI_STATUS_BAD_PARAM);
return nullptr;
}
std::vector<size_t> new_shape(ndim_); std::vector<size_t> new_shape(ndim_);
std::vector<ptrdiff_t> new_strides(ndim_); std::vector<ptrdiff_t> new_strides(ndim_);
for (size_t i = 0; i < ndim_; i++) { for (size_t i = 0; i < ndim_; i++) {
if (std::find(order.begin(), order.end(), i) == order.end()) { CHECK_OR_RETURN(std::find(order.begin(), order.end(), i) != order.end(), INFINI_STATUS_BAD_PARAM);
return nullptr;
}
new_shape[i] = dim(order[i]); new_shape[i] = dim(order[i]);
new_strides[i] = stride(order[i]); new_strides[i] = stride(order[i]);
} }
return new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()); return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()));
} }
std::string InfiniopTensorDescriptor::toString() const { std::string InfiniopTensorDescriptor::toString() const {
......
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
namespace infinirt::bang { namespace infinirt::bang {
infiniStatus_t getDeviceCount(int *count) { infiniStatus_t getDeviceCount(int *count) {
CHECK_BANGRT(cnrtGetDeviceCount(count)); unsigned int device_count = static_cast<unsigned int>(*count);
CHECK_BANGRT(cnrtGetDeviceCount(&device_count));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() { ...@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {
infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) { infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
cnrtQueue_t queue; cnrtQueue_t queue;
CHECK_BANGRT(cnrtQueueCreate(&stream)); CHECK_BANGRT(cnrtQueueCreate(&queue));
*stream_ptr = queue; *stream_ptr = queue;
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) { ...@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
} }
infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) { infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
auto status = cnrtQueryNotifier((cnrtQueue_t)stream); auto status = cnrtQueryNotifier((cnrtNotifier_t)event);
if (status == cnrtSuccess) { if (status == cnrtSuccess) {
*status_ptr = INFINIRT_EVENT_COMPLETE; *status_ptr = INFINIRT_EVENT_COMPLETE;
} else if (status == cnrtErrorBusy) { } else if (status == cnrtErrorBusy) {
...@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) { ...@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
} }
infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) { infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind))); CHECK_BANGRT(cnrtMemcpy(dst, (void *)src, size, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) { infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind))); CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, (void *)src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../infinirt_impl.h" #include "../infinirt_impl.h"
namespace infinirt::bang { namespace infinirt::bang {
#ifdef ENABLE_BANG_API #ifdef ENABLE_CAMBRICON_API
INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API_IMPL
#else #else
INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API_NOOP
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "bang/infinirt_bang.h" #include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h" #include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh" #include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "maca/infinirt_maca.h" #include "maca/infinirt_maca.h"
#include "musa/infinirt_musa.h" #include "musa/infinirt_musa.h"
...@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_ ...@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_MOORE: \ case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \ _status = infinirt::musa::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_KUNLUN: \
_status = infinirt::kunlun::API PARAMS; \
break; \
default: \ default: \
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ _status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \ } \
{ ACTION; } \ { ACTION; } \
return _status; \ return _status; \
......
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def swiglu(
a: np.ndarray,
b: np.ndarray,
):
c = a * b / (1.0 + np.exp(-b))
return c
class SwiGLUTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
stride_a: List[int] | None,
b: np.ndarray,
stride_b: List[int] | None,
c: np.ndarray,
stride_c: List[int] | None,
):
super().__init__("swiglu")
self.a = a
self.stride_a = stride_a
self.b = b
self.stride_b = stride_b
self.c = c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
if self.stride_c is not None:
test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = swiglu(
self.a.astype(np.float64),
self.b.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("swiglu.gguf")
test_cases = [
SwiGLUTestCase(
np.random.rand(64, 128).astype(np.float32),
None,
np.random.rand(64, 128).astype(np.float32),
None,
np.random.rand(64, 128).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(64, 121).astype(np.float32),
None,
np.random.rand(64, 121).astype(np.float32),
None,
np.random.rand(64, 121).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(15, 512).astype(np.float32),
None,
np.random.rand(15, 512).astype(np.float32),
None,
np.random.rand(15, 512).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(13, 4).astype(np.float32),
None,
np.random.rand(13, 4).astype(np.float32),
None,
np.random.rand(13, 4).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(13, 4).astype(np.float16),
None,
np.random.rand(13, 4).astype(np.float16),
None,
np.random.rand(13, 4).astype(np.float16),
None,
),
SwiGLUTestCase(
np.random.rand(13, 4).astype(np.float32),
gguf_strides(10, 1),
np.random.rand(13, 4).astype(np.float32),
gguf_strides(10, 1),
np.random.rand(13, 4).astype(np.float32),
gguf_strides(10, 1),
),
SwiGLUTestCase(
np.random.rand(13, 4).astype(np.float16),
gguf_strides(10, 1),
np.random.rand(13, 4).astype(np.float16),
gguf_strides(10, 1),
np.random.rand(13, 4).astype(np.float16),
gguf_strides(10, 1),
),
SwiGLUTestCase(
np.random.rand(13, 4, 4).astype(np.float32),
None,
np.random.rand(13, 4, 4).astype(np.float32),
None,
np.random.rand(13, 4, 4).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(13, 4, 4).astype(np.float16),
None,
np.random.rand(13, 4, 4).astype(np.float16),
None,
np.random.rand(13, 4, 4).astype(np.float16),
None,
),
SwiGLUTestCase(
np.random.rand(13, 4, 4).astype(np.float32),
gguf_strides(20, 4, 1),
np.random.rand(13, 4, 4).astype(np.float32),
gguf_strides(20, 4, 1),
np.random.rand(13, 4, 4).astype(np.float32),
gguf_strides(20, 4, 1),
),
SwiGLUTestCase(
np.random.rand(13, 4, 4).astype(np.float16),
gguf_strides(20, 4, 1),
np.random.rand(13, 4, 4).astype(np.float16),
gguf_strides(20, 4, 1),
np.random.rand(13, 4, 4).astype(np.float16),
gguf_strides(20, 4, 1),
),
SwiGLUTestCase(
np.random.rand(16, 5632).astype(np.float32),
None,
np.random.rand(16, 5632).astype(np.float32),
None,
np.random.rand(16, 5632).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(16, 5632).astype(np.float16),
None,
np.random.rand(16, 5632).astype(np.float16),
None,
np.random.rand(16, 5632).astype(np.float16),
None,
),
SwiGLUTestCase(
np.random.rand(16, 5632).astype(np.float32),
gguf_strides(13312, 1),
np.random.rand(16, 5632).astype(np.float32),
gguf_strides(13312, 1),
np.random.rand(16, 5632).astype(np.float32),
gguf_strides(13312, 1),
),
SwiGLUTestCase(
np.random.rand(16, 5632).astype(np.float16),
gguf_strides(13312, 1),
np.random.rand(16, 5632).astype(np.float16),
gguf_strides(13312, 1),
np.random.rand(16, 5632).astype(np.float16),
gguf_strides(13312, 1),
),
SwiGLUTestCase(
np.random.rand(16, 5632).astype(np.float32),
gguf_strides(5632, 1),
np.random.rand(16, 5632).astype(np.float32),
gguf_strides(5632, 1),
np.random.rand(16, 5632).astype(np.float32),
gguf_strides(1, 16),
),
SwiGLUTestCase(
np.random.rand(16, 5632).astype(np.float16),
gguf_strides(5632, 1),
np.random.rand(16, 5632).astype(np.float16),
gguf_strides(5632, 1),
np.random.rand(16, 5632).astype(np.float16),
gguf_strides(1, 16),
),
SwiGLUTestCase(
np.random.rand(2, 3, 400).astype(np.float32),
gguf_strides(1200, 400, 1),
np.random.rand(2, 3, 400).astype(np.float32),
gguf_strides(1200, 400, 1),
np.random.rand(2, 3, 400).astype(np.float32),
gguf_strides(1, 2, 6),
),
SwiGLUTestCase(
np.random.rand(2, 3, 400).astype(np.float16),
gguf_strides(1200, 400, 1),
np.random.rand(2, 3, 400).astype(np.float16),
gguf_strides(1200, 400, 1),
np.random.rand(2, 3, 400).astype(np.float16),
gguf_strides(1, 2, 6),
),
SwiGLUTestCase(
np.random.rand(4, 4, 5632).astype(np.float32),
None,
np.random.rand(4, 4, 5632).astype(np.float32),
None,
np.random.rand(4, 4, 5632).astype(np.float32),
None,
),
SwiGLUTestCase(
np.random.rand(4, 4, 5632).astype(np.float16),
None,
np.random.rand(4, 4, 5632).astype(np.float16),
None,
np.random.rand(4, 4, 5632).astype(np.float16),
None,
),
SwiGLUTestCase(
np.random.rand(4, 4, 5632).astype(np.float32),
gguf_strides(45056, 5632, 1),
np.random.rand(4, 4, 5632).astype(np.float32),
gguf_strides(45056, 5632, 1),
np.random.rand(4, 4, 5632).astype(np.float32),
gguf_strides(45056, 5632, 1),
),
SwiGLUTestCase(
np.random.rand(4, 4, 5632).astype(np.float16),
gguf_strides(45056, 5632, 1),
np.random.rand(4, 4, 5632).astype(np.float16),
gguf_strides(45056, 5632, 1),
np.random.rand(4, 4, 5632).astype(np.float16),
gguf_strides(45056, 5632, 1),
),
]
test_writer.add_tests(test_cases)
test_writer.save()
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes import ctypes
import sys import sys
import os import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import ( from libinfiniop import (
open_lib, open_lib,
to_tensor, to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t, infiniopHandle_t,
infiniopTensorDescriptor_t, infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error, check_error,
rearrange_tensor, rearrange_tensor,
create_workspace, create_workspace,
get_args,
get_test_devices,
test_operator,
debug,
get_tolerance,
profile_operation,
) )
from operatorspy.tests.test_utils import get_args
import torch import torch
import torch.nn.functional as F
class AttentionDescriptor(Structure): class AttentionDescriptor(Structure):
...@@ -95,12 +95,13 @@ def test( ...@@ -95,12 +95,13 @@ def test(
pos, pos,
k_cache_buf_len, k_cache_buf_len,
v_cache_buf_len, v_cache_buf_len,
dtype=torch.float16,
q_stride=None, q_stride=None,
k_stride=None, k_stride=None,
v_stride=None, v_stride=None,
k_cache_stride=None, k_cache_stride=None,
v_cache_stride=None, v_cache_stride=None,
dtype=torch.float16,
sync=None,
): ):
print( print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} " f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
...@@ -140,6 +141,9 @@ def test( ...@@ -140,6 +141,9 @@ def test(
k_cache_tensor = to_tensor(k_cache, lib) k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib) v_cache_tensor = to_tensor(v_cache, lib)
if sync is not None:
sync()
descriptor = infiniopAttentionDescriptor_t() descriptor = infiniopAttentionDescriptor_t()
check_error( check_error(
lib.infiniopCreateAttentionDescriptor( lib.infiniopCreateAttentionDescriptor(
...@@ -156,12 +160,15 @@ def test( ...@@ -156,12 +160,15 @@ def test(
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor.descriptor.contents.invalidate() for tensor in [
q_tensor.descriptor.contents.invalidate() out_tensor,
k_tensor.descriptor.contents.invalidate() q_tensor,
v_tensor.descriptor.contents.invalidate() k_tensor,
k_cache_tensor.descriptor.contents.invalidate() v_tensor,
v_cache_tensor.descriptor.contents.invalidate() k_cache_tensor,
v_cache_tensor,
]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
...@@ -169,152 +176,52 @@ def test( ...@@ -169,152 +176,52 @@ def test(
) )
workspace = create_workspace(workspace_size.value, out.device) workspace = create_workspace(workspace_size.value, out.device)
check_error( def lib_attention():
lib.infiniopAttention( check_error(
descriptor, lib.infiniopAttention(
workspace.data_ptr() if workspace is not None else None, descriptor,
workspace_size.value, workspace.data_ptr() if workspace is not None else None,
out_tensor.data, workspace_size.value,
q_tensor.data, out_tensor.data,
k_tensor.data, q_tensor.data,
v_tensor.data, k_tensor.data,
k_cache_tensor.data, v_tensor.data,
v_cache_tensor.data, k_cache_tensor.data,
None, v_cache_tensor.data,
None,
)
) )
)
assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2) lib_attention()
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor)) # Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
def test_cpu(lib, test_cases): debug(out, ans, atol=atol, rtol=rtol)
device = DeviceEnum.DEVICE_CPU assert torch.allclose(out, ans, atol=atol, rtol=rtol)
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle) # Profiling workflow
if PROFILE:
# fmt: off
def test_cuda(lib, test_cases): profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
device = DeviceEnum.DEVICE_CUDA profile_operation(" lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
handle = create_handle(lib, device) # fmt: on
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
if __name__ == "__main__": if __name__ == "__main__":
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float32: {"atol": 1e-6, "rtol": 1e-4},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
test_cases = [ test_cases = [
# prefill # prefill
( (
...@@ -325,7 +232,6 @@ if __name__ == "__main__": ...@@ -325,7 +232,6 @@ if __name__ == "__main__":
0, # pos 0, # pos
2048, # k_cache_buf_len 2048, # k_cache_buf_len
2048, # v_cache_buf_len 2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride [64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride [64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride [64, 2560, 1], # v_stride
...@@ -341,7 +247,6 @@ if __name__ == "__main__": ...@@ -341,7 +247,6 @@ if __name__ == "__main__":
3, # pos 3, # pos
2048, # k_cache_buf_len 2048, # k_cache_buf_len
2048, # v_cache_buf_len 2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride [64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride [64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride [64, 2560, 1], # v_stride
...@@ -357,7 +262,6 @@ if __name__ == "__main__": ...@@ -357,7 +262,6 @@ if __name__ == "__main__":
1, # pos 1, # pos
8, # k_cache_buf_len 8, # k_cache_buf_len
8, # v_cache_buf_len 8, # v_cache_buf_len
torch.float16, # dtype
None, # q_stride None, # q_stride
None, # k_stride None, # k_stride
None, # v_stride None, # v_stride
...@@ -406,12 +310,13 @@ if __name__ == "__main__": ...@@ -406,12 +310,13 @@ if __name__ == "__main__":
infiniopAttentionDescriptor_t, infiniopAttentionDescriptor_t,
] ]
if args.cpu: # Configure testing options
test_cpu(lib, test_cases) DEBUG = args.debug
if args.cuda: PROFILE = args.profile
test_cuda(lib, test_cases) NUM_PRERUN = args.num_prerun
if args.bang: NUM_ITERATIONS = args.num_iterations
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang): # Execute tests
test_cpu(lib, test_cases) for device in get_test_devices(args):
test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
...@@ -88,6 +88,7 @@ def test( ...@@ -88,6 +88,7 @@ def test(
padding, padding,
strides, strides,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}" f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
...@@ -109,6 +110,10 @@ def test( ...@@ -109,6 +110,10 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopAvgPoolDescriptor_t() descriptor = infiniopAvgPoolDescriptor_t()
check_error( check_error(
......
...@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16] ...@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2}, torch.float16: {"atol": 1e-3, "rtol": 1e-2},
} }
...@@ -87,6 +87,7 @@ def test( ...@@ -87,6 +87,7 @@ def test(
y_stride=None, y_stride=None,
inplace=Inplace.OUT_OF_PLACE, inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16, dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}" f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
...@@ -107,6 +108,9 @@ def test( ...@@ -107,6 +108,9 @@ def test(
y = torch.zeros(shape, dtype=dtype).to(torch_device) y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride) y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib) y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopCausalSoftmaxDescriptor_t() descriptor = infiniopCausalSoftmaxDescriptor_t()
check_error( check_error(
...@@ -139,6 +143,9 @@ def test( ...@@ -139,6 +143,9 @@ def test(
) )
lib_causal_softmax() lib_causal_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
......
...@@ -95,6 +95,7 @@ def test( ...@@ -95,6 +95,7 @@ def test(
dilations, dilations,
tensor_stride=None, tensor_stride=None,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
sync=None
): ):
assert len(pads) == len(strides) == len(dilations) assert len(pads) == len(strides) == len(dilations)
print( print(
...@@ -118,8 +119,11 @@ def test( ...@@ -118,8 +119,11 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib) w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib) y_tensor = to_tensor(y, lib)
descriptor = infiniopConvDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopConvDescriptor_t()
check_error( check_error(
lib.infiniopCreateConvDescriptor( lib.infiniopCreateConvDescriptor(
handle, handle,
......
...@@ -52,6 +52,7 @@ def test( ...@@ -52,6 +52,7 @@ def test(
y_stride=None, y_stride=None,
x_stride=None, x_stride=None,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}" f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
...@@ -76,8 +77,11 @@ def test( ...@@ -76,8 +77,11 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) y_tensor = to_tensor(y, lib)
descriptor = infiniopExpandDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopExpandDescriptor_t()
check_error( check_error(
lib.infiniopCreateExpandDescriptor( lib.infiniopCreateExpandDescriptor(
handle, handle,
......
...@@ -83,6 +83,7 @@ def test( ...@@ -83,6 +83,7 @@ def test(
b_stride=None, b_stride=None,
c_stride=None, c_stride=None,
dtype=torch.float16, dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta}," f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
...@@ -104,6 +105,9 @@ def test( ...@@ -104,6 +105,9 @@ def test(
] ]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]] a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
if sync is not None:
sync()
descriptor = infiniopGemmDescriptor_t() descriptor = infiniopGemmDescriptor_t()
check_error( check_error(
lib.infiniopCreateGemmDescriptor( lib.infiniopCreateGemmDescriptor(
......
...@@ -51,6 +51,7 @@ def test( ...@@ -51,6 +51,7 @@ def test(
torch_device, torch_device,
x_shape, x_shape,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}" f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
...@@ -70,8 +71,11 @@ def test( ...@@ -70,8 +71,11 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) y_tensor = to_tensor(y, lib)
descriptor = infiniopGlobalAvgPoolDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error( check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor( lib.infiniopCreateGlobalAvgPoolDescriptor(
handle, handle,
......
...@@ -423,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes): ...@@ -423,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
infiniDeviceEnum_str_map[device], infiniDeviceEnum_str_map[device],
*test_case, *test_case,
tensor_dtype, tensor_dtype,
get_sync_func(device)
) )
finally: finally:
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -471,3 +472,15 @@ def get_test_devices(args): ...@@ -471,3 +472,15 @@ def get_test_devices(args):
devices_to_test = [InfiniDeviceEnum.CPU] devices_to_test = [InfiniDeviceEnum.CPU]
return devices_to_test return devices_to_test
def get_sync_func(device):
import torch
device_str = infiniDeviceEnum_str_map[device]
if device == InfiniDeviceEnum.CPU:
sync = None
else:
sync = getattr(torch, device_str).synchronize
return sync
...@@ -83,6 +83,7 @@ def test( ...@@ -83,6 +83,7 @@ def test(
padding, padding,
strides, strides,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}" f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
...@@ -104,8 +105,11 @@ def test( ...@@ -104,8 +105,11 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) y_tensor = to_tensor(y, lib)
descriptor = infiniopMaxPoolDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopMaxPoolDescriptor_t()
check_error( check_error(
lib.infiniopCreateMaxPoolDescriptor( lib.infiniopCreateMaxPoolDescriptor(
handle, handle,
......
...@@ -65,6 +65,7 @@ def test( ...@@ -65,6 +65,7 @@ def test(
y_stride=None, y_stride=None,
w12_stride=None, w12_stride=None,
w3_stride=None, w3_stride=None,
sync=None
): ):
print( print(
f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}" f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
...@@ -97,6 +98,10 @@ def test( ...@@ -97,6 +98,10 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
w12_tensor = to_tensor(w12, lib) w12_tensor = to_tensor(w12, lib)
w3_tensor = to_tensor(w3, lib) w3_tensor = to_tensor(w3, lib)
if sync is not None:
sync()
descriptor = infiniopMLPDescriptor_t() descriptor = infiniopMLPDescriptor_t()
check_error( check_error(
lib.infiniopCreateMLPDescriptor( lib.infiniopCreateMLPDescriptor(
......
...@@ -103,6 +103,7 @@ def test( ...@@ -103,6 +103,7 @@ def test(
topk, topk,
temperature, temperature,
dtype=torch.float16, dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}" f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
...@@ -122,6 +123,9 @@ def test( ...@@ -122,6 +123,9 @@ def test(
indices_tensor.descriptor.contents.dt = InfiniDtype.U64 # treat int64 as uint64 indices_tensor.descriptor.contents.dt = InfiniDtype.U64 # treat int64 as uint64
if sync is not None:
sync()
descriptor = infiniopRandomSampleDescriptor_t() descriptor = infiniopRandomSampleDescriptor_t()
check_error( check_error(
lib.infiniopCreateRandomSampleDescriptor( lib.infiniopCreateRandomSampleDescriptor(
......
...@@ -131,6 +131,7 @@ def test( ...@@ -131,6 +131,7 @@ def test(
x_stride, x_stride,
y_stride, y_stride,
dtype=torch.float16, dtype=torch.float16,
sync=None
): ):
print( print(
f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}" f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
...@@ -145,6 +146,9 @@ def test( ...@@ -145,6 +146,9 @@ def test(
] ]
x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]] x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
if sync is not None:
sync()
descriptor = infiniopRearrangeDescriptor_t() descriptor = infiniopRearrangeDescriptor_t()
check_error( check_error(
......
...@@ -55,6 +55,7 @@ def test( ...@@ -55,6 +55,7 @@ def test(
tensor_shape, tensor_shape,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE, inplace=Inplace.OUT_OF_PLACE,
sync=None
): ):
print( print(
f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}" f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
...@@ -78,8 +79,11 @@ def test( ...@@ -78,8 +79,11 @@ def test(
x_tensor = to_tensor(x, lib) x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
descriptor = infiniopReluDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopReluDescriptor_t()
check_error( check_error(
lib.infiniopCreateReluDescriptor( lib.infiniopCreateReluDescriptor(
handle, handle,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment