Commit 2f2a74b6 authored by Zimin Li's avatar Zimin Li
Browse files

Merge remote-tracking branch 'upstream/main'

parents 1d95ddf3 70806eed
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::random_sample::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p; \
size_t _n, _min_workspace_size; \
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
size_t min_workspace_size, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
_min_workspace_size(min_workspace_size) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t result_desc, \
infiniopTensorDescriptor_t probs_desc); \
\
size_t minWorkspaceSize() const; \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *result, \
const void *probs, \
float random_val, \
float topp, \
int topk, \
float temperature, \
void *stream) const; \
}; \
}
#endif // __RANDOM_SAMPLE_H__
#include "../../../devices/cuda/cuda_common.cuh"
#include "rms_norm_cuda.cuh"
#include "rms_norm_kernel.cuh"
#include <memory>
#include <stdint.h>
namespace op::rms_norm::cuda {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
float epsilon) {
RMSNormInfo info;
CHECK_STATUS(createRMSNormInfo(&info, y_desc, x_desc, w_desc, epsilon));
// only support contiguous last dimension
if (info.x_strides[1] != 1 || info.y_strides[1] != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
// launch kernel with different data types
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t batch_size, size_t dim,
void *y, infiniDtype_t atype, ptrdiff_t stride_y,
const void *x, ptrdiff_t stride_x,
const void *w, infiniDtype_t wtype,
float epsilon,
cudaStream_t cuda_stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
epsilon)
if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, half, float);
} else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(half, float, float);
} else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float, float);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *y, const void *x, const void *w, void *stream) {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto stride_x = _info.x_strides[0];
auto stride_y = _info.y_strides[0];
auto dim = _info.dim();
uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
// launch kernel with different block sizes
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rms_norm::cuda
#ifndef __RMS_NORM_CUDA_H__
#define __RMS_NORM_CUDA_H__
#include "../rms_norm.h"
DESCRIPTOR(cuda)
#endif
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_common.cuh"
#include <cub/block/block_reduce.cuh>
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
INFINIOP_CUDA_KERNEL rmsnormBlock(
Tdata *__restrict__ y,
ptrdiff_t stride_y,
const Tdata *__restrict__ x,
ptrdiff_t stride_x,
const Tweight *__restrict__ w,
size_t dim,
float epsilon) {
// Each block takes care of a row of continuous data of length dim
// Each thread deals with every block_size element in the row
auto y_ptr = y + blockIdx.x * stride_y;
auto x_ptr = x + blockIdx.x * stride_x;
auto w_ptr = w;
// Block-reduce sum of x^2
Tcompute ss = op::common_cuda::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(x_ptr, dim);
// Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
__shared__ Tcompute rms;
if (threadIdx.x == 0) {
rms = Tdata(rsqrtf(ss / Tcompute(dim) + epsilon));
}
__syncthreads();
for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
y_ptr[i] = Tdata(Tcompute(x_ptr[i]) * Tcompute(w_ptr[i]) * rms);
}
}
#endif
......@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/rms_norm_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rms_norm_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateRMSNormDescriptor(
infiniopHandle_t handle,
......@@ -28,10 +31,8 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRMSNormDescriptor((CudaHandle_t)handle, (RMSNormCudaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -76,11 +77,8 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t)desc, size);
}
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -122,11 +120,8 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRMSNorm((RMSNormCudaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
}
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -172,11 +167,8 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t)desc);
}
#ifdef ENABLE_CUDA_API
DESTROY(INFINI_DEVICE_NVIDIA, cuda)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......
......@@ -51,6 +51,10 @@ inline infiniStatus_t createRMSNormInfo(RMSNormInfo *info, infiniopTensorDescrip
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
if (w_desc->stride(0) != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
info->shape = std::move(y_desc->shape());
info->y_strides = std::move(y_desc->strides());
info->x_strides = std::move(x_desc->strides());
......
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
namespace op::common_cuda::reduce_op {
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
Tcompute ss = 0;
// Each thread computes its partial sum
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
ss += Tcompute(data_ptr[i] * data_ptr[i]);
}
// Use CUB block-level reduction
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(ss);
}
} // namespace op::common_cuda::reduce_op
#endif
......@@ -45,28 +45,26 @@ size_t check_equal(
return fails;
}
int test_transpose_2d() {
std::vector<size_t> shape = {3, 5};
std::vector<ptrdiff_t> strides_a = {5, 1};
std::vector<ptrdiff_t> strides_b = {1, 3};
int test_transpose_any(size_t index, std::vector<size_t> shape, std::vector<ptrdiff_t> strides_a, std::vector<ptrdiff_t> strides_b) {
auto numel = std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
std::vector<float> a(numel);
std::vector<float> b(numel);
for (size_t i = 0; i < numel; i++) {
a[i] = i / numel;
a[i] = (float)i / numel;
}
utils::rearrange(b.data(), a.data(), shape.data(), strides_b.data(), strides_a.data(), 2, sizeof(float));
if (check_equal<float>(a.data(), b.data(), shape, strides_a, strides_b)) {
utils::rearrange(b.data(), a.data(), shape.data(), strides_b.data(), strides_a.data(), shape.size(), sizeof(float));
auto fails = check_equal<float>(a.data(), b.data(), shape, strides_a, strides_b);
if (fails > 0) {
std::cout << "test_transpose " << index << " failed" << std::endl;
return 1;
} else {
std::cout << "test_transpose_2d passed" << std::endl;
std::cout << "test_transpose " << index << " passed" << std::endl;
return 0;
}
return 0;
}
int test_rearrange() {
return test_transpose_2d();
return test_transpose_any(1, {3, 5}, {5, 1}, {1, 3})
+ test_transpose_any(2, {1, 2048}, {2048, 1}, {2048, 1});
}
......@@ -17,4 +17,17 @@
#define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_)
#define CHECK_DTYPE(DT, ...) \
do { \
auto found_supported_dtype = false; \
for (auto dt : {__VA_ARGS__}) { \
if (dt == DT) { \
found_supported_dtype = true; \
break; \
} \
} \
CHECK_API_OR(found_supported_dtype, true, \
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
} while (0)
#endif // INFINIUTILS_CHECK_H
......@@ -46,6 +46,7 @@ std::optional<RearrangeMeta> RearrangeMeta::create(
}
return std::abs(a.dst) > std::abs(b.dst);
});
ndim = dims.size();
// # 合并连续维度
// ## 合并末尾连续维度到 unit
for (auto it = dims.rbegin(); it != dims.rend(); ++it) {
......
# InfiniOP 测例生成
## 介绍
使用 python 脚本生成包含测例的 `.gguf` 文件,并使用 `infiniop-test` 程序进行测试。
## 运行方式
- 编译 `infiniop-test` 程序
```bash
xmake build infiniop-test
```
- 生成测例
`/test/infiniop-test/`目录执行矩阵乘测例生成脚本,执行结束以后会在`/test/infiniop-test/`目录生成`gemm.gguf`测例文件。
```bash
cd /test/infiniop-test/
python -m test_generate.testcases.gemm
```
- 测试测例
打印测试程序用法
```bash
infiniop-test --help
```
示例:在CPU上测试`gemm.gguf`测例文件,预热20次,测试1000次。
```bash
infiniop-test gemm.gguf --cpu --warmup 20 --run 1000
```
## 自定义测例
### GGUF文件格式
```text
GGUF File Contents:
Version: 3
Number of Meta KVs: 8
Number of Tensors: 4
Meta KVs:
Key: general.architecture, Type: GGUF_TYPE_STRING, Value: infiniop-test
Key: test_count, Type: GGUF_TYPE_UINT64, Value: 1
Key: test.0.op_name, Type: GGUF_TYPE_STRING, Value: matmul
Key: test.0.a.strides, Type: GGUF_TYPE_INT32, Value: [1, 5]
Key: test.0.b.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
Key: test.0.c.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
Key: test.0.alpha, Type: GGUF_TYPE_FLOAT32, Value: 1.000000
Key: test.0.beta, Type: GGUF_TYPE_FLOAT32, Value: 0.000000
Tensor INFOs:
Name: test.0.a, NDims: 2, Shape: [5, 4], DataType: F32, DataOffset: 0
Name: test.0.b, NDims: 2, Shape: [6, 5], DataType: F32, DataOffset: 96
Name: test.0.c, NDims: 2, Shape: [6, 4], DataType: F32, DataOffset: 224
Name: test.0.ans, NDims: 2, Shape: [6, 4], DataType: F64, DataOffset: 320
```
- `Meta` 中必须包含 `test_count` ,表示测例数量。
- 每个测例的 `Meta``Tensor` 名字以 `test.[id].` 开头,后接具体信息名称。数字 `[id]` 表示测例编号。编号必须为 0 到 test_count-1.
- `Tensor` 名字接 `.strides` 表示步长,若没有则默认为连续。
from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides
import gguf
from typing import List
import numpy as np
from gguf import GGMLQuantizationType
def np_dtype_to_ggml(tensor_dtype: np.dtype):
if tensor_dtype == np.float16:
return GGMLQuantizationType.F16
elif tensor_dtype == np.float32:
return GGMLQuantizationType.F32
elif tensor_dtype == np.float64:
return GGMLQuantizationType.F64
elif tensor_dtype == np.int8:
return GGMLQuantizationType.I8
elif tensor_dtype == np.int16:
return GGMLQuantizationType.I16
elif tensor_dtype == np.int32:
return GGMLQuantizationType.I32
elif tensor_dtype == np.int64:
return GGMLQuantizationType.I64
else:
raise ValueError(
"Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
)
def gguf_strides(*args: int) -> list[int] | None:
return list(args)[::-1] if args else None
class InfiniopTestCase:
op_name: str
def __init__(self, op_name: str):
self.op_name = op_name
def write_test(self, test_writer: "InfiniopTestWriter"):
test_writer.add_string(test_writer.gguf_key("op_name"), self.op_name)
class InfiniopTestWriter(gguf.GGUFWriter):
_test_cases: List["InfiniopTestCase"]
_written_tests = 0
def __init__(self, filepath):
super().__init__(filepath, "infiniop-test")
self._test_cases = []
self._written_tests = 0
def add_test(self, test_case: "InfiniopTestCase"):
self._test_cases.append(test_case)
def add_tests(self, test_cases: List["InfiniopTestCase"]):
self._test_cases.extend(test_cases)
def gguf_key(self, name: str) -> str:
return f"test.{self._written_tests}.{name}"
def save(self):
super().add_uint64("test_count", len(self._test_cases))
for test_case in self._test_cases:
test_case.write_test(self)
self._written_tests += 1
super().write_header_to_file()
super().write_kv_data_to_file()
super().write_tensors_to_file()
super().close()
from ast import List
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def gemm(
a: np.ndarray,
b: np.ndarray,
alpha: float = 1.0,
c: np.ndarray = None,
beta: float = 0.0,
):
if c is None:
return alpha * np.matmul(a, b)
return alpha * np.matmul(a, b) + beta * c
def random_tensor(shape, dtype):
rate = 1e-3
var = 0.5 * rate # 数值范围在[-5e-4, 5e-4]
return rate * np.random.rand(*shape).astype(dtype) - var
class GemmTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
stride_a: List[int] | None,
b: np.ndarray,
stride_b: List[int] | None,
c: np.ndarray,
stride_c: List[int] | None,
alpha: float,
beta: float,
):
super().__init__("gemm")
self.a = a
self.stride_a = stride_a
self.b = b
self.stride_b = stride_b
self.c = c
self.stride_c = stride_c
self.alpha = alpha
self.beta = beta
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
if self.stride_c is not None:
test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
test_writer.add_float32(test_writer.gguf_key("alpha"), self.alpha)
test_writer.add_float32(test_writer.gguf_key("beta"), self.beta)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = gemm(
self.a.astype(np.float64),
self.b.astype(np.float64),
self.alpha,
self.c.astype(np.float64),
self.beta,
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("gemm.gguf")
# a, stride_a, b, stride_b, c, stride_c, alpha, beta
test_cases = [
GemmTestCase(
random_tensor((4, 5), np.float32),
None,
random_tensor((5, 6), np.float32),
None,
random_tensor((4, 6), np.float32),
None,
1.0,
0.0,
),
GemmTestCase(
random_tensor((4, 5), np.float32),
gguf_strides(1, 4),
random_tensor((5, 6), np.float32),
gguf_strides(1, 5),
random_tensor((4, 6), np.float32),
gguf_strides(1, 4),
1.0,
1.0,
),
GemmTestCase(
random_tensor((4, 5), np.float16),
None,
random_tensor((5, 6), np.float16),
None,
random_tensor((4, 6), np.float16),
None,
1.0,
0.0,
),
GemmTestCase(
random_tensor((4, 5), np.float16),
gguf_strides(1, 4),
random_tensor((5, 6), np.float16),
gguf_strides(1, 5),
random_tensor((4, 6), np.float16),
gguf_strides(1, 4),
1.0,
1.0,
),
GemmTestCase(
random_tensor((1, 2048), np.float16),
gguf_strides(1, 2048),
random_tensor((2048, 2048), np.float16),
gguf_strides(1, 2048),
random_tensor((1, 2048), np.float16),
gguf_strides(1, 2048),
1.0,
0.0,
),
GemmTestCase(
random_tensor((1, 2048), np.float32),
None,
random_tensor((2048, 2048), np.float32),
None,
random_tensor((1, 2048), np.float32),
None,
1.0,
0.0,
),
GemmTestCase(
random_tensor((2, 4, 2048), np.float16),
None,
random_tensor((2, 2048, 2048), np.float16),
None,
random_tensor((2, 4, 2048), np.float16),
None,
1.0,
0.0,
),
GemmTestCase(
random_tensor((2, 4, 2048), np.float32),
None,
random_tensor((2, 2048, 2048), np.float32),
None,
random_tensor((2, 4, 2048), np.float32),
None,
1.0,
0.0,
),
GemmTestCase(
random_tensor((6, 2048), np.float32),
gguf_strides(1, 2048),
random_tensor((2048, 2560), np.float32),
gguf_strides(1, 2560),
random_tensor((6, 2560), np.float32),
gguf_strides(1, 2560),
1.0,
1.0,
),
GemmTestCase(
random_tensor((4, 48, 64), np.float16),
None,
random_tensor((4, 64, 6), np.float16),
None,
random_tensor((4, 48, 6), np.float16),
None,
1.0 / 8,
1.0,
),
GemmTestCase(
random_tensor((4, 48, 64), np.float32),
None,
random_tensor((4, 64, 6), np.float32),
None,
random_tensor((4, 48, 6), np.float32),
None,
1.0 / 8,
1.0,
),
]
test_writer.add_tests(test_cases)
test_writer.save()
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import torch
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_tensor,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float32: {"atol": 0, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GEMMDescriptor(Structure):
# ==============================================================================
# Definitions
# ==============================================================================
class GemmDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
infiniopGemmDescriptor_t = POINTER(GemmDescriptor)
def gemm(
A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32
):
A = A.T if transA else A
B = B.T if transB else B
result = alpha * torch.matmul(
A if dtype != torch.float16 else A.to(torch.float32),
B if dtype != torch.float16 else B.to(torch.float32),
).to(dtype)
if C is not None:
result += beta * C if dtype != torch.float16 else C.to(torch.float32)
if PROFILE:
torch.cuda.synchronize()
return result
# PyTorch implementation for matrix multiplication
def gemm(_c, beta, _a, _b, alpha):
a, b, c = _a.clone(), _b.clone(), _c.clone()
result_dtype = c.dtype
fp32_result = torch.matmul(a.to(torch.float32), b.to(torch.float32))
return alpha * fp32_result.to(result_dtype) + beta * c
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def test(
lib,
handle,
torch_device,
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride=None,
b_stride=None,
c_stride=None,
y_stride=None,
dtype=torch.float16,
):
print(
f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
)
# Initialize tensors
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
y = torch.rand(y_shape, dtype=dtype).to(torch_device)
c = torch.ones(c_shape, dtype=dtype).to(torch_device)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
if c_stride is not None and c is not None:
c = rearrange_tensor(c, c_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
# Compute the PyTorch reference result
ans = gemm(c, beta, a, b, alpha)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
a, b, c = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib) if c is not None else None
y_tensor = to_tensor(y, lib)
descriptor = infiniopGEMMDescriptor_t()
descriptor = infiniopGemmDescriptor_t()
check_error(
lib.infiniopCreateGEMMDescriptor(
lib.infiniopCreateGemmDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c_tensor.descriptor if c_tensor else None,
alpha,
beta,
transA,
transB,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
if c_tensor is not None:
c_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
workspace_size = ctypes.c_uint64(0)
# Get workspace size and create workspace
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetGEMMWorkspaceSize(descriptor, ctypes.byref(workspace_size))
lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
workspace = create_workspace(workspace_size.value, a.device)
for i in range(NUM_PRERUN if PROFILE else 1):
# Execute infiniop gemm operator
def lib_gemm():
check_error(
lib.infiniopGEMM(
lib.infiniopGemm(
descriptor,
workspace_ptr,
workspace_size,
y_tensor.data,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
c_tensor.data if c_tensor else None,
alpha,
beta,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGEMM(
descriptor,
workspace_ptr,
workspace_size,
y_tensor.data,
a_tensor.data,
b_tensor.data,
c_tensor.data if c_tensor else None,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
lib_gemm()
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
# fmt: off
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
# fmt: off
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
# Profiling workflow
if PROFILE:
# fmt: off
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
profile_operation("PyTorch", lambda: gemm(c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
destroy_handle(lib, handle)
check_error(lib.infiniopDestroyGemmDescriptor(descriptor))
# ==============================================================================
# Main Execution
# ==============================================================================
if __name__ == "__main__":
test_cases = [
# alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
(
1.0,
1.0,
False,
False,
(1, 2048),
(2048, 2048),
(1, 2048),
(1, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
True,
(2048, 4),
(2048, 2048),
(4, 2048),
(4, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
False,
True,
(1, 2048),
(1000, 2048),
(1000),
(1, 1000),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
False,
(2048, 4),
(2048, 2048),
(2048),
(4, 2048),
(4096, 1),
(4096, 1),
(2,),
(4096, 1),
),
(
1.0,
1.0,
False,
False,
(3, 1, 2048),
(3, 2048, 2048),
(1,),
(3, 1, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
False,
(2048, 4),
(2048, 2048),
None,
(4, 2048),
(4096, 1),
(4096, 1),
(2,),
(4096, 1),
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGEMMDescriptor.restype = c_int32
lib.infiniopCreateGEMMDescriptor.argtypes = [
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGEMMDescriptor_t),
POINTER(infiniopGemmDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
c_float,
c_bool,
c_bool,
]
lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
lib.infiniopGetGEMMWorkspaceSize.argtypes = [
infiniopGEMMDescriptor_t,
POINTER(c_uint64),
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopGemmDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGEMM.restype = c_int32
lib.infiniopGEMM.argtypes = [
infiniopGEMMDescriptor_t,
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopGemmDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGEMMDescriptor.restype = c_int32
lib.infiniopDestroyGEMMDescriptor.argtypes = [
infiniopGEMMDescriptor_t,
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopGemmDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float32: {"atol": 0, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
# ==============================================================================
# Definitions
# ==============================================================================
class MatmulDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor)
# PyTorch implementation for matrix multiplication
def matmul(_c, beta, _a, _b, alpha):
a, b, c = _a.clone(), _b.clone(), _c.clone()
result_dtype = c.dtype
fp32_result = torch.matmul(a.to(torch.float32), b.to(torch.float32))
return alpha * fp32_result.to(result_dtype) + beta * c
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def test(
lib,
handle,
torch_device,
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride=None,
b_stride=None,
c_stride=None,
dtype=torch.float16,
):
print(
f"Testing Matmul on {torch_device} with alpha:{alpha}, beta:{beta},"
f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
)
# Initialize tensors
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.ones(c_shape, dtype=dtype).to(torch_device)
# Compute the PyTorch reference result
ans = matmul(c, beta, a, b, alpha)
a, b, c = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
descriptor = infiniopMatmulDescriptor_t()
check_error(
lib.infiniopCreateMatmulDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
# Get workspace size and create workspace
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, a.device)
# Execute infiniop matmul operator
def lib_matmul():
check_error(
lib.infiniopMatmul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
alpha,
beta,
None,
)
)
lib_matmul()
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: matmul(c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_matmul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
# ==============================================================================
# Main Execution
# ==============================================================================
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateMatmulDescriptor.restype = c_int32
lib.infiniopCreateMatmulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMatmulDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMatmulWorkspaceSize.restype = c_int32
lib.infiniopGetMatmulWorkspaceSize.argtypes = [
infiniopMatmulDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopMatmul.restype = c_int32
lib.infiniopMatmul.argtypes = [
infiniopMatmulDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyMatmulDescriptor.restype = c_int32
lib.infiniopDestroyMatmulDescriptor.argtypes = [
infiniopMatmulDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -82,25 +82,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
globalM = dataNp[0]
dataNp = (dataNp - globalM) / temperature
dataNp = torch.softmax(dataNp.float(), dim=0)
sum_s = 0
for end in range(topk):
sum_s += dataNp[end]
if sum_s >= topp:
break
if end < topk - 1:
end += 1
else:
end = topk
sum_s = 0
for i in range(end):
sum_s += dataNp[i]
random_val *= sum_s
sum_s = 0
for i in range(end):
sum_s += dataNp[i]
if random_val < sum_s:
for i in range(1, voc):
dataNp[i] += dataNp[i - 1]
limit_k = dataNp[min(topk, voc) - 1]
limit_p = dataNp[voc - 1] * topp
limit = min(limit_k, limit_p) * random_val
for i in range(voc):
if limit < dataNp[i]:
return indices[i]
else:
return torch.argmax(data)
......@@ -129,7 +118,7 @@ def test(
data, random_val, topp, topk, voc, temperature
) # 这个函数在device速度可能会很慢,可以通过data.to("cpu")方式加快计算过程
indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
indices = torch.zeros([], dtype=torch.int64).to(torch_device)
x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]
......@@ -147,7 +136,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, indices_tensor]:
tensor.descriptor.contents.invalidate()
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
......@@ -181,13 +170,13 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug_all(
(indices[0].type(ans.dtype), data[indices[0]]),
(indices.type(ans.dtype), data[indices]),
(ans, data[ans]),
"or",
atol=atol,
rtol=rtol,
)
assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
assert indices.type(ans.dtype) == ans or data[ans] == data[indices]
# Profiling workflow
if PROFILE:
......
......@@ -145,7 +145,7 @@ target("infinirt")
set_languages("cxx17")
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
add_files("src/infinirt/*.cc")
add_installfiles("include/infinirt.h")
add_installfiles("include/infinirt.h", {prefixdir = "include"})
target_end()
target("infiniop")
......@@ -197,8 +197,12 @@ target("infiniop")
add_installfiles("include/infinicore.h", {prefixdir = "include"})
target_end()
target("all")
set_kind("phony")
add_deps("infiniop", "infinirt")
after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
target_end()
-- Tests
includes("xmake/test.lua")
......@@ -8,4 +8,27 @@ target("infiniutils-test")
add_files(os.projectdir().."/src/utils-test/*.cc")
target("infiniop-test")
set_kind("binary")
add_deps("infini-utils")
on_install(function (target) end)
set_default(false)
local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
set_languages("cxx17")
set_warnings("all", "error")
add_includedirs(INFINI_ROOT.."/include")
add_linkdirs(INFINI_ROOT.."/lib")
add_links("infiniop", "infinirt")
if has_config("omp") then
add_cxflags("-fopenmp")
add_ldflags("-fopenmp")
end
add_includedirs(os.projectdir().."/src/infiniop-test/include")
add_files(os.projectdir().."/src/infiniop-test/src/*.cpp")
add_files(os.projectdir().."/src/infiniop-test/src/ops/*.cpp")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment