Commit f0c5a569 authored by YdrMaster's avatar YdrMaster
Browse files

issue/291/build: nvidia 和 metax 共用 softmax kernel 实现


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent c6a3e4c7
#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__ #ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
#define __CAUSAL_SOFTMAX_KERNEL_CUH__ #define __CAUSAL_SOFTMAX_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute> template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_CUDA_KERNEL causalSoftmax( __device__ void causalSoftmaxKernel(
Tdata *y_, const Tdata *x_, Tdata *y_, const Tdata *x_,
size_t batch, size_t height, size_t width, size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
...@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax( ...@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
// 2 | * * * ... * * * | // 2 | * * * ... * * * |
// height: 3 col_id-> // height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) { if (width + blockIdx.x >= threadIdx.x + height) {
#ifdef ENABLE_NVIDIA_API if constexpr (std::is_same_v<Tdata, half>) {
y[col] = exp_(x[col] - max_); y[col] = hexp(x[col] - max_);
#else } else {
y[col] = exp(x[col] - max_); y[col] = exp(x[col] - max_);
#endif }
} else { } else {
y[col] = Tdata(0); y[col] = Tdata(0);
} }
......
#ifndef __CAUSAL_SOFTMAX_KERNEL_H__
#define __CAUSAL_SOFTMAX_KERNEL_H__
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/maca/reduce.h"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MACA_KERNEL causalSoftmax(
Tdata *y_, const Tdata *x_,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
Tdata *y = y_ // threadIdx.x for col_id
+ blockIdx.y * y_stride_b // gridDim.y for batch_id
+ blockIdx.x * y_stride_h; // gridDim.x for row_id
const Tdata *x = x_ + blockIdx.y * x_stride_b + blockIdx.x * x_stride_h;
// [Reduce] Find max value in each row and store in shared memory
__shared__ Tdata max_;
Tdata max_0 = op::common_maca::reduce_op::max<BLOCK_SIZE, Tdata>(x, width - height + 1 + blockIdx.x);
if (threadIdx.x == 0) {
max_ = max_0;
}
__syncthreads();
// [Elementwise] Subtract max value from each element and apply causal mask
for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
// row_id ↓ |<- width ->|
// 0 | * * * ... * |
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) {
#ifdef ENABLE_MACA_API
y[col] = exp_(x[col] - max_);
#else
y[col] = exp(x[col] - max_);
#endif
} else {
y[col] = Tdata(0);
}
}
__syncthreads();
// [Reduce] Find the sum of each updated row and store in shared memory
__shared__ Tcompute sum_;
Tcompute sum_0 = op::common_maca::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(y, width);
if (threadIdx.x == 0) {
sum_ = sum_0;
}
__syncthreads();
// [Elementwise] Divide each element by the sum and store in shared memory
for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
y[col] /= Tdata(sum_);
}
}
#endif // __CAUSAL_SOFTMAX_KERNEL_H__
#ifndef __CAUSAL_SOFTMAX_MACA_H__ #ifndef __CAUSAL_SOFTMAX_METAX_H__
#define __CAUSAL_SOFTMAX_MACA_H__ #define __CAUSAL_SOFTMAX_METAX_H__
#include "../causal_softmax.h" #include "../causal_softmax.h"
DESCRIPTOR(maca) DESCRIPTOR(metax)
#endif #endif
#include "../../../devices/maca/common_maca.h" #include "../../../devices/maca/common_maca.h"
#include "causal_softmax_kernel.h" #include "../../../devices/maca/maca_kernel_common.h"
#include "causal_softmax_maca.h" #include "causal_softmax_metax.h"
namespace op::causal_softmax::maca { #include <hccub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MACA_KERNEL causalSoftmax(
Tdata *y, const Tdata *x,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
}
namespace op::causal_softmax::metax {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal; std::shared_ptr<device::maca::Handle::Internal> internal;
...@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, ...@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::causal_softmax::maca } // namespace op::causal_softmax::metax
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/cuda/cuda_common.cuh"
#include "causal_softmax_cuda.cuh" #include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "causal_softmax_kernel.cuh" #include "causal_softmax_nvidia.cuh"
namespace op::causal_softmax::cuda { #include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_CUDA_KERNEL causalSoftmax(
Tdata *y, const Tdata *x,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
}
namespace op::causal_softmax::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::cuda::Handle::Internal> internal;
...@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, ...@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::causal_softmax::cuda } // namespace op::causal_softmax::nvidia
#ifndef __CAUSAL_SOFTMAX_CUDA_H__ #ifndef __CAUSAL_SOFTMAX_NVIDIA_H__
#define __CAUSAL_SOFTMAX_CUDA_H__ #define __CAUSAL_SOFTMAX_NVIDIA_H__
#include "../causal_softmax.h" #include "../causal_softmax.h"
DESCRIPTOR(cuda) DESCRIPTOR(nvidia)
#endif #endif
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
#include "cpu/causal_softmax_cpu.h" #include "cpu/causal_softmax_cpu.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "cuda/causal_softmax_cuda.cuh" #include "nvidia/causal_softmax_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
#include "maca/causal_softmax_maca.h" #include "metax/causal_softmax_metax.h"
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/causal_softmax_ascend.h" #include "ascend/causal_softmax_ascend.h"
...@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( ...@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu) CREATE(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda) CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca) CREATE(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( ...@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
...@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe ...@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
GET(INFINI_DEVICE_CPU, cpu) GET(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda) GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe ...@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
...@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax( ...@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax(
CALCULATE(INFINI_DEVICE_CPU, cpu) CALCULATE(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda) CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca) CALCULATE(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax( ...@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
...@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD ...@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
DESTROY(INFINI_DEVICE_CPU, cpu) DESTROY(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, cuda) DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, maca) DESTROY(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
DESTROY(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD ...@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
DESTROY(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: case DevMthreadsGpu:
return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
......
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
#define __RMS_NORM_CUDA_KERNEL_H__ #define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_kernel_common.cuh" #include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh" #include "../../../reduce/cuda/reduce.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute> template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
......
#include "../../../devices/maca/common_maca.h" #include "../../../devices/maca/common_maca.h"
#include "../cuda/rms_norm_kernel.cuh" #include "../cuda/rms_norm_kernel.cuh"
#include "rms_norm_maca.cuh" #include "rms_norm_metax.cuh"
namespace op::rms_norm::maca { namespace op::rms_norm::maca {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ascend/rms_norm_aclnn.h" #include "ascend/rms_norm_aclnn.h"
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
#include "maca/rms_norm_maca.cuh" #include "metax/rms_norm_metax.cuh"
#endif #endif
#ifdef ENABLE_MOORE_API #ifdef ENABLE_MOORE_API
#include "musa/rms_norm_musa.cuh" #include "musa/rms_norm_musa.cuh"
......
#ifndef __INFINIOP_REDUCE_CUDA_H__ #ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__ #define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
/* /*
* Device functions for reduction operations on CUDA. * Device functions for reduction operations on CUDA.
* *
......
#ifndef __INFINIOP_REDUCE_MACA_H__
#define __INFINIOP_REDUCE_MACA_H__
#include <hccub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on MACA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*/
namespace op::common_maca::reduce_op {
// Sum(x^2) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
Tcompute ss = 0;
// Each thread computes its partial sum
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
}
// Use CUB block-level reduction
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(ss);
}
// Sum(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
Tcompute s = 0;
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
s += Tcompute(data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(s);
}
// Max(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata>
__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
Tdata max_ = data_ptr[0];
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
max_ = cub::Max()(max_, data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
}
} // namespace op::common_maca::reduce_op
#endif
...@@ -108,7 +108,7 @@ option_end() ...@@ -108,7 +108,7 @@ option_end()
if has_config("metax-gpu") then if has_config("metax-gpu") then
add_defines("ENABLE_METAX_API") add_defines("ENABLE_METAX_API")
includes("xmake/maca.lua") includes("xmake/metax.lua")
end end
-- 摩尔线程 -- 摩尔线程
......
...@@ -46,7 +46,7 @@ target("infiniop-cuda") ...@@ -46,7 +46,7 @@ target("infiniop-cuda")
add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations") add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
set_languages("cxx17") set_languages("cxx17")
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../build/ninetoothed/*.c") add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c")
target_end() target_end()
target("infinirt-cuda") target("infinirt-cuda")
......
...@@ -35,7 +35,7 @@ target("infiniop-metax") ...@@ -35,7 +35,7 @@ target("infiniop-metax")
set_warnings("all", "error") set_warnings("all", "error")
add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing") add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc") add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"}) add_files("../src/infiniop/ops/*/maca/*.maca", "../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
target_end() target_end()
target("infinirt-metax") target("infinirt-metax")
...@@ -61,5 +61,5 @@ target("infiniccl-metax") ...@@ -61,5 +61,5 @@ target("infiniccl-metax")
add_files("../src/infiniccl/maca/*.cc") add_files("../src/infiniccl/maca/*.cc")
end end
set_languages("cxx17") set_languages("cxx17")
target_end() target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment