"git@developer.sourcefind.cn:yangql/googletest.git" did not exist on "f4419791abd9d0962eb5a61003b7f4b80d7e4068"
Unverified Commit 52f0dcf0 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1019 from InfiniTensor/issue/1008

Issue/1008
parents d0f405ce 68026bd1
......@@ -39,6 +39,9 @@ def run_tests(args):
"topkrouter.py",
"topksoftmax.py",
"zeros.py",
# "paged_attention.py",
# "paged_caching.py",
# "paged_attention_prefill.py"
]:
result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
......@@ -255,6 +255,8 @@ infiniStatus_t Descriptor::calculate(
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -174,6 +174,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -117,6 +117,11 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
_info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
_info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
_info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
_info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
......
......@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor(
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
......@@ -73,7 +73,7 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// GET(INFINI_DEVICE_ILUVATAR, nvidia);
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
......@@ -111,7 +111,7 @@ __C infiniStatus_t infiniopLogSoftmax(
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
......@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
......
......@@ -155,6 +155,8 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
......
......@@ -16,17 +16,66 @@ struct OnlineSoftmaxState {
}
};
__device__ __forceinline__ float warpReduceSum(float x) {
#if defined(ENABLE_ILUVATAR_API)
// Iluvatar may use warp size 64; __shfl_sync(0xffffffff) only covers 32 threads.
// Use shared-memory tree reduce for portability across warp sizes.
constexpr int kMaxWarps = 16;
__shared__ float _reduce_buf[kMaxWarps * 32];
const int lane = threadIdx.x & 31;
const int warp_id = threadIdx.x / 32;
_reduce_buf[threadIdx.x] = x;
__syncthreads();
for (int offset = 16; offset > 0; offset >>= 1) {
if (lane < offset) {
_reduce_buf[warp_id * 32 + lane] += _reduce_buf[warp_id * 32 + lane + offset];
}
__syncthreads();
}
return _reduce_buf[warp_id * 32];
#else
for (int offset = 16; offset > 0; offset >>= 1) {
x += __shfl_down_sync(0xffffffff, x, offset);
}
return x;
#endif
}
__device__ __forceinline__ float warpBroadcast(float x, int src_lane) {
#if defined(ENABLE_ILUVATAR_API)
__shared__ float _bcast_buf[16];
const int warp_id = threadIdx.x / 32;
if ((threadIdx.x & 31) == src_lane) {
_bcast_buf[warp_id] = x;
}
__syncthreads();
return _bcast_buf[warp_id];
#else
return __shfl_sync(0xffffffff, x, src_lane);
#endif
}
__device__ __forceinline__ float warpReduceMax(float x) {
#if defined(ENABLE_ILUVATAR_API)
__shared__ float _reduce_buf[16 * 32];
const int lane = threadIdx.x & 31;
const int warp_id = threadIdx.x / 32;
_reduce_buf[threadIdx.x] = x;
__syncthreads();
for (int offset = 16; offset > 0; offset >>= 1) {
if (lane < offset) {
float other = _reduce_buf[warp_id * 32 + lane + offset];
float cur = _reduce_buf[warp_id * 32 + lane];
_reduce_buf[warp_id * 32 + lane] = fmaxf(cur, other);
}
__syncthreads();
}
return _reduce_buf[warp_id * 32];
#else
for (int offset = 16; offset > 0; offset >>= 1) {
x = fmaxf(x, __shfl_down_sync(0xffffffff, x, offset));
}
return x;
#endif
}
__device__ __forceinline__ unsigned int cvtaToShared(const void *ptr) {
......
#ifndef __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
#define __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
......@@ -194,8 +194,8 @@ __device__ void PagedAttentionPrefillWarpKernel(
l = l * alpha + beta;
m = m_new;
}
alpha = __shfl_sync(0xffffffff, alpha, 0);
beta = __shfl_sync(0xffffffff, beta, 0);
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
#if defined(__CUDA_ARCH__)
if constexpr (std::is_same_v<Tdata, half>) {
......@@ -233,7 +233,7 @@ __device__ void PagedAttentionPrefillWarpKernel(
if (lane == 0) {
inv_l = 1.0f / (l + 1e-6f);
}
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
#pragma unroll
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
......@@ -411,8 +411,8 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
l = l * alpha + beta;
m = m_new;
}
alpha = __shfl_sync(0xffffffff, alpha, 0);
beta = __shfl_sync(0xffffffff, beta, 0);
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
#if defined(__CUDA_ARCH__)
if constexpr (std::is_same_v<Tdata, half>) {
......@@ -450,7 +450,11 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
if (lane == 0) {
inv_l = 1.0f / (l + 1e-6f);
}
#ifdef ENABLE_ILUVATAR_API
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
#else
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
#endif
#pragma unroll
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
......@@ -785,8 +789,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
l = l * alpha + beta;
m = m_new;
}
alpha = __shfl_sync(0xffffffff, alpha, 0);
beta = __shfl_sync(0xffffffff, beta, 0);
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
#if defined(__CUDA_ARCH__)
if constexpr (std::is_same_v<Tdata, half>) {
......@@ -826,7 +830,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
if (lane == 0) {
inv_l = 1.0f / (l + 1e-6f);
}
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
#pragma unroll
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
......@@ -1270,7 +1274,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelPipelined(
if (lane == 0) {
inv_l = 1.0f / (l + 1e-6f);
}
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
#pragma unroll
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
......@@ -1961,8 +1965,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
l = l * alpha + beta;
m = m_new;
}
alpha = __shfl_sync(0xffffffff, alpha, 0);
beta = __shfl_sync(0xffffffff, beta, 0);
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
#if defined(__CUDA_ARCH__)
if constexpr (std::is_same_v<Tdata, half>) {
......@@ -2002,7 +2006,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
if (lane == 0) {
inv_l = 1.0f / (l + 1e-6f);
}
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
#pragma unroll
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
......@@ -2131,7 +2135,7 @@ __device__ __forceinline__ void PagedAttentionPrefillMmaScoreWriteRow(
if (lane == 0) {
inv_l = 1.0f / (l + 1e-6f);
}
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
const int64_t q_token = q_start + static_cast<int64_t>(q_token_local);
half *out_ptr = out_ + q_token * o_stride + static_cast<int64_t>(head_idx) * o_head_stride;
......
......@@ -21,6 +21,11 @@ constexpr size_t ceilDiv(size_t a, size_t b) {
}
inline const char *default_prefill_kernel(const PagedAttentionPrefillInfo &info) {
// Iluvatar: use warp (stable). Users can override via INFINIOP_FLASH_PREFILL_KERNEL.
#ifdef ENABLE_ILUVATAR_API
(void)info;
return "warp";
#endif
// Heuristic auto-dispatch (v0.4):
// - Prefer the pipelined + tile-wise softmax kernel on FA2-compatible block_size=256.
// - Keep a conservative fallback for other shapes / older GPUs (cp.async is a no-op below SM80).
......
......@@ -8,8 +8,8 @@
#define ARRAY_TYPE_SIZE size_t
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
#define MAX_BLOCK_ARRAY_SIZE 5
#define MAX_GRID_ARRAY_SIZE 5
#define MAX_BLOCK_ARRAY_SIZE 6
#define MAX_GRID_ARRAY_SIZE 6
template <int ArrSize, typename ArrayType>
struct ArrayStruct {
......@@ -185,32 +185,43 @@ struct Constraint {
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 为1-5和1-5的所有组合生成内核
// 为1-6和1-6的所有组合生成内核
DEFINE_KERNELS_BY_CONSTRAINT(1, 1)
DEFINE_KERNELS_BY_CONSTRAINT(1, 2)
DEFINE_KERNELS_BY_CONSTRAINT(1, 3)
DEFINE_KERNELS_BY_CONSTRAINT(1, 4)
DEFINE_KERNELS_BY_CONSTRAINT(1, 5)
DEFINE_KERNELS_BY_CONSTRAINT(1, 6)
DEFINE_KERNELS_BY_CONSTRAINT(2, 1)
DEFINE_KERNELS_BY_CONSTRAINT(2, 2)
DEFINE_KERNELS_BY_CONSTRAINT(2, 3)
DEFINE_KERNELS_BY_CONSTRAINT(2, 4)
DEFINE_KERNELS_BY_CONSTRAINT(2, 5)
DEFINE_KERNELS_BY_CONSTRAINT(2, 6)
DEFINE_KERNELS_BY_CONSTRAINT(3, 1)
DEFINE_KERNELS_BY_CONSTRAINT(3, 2)
DEFINE_KERNELS_BY_CONSTRAINT(3, 3)
DEFINE_KERNELS_BY_CONSTRAINT(3, 4)
DEFINE_KERNELS_BY_CONSTRAINT(3, 5)
DEFINE_KERNELS_BY_CONSTRAINT(3, 6)
DEFINE_KERNELS_BY_CONSTRAINT(4, 1)
DEFINE_KERNELS_BY_CONSTRAINT(4, 2)
DEFINE_KERNELS_BY_CONSTRAINT(4, 3)
DEFINE_KERNELS_BY_CONSTRAINT(4, 4)
DEFINE_KERNELS_BY_CONSTRAINT(4, 5)
DEFINE_KERNELS_BY_CONSTRAINT(4, 6)
DEFINE_KERNELS_BY_CONSTRAINT(5, 1)
DEFINE_KERNELS_BY_CONSTRAINT(5, 2)
DEFINE_KERNELS_BY_CONSTRAINT(5, 3)
DEFINE_KERNELS_BY_CONSTRAINT(5, 4)
DEFINE_KERNELS_BY_CONSTRAINT(5, 5)
DEFINE_KERNELS_BY_CONSTRAINT(5, 6)
DEFINE_KERNELS_BY_CONSTRAINT(6, 1)
DEFINE_KERNELS_BY_CONSTRAINT(6, 2)
DEFINE_KERNELS_BY_CONSTRAINT(6, 3)
DEFINE_KERNELS_BY_CONSTRAINT(6, 4)
DEFINE_KERNELS_BY_CONSTRAINT(6, 5)
DEFINE_KERNELS_BY_CONSTRAINT(6, 6)
// 准备参数结构体
struct RearrangeParams {
......@@ -294,6 +305,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
case 5: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
break; \
case 6: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 6); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
......@@ -313,6 +327,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
case 5: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
break; \
case 6: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(6); \
break; \
}
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
......
......@@ -64,6 +64,7 @@ infiniStatus_t Descriptor::create(
return INFINI_STATUS_SUCCESS;
}
#ifdef ENABLE_QY_API
template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, void *stream_, void *workspace) const {
cudaStream_t stream = (cudaStream_t)stream_;
......@@ -112,6 +113,7 @@ infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const
return INFINI_STATUS_SUCCESS;
}
#endif
infiniStatus_t Descriptor::calculate(
void *workspace,
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sigmoid_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/sigmoid_nvidia.cuh"
#endif
......@@ -37,6 +37,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -65,7 +68,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -101,7 +106,9 @@ __C infiniStatus_t infiniopSigmoid(
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -131,7 +138,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......
......@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/topksoftmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/topksoftmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -36,6 +36,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......@@ -66,6 +69,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......@@ -101,6 +107,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......@@ -131,6 +140,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
#endif
#ifdef ENABLE_ALI_API
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......
......@@ -3,8 +3,8 @@ import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
import torch
from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
# Test cases format: (in_shape, in_strides_or_None)
......
......@@ -3,8 +3,8 @@ import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
import torch
from framework import (
BaseOperatorTest,
TensorSpec,
......
......@@ -3,8 +3,8 @@ import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
import torch
from framework import (
BaseOperatorTest,
TensorSpec,
......
......@@ -3,8 +3,8 @@ import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
import torch
from framework import (
BaseOperatorTest,
TensorSpec,
......
......@@ -3,8 +3,8 @@ import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
import torch
from framework import (
BaseOperatorTest,
TensorSpec,
......
......@@ -3,8 +3,8 @@ import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
import torch
from framework import (
BaseOperatorTest,
TensorSpec,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment