Commit cb7f0b7d authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing
changes made to e60985dc.
parent 037140c0
#ifndef __CROSS_ENTROPY_KERNEL_CUH__
#define __CROSS_ENTROPY_KERNEL_CUH__
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
__device__ void crossEntropyKernel(
Tdata *y_,
const Tdata *x_,
const void *target_,
size_t outer_size,
size_t vocab_size,
ptrdiff_t x_stride) {
size_t row_idx = blockIdx.x;
if (row_idx >= outer_size) {
return;
}
const Tdata *x = x_ + row_idx * x_stride;
const Tidx *target = reinterpret_cast<const Tidx *>(target_);
Tidx label = target[row_idx];
Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
__shared__ Tcompute max_val_shared;
if (threadIdx.x == 0) {
max_val_shared = static_cast<Tcompute>(max_val_raw);
}
__syncthreads();
Tcompute max_val = max_val_shared;
Tcompute thread_sum = 0.0f;
for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
Tcompute val = static_cast<Tcompute>(x[col]);
thread_sum += expf(val - max_val);
}
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset);
}
static __shared__ Tcompute shared_sum[32];
int lane = threadIdx.x % warpSize;
int warp = threadIdx.x / warpSize;
if (lane == 0) {
shared_sum[warp] = thread_sum;
}
__syncthreads();
Tcompute block_sum = 0.0f;
if (warp == 0) {
if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) {
block_sum = shared_sum[lane];
}
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
block_sum += __shfl_down_sync(0xffffffff, block_sum, offset);
}
}
if (threadIdx.x == 0) {
Tcompute log_term = logf(block_sum) + max_val;
Tcompute target_logit = 0.0f;
if (label >= 0 && static_cast<size_t>(label) < vocab_size) {
target_logit = static_cast<Tcompute>(x[label]);
} else {
log_term = 0.0f;
}
y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
}
}
#endif
#ifndef CROSS_ENTROPY_INFO_H
#define CROSS_ENTROPY_INFO_H
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
#include <cstddef>
struct CrossEntropyInfo {
int dtype;
int target_dtype;
size_t outer_size;
size_t vocab_size;
ptrdiff_t x_stride;
};
#endif
#ifndef __CROSS_ENTROPY_METAX_H__
#define __CROSS_ENTROPY_METAX_H__
#include "../cross_entropy.h"
DESCRIPTOR(metax)
#endif // __CROSS_ENTROPY_METAX_H__
#include "../../../devices/metax/metax_common.h"
#include "cross_entropy_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include <cmath>
namespace {
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
__device__ void crossEntropyKernel(
Tdata *y_,
const Tdata *x_,
const void *target_,
size_t outer_size,
size_t vocab_size,
ptrdiff_t x_stride) {
size_t row_idx = blockIdx.x;
if (row_idx >= outer_size) {
return;
}
const Tdata *x = x_ + row_idx * x_stride;
const Tidx *target = reinterpret_cast<const Tidx *>(target_);
Tidx label = target[row_idx];
Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
__shared__ Tcompute max_val_shared;
if (threadIdx.x == 0) {
max_val_shared = static_cast<Tcompute>(max_val_raw);
}
__syncthreads();
Tcompute max_val = max_val_shared;
Tcompute thread_sum = Tcompute(0);
for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
Tcompute val = static_cast<Tcompute>(x[col]);
thread_sum += expf(val - max_val);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
if (threadIdx.x == 0) {
if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
y_[row_idx] = static_cast<Tdata>(0.0f);
return;
}
Tcompute log_term = logf(block_sum) + max_val;
Tcompute target_logit = static_cast<Tcompute>(x[label]);
y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
}
}
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
INFINIOP_METAX_KERNEL crossEntropy(
Tdata *y, const Tdata *x, const void *target,
size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
y, x, target, outer_size, vocab_size, x_stride);
}
} // namespace
namespace op::cross_entropy::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
(void)y_desc;
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CrossEntropyInfo info{};
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.vocab_size = x_desc->shape().back();
info.outer_size = target_desc->numel();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, const void *target,
const CrossEntropyInfo &info, hcStream_t stream) {
dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
if (info.target_dtype == INFINI_DTYPE_I64) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, cuda_bfloat16, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (info.target_dtype == INFINI_DTYPE_I32) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, cuda_bfloat16, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream_) const {
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto stream = reinterpret_cast<hcStream_t>(stream_);
int max_threads = _opaque->internal->maxThreadsPerBlock();
if (max_threads >= METAX_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(y, x, target, _info, stream));
} else if (max_threads >= METAX_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(y, x, target, _info, stream));
} else {
CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::cross_entropy::metax
#ifndef __CROSS_ENTROPY_KERNEL_CUH__
#define __CROSS_ENTROPY_KERNEL_CUH__
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
__device__ void crossEntropyKernel(
Tdata *y_,
const Tdata *x_,
const void *target_,
size_t outer_size,
size_t vocab_size,
ptrdiff_t x_stride) {
size_t row_idx = blockIdx.x;
if (row_idx >= outer_size) {
return;
}
const Tdata *x = x_ + row_idx * x_stride;
const Tidx *target = reinterpret_cast<const Tidx *>(target_);
Tidx label = target[row_idx];
Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
__shared__ Tcompute max_val_shared;
if (threadIdx.x == 0) {
max_val_shared = static_cast<Tcompute>(max_val_raw);
}
__syncthreads();
Tcompute max_val = max_val_shared;
Tcompute thread_sum = Tcompute(0);
for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
Tcompute val = static_cast<Tcompute>(x[col]);
thread_sum += expf(val - max_val);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
if (threadIdx.x == 0) {
if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
y_[row_idx] = static_cast<Tdata>(0.0f);
return;
}
Tcompute log_term = logf(block_sum) + max_val;
Tcompute target_logit = static_cast<Tcompute>(x[label]);
y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
}
}
#endif
#ifndef __CROSS_ENTROPY_MOORE_H__
#define __CROSS_ENTROPY_MOORE_H__
#include "../cross_entropy.h"
DESCRIPTOR(moore)
#endif
#include "../../../devices/moore/moore_common.h"
#include "cross_entropy_moore.h"
#include <cub/block/block_reduce.cuh>
#include "../../../devices/moore/moore_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
#include "cross_entropy_kernel.h"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
INFINIOP_MOORE_KERNEL crossEntropy(
Tdata *y, const Tdata *x, const void *target,
size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
y, x, target, outer_size, vocab_size, x_stride);
}
namespace op::cross_entropy::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
(void)y_desc;
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CrossEntropyInfo info{};
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.vocab_size = x_desc->shape().back();
info.outer_size = target_desc->numel();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, const void *target,
const CrossEntropyInfo &info, musaStream_t stream) {
dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
if (info.target_dtype == INFINI_DTYPE_I64) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (info.target_dtype == INFINI_DTYPE_I32) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::cross_entropy::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "cross_entropy_nvidia.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
INFINIOP_CUDA_KERNEL crossEntropy(
Tdata *y, const Tdata *x, const void *target,
size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
y, x, target, outer_size, vocab_size, x_stride);
}
namespace op::cross_entropy::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CrossEntropyInfo info;
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.vocab_size = x_desc->shape().back();
info.outer_size = target_desc->numel();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
auto internal = reinterpret_cast<device::nvidia::Handle *>(handle)->internal();
*desc_ptr = new Descriptor(
new Opaque{internal},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, const void *target,
const CrossEntropyInfo &info, cudaStream_t stream) {
dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
if (info.target_dtype == INFINI_DTYPE_I64) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int64_t>
<<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __nv_bfloat16, int64_t>
<<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int64_t>
<<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
}
} else if (info.target_dtype == INFINI_DTYPE_I32) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int32_t>
<<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __nv_bfloat16, int32_t>
<<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int32_t>
<<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
int max_threads = _opaque->internal->maxThreadsPerBlock();
if (max_threads >= 1024) {
CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream));
} else if (max_threads >= 512) {
CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream));
} else {
CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::cross_entropy::nvidia
#ifndef __CROSS_ENTROPY_NVIDIA_H__
#define __CROSS_ENTROPY_NVIDIA_H__
#include "../cross_entropy.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/cross_entropy.h"
#ifdef ENABLE_CPU_API
#include "cpu/cross_entropy_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/cross_entropy_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/cross_entropy_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/cross_entropy_metax.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor(
infiniopHandle_t handle,
infiniopCrossEntropyDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::cross_entropy::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, x_desc, target_desc);
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
infiniopCrossEntropyDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
}
__INFINI_C infiniStatus_t infiniopCrossEntropy(
infiniopCrossEntropyDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, target, stream);
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
infiniopCrossEntropyDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DESTROY
}
#include <cstdint>
#include <type_traits>
#include "equal_cpu.h"
namespace op::equal::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
if (compute_dtype != b_desc->dtype()) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<EqualOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<EqualOp, bool, float, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<EqualOp, bool, double, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<EqualOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<EqualOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<EqualOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::equal::cpu
#ifndef __EQUAL_CPU_H__
#define __EQUAL_CPU_H__
#include <type_traits>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(equal, cpu)
namespace op::equal::cpu {
typedef struct EqualOp {
public:
static constexpr size_t num_inputs = 2;
template <typename Tout, typename Tin0, typename Tin1>
bool operator()(const Tin0 &a, const Tin1 &b) {
if constexpr (std::is_same_v<Tin0, Tin1>) {
return a == b;
} else {
return false;
}
}
} EqualOp;
} // namespace op::equal::cpu
#endif
#ifndef __EQUAL_CUDA_H__
#define __EQUAL_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace op::equal::cuda {
typedef struct EqualOp {
public:
static constexpr size_t num_inputs = 2;
template <typename Tout, typename Tin0, typename Tin1>
__device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
if constexpr (std::is_same_v<Tin0, Tin1>) {
if constexpr (std::is_same_v<Tin0, half2>) {
static_assert(!std::is_same_v<Tin0, half2>, "half2 is not supported for mixed output dtype");
} else if constexpr (std::is_same_v<Tin0, half>) {
return static_cast<Tout>(__heq(a, b));
} else {
return static_cast<Tout>(a == b);
}
} else {
return false;
}
}
} EqualOp;
} // namespace op::equal::cuda
#endif
#ifndef __EQUAL_METAX_API_H__
#define __EQUAL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(equal, metax)
#endif // __EQUAL_METAX_API_H__
#include "equal_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::equal::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::metax
#ifndef __EQUAL_MOORE_API_H__
#define __EQUAL_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR(equal, moore)
#endif // __EQUAL_MOORE_API_H__
#include "equal_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "equal_moore_kernel.h"
namespace op::equal::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
if (!info.isOutputContiguous()) {
return false;
}
const bool *input_contiguous = info.getInputContiguous();
const bool *input_broadcasted = info.getInputBroadcasted();
for (size_t i = 0; i < 2; ++i) {
if (!input_contiguous[i] || input_broadcasted[i]) {
return false;
}
}
return true;
}
template <typename Tout, typename Tin>
INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
const auto op = op::equal::moore::EqualOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
}
}
template <typename Tout, typename Tin>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<Tout *>(output),
reinterpret_cast<const Tin *>(inputs[0]),
reinterpret_cast<const Tin *>(inputs[1]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (can_use_contiguous_fast_path(_info)) {
size_t numel = _info.getOutputSize();
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_fast_path<bool, half>(numel, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<bool, float>(numel, output, inputs, stream);
case INFINI_DTYPE_I32:
return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
case INFINI_DTYPE_I64:
return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<bool, double>(numel, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::moore
#ifndef __EQUAL_MOORE_KERNEL_H__
#define __EQUAL_MOORE_KERNEL_H__
#include <type_traits>
namespace op::equal::moore {
typedef struct EqualOp {
public:
static constexpr size_t num_inputs = 2;
template <typename Tout, typename Tin0, typename Tin1>
__device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
if constexpr (std::is_same_v<Tin0, Tin1>) {
if constexpr (std::is_same_v<Tin0, half>) {
return __half2float(a) == __half2float(b);
} else if constexpr (std::is_same_v<Tin0, cuda_bfloat16>) {
return __bfloat162float(a) == __bfloat162float(b);
} else {
return a == b;
}
} else {
return false;
}
}
} EqualOp;
} // namespace op::equal::moore
#endif // __EQUAL_MOORE_KERNEL_H__
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "equal_nvidia.cuh"
namespace {
template <typename Tout, typename Tin>
INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
op::equal::cuda::EqualOp op{};
for (; idx < n; idx += stride) {
output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
}
}
template <typename Tout, typename Tin>
infiniStatus_t launchFastEqualKernel(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int block = 256;
int grid = static_cast<int>((numel + block - 1) / block);
grid = std::min(grid, 65535);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
FastEqualKernel<Tout, Tin><<<grid, block, 0, cuda_stream>>>(
numel,
reinterpret_cast<Tout *>(output),
reinterpret_cast<const Tin *>(inputs[0]),
reinterpret_cast<const Tin *>(inputs[1]));
auto err = cudaGetLastError();
return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
}
} // namespace
namespace op::equal::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
bool fast_path = _info.isOutputContiguous();
if (fast_path) {
const bool *input_contiguous = _info.getInputContiguous();
const bool *input_broadcasted = _info.getInputBroadcasted();
for (size_t i = 0; i < 2; ++i) {
fast_path &= input_contiguous[i] && !input_broadcasted[i];
}
}
if (fast_path) {
size_t numel = _info.getOutputSize();
switch (_dtype) {
case INFINI_DTYPE_F16:
return launchFastEqualKernel<bool, half>(numel, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launchFastEqualKernel<bool, cuda_bfloat16>(numel, output, inputs, stream);
case INFINI_DTYPE_F32:
return launchFastEqualKernel<bool, float>(numel, output, inputs, stream);
case INFINI_DTYPE_I32:
return launchFastEqualKernel<bool, int32_t>(numel, output, inputs, stream);
case INFINI_DTYPE_I64:
return launchFastEqualKernel<bool, int64_t>(numel, output, inputs, stream);
case INFINI_DTYPE_F64:
return launchFastEqualKernel<bool, double>(numel, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::equal::nvidia
#ifndef __EQUAL_CUDA_API_H__
#define __EQUAL_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(equal, nvidia)
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment