Commit feb19535 authored by zhangyue's avatar zhangyue
Browse files

issue/349 elementwise-clip

parent c94db20d
......@@ -227,9 +227,6 @@ private:
CHECK_KUNLUN(xpu_memcpy_async(workspace, h_inputs_arr, input_arr_size, XPU_HOST_TO_DEVICE, stream));
CHECK_KUNLUN(xpu_memcpy_async((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), XPU_HOST_TO_DEVICE, stream));
xpu_wait(stream);
// xpu_wait(stream);
// offset/assign the pointers
d_inputs_arr = reinterpret_cast<__global_ptr__ const void **>(workspace);
d_output_shape = reinterpret_cast<__global_ptr__ const size_t *>(d_meta_start);
......
#ifndef __CLIP_KUNLUN_API_H__
#define __CLIP_KUNLUN_API_H__
#include "../../../elementwise/kunlun/elementwise_kunlun_api.h"
ELEMENTWISE_DESCRIPTOR(clip, kunlun)
#endif // __CLIP_KUNLUN_API_H__
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
#include "clip_kunlun.h"
#include "kernel.h"
namespace op::elementwise::kunlun {
using ClipOp = op::clip::kunlun::ClipOp;
INSTANTIATE_ELEMENTWISE_KERNEL(ClipOp::num_inputs, ClipOp, float);
INSTANTIATE_ELEMENTWISE_KERNEL(ClipOp::num_inputs, ClipOp, half);
INSTANTIATE_ELEMENTWISE_KERNEL(ClipOp::num_inputs, ClipOp, bfloat16_t);
} // namespace op::elementwise::kunlun
namespace op::clip::kunlun {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<8, ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<8, ClipOp, bfloat16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<8, ClipOp, float>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::kunlun
#ifndef __CLIP_KUNLUN_KERNEL_H__
#define __CLIP_KUNLUN_KERNEL_H__
#include <xpu/kernel/xtdk_io.h>
namespace op::clip::kunlun {
typedef struct ClipOp {
public:
static constexpr int num_inputs = 3;
template <typename T>
inline __device__ T operator()(const T *inputs) const {
T x = inputs[0];
T min_val = inputs[1];
T max_val = inputs[2];
// printf("x: %f, min_val: %f, max_val: %f, result: %f\n", x, min_val, max_val, fmax(fmin(x, max_val), min_val));
return fmax(fmin(x, max_val), min_val);
}
// bfloat16 特化版本(使用 float 计算精度)
inline __device__ bfloat16_t operator()(const bfloat16_t *inputs) const {
float x_f = __bfloat162float(inputs[0]);
float min_val_f = __bfloat162float(inputs[1]);
float max_val_f = __bfloat162float(inputs[2]);
float result_f = fmax(fmin(x_f, max_val_f), min_val_f);
return __float2bfloat16(result_f);
}
} ClipOp;
} // namespace op::clip::kunlun
#endif // __CLIP_KUNLUN_KERNEL_H__
......@@ -11,6 +11,9 @@
#ifdef ENABLE_METAX_API
#include "metax/clip_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/clip_kunlun.h"
#endif
__C infiniStatus_t infiniopCreateClipDescriptor(
infiniopHandle_t handle,
......@@ -42,6 +45,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -69,6 +75,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
#endif
}
......@@ -106,6 +115,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -136,6 +148,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __ADD_KUNLUN_KERNEL_H__
#define __ADD_KUNLUN_KERNEL_H__
#ifndef __MUL_KUNLUN_KERNEL_H__
#define __MUL_KUNLUN_KERNEL_H__
namespace op::mul::kunlun {
......@@ -22,4 +22,4 @@ public:
} // namespace op::mul::kunlun
#endif // __ADD_KUNLUN_KERNEL_H__
#endif // __MUL_KUNLUN_KERNEL_H__
#ifndef __ADD_KUNLUN_KERNEL_H__
#define __ADD_KUNLUN_KERNEL_H__
#ifndef __SUB_KUNLUN_KERNEL_H__
#define __SUB_KUNLUN_KERNEL_H__
namespace op::sub::kunlun {
......@@ -22,4 +22,4 @@ public:
} // namespace op::sub::kunlun
#endif // __ADD_KUNLUN_KERNEL_H__
#endif // __SUB_KUNLUN_KERNEL_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment