Commit b5ccf30f authored by YdrMaster's avatar YdrMaster
Browse files

issue/50/fix: 添加 random sample/cpu 并修改测例


Signed-off-by: default avatarYdrMaster <ydrml@hotmail.com>
parent bd8ae651
...@@ -47,3 +47,4 @@ jobs: ...@@ -47,3 +47,4 @@ jobs:
pip install torch pip install torch
LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/matmul.py --cpu LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/matmul.py --cpu
LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/rms_norm.py --cpu LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/rms_norm.py --cpu
LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/random_sample.py --cpu
...@@ -5,21 +5,29 @@ ...@@ -5,21 +5,29 @@
typedef InfiniopDescriptor *infiniopRandomSampleDescriptor_t; typedef InfiniopDescriptor *infiniopRandomSampleDescriptor_t;
__C __export infiniStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs); __C __export infiniStatus_t infiniopCreateRandomSampleDescriptor(
infiniopHandle_t handle,
infiniopRandomSampleDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t result,
infiniopTensorDescriptor_t probs);
__C __export infiniStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
infiniopRandomSampleDescriptor_t desc,
size_t *size);
__C __export infiniStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, __C __export infiniStatus_t infiniopRandomSample(
void *workspace, infiniopRandomSampleDescriptor_t desc,
size_t workspace_size, void *workspace,
void *result, size_t workspace_size,
void const *probs, void *result,
float random_val, const void *probs,
float topp, float random_val,
int topk, float topp,
float temperature, int topk,
void *stream); float temperature,
void *stream);
__C __export infiniStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc); __C __export infiniStatus_t infiniopDestroyRandomSampleDescriptor(
infiniopRandomSampleDescriptor_t desc);
#endif #endif
#include "random_sample_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../devices/cpu/cpu_handle.h"
#include "../../../tensor.h"
#include <algorithm>
namespace op::random_sample::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dt_i = result_desc->dtype();
auto dt_p = probs_desc->dtype();
CHECK_DTYPE(dt_i,
INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_API_OR(result_desc->ndim(), 0,
return INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_API_OR(probs_desc->ndim(), 1,
return INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_API_OR(probs_desc->stride(0), 1,
return INFINI_STATUS_BAD_TENSOR_STRIDES);
*desc_ptr = new Descriptor(
dt_i,
dt_p,
probs_desc->dim(0),
0,
nullptr,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename DT>
struct ComputeType {
using type = DT;
};
template <>
struct ComputeType<fp16_t> {
using type = float;
};
template <class Tidx, class Tval>
struct Scheme {
using Tcompute = typename ComputeType<Tval>::type;
static Tcompute get(void const *ptr, size_t i) {
return utils::cast<Tcompute, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
}
static void argmax(
void *result, void const *probs, size_t n) {
auto idx = reinterpret_cast<Tidx *>(result);
*idx = 0;
auto max_val = get(probs, 0);
for (size_t i = 0; i < n; i++) {
if (auto val = get(probs, i); val > max_val) {
max_val = val;
*idx = static_cast<Tidx>(i);
}
}
}
static void random(
void *result, void const *probs, size_t n,
float random_val, float topp, int topk, float temperature) {
struct KVPair {
Tidx idx;
Tcompute val;
bool operator<(const KVPair &other) const {
return val > other.val;
}
};
auto idx = reinterpret_cast<Tidx *>(result);
// build & sort
std::vector<KVPair> pairs(n);
for (size_t i = 0; i < n; i++) {
pairs[i] = {static_cast<Tidx>(i), get(probs, i)};
}
std::sort(pairs.begin(), pairs.end());
// softmax & sum
auto const max_val = pairs[0].val;
pairs[0].val = 1;
for (size_t i = 1; i < n; i++) {
pairs[i].val = pairs[i - 1].val + std::exp((pairs[i].val - max_val) / temperature);
}
// topk & topp & limit
auto const pk = pairs[std::min(static_cast<size_t>(topk), n) - 1].val,
pp = pairs[n - 1].val * topp,
plimit = random_val * std::min(pk, pp);
// sample
for (size_t i = 0; i < n; i++) {
if (plimit <= pairs[i].val) {
*idx = pairs[i].idx;
break;
}
}
}
};
template <class Tidx, class Tval>
void switch_f(
size_t n,
void *result, const void *probs,
float random_val, float topp, int topk, float temperature) {
if (random_val == 0 || topp == 0 || topk == 1 || temperature == 0) {
Scheme<Tidx, Tval>::argmax(result, probs, n);
} else {
Scheme<Tidx, Tval>::random(result, probs, n, random_val, topp, topk, temperature);
}
}
template <class Tidx>
void switch_val(
infiniDtype_t dt_p, size_t n,
void *result, void const *probs,
float random_val, float topp, int topk, float temperature) {
switch (dt_p) {
case INFINI_DTYPE_F16:
switch_f<Tidx, fp16_t>(n, result, probs, random_val, topp, topk, temperature);
break;
case INFINI_DTYPE_F32:
switch_f<Tidx, float>(n, result, probs, random_val, topp, topk, temperature);
break;
case INFINI_DTYPE_F64:
switch_f<Tidx, double>(n, result, probs, random_val, topp, topk, temperature);
break;
default:
// unreachable
std::abort();
}
}
void switch_idx(
infiniDtype_t dt_i, infiniDtype_t dt_p, size_t n,
void *result, void const *probs,
float random_val, float topp, int topk, float temperature) {
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
break
switch (dt_i) {
CASE(INFINI_DTYPE_I8, int8_t);
CASE(INFINI_DTYPE_I16, int16_t);
CASE(INFINI_DTYPE_I32, int32_t);
CASE(INFINI_DTYPE_I64, int64_t);
CASE(INFINI_DTYPE_U8, uint8_t);
CASE(INFINI_DTYPE_U16, uint16_t);
CASE(INFINI_DTYPE_U32, uint32_t);
CASE(INFINI_DTYPE_U64, uint64_t);
default:
// unreachable
std::abort();
}
#undef CASE
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *result,
const void *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) const {
switch_idx(_dt_i, _dt_p, _n, result, probs, random_val, topp, topk, temperature);
return INFINI_STATUS_SUCCESS;
}
} // namespace op::random_sample::cpu
#ifndef __RANDOM_SAMPLE_CPU_H__
#define __RANDOM_SAMPLE_CPU_H__
#include "../random_sample.h"
DESCRIPTOR(cpu)
#endif // __RANDOM_SAMPLE_CPU_H__
...@@ -2,152 +2,110 @@ ...@@ -2,152 +2,110 @@
#include "../../handle.h" #include "../../handle.h"
#include "infiniop/ops/random_sample.h" #include "infiniop/ops/random_sample.h"
__C infiniStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) { #ifdef ENABLE_CPU_API
switch (handle->device) { #include "cpu/random_sample_cpu.h"
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *)desc_ptr, result, probs);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaCreateRandomSampleDescriptor((CudaHandle_t)handle, (RandomSampleCudaDescriptor_t *)desc_ptr, result, probs);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRandomSampleDescriptor((BangHandle_t)handle,
(RandomSampleBangDescriptor_t *)desc_ptr, result,
probs);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendCreateRandomSampleDescriptor((AscendHandle_t)handle,
(RandomSampleAscendDescriptor_t *)desc_ptr, result, probs);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRandomSampleDescriptor((MacaHandle_t)handle,
(RandomSampleMacaDescriptor_t *)desc_ptr, result,
probs);
}
#endif #endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
return musaCreateRandomSampleDescriptor((MusaHandle_t)handle, (RandomSampleMusaDescriptor_t *)desc_ptr, result, probs); infiniopHandle_t handle,
infiniopRandomSampleDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t result,
infiniopTensorDescriptor_t probs) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::random_sample::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::random_sample::NAMESPACE::Descriptor **>(desc_ptr), \
result, \
probs)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
#undef CREATE
}; };
__C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, size_t *size) { __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
infiniopRandomSampleDescriptor_t desc,
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::random_sample::NAMESPACE::Descriptor *>(desc)->workspace_size; \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) { switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t)desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t)desc, size);
}
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { default:
return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t)desc, size); return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t)desc, size);
}
#endif
} }
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
#undef GET
} }
__C infiniStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, __C infiniStatus_t infiniopRandomSample(
void *workspace, infiniopRandomSampleDescriptor_t desc,
size_t workspace_size, void *workspace,
void *result, size_t workspace_size,
const void *probs, void *result,
float random_val, const void *probs,
float topp, float random_val,
int topk, float topp,
float temperature, int topk,
void *stream) { float temperature,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::random_sample::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, \
result, probs, \
random_val, \
topp, topk, temperature, \
stream)
switch (desc->device_type) { switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu: #ifdef ENABLE_CPU_API
return cpuRandomSample((RandomSampleCpuDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaRandomSample((RandomSampleCudaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRandomSample((RandomSampleBangDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendRandomSample((RandomSampleAscendDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRandomSample((RandomSampleMacaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaRandomSample((RandomSampleMusaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif #endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
#undef CALCULATE
} }
__C infiniStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) { __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
infiniopRandomSampleDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::random_sample::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) { switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu: #ifdef ENABLE_CPU_API
return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t)desc); DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t)desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t)desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t)desc);
#endif #endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
#undef DELETE
} }
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::random_sample::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p; \
size_t _n; \
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
workspace_size(workspace_size_) {} \
\
public: \
size_t workspace_size; \
\
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t result_desc, \
infiniopTensorDescriptor_t probs_desc); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *result, \
const void *probs, \
float random_val, \
float topp, \
int topk, \
float temperature, \
void *stream) const; \
}; \
}
#endif // __RANDOM_SAMPLE_H__
...@@ -17,4 +17,17 @@ ...@@ -17,4 +17,17 @@
#define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_) #define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_)
#define CHECK_DTYPE(DT, ...) \
do { \
auto found_supported_dtype = false; \
for (auto dt : {__VA_ARGS__}) { \
if (dt == DT) { \
found_supported_dtype = true; \
break; \
} \
} \
CHECK_API_OR(found_supported_dtype, true, \
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
} while (0)
#endif // INFINIUTILS_CHECK_H #endif // INFINIUTILS_CHECK_H
...@@ -82,25 +82,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature): ...@@ -82,25 +82,14 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
globalM = dataNp[0] globalM = dataNp[0]
dataNp = (dataNp - globalM) / temperature dataNp = (dataNp - globalM) / temperature
dataNp = torch.softmax(dataNp.float(), dim=0) dataNp = torch.softmax(dataNp.float(), dim=0)
sum_s = 0 for i in range(1, voc):
for end in range(topk): dataNp[i] += dataNp[i - 1]
sum_s += dataNp[end] limit_k = dataNp[min(topk, voc) - 1]
if sum_s >= topp: limit_p = dataNp[voc - 1] * topp
break limit = min(limit_k, limit_p) * random_val
if end < topk - 1:
end += 1 for i in range(voc):
else: if limit < dataNp[i]:
end = topk
sum_s = 0
for i in range(end):
sum_s += dataNp[i]
random_val *= sum_s
sum_s = 0
for i in range(end):
sum_s += dataNp[i]
if random_val < sum_s:
return indices[i] return indices[i]
else: else:
return torch.argmax(data) return torch.argmax(data)
...@@ -129,7 +118,7 @@ def test( ...@@ -129,7 +118,7 @@ def test(
data, random_val, topp, topk, voc, temperature data, random_val, topp, topk, voc, temperature
) # 这个函数在device速度可能会很慢,可以通过data.to("cpu")方式加快计算过程 ) # 这个函数在device速度可能会很慢,可以通过data.to("cpu")方式加快计算过程
indices = torch.zeros([1], dtype=torch.int64).to(torch_device) indices = torch.zeros([], dtype=torch.int64).to(torch_device)
x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]] x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]
...@@ -147,7 +136,7 @@ def test( ...@@ -147,7 +136,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, indices_tensor]: for tensor in [x_tensor, indices_tensor]:
tensor.descriptor.contents.invalidate() tensor.destroyDesc(lib)
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
...@@ -181,13 +170,13 @@ def test( ...@@ -181,13 +170,13 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
debug_all( debug_all(
(indices[0].type(ans.dtype), data[indices[0]]), (indices.type(ans.dtype), data[indices]),
(ans, data[ans]), (ans, data[ans]),
"or", "or",
atol=atol, atol=atol,
rtol=rtol, rtol=rtol,
) )
assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]] assert indices.type(ans.dtype) == ans or data[ans] == data[indices]
# Profiling workflow # Profiling workflow
if PROFILE: if PROFILE:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment