Commit d854dbee authored by zhangyue's avatar zhangyue
Browse files

issue/111: Merge branch 'main' of github.com:PanZezhong1725/InfiniCore into...

issue/111: Merge branch 'main' of github.com:PanZezhong1725/InfiniCore into issue/111-rmsnorm-kunlun
parents 23ddc20b a474a6f5
...@@ -20,35 +20,23 @@ jobs: ...@@ -20,35 +20,23 @@ jobs:
- name: checkout code - name: checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: install black - name: Check Format
run: pip install black run: |
pip install black
- name: check format python3 scripts/format.py --path src --check
run: python3 scripts/format.py --path src --check
- name: install xmake - name: install xmake
uses: xmake-io/github-action-setup-xmake@v1 uses: xmake-io/github-action-setup-xmake@v1
with: with:
xmake-version: latest xmake-version: latest
- name: configure xmake - name: Build & Install
run: xmake f --omp=y -cv run: python scripts/install.py --omp=y
- name: build with xmake
run: xmake build
- name: install to INFINI_ROOT - name: install python packages
if: matrix.os != 'windows-latest'
run: xmake install
- name: build infiniop-test
if: matrix.os != 'windows-latest'
run: xmake build infiniop-test
- name: python test
if: matrix.os != 'windows-latest'
run: | run: |
pip install numpy
pip install torch pip install torch
LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/gemm.py --cpu
LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/rms_norm.py --cpu - name: Python Test
LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/random_sample.py --cpu run: python scripts/python_test.py --cpu
The MIT License (MIT)
Copyright © 2025 InfiniTensor
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
...@@ -15,6 +15,34 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功 ...@@ -15,6 +15,34 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
## 配置和使用 ## 配置和使用
### 软件依赖
- XMake编译器
XMake配置选项(`XMAKE_CONFIG_FLAGS`)以及含义
- `--omp=[y|n]` 是否使用OpenMP,默认开启
- `--cpu=[y|n]` 是否编译CPU接口实现,默认开启
- `--nv-gpu=[y|n]` 是否编译英伟达GPU接口实现
- `--ascend-npu=[y|n]` 是否编译昇腾NPU接口实现
- `--cambricon-mlu=[y|n]` 是否编译寒武纪MLU接口实现
- `--metax-gpu=[y|n]` 是否编译沐曦GPU接口实现
- `--moore-gpu=[y|n]` 是否编译摩尔线程GPU接口实现
- `--sugon-dcu=[y|n]` 是否编译曙光DCU接口实现
- `--kunlun-xpu=[y|n]` 是否编译昆仑XPU接口实现
### 一键安装
`script/` 目录中提供了 `install.py` 安装脚本。使用方式如下:
```shell
cd InfiniCore
python scripts/install.py [XMAKE_CONFIG_FLAGS]
```
### 手动安装
1. 项目配置 1. 项目配置
- 查看当前配置 - 查看当前配置
...@@ -55,11 +83,23 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功 ...@@ -55,11 +83,23 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
按输出提示设置 `INFINI_ROOT``LD_LIBRARY_PATH` 环境变量。 按输出提示设置 `INFINI_ROOT``LD_LIBRARY_PATH` 环境变量。
4. 运行算子测试 ### 运行测试
```shell #### 运行Python算子测试
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
``` ```shell
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
```
#### 一键运行所有Python算子测试
```shell
python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
```
#### 算子测试框架
详见 `test/infiniop-test` 目录
## 开发指南 ## 开发指南
......
...@@ -65,10 +65,10 @@ typedef enum { ...@@ -65,10 +65,10 @@ typedef enum {
INFINI_DTYPE_F16 = 12, INFINI_DTYPE_F16 = 12,
INFINI_DTYPE_F32 = 13, INFINI_DTYPE_F32 = 13,
INFINI_DTYPE_F64 = 14, INFINI_DTYPE_F64 = 14,
INFINI_DTYPE_C8 = 15, INFINI_DTYPE_C16 = 15,
INFINI_DTYPE_C16 = 16, INFINI_DTYPE_C32 = 16,
INFINI_DTYPE_C32 = 17, INFINI_DTYPE_C64 = 17,
INFINI_DTYPE_C64 = 18, INFINI_DTYPE_C128 = 18,
INFINI_DTYPE_BF16 = 19, INFINI_DTYPE_BF16 = 19,
} infiniDtype_t; } infiniDtype_t;
......
import os
import subprocess
import platform
import sys
from set_env import set_env
PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
os.chdir(PROJECT_DIR)
def run_cmd(cmd):
subprocess.run(cmd, text=True, encoding="utf-8", check=True, shell=True)
def install(xmake_config_flags=""):
run_cmd(f"xmake f {xmake_config_flags} -cv")
run_cmd("xmake")
run_cmd("xmake install")
run_cmd("xmake build infiniop-test")
run_cmd("xmake install infiniop-test")
if __name__ == "__main__":
set_env()
install(" ".join(sys.argv[1:]))
import os
import subprocess
from set_env import set_env
import sys
PROJECT_DIR = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
)
os.chdir(PROJECT_DIR)
def run_tests(args):
failed = []
for test in [
"gemm.py",
"rms_norm.py",
"causal_softmax.py",
"swiglu.py",
"random_sample.py",
]:
result = subprocess.run(
f"python {test} {args}", text=True, encoding="utf-8", shell=True
)
if result.returncode != 0:
failed.append(test)
return failed
if __name__ == "__main__":
set_env()
failed = run_tests(" ".join(sys.argv[1:]))
if len(failed) == 0:
print("\033[92mAll tests passed!\033[0m")
else:
print("\033[91mThe following tests failed:\033[0m")
for test in failed:
print(f"\033[91m - {test}\033[0m")
exit(len(failed))
import os
import platform
def set_env():
if os.environ.get("INFINI_ROOT") == None:
os.environ["INFINI_ROOT"] = os.path.expanduser("~/.infini")
if platform.system() == "Windows":
new_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/bin")
if new_path not in os.environ.get("PATH", ""):
os.environ["PATH"] = f"{new_path};{os.environ.get('PATH', '')}"
elif platform.system() == "Linux":
new_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/bin")
if new_path not in os.environ.get("PATH", ""):
os.environ["PATH"] = f"{new_path}:{os.environ.get('PATH', '')}"
new_lib_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/lib")
if new_lib_path not in os.environ.get("LD_LIBRARY_PATH", ""):
os.environ["LD_LIBRARY_PATH"] = (
f"{new_lib_path}:{os.environ.get('LD_LIBRARY_PATH', '')}"
)
else:
raise RuntimeError("Unsupported platform.")
...@@ -8,6 +8,10 @@ std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<i ...@@ -8,6 +8,10 @@ std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<i
return storageShape; return storageShape;
} }
size_t aclnnTensorDescriptor::numel() const {
return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
}
aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, void *data) { aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, void *data) {
this->ndim = desc->ndim(); this->ndim = desc->ndim();
this->shape = std::vector<int64_t>(ndim); this->shape = std::vector<int64_t>(ndim);
......
...@@ -34,10 +34,10 @@ struct aclnnTensorDescriptor { ...@@ -34,10 +34,10 @@ struct aclnnTensorDescriptor {
int64_t storageNdim = 1; int64_t storageNdim = 1;
aclTensor *tensor; aclTensor *tensor;
// aclnnGemmGetWorkspaceSize only support 2D matrix multiply, so we need to convert 3D tensor to 2D tensor
aclnnTensorDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides, void *data = nullptr); aclnnTensorDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides, void *data = nullptr);
aclnnTensorDescriptor(infiniopTensorDescriptor_t y_desc, void *data = nullptr); aclnnTensorDescriptor(infiniopTensorDescriptor_t y_desc, void *data = nullptr);
~aclnnTensorDescriptor(); ~aclnnTensorDescriptor();
size_t numel() const;
std::string toString(); std::string toString();
}; };
......
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/ascend_handle.h" #include "ascend/ascend_handle.h"
#endif #endif
#ifdef ENABLE_MOORE_API
#include "musa/musa_handle.h"
#endif
#ifdef ENABLE_KUNLUN_API #ifdef ENABLE_KUNLUN_API
#include "kunlun/kunlun_handle.h" #include "kunlun/kunlun_handle.h"
#endif #endif
...@@ -47,6 +50,9 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) { ...@@ -47,6 +50,9 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend); CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif #endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, musa);
#endif
#ifdef ENABLE_KUNLUN_API #ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun); CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif #endif
...@@ -81,6 +87,9 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { ...@@ -81,6 +87,9 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend); DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif #endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, musa);
#endif
#ifdef ENABLE_KUNLUN_API #ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun); DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif #endif
......
#include "../../../utils.h"
#include "../pool.h"
#include "musa_handle.h"
#include <mublas.h>
#include <mudnn.h>
#include <musa.h>
#include <musa_fp16_mtgpu.h>
#include <musa_runtime_api.h>
#define CHECK_MUBLAS(API) CHECK_INTERNAL(API, MUBLAS_STATUS_SUCCESS)
#define CHECK_MUDNN(API) CHECK_INTERNAL((int)API, (int)::musa::dnn::Status::SUCCESS)
namespace device::musa {
class Handle::Internal {
Pool<std::unique_ptr<mublasHandle_t>> mublas_handles;
Pool<std::unique_ptr<::musa::dnn::Handle>> mudnn_handles;
template <typename T>
using Fn = std::function<infiniStatus_t(T)>;
public:
infiniStatus_t useMublas(musaStream_t stream, const Fn<mublasHandle_t> &f) const;
infiniStatus_t useMudnn(musaStream_t stream, const Fn<::musa::dnn::Handle &> &f) const;
};
} // namespace device::musa
#include "common_musa.h"
namespace device::musa {
Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {}
Handle::Handle(int device_id) : Handle(INFINI_DEVICE_MOORE, device_id) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal;
}
infiniStatus_t Handle::Internal::useMublas(musaStream_t stream, const Fn<mublasHandle_t> &f) const {
std::unique_ptr<mublasHandle_t> handle;
auto opt_handle = mublas_handles.pop();
if (opt_handle.has_value()) {
handle = std::move(*opt_handle);
} else {
handle = std::make_unique<mublasHandle_t>();
CHECK_MUBLAS(mublasCreate(&(*handle)));
}
CHECK_MUBLAS(mublasSetStream(*handle, stream));
CHECK_STATUS(f(*handle));
mublas_handles.push(std::move(handle));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Handle::Internal::useMudnn(musaStream_t stream, const Fn<::musa::dnn::Handle &> &f) const {
std::unique_ptr<::musa::dnn::Handle> handle;
auto opt_handle = mudnn_handles.pop();
if (opt_handle.has_value()) {
handle = std::move(*opt_handle);
} else {
handle = std::make_unique<::musa::dnn::Handle>();
}
CHECK_MUDNN(handle->SetStream(stream));
CHECK_STATUS(f(*handle));
mudnn_handles.push(std::move(handle));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(INFINI_DEVICE_MOORE, device_id);
return INFINI_STATUS_SUCCESS;
}
} // namespace device::musa
#ifndef __INFINIOP_MUSA_HANDLE_H__
#define __INFINIOP_MUSA_HANDLE_H__
#include "../../handle.h"
#include <memory>
namespace device::musa {
struct Handle : public InfiniopHandle {
Handle(int device_id);
class Internal;
auto internal() const -> const std::shared_ptr<Internal> &;
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
protected:
Handle(infiniDevice_t device, int device_id);
private:
std::shared_ptr<Internal> _internal;
};
} // namespace device::musa
#endif // __INFINIOP_MUSA_HANDLE_H__
...@@ -41,7 +41,7 @@ private: ...@@ -41,7 +41,7 @@ private:
struct Node { struct Node {
U data; U data;
Node<U> *next; Node<U> *next;
Node(U &&data) : data(data), next(nullptr) {} Node(U &&data) : data(std::move(data)), next(nullptr) {}
}; };
mutable std::atomic<Node<T> *> _head; mutable std::atomic<Node<T> *> _head;
......
...@@ -2,57 +2,10 @@ ...@@ -2,57 +2,10 @@
#define CAUSAL_SOFTMAX_H #define CAUSAL_SOFTMAX_H
#include "../../operator.h" #include "../../operator.h"
#include "../../tensor.h" #include "info.h"
#include <iostream>
#include <vector>
struct CausalSoftmaxInfo {
infiniDtype_t dtype;
size_t batch_size;
ptrdiff_t stride_b;
size_t seq_len;
ptrdiff_t stride_i;
size_t total_seq_len;
ptrdiff_t stride_j;
};
inline infiniStatus_t createCausalSoftmaxInfo(CausalSoftmaxInfo *info, infiniopTensorDescriptor_t y_desc) {
auto dtype = y_desc->dtype();
if (y_desc->dtype() != INFINI_DTYPE_F16 && y_desc->dtype() != INFINI_DTYPE_F32) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
info->dtype = dtype;
if (y_desc->ndim() != 2 && y_desc->ndim() != 3) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
if (y_desc->shape()[y_desc->ndim() - 1] < y_desc->shape()[y_desc->ndim() - 2]) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t batch_size = 1;
ptrdiff_t stride_b = 0;
size_t seq_len = y_desc->shape()[y_desc->ndim() - 2];
ptrdiff_t stride_i = y_desc->strides()[y_desc->ndim() - 2];
size_t total_seq_len = y_desc->shape()[y_desc->ndim() - 1];
ptrdiff_t stride_j = y_desc->strides()[y_desc->ndim() - 1];
if (y_desc->ndim() == 3) {
stride_b = y_desc->strides()[0];
batch_size = y_desc->shape()[0];
}
info->batch_size = batch_size;
info->stride_b = stride_b;
info->seq_len = seq_len;
info->stride_i = stride_i;
info->total_seq_len = total_seq_len;
info->stride_j = stride_j;
return INFINI_STATUS_SUCCESS;
}
#define DESCRIPTOR(NAMESPACE) \ #define DESCRIPTOR(NAMESPACE) \
\
namespace op::causal_softmax::NAMESPACE { \ namespace op::causal_softmax::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \ class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \ struct Opaque; \
...@@ -65,20 +18,26 @@ inline infiniStatus_t createCausalSoftmaxInfo(CausalSoftmaxInfo *info, infiniopT ...@@ -65,20 +18,26 @@ inline infiniStatus_t createCausalSoftmaxInfo(CausalSoftmaxInfo *info, infiniopT
CausalSoftmaxInfo info, \ CausalSoftmaxInfo info, \
size_t workspace_size, \ size_t workspace_size, \
infiniDevice_t device_type, \ infiniDevice_t device_type, \
int device_id) : InfiniopDescriptor{device_type, device_id}, \ int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \ _opaque(opaque), \
_info(info), \ _info(info), \
_workspace_size(workspace_size) {} \ _workspace_size(workspace_size) {} \
\ \
public: \ public: \
~Descriptor(); \ ~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \ size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \ static infiniStatus_t create( \
infiniopHandle_t handle, \ infiniopHandle_t handle, \
Descriptor **desc_ptr, \ Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc); \ infiniopTensorDescriptor_t y_desc); \
infiniStatus_t calculate(void *workspace, size_t workspace_size, \ \
void *data, void *stream); \ infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *data, \
void *stream) const; \
}; \ }; \
} }
......
...@@ -3,15 +3,16 @@ ...@@ -3,15 +3,16 @@
#include "../../../reduce/cpu/reduce.h" #include "../../../reduce/cpu/reduce.h"
namespace op::causal_softmax::cpu { namespace op::causal_softmax::cpu {
Descriptor::~Descriptor() {} Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create( infiniStatus_t Descriptor::create(
infiniopHandle_t handle, infiniopHandle_t handle,
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc) { infiniopTensorDescriptor_t y_desc) {
CausalSoftmaxInfo info; auto result = CausalSoftmaxInfo::create(y_desc);
CHECK_STATUS(createCausalSoftmaxInfo(&info, y_desc)); CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id); *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -53,9 +54,11 @@ infiniStatus_t causal_softmax(const CausalSoftmaxInfo *info, T *data) { ...@@ -53,9 +54,11 @@ infiniStatus_t causal_softmax(const CausalSoftmaxInfo *info, T *data) {
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *data, void *data,
void *stream) { void *stream) const {
if (_info.dtype == INFINI_DTYPE_F16) { if (_info.dtype == INFINI_DTYPE_F16) {
CHECK_STATUS(causal_softmax<fp16_t>(&_info, (fp16_t *)data)); CHECK_STATUS(causal_softmax<fp16_t>(&_info, (fp16_t *)data));
} else if (_info.dtype == INFINI_DTYPE_F32) { } else if (_info.dtype == INFINI_DTYPE_F32) {
......
#ifndef __CAUSAL_SOFTMAX_INFO_H__
#define __CAUSAL_SOFTMAX_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::causal_softmax {
class CausalSoftmaxInfo {
CausalSoftmaxInfo() = default;
public:
infiniDtype_t dtype;
size_t batch_size;
ptrdiff_t stride_b;
size_t seq_len;
ptrdiff_t stride_i;
size_t total_seq_len;
ptrdiff_t stride_j;
static utils::Result<CausalSoftmaxInfo> create(infiniopTensorDescriptor_t y_desc) {
auto dtype = y_desc->dtype();
if (y_desc->dtype() != INFINI_DTYPE_F16 && y_desc->dtype() != INFINI_DTYPE_F32) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
if (y_desc->ndim() != 2 && y_desc->ndim() != 3) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
if (y_desc->shape()[y_desc->ndim() - 1] < y_desc->shape()[y_desc->ndim() - 2]) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t batch_size = 1;
ptrdiff_t stride_b = 0;
size_t seq_len = y_desc->shape()[y_desc->ndim() - 2];
ptrdiff_t stride_i = y_desc->strides()[y_desc->ndim() - 2];
size_t total_seq_len = y_desc->shape()[y_desc->ndim() - 1];
ptrdiff_t stride_j = y_desc->strides()[y_desc->ndim() - 1];
if (y_desc->ndim() == 3) {
stride_b = y_desc->strides()[0];
batch_size = y_desc->shape()[0];
}
return utils::Result<CausalSoftmaxInfo>(CausalSoftmaxInfo{
dtype,
batch_size,
stride_b,
seq_len,
stride_i,
total_seq_len,
stride_j});
}
};
} // namespace op::causal_softmax
#endif // __CAUSAL_SOFTMAX_INFO_H__
...@@ -38,14 +38,12 @@ infiniStatus_t Descriptor::create( ...@@ -38,14 +38,12 @@ infiniStatus_t Descriptor::create(
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
infiniStatus_t status; auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::ROW_MAJOR);
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR); CHECK_RESULT(result);
if (status != INFINI_STATUS_SUCCESS) { auto info = result.take();
return status;
}
auto c = new aclnnTensorDescriptor(toAclDataType(c_desc->dtype()), auto c = new aclnnTensorDescriptor(toAclDataType(c_desc->dtype()),
{static_cast<int64_t>(info.c_matrix.rows), static_cast<int64_t>(info.c_matrix.cols)}, {static_cast<int64_t>(info.m), static_cast<int64_t>(info.n)},
{info.c_matrix.row_stride, info.c_matrix.col_stride}); {info.c_matrix.row_stride, info.c_matrix.col_stride});
auto a = new aclnnTensorDescriptor(toAclDataType(a_desc->dtype()), auto a = new aclnnTensorDescriptor(toAclDataType(a_desc->dtype()),
{static_cast<int64_t>(info.a_matrix.rows), static_cast<int64_t>(info.a_matrix.cols)}, {static_cast<int64_t>(info.a_matrix.rows), static_cast<int64_t>(info.a_matrix.cols)},
......
...@@ -71,11 +71,9 @@ infiniStatus_t Descriptor::create( ...@@ -71,11 +71,9 @@ infiniStatus_t Descriptor::create(
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
infiniStatus_t status; auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::ROW_MAJOR);
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR); CHECK_RESULT(result);
if (status != INFINI_STATUS_SUCCESS) { auto info = result.take();
return status;
}
cnnlTensorDescriptor_t a, b, c; cnnlTensorDescriptor_t a, b, c;
CHECK_BANG(cnnlCreateTensorDescriptor(&a)); CHECK_BANG(cnnlCreateTensorDescriptor(&a));
......
#ifndef __BLAS_H__
#define __BLAS_H__
#include "../../operator.h"
#include "../../tensor.h"
#include <algorithm>
namespace op::gemm {
struct BlasMatrix {
size_t ndim;
size_t batch;
ptrdiff_t stride;
size_t rows;
size_t cols;
ptrdiff_t row_stride;
ptrdiff_t col_stride;
BlasMatrix() = default;
BlasMatrix(infiniopTensorDescriptor_t layout, infiniStatus_t *status) {
if (layout->ndim() == 2) {
ndim = 2;
batch = 1;
stride = 0;
rows = layout->dim(0);
cols = layout->dim(1);
row_stride = layout->stride(0);
col_stride = layout->stride(1);
} else if (layout->ndim() == 3) {
ndim = 3;
batch = layout->dim(0);
stride = batch == 1 ? 0 : layout->stride(0);
rows = layout->dim(1);
cols = layout->dim(2);
row_stride = layout->stride(1);
col_stride = layout->stride(2);
} else {
*status = INFINI_STATUS_BAD_TENSOR_SHAPE;
return;
}
if (row_stride != 1 && col_stride != 1) {
*status = INFINI_STATUS_BAD_TENSOR_STRIDES;
return;
}
*status = INFINI_STATUS_SUCCESS;
}
bool match_batch(size_t _batch) const {
return batch == _batch || batch == 1;
}
void transpose() {
std::swap(rows, cols);
std::swap(row_stride, col_stride);
}
ptrdiff_t ld() const {
return row_stride == 1 ? col_stride : row_stride;
}
};
enum class MatrixLayout : char {
COL_MAJOR,
ROW_MAJOR,
};
struct MatmulInfo {
BlasMatrix a_matrix;
BlasMatrix b_matrix;
BlasMatrix c_matrix;
size_t m, n, k, batch;
bool is_transed = false;
MatmulInfo(infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniStatus_t *status,
MatrixLayout layout) {
a_matrix = BlasMatrix(a_desc, status);
if (*status != INFINI_STATUS_SUCCESS) {
return;
}
b_matrix = BlasMatrix(b_desc, status);
if (*status != INFINI_STATUS_SUCCESS) {
return;
}
c_matrix = BlasMatrix(c_desc, status);
if (*status != INFINI_STATUS_SUCCESS) {
return;
}
if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows) {
*status = INFINI_STATUS_BAD_TENSOR_SHAPE;
return;
}
batch = c_matrix.batch;
if (!a_matrix.match_batch(batch) || !b_matrix.match_batch(batch)) {
*status = INFINI_STATUS_BAD_TENSOR_SHAPE;
return;
}
if ((layout == MatrixLayout::COL_MAJOR && c_matrix.col_stride == 1)
|| (layout == MatrixLayout::ROW_MAJOR && c_matrix.row_stride == 1)) {
c_matrix.transpose();
b_matrix.transpose();
a_matrix.transpose();
std::swap(a_matrix, b_matrix);
is_transed = true;
}
m = c_matrix.rows;
n = c_matrix.cols;
k = a_matrix.cols;
}
};
} // namespace op::gemm
#endif // __BLAS_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment