Unverified Commit 45d94235 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge branch 'main' into issue/150

parents 12d75974 da0cef14
...@@ -92,29 +92,32 @@ void *testAllReduceThread(void *arg) { ...@@ -92,29 +92,32 @@ void *testAllReduceThread(void *arg) {
ThreadArgs *args = (ThreadArgs *)arg; ThreadArgs *args = (ThreadArgs *)arg;
*(args->result) = 1; *(args->result) = 1;
TEST_INFINI_THREAD(infinirtSetDevice(args->device_type, args->device_id)); TEST_INFINI_THREAD(infinirtSetDevice(args->device_type, args->device_id));
infinirtStream_t stream;
TEST_INFINI_THREAD(infinirtStreamCreate(&stream));
void *output = std::malloc(args->count * infiniSizeOf(args->dtype)); void *output = std::malloc(args->count * infiniSizeOf(args->dtype));
std::memset(output, 0, args->count * infiniSizeOf(args->dtype)); std::memset(output, 0, args->count * infiniSizeOf(args->dtype));
void *buf; void *buf;
TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype))); TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype)));
TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D)); TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D));
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL)); TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
TEST_INFINI_THREAD(infinirtDeviceSynchronize()); TEST_INFINI_THREAD(infinirtDeviceSynchronize());
TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H)); TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H));
if (checkData(output, args->ans, args->dtype, args->count) != 0) { if (checkData(output, args->ans, args->dtype, args->count) != 0) {
std::free(output); std::free(output);
infinirtFree(buf); infinirtFree(buf);
infinirtStreamDestroy(stream);
return nullptr; return nullptr;
} }
for (size_t i = 0; i < WARM_UPS; i++) { for (size_t i = 0; i < WARM_UPS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL)); TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
} }
TEST_INFINI_THREAD(infinirtDeviceSynchronize()); TEST_INFINI_THREAD(infinirtDeviceSynchronize());
// measure time // measure time
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL)); TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
} }
TEST_INFINI_THREAD(infinirtDeviceSynchronize()); TEST_INFINI_THREAD(infinirtDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
...@@ -125,6 +128,7 @@ void *testAllReduceThread(void *arg) { ...@@ -125,6 +128,7 @@ void *testAllReduceThread(void *arg) {
std::free(output); std::free(output);
infinirtFree(buf); infinirtFree(buf);
infinirtStreamDestroy(stream);
return nullptr; return nullptr;
} }
......
...@@ -60,6 +60,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) { ...@@ -60,6 +60,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
ParsedArgs args = parseArgs(argc, argv); ParsedArgs args = parseArgs(argc, argv);
int ndevice = 0; int ndevice = 0;
infinirtInit();
if (infinirtGetDeviceCount(args.device_type, &ndevice) != INFINI_STATUS_SUCCESS) { if (infinirtGetDeviceCount(args.device_type, &ndevice) != INFINI_STATUS_SUCCESS) {
std::cout << "Failed to get device count" << std::endl; std::cout << "Failed to get device count" << std::endl;
return -1; return -1;
......
#include "infiniccl_ascend.h"
#include "../../utils.h"
#include <acl/acl.h>
#include <hccl.h>
#include <iostream>
#include <vector>
#define CHECK_HCCL(API__) CHECK_INTERNAL(API__, HCCL_SUCCESS)
inline aclrtStream getAscendStream(infinirtStream_t stream) {
if (stream == nullptr) {
return 0;
}
return static_cast<aclrtStream>(stream);
}
inline HcclComm getHcclComm(infinicclComm_t comm) {
return static_cast<HcclComm>(comm->comm);
}
inline HcclDataType getAscendDtype(infiniDtype_t datatype) {
switch (datatype) {
case INFINI_DTYPE_F32:
return HCCL_DATA_TYPE_FP32;
case INFINI_DTYPE_F16:
return HCCL_DATA_TYPE_FP16;
default:
std::cerr << "Unsupported data type: " << datatype << std::endl;
std::abort();
return HCCL_DATA_TYPE_FP16;
}
}
inline HcclReduceOp getHcclRedOp(infinicclReduceOp_t op) {
switch (op) {
case INFINICCL_SUM:
return HCCL_REDUCE_SUM;
case INFINICCL_PROD:
return HCCL_REDUCE_PROD;
case INFINICCL_MAX:
return HCCL_REDUCE_MAX;
case INFINICCL_MIN:
return HCCL_REDUCE_MIN;
default:
std::abort();
return HCCL_REDUCE_SUM;
}
}
namespace infiniccl::ascend {
infiniStatus_t commInitAll(
infinicclComm_t *comms,
int ndevice,
const int *device_ids) {
// Ascend requires all devices to be initialized before calling HcclCommInitAll.
for (int i = ndevice - 1; i >= 0; i--) {
aclrtSetDevice(device_ids[i]);
}
std::vector<HcclComm> hccl_comms(ndevice);
CHECK_HCCL(HcclCommInitAll(ndevice, (int32_t *)device_ids, hccl_comms.data()));
for (int i = 0; i < ndevice; i++) {
comms[i] = new InfinicclComm{INFINI_DEVICE_ASCEND, device_ids[i], (void *)(hccl_comms[i])};
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t commDestroy(infinicclComm_t comm) {
CHECK_HCCL(HcclCommDestroy(getHcclComm(comm)));
delete comm;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t allReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t datatype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream) {
if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
return INFINI_STATUS_BAD_PARAM;
}
CHECK_HCCL(HcclAllReduce(sendbuf, recvbuf, (uint64_t)count,
getAscendDtype(datatype), getHcclRedOp(op),
getHcclComm(comm), getAscendStream(stream)));
return INFINI_STATUS_SUCCESS;
}
} // namespace infiniccl::ascend
#ifndef INFINICCL_ASCEND_H_
#define INFINICCL_ASCEND_H_
#include "../infiniccl_impl.h"
#if defined(ENABLE_ASCEND_API) && defined(ENABLE_CCL)
INFINICCL_DEVICE_API_IMPL(ascend)
#else
INFINICCL_DEVICE_API_NOOP(ascend)
#endif
#endif /* INFINICCL_ASCEND_H_ */
#include "infiniccl.h" #include "infiniccl.h"
#include "./ascend/infiniccl_ascend.h"
#include "./cuda/infiniccl_cuda.h" #include "./cuda/infiniccl_cuda.h"
__C infiniStatus_t infinicclCommInitAll( __C infiniStatus_t infinicclCommInitAll(
...@@ -14,6 +15,7 @@ __C infiniStatus_t infinicclCommInitAll( ...@@ -14,6 +15,7 @@ __C infiniStatus_t infinicclCommInitAll(
switch (device_type) { switch (device_type) {
COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda) COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda)
COMM_INIT_ALL(INFINI_DEVICE_ASCEND, ascend)
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
...@@ -32,6 +34,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) { ...@@ -32,6 +34,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
switch (comm->device_type) { switch (comm->device_type) {
COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda) COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda)
COMM_DESTROY(INFINI_DEVICE_ASCEND, ascend)
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
...@@ -57,6 +60,7 @@ __C infiniStatus_t infinicclAllReduce( ...@@ -57,6 +60,7 @@ __C infiniStatus_t infinicclAllReduce(
switch (comm->device_type) { switch (comm->device_type) {
ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda) ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda)
ALL_REDUCE(INFINI_DEVICE_ASCEND, ascend)
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
*/ */
DECLARE_INFINIOP_TEST(gemm) DECLARE_INFINIOP_TEST(gemm)
DECLARE_INFINIOP_TEST(random_sample) DECLARE_INFINIOP_TEST(random_sample)
DECLARE_INFINIOP_TEST(rms_norm)
DECLARE_INFINIOP_TEST(mul) DECLARE_INFINIOP_TEST(mul)
DECLARE_INFINIOP_TEST(rope) DECLARE_INFINIOP_TEST(rope)
DECLARE_INFINIOP_TEST(clip) DECLARE_INFINIOP_TEST(clip)
...@@ -35,6 +36,7 @@ DECLARE_INFINIOP_TEST(add) ...@@ -35,6 +36,7 @@ DECLARE_INFINIOP_TEST(add)
REGISTER_INFINIOP_TEST(clip) \ REGISTER_INFINIOP_TEST(clip) \
REGISTER_INFINIOP_TEST(swiglu) \ REGISTER_INFINIOP_TEST(swiglu) \
REGISTER_INFINIOP_TEST(rope) \ REGISTER_INFINIOP_TEST(rope) \
REGISTER_INFINIOP_TEST(rms_norm) \
} }
namespace infiniop_test { namespace infiniop_test {
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::rms_norm {
struct Test::Attributes {
float epsilon;
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> w;
std::shared_ptr<Tensor> ans;
std::shared_ptr<Tensor> y;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (attributes.find("epsilon") == attributes.end()
|| tensors.find("x") == tensors.end()
|| tensors.find("w") == tensors.end()
|| tensors.find("ans") == tensors.end()
|| tensors.find("y") == tensors.end()) {
throw std::runtime_error("Invalid Test: Missing attributes or tensors");
}
test->_attributes->epsilon = *reinterpret_cast<float *>(attributes["epsilon"].data());
test->_attributes->x = tensors["x"];
test->_attributes->w = tensors["w"];
test->_attributes->ans = tensors["ans"];
test->_attributes->y = tensors["y"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id,
size_t warm_ups, size_t iterations) {
infiniopRMSNormDescriptor_t op_desc;
CHECK_OR(infiniopCreateRMSNormDescriptor(handle, &op_desc,
_attributes->y->desc(),
_attributes->x->desc(),
_attributes->w->desc(),
_attributes->epsilon),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create RMSNorm descriptor"));
auto x = _attributes->x->to(device, device_id);
auto w = _attributes->w->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
size_t workspace_size;
CHECK_OR(infiniopGetRMSNormWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
void *workspace = nullptr;
if (workspace_size > 0) {
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
}
CHECK_OR(infiniopRMSNorm(op_desc,
workspace, workspace_size,
y->data(),
x->data(),
w->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed"));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopRMSNorm(op_desc,
workspace, workspace_size,
y->data(),
x->data(),
w->data(),
nullptr);
},
warm_ups, iterations);
if (workspace != nullptr) {
infinirtFree(workspace);
}
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {"epsilon"};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "w", "ans", "y"};
}
std::vector<std::string> Test::output_names() {
return {"y"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- epsilon=" << _attributes->epsilon << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- w: " << _attributes->w->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::rms_norm
import numpy as np
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
"""
使用numpy计算rms_norm结果
Args:
x: 输入张量, 维度为2, 形状为 [..., hidden_size]
w: 缩放权重, 形状为 [hidden_size]
epsilon: 避免除零的小常数
Returns:
输出张量, 形状与 input 相同
"""
squared = x ** 2
mean = np.mean(squared, axis=-1, keepdims=True)
rms = np.sqrt(mean + epsilon)
normalized = x / rms
return normalized * w
class RMSNormTestCase(InfiniopTestCase):
def __init__(
self,
x: np.ndarray,
w: np.ndarray,
y: np.ndarray,
shape: List[int] | None,
x_strides: List[int] | None,
y_strides: List[int] | None,
epsilon: float = 1e-5,
):
super().__init__("rms_norm")
self.x = x
self.w = w
self.y = y
self.shape = shape
self.epsilon = epsilon
self.x_strides=x_strides
self.y_strides=y_strides
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon)
if self.shape is not None:
test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape)
test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape)
if self.x_strides is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
test_writer.add_array(
test_writer.gguf_key("y.strides"),
gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape))
)
test_writer.add_tensor(
test_writer.gguf_key("x"),
self.x,
raw_dtype=np_dtype_to_ggml(self.x.dtype),
)
test_writer.add_tensor(
test_writer.gguf_key("w"),
self.w,
raw_dtype=np_dtype_to_ggml(self.w.dtype),
)
test_writer.add_tensor(
test_writer.gguf_key("y"),
self.y,
raw_dtype=np_dtype_to_ggml(self.y.dtype),
)
ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon)
test_writer.add_tensor(
test_writer.gguf_key("ans"),
ans,
raw_dtype=np_dtype_to_ggml(np.float64),
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("rms_norm.gguf")
test_cases = []
_TEST_CASES_ = [
# shape, x_strides, y_strides
((2, 256), None, None),
((4, 512), None, None),
((8, 1024), None, None),
((1, 768), None, None),
((8, 256), None, None),
((500, 4096), None, None),
((4, 512), (1024, 1), None),
((4, 512), (512, 1), None),
((500, 4096), (9192, 1), None),
((500, 4096), (4096, 1), None),
((4, 512), None, (1024, 1)),
((500, 4096), None, (8192, 1)),
((4, 512), (1024, 1), (512, 1)),
((4, 512), None, (2048, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, x_strides, y_strides in _TEST_CASES_:
w = np.random.rand(shape[-1]).astype(dtype)
x = np.random.rand(*shape).astype(dtype)
y = np.empty(tuple(0 for _ in shape), dtype=dtype)
epsilon = 1e-5
test_case = RMSNormTestCase(
x=x,
w=w,
y=y,
shape=shape,
x_strides=x_strides,
y_strides=y_strides,
epsilon=epsilon
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
...@@ -242,6 +242,9 @@ target("infiniccl") ...@@ -242,6 +242,9 @@ target("infiniccl")
if has_config("nv-gpu") then if has_config("nv-gpu") then
add_deps("infiniccl-cuda") add_deps("infiniccl-cuda")
end end
if has_config("ascend-npu") then
add_deps("infiniccl-ascend")
end
set_languages("cxx17") set_languages("cxx17")
......
...@@ -63,3 +63,18 @@ target("infinirt-ascend") ...@@ -63,3 +63,18 @@ target("infinirt-ascend")
add_files("$(projectdir)/src/infinirt/ascend/*.cc") add_files("$(projectdir)/src/infinirt/ascend/*.cc")
add_cxflags("-lstdc++ -Wall -Werror -fPIC") add_cxflags("-lstdc++ -Wall -Werror -fPIC")
target_end() target_end()
target("infiniccl-ascend")
set_kind("static")
add_deps("infinirt")
add_deps("infini-utils")
set_warnings("all", "error")
set_languages("cxx17")
on_install(function (target) end)
if has_config("ccl") then
add_includedirs(ASCEND_HOME .. "/include/hccl")
add_links("libhccl.so")
add_files("../src/infiniccl/ascend/*.cc")
add_cxflags("-lstdc++ -fPIC")
end
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment