Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
c2e87202
Commit
c2e87202
authored
Jun 04, 2025
by
Catheriany
Browse files
Merge remote-tracking branch 'origin/main' into issue/142
parents
41818f84
c203635b
Changes
175
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
806 additions
and
61 deletions
+806
-61
src/infiniccl/infiniccl.cc
src/infiniccl/infiniccl.cc
+65
-0
src/infiniccl/infiniccl_impl.h
src/infiniccl/infiniccl_impl.h
+37
-0
src/infiniop-test/include/ops.hpp
src/infiniop-test/include/ops.hpp
+9
-0
src/infiniop-test/include/tensor.hpp
src/infiniop-test/include/tensor.hpp
+3
-1
src/infiniop-test/include/test.hpp
src/infiniop-test/include/test.hpp
+2
-0
src/infiniop-test/src/ops/add.cpp
src/infiniop-test/src/ops/add.cpp
+109
-0
src/infiniop-test/src/ops/clip.cpp
src/infiniop-test/src/ops/clip.cpp
+120
-0
src/infiniop-test/src/ops/gemm.cpp
src/infiniop-test/src/ops/gemm.cpp
+4
-0
src/infiniop-test/src/ops/mul.cpp
src/infiniop-test/src/ops/mul.cpp
+109
-0
src/infiniop-test/src/ops/random_sample.cpp
src/infiniop-test/src/ops/random_sample.cpp
+4
-0
src/infiniop-test/src/ops/swiglu.cpp
src/infiniop-test/src/ops/swiglu.cpp
+104
-0
src/infiniop-test/src/tensor.cpp
src/infiniop-test/src/tensor.cpp
+57
-6
src/infiniop-test/src/test.cpp
src/infiniop-test/src/test.cpp
+6
-1
src/infiniop/devices/ascend/CMakeLists.txt
src/infiniop/devices/ascend/CMakeLists.txt
+7
-4
src/infiniop/devices/ascend/ascend_kernel_common.h
src/infiniop/devices/ascend/ascend_kernel_common.h
+20
-0
src/infiniop/devices/ascend/common_ascend.cc
src/infiniop/devices/ascend/common_ascend.cc
+17
-6
src/infiniop/devices/cuda/cuda_common.cuh
src/infiniop/devices/cuda/cuda_common.cuh
+0
-42
src/infiniop/devices/cuda/cuda_kernel_common.cuh
src/infiniop/devices/cuda/cuda_kernel_common.cuh
+68
-0
src/infiniop/devices/kunlun/kunlun_handle.h
src/infiniop/devices/kunlun/kunlun_handle.h
+1
-1
src/infiniop/devices/kunlun/kunlun_kernel_common.h
src/infiniop/devices/kunlun/kunlun_kernel_common.h
+64
-0
No files found.
src/infiniccl/infiniccl.cc
0 → 100644
View file @
c2e87202
#include "infiniccl.h"
#include "./cuda/infiniccl_cuda.h"
__C
infiniStatus_t
infinicclCommInitAll
(
infiniDevice_t
device_type
,
infinicclComm_t
*
comms
,
int
ndevice
,
const
int
*
device_ids
)
{
#define COMM_INIT_ALL(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::commInitAll(comms, ndevice, device_ids);
switch
(
device_type
)
{
COMM_INIT_ALL
(
INFINI_DEVICE_NVIDIA
,
cuda
)
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef COMM_INIT_ALL
}
__C
infiniStatus_t
infinicclCommDestroy
(
infinicclComm_t
comm
)
{
if
(
comm
==
nullptr
)
{
return
INFINI_STATUS_SUCCESS
;
}
#define COMM_DESTROY(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::commDestroy(comm);
switch
(
comm
->
device_type
)
{
COMM_DESTROY
(
INFINI_DEVICE_NVIDIA
,
cuda
)
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef COMM_DESTROY
}
__C
infiniStatus_t
infinicclAllReduce
(
void
*
sendbuf
,
void
*
recvbuf
,
size_t
count
,
infiniDtype_t
dataype
,
infinicclReduceOp_t
op
,
infinicclComm_t
comm
,
infinirtStream_t
stream
)
{
if
(
comm
==
nullptr
)
{
return
INFINI_STATUS_NULL_POINTER
;
}
#define ALL_REDUCE(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::allReduce(sendbuf, recvbuf, count, dataype, op, comm, stream);
switch
(
comm
->
device_type
)
{
ALL_REDUCE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef ALL_REDUCE
}
src/infiniccl/infiniccl_impl.h
0 → 100644
View file @
c2e87202
#ifndef INFINICCL_IMPL_H
#define INFINICCL_IMPL_H
#include "infiniccl.h"
struct
InfinicclComm
{
infiniDevice_t
device_type
;
int
device_id
;
// the actual device ID, not rank number
void
*
comm
;
// the actual communicator
};
#define INFINICCL_DEVICE_API(NAMSPACE, IMPL) \
namespace infiniccl::NAMSPACE { \
infiniStatus_t commInitAll( \
infinicclComm_t *comms, \
int ndevice, \
const int *device_ids) IMPL; \
\
infiniStatus_t commDestroy(infinicclComm_t comm) IMPL; \
\
infiniStatus_t allReduce( \
void *sendbuf, \
void *recvbuf, \
size_t count, \
infiniDtype_t datatype, \
infinicclReduceOp_t op, \
infinicclComm_t comm, \
infinirtStream_t stream) IMPL; \
};
#define INFINICCL_DEVICE_API_IMPL(NAMSPACE) \
INFINICCL_DEVICE_API(NAMSPACE, )
#define INFINICCL_DEVICE_API_NOOP(NAMSPACE) \
INFINICCL_DEVICE_API(NAMSPACE, { return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; })
#endif // INFINICCL_IMPL_H
src/infiniop-test/include/ops.hpp
View file @
c2e87202
...
...
@@ -8,6 +8,10 @@
DECLARE_INFINIOP_TEST
(
gemm
)
DECLARE_INFINIOP_TEST
(
random_sample
)
DECLARE_INFINIOP_TEST
(
rms_norm
)
DECLARE_INFINIOP_TEST
(
mul
)
DECLARE_INFINIOP_TEST
(
clip
)
DECLARE_INFINIOP_TEST
(
swiglu
)
DECLARE_INFINIOP_TEST
(
add
)
#define REGISTER_INFINIOP_TEST(name) \
{ \
...
...
@@ -16,6 +20,7 @@ DECLARE_INFINIOP_TEST(rms_norm)
infiniop_test::name::Test::build, \
infiniop_test::name::Test::attribute_names(), \
infiniop_test::name::Test::tensor_names(), \
infiniop_test::name::Test::output_names(), \
}},
/*
...
...
@@ -25,6 +30,10 @@ DECLARE_INFINIOP_TEST(rms_norm)
{ \
REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(add) \
REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(clip) \
REGISTER_INFINIOP_TEST(swiglu) \
REGISTER_INFINIOP_TEST(rms_norm) \
}
...
...
src/infiniop-test/include/tensor.hpp
View file @
c2e87202
...
...
@@ -58,7 +58,9 @@ private:
public:
Tensor
(
const
GGUFTensorInfo
*
info
,
const
void
*
ggml_ptr
,
const
GGUFKeyValue
*
strides_meta
=
nullptr
);
const
GGUFKeyValue
*
shape_meta
=
nullptr
,
const
GGUFKeyValue
*
strides_meta
=
nullptr
,
bool
isOutput
=
false
);
Tensor
(
std
::
shared_ptr
<
Memory
>
memory
,
size_t
offset
,
const
std
::
vector
<
size_t
>
&
shape
,
const
std
::
vector
<
ptrdiff_t
>
&
strides
,
...
...
src/infiniop-test/include/test.hpp
View file @
c2e87202
...
...
@@ -92,6 +92,7 @@ public:
\
static std::vector<std::string> attribute_names(); \
static std::vector<std::string> tensor_names(); \
static std::vector<std::string> output_names(); \
\
std::shared_ptr<infiniop_test::Result> run( \
infiniopHandle_t handle, infiniDevice_t device, int device_id, \
...
...
@@ -121,6 +122,7 @@ struct TestBuilder {
BuilderFunc
build
;
std
::
vector
<
std
::
string
>
attribute_names
;
std
::
vector
<
std
::
string
>
tensor_names
;
std
::
vector
<
std
::
string
>
output_names
;
};
}
// namespace infiniop_test
...
...
src/infiniop-test/src/ops/add.cpp
0 → 100644
View file @
c2e87202
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
add
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
a
;
std
::
shared_ptr
<
Tensor
>
b
;
std
::
shared_ptr
<
Tensor
>
c
;
std
::
shared_ptr
<
Tensor
>
ans
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"a"
)
==
tensors
.
end
()
||
tensors
.
find
(
"b"
)
==
tensors
.
end
()
||
tensors
.
find
(
"c"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
a
=
tensors
[
"a"
];
test
->
_attributes
->
b
=
tensors
[
"b"
];
test
->
_attributes
->
c
=
tensors
[
"c"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopAddDescriptor_t
op_desc
;
auto
a
=
_attributes
->
a
->
to
(
device
,
device_id
);
auto
b
=
_attributes
->
b
->
to
(
device
,
device_id
);
auto
c
=
_attributes
->
c
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateAddDescriptor
(
handle
,
&
op_desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create op descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetAddWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopAdd
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
c
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopAdd
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"a"
,
"b"
,
"c"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"c"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- a: "
<<
_attributes
->
a
->
info
()
<<
std
::
endl
;
oss
<<
"- b: "
<<
_attributes
->
b
->
info
()
<<
std
::
endl
;
oss
<<
"- c: "
<<
_attributes
->
c
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::add
src/infiniop-test/src/ops/clip.cpp
0 → 100644
View file @
c2e87202
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
clip
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
x
;
std
::
shared_ptr
<
Tensor
>
min_val
;
std
::
shared_ptr
<
Tensor
>
max_val
;
std
::
shared_ptr
<
Tensor
>
y
;
std
::
shared_ptr
<
Tensor
>
ans
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"x"
)
==
tensors
.
end
()
||
tensors
.
find
(
"min_val"
)
==
tensors
.
end
()
||
tensors
.
find
(
"max_val"
)
==
tensors
.
end
()
||
tensors
.
find
(
"y"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
x
=
tensors
[
"x"
];
test
->
_attributes
->
min_val
=
tensors
[
"min_val"
];
test
->
_attributes
->
max_val
=
tensors
[
"max_val"
];
test
->
_attributes
->
y
=
tensors
[
"y"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopClipDescriptor_t
op_desc
;
auto
x
=
_attributes
->
x
->
to
(
device
,
device_id
);
auto
min_val
=
_attributes
->
min_val
->
to
(
device
,
device_id
);
auto
max_val
=
_attributes
->
max_val
->
to
(
device
,
device_id
);
auto
y
=
_attributes
->
y
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateClipDescriptor
(
handle
,
&
op_desc
,
y
->
desc
(),
x
->
desc
(),
min_val
->
desc
(),
max_val
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create clip descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetClipWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopClip
(
op_desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
min_val
->
data
(),
max_val
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
y
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopClip
(
op_desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
min_val
->
data
(),
max_val
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
infiniopDestroyClipDescriptor
(
op_desc
);
infinirtFree
(
workspace
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"x"
,
"min_val"
,
"max_val"
,
"y"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"y"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- x: "
<<
_attributes
->
x
->
info
()
<<
std
::
endl
;
oss
<<
"- min_val: "
<<
_attributes
->
min_val
->
info
()
<<
std
::
endl
;
oss
<<
"- max_val: "
<<
_attributes
->
max_val
->
info
()
<<
std
::
endl
;
oss
<<
"- y: "
<<
_attributes
->
y
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::clip
src/infiniop-test/src/ops/gemm.cpp
View file @
c2e87202
...
...
@@ -113,6 +113,10 @@ std::vector<std::string> Test::tensor_names() {
return
{
"a"
,
"b"
,
"c"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
...
...
src/infiniop-test/src/ops/mul.cpp
0 → 100644
View file @
c2e87202
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
mul
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
a
;
std
::
shared_ptr
<
Tensor
>
b
;
std
::
shared_ptr
<
Tensor
>
c
;
std
::
shared_ptr
<
Tensor
>
ans
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"a"
)
==
tensors
.
end
()
||
tensors
.
find
(
"b"
)
==
tensors
.
end
()
||
tensors
.
find
(
"c"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
a
=
tensors
[
"a"
];
test
->
_attributes
->
b
=
tensors
[
"b"
];
test
->
_attributes
->
c
=
tensors
[
"c"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopMulDescriptor_t
op_desc
;
auto
a
=
_attributes
->
a
->
to
(
device
,
device_id
);
auto
b
=
_attributes
->
b
->
to
(
device
,
device_id
);
auto
c
=
_attributes
->
c
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateMulDescriptor
(
handle
,
&
op_desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create op descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetMulWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopMul
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
c
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopMul
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"a"
,
"b"
,
"c"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"c"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- a: "
<<
_attributes
->
a
->
info
()
<<
std
::
endl
;
oss
<<
"- b: "
<<
_attributes
->
b
->
info
()
<<
std
::
endl
;
oss
<<
"- c: "
<<
_attributes
->
c
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::mul
src/infiniop-test/src/ops/random_sample.cpp
View file @
c2e87202
...
...
@@ -109,6 +109,10 @@ std::vector<std::string> Test::tensor_names() {
return
{
"data"
,
"ans"
,
"result"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"result"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
...
...
src/infiniop-test/src/ops/swiglu.cpp
0 → 100644
View file @
c2e87202
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
swiglu
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
a
;
std
::
shared_ptr
<
Tensor
>
b
;
std
::
shared_ptr
<
Tensor
>
ans
;
std
::
shared_ptr
<
Tensor
>
c
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"a"
)
==
tensors
.
end
()
||
tensors
.
find
(
"b"
)
==
tensors
.
end
()
||
tensors
.
find
(
"c"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
a
=
tensors
[
"a"
];
test
->
_attributes
->
b
=
tensors
[
"b"
];
test
->
_attributes
->
c
=
tensors
[
"c"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopSwiGLUDescriptor_t
op_desc
;
auto
a
=
_attributes
->
a
->
to
(
device
,
device_id
);
auto
b
=
_attributes
->
b
->
to
(
device
,
device_id
);
auto
c
=
_attributes
->
c
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateSwiGLUDescriptor
(
handle
,
&
op_desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create op descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetSwiGLUWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopSwiGLU
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
c
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopSwiGLU
(
op_desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"a"
,
"b"
,
"c"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"c"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- a: "
<<
_attributes
->
a
->
info
()
<<
std
::
endl
;
oss
<<
"- b: "
<<
_attributes
->
b
->
info
()
<<
std
::
endl
;
oss
<<
"- c: "
<<
_attributes
->
c
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::swiglu
src/infiniop-test/src/tensor.cpp
View file @
c2e87202
...
...
@@ -98,20 +98,28 @@ void *Tensor::data() const {
Tensor
::
Tensor
(
const
GGUFTensorInfo
*
info
,
const
void
*
ggml_ptr
,
const
GGUFKeyValue
*
strides_meta
)
{
const
GGUFKeyValue
*
shape_meta
,
const
GGUFKeyValue
*
strides_meta
,
bool
isOutput
)
{
_ggml_type
=
info
->
ggml_type
;
_offset
=
0
;
size_t
ndim
=
static_cast
<
size_t
>
(
info
->
ndim
);
// `_shape`存储真实的tensor形状(来自shape_meta),`temp_shape`存储用于rearrange和计算内存的tensor形状
_shape
=
std
::
vector
<
size_t
>
(
ndim
);
std
::
vector
<
size_t
>
temp_shape
(
ndim
);
_strides
=
std
::
vector
<
ptrdiff_t
>
(
ndim
);
std
::
vector
<
ptrdiff_t
>
contiguous_strides
(
ndim
);
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
_shape
[
i
]
=
static_cast
<
size_t
>
(
info
->
shape
[
ndim
-
1
-
i
]);
temp
_shape
[
i
]
=
static_cast
<
size_t
>
(
info
->
shape
[
ndim
-
1
-
i
]);
if
(
i
==
0
)
{
contiguous_strides
[
ndim
-
1
]
=
(
ptrdiff_t
)
1
;
}
else
{
contiguous_strides
[
ndim
-
1
-
i
]
=
(
ptrdiff_t
)
info
->
shape
[
i
-
1
]
*
contiguous_strides
[
ndim
-
i
];
}
if
(
isOutput
)
{
contiguous_strides
[
i
]
=
(
ptrdiff_t
)
0
;
}
}
if
(
strides_meta
==
nullptr
)
{
...
...
@@ -120,7 +128,6 @@ Tensor::Tensor(const GGUFTensorInfo *info,
}
}
else
{
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
_shape
[
i
]
=
static_cast
<
size_t
>
(
info
->
shape
[
ndim
-
1
-
i
]);
if
(
strides_meta
->
gguf_type
==
GGUF_TYPE_INT64
)
{
_strides
[
i
]
=
(
ptrdiff_t
)(
reinterpret_cast
<
const
int64_t
*>
(
strides_meta
->
value
.
data
())[
ndim
-
1
-
i
]);
...
...
@@ -133,18 +140,62 @@ Tensor::Tensor(const GGUFTensorInfo *info,
}
}
infiniopCreateTensorDescriptor
(
&
_desc
,
ndim
,
_shape
.
data
(),
_strides
.
data
(),
ggmlTypeToInfiniType
(
_ggml_type
));
if
(
isOutput
)
{
if
(
shape_meta
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error Creating Tensor: shape_meta cannot be null for output tensor"
);
}
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
if
(
shape_meta
->
gguf_type
==
GGUF_TYPE_INT64
)
{
int64_t
val
=
reinterpret_cast
<
const
int64_t
*>
(
shape_meta
->
value
.
data
())[
i
];
if
(
val
<
0
)
{
throw
std
::
runtime_error
(
"Shape must be non-negative"
);
}
temp_shape
[
i
]
=
static_cast
<
size_t
>
(
val
);
}
else
if
(
shape_meta
->
gguf_type
==
GGUF_TYPE_INT32
)
{
int32_t
val
=
reinterpret_cast
<
const
int32_t
*>
(
shape_meta
->
value
.
data
())[
i
];
if
(
val
<
0
)
{
throw
std
::
runtime_error
(
"Shape must be non-negative"
);
}
temp_shape
[
i
]
=
static_cast
<
size_t
>
(
val
);
}
else
{
throw
std
::
runtime_error
(
"Error Creating Tensor: Unsupported shape type"
);
}
}
}
infiniopCreateTensorDescriptor
(
&
_desc
,
ndim
,
temp_shape
.
data
(),
_strides
.
data
(),
ggmlTypeToInfiniType
(
_ggml_type
));
size_t
size
;
calculateTensorMemory
(
size
,
_offset
,
_shape
,
_strides
,
ggmlTypeSize
(
_ggml_type
));
calculateTensorMemory
(
size
,
_offset
,
temp
_shape
,
_strides
,
ggmlTypeSize
(
_ggml_type
));
_memory
=
std
::
make_shared
<
Memory
>
(
size
,
INFINI_DEVICE_CPU
,
0
);
utils
::
rearrange
(
(
char
*
)
_memory
->
ptr
()
+
_offset
,
(
char
*
)
ggml_ptr
+
info
->
data_offset
,
_shape
.
data
(),
temp
_shape
.
data
(),
_strides
.
data
(),
contiguous_strides
.
data
(),
ndim
,
ggmlTypeSize
(
_ggml_type
));
if
(
shape_meta
==
nullptr
)
{
_shape
=
temp_shape
;
}
else
{
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
if
(
shape_meta
->
gguf_type
==
GGUF_TYPE_INT64
)
{
int64_t
val
=
reinterpret_cast
<
const
int64_t
*>
(
shape_meta
->
value
.
data
())[
i
];
if
(
val
<
0
)
{
throw
std
::
runtime_error
(
"Shape must be non-negative"
);
}
_shape
[
i
]
=
static_cast
<
size_t
>
(
val
);
}
else
if
(
shape_meta
->
gguf_type
==
GGUF_TYPE_INT32
)
{
int32_t
val
=
reinterpret_cast
<
const
int32_t
*>
(
shape_meta
->
value
.
data
())[
i
];
if
(
val
<
0
)
{
throw
std
::
runtime_error
(
"Shape must be non-negative"
);
}
_shape
[
i
]
=
static_cast
<
size_t
>
(
val
);
}
else
{
throw
std
::
runtime_error
(
"Error Creating Tensor: Unsupported shape type"
);
}
}
}
}
Tensor
::
Tensor
(
std
::
shared_ptr
<
Memory
>
memory
,
size_t
offset
,
...
...
src/infiniop-test/src/test.cpp
View file @
c2e87202
...
...
@@ -90,14 +90,19 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
attrs
[
attr_name
]
=
attr
->
second
->
value
;
}
}
for
(
auto
tensor_name
:
builder
.
tensor_names
)
{
auto
info
=
tensor_info
.
find
(
"test."
+
std
::
to_string
(
test_id
)
+
"."
+
tensor_name
);
if
(
info
!=
tensor_info
.
end
())
{
auto
shape
=
meta
.
find
(
"test."
+
std
::
to_string
(
test_id
)
+
"."
+
tensor_name
+
".shape"
);
auto
strides
=
meta
.
find
(
"test."
+
std
::
to_string
(
test_id
)
+
"."
+
tensor_name
+
".strides"
);
bool
is_output
=
std
::
find
(
builder
.
output_names
.
begin
(),
builder
.
output_names
.
end
(),
tensor_name
)
!=
builder
.
output_names
.
end
();
tensors
[
tensor_name
]
=
std
::
make_shared
<
Tensor
>
(
info
->
second
.
get
(),
gguf_reader
.
getGgmlStart
(),
strides
!=
meta
.
end
()
?
strides
->
second
.
get
()
:
nullptr
);
shape
!=
meta
.
end
()
?
shape
->
second
.
get
()
:
nullptr
,
strides
!=
meta
.
end
()
?
strides
->
second
.
get
()
:
nullptr
,
is_output
);
}
}
std
::
shared_ptr
<
infiniop_test
::
base
::
Test
>
test
;
...
...
src/infiniop/devices/ascend/CMakeLists.txt
View file @
c2e87202
...
...
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
# project information
project
(
Ascend_C
)
set
(
SOC_VERSION
"Ascend910B3"
CACHE STRING
"system on chip type"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_
TOOLKIT_
HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
RUN_MODE
"npu"
CACHE STRING
"run mode: npu"
)
set
(
CMAKE_BUILD_TYPE
"Release"
CACHE STRING
"Build type Release/Debug (default Debug)"
FORCE
)
set
(
CMAKE_INSTALL_PREFIX
"
${
CMAKE_CURRENT_LIST_DIR
}
/out"
CACHE STRING
"path for install()"
FORCE
)
...
...
@@ -19,10 +19,13 @@ else()
endif
()
include
(
${
ASCENDC_CMAKE_DIR
}
/ascendc.cmake
)
include_directories
(
${
CMAKE_SOURCE_DIR
}
/../../../../include/infiniop/
)
ascendc_library
(
ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_kernel.cpp
../../ops/ro
tary_embedding/ascend/rotary_embedding
_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp
../../ops/swiglu/ascend/swiglu_
ascend_
kernel.cpp
../../ops/ro
pe/ascend/rope_ascend
_kernel.cpp
#
../../ops/random_sample/ascend/random_sample_kernel.cpp
)
src/infiniop/devices/ascend/ascend_kernel_common.h
0 → 100644
View file @
c2e87202
#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
#include "../../../../include/infinicore.h"
#include "kernel_operator.h"
constexpr
size_t
BLOCK_NUM
=
8
;
constexpr
size_t
BUFFER_NUM
=
2
;
constexpr
size_t
BYTE_ALIGN
=
32
;
template
<
typename
T
>
__aicore__
inline
size_t
alignTileLen
(
size_t
tile_len
,
size_t
byte_align
)
{
size_t
bytes
=
tile_len
*
sizeof
(
T
);
size_t
aligned_bytes
=
(
bytes
%
byte_align
==
0
)
?
bytes
:
(
bytes
+
(
byte_align
-
bytes
%
byte_align
));
return
aligned_bytes
/
sizeof
(
T
);
}
#endif
src/infiniop/devices/ascend/common_ascend.cc
View file @
c2e87202
#include "common_ascend.h"
std
::
vector
<
int64_t
>
inferStorageShape
(
std
::
vector
<
int64_t
>
shape
,
std
::
vector
<
int64_t
>
strides
)
{
auto
index
=
std
::
max_element
(
strides
.
begin
(),
strides
.
end
());
uint64_t
max_stride_index
=
std
::
distance
(
strides
.
begin
(),
index
);
auto
storageShape
=
std
::
vector
<
int64_t
>
({
shape
[
max_stride_index
]
*
strides
[
max_stride_index
]});
if
(
shape
.
size
()
!=
strides
.
size
())
{
throw
std
::
invalid_argument
(
"Shape and strides must have the same length."
);
}
int64_t
max_offset
=
0
;
for
(
size_t
i
=
0
;
i
<
shape
.
size
();
++
i
)
{
max_offset
+=
(
shape
[
i
]
-
1
)
*
strides
[
i
];
}
return
storageShape
;
// storage shape is 1D buffer that must cover all accessed elements
return
{
max_offset
+
1
};
}
size_t
aclnnTensorDescriptor
::
numel
()
const
{
...
...
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this
->
strides
=
std
::
vector
<
int64_t
>
(
ndim
);
for
(
uint64_t
i
=
0
;
i
<
ndim
;
++
i
)
{
this
->
shape
[
i
]
=
static_cast
<
int64_t
>
(
desc
->
dim
(
i
));
this
->
strides
[
i
]
=
desc
->
stride
(
i
);
this
->
strides
[
i
]
=
static_cast
<
int64_t
>
(
desc
->
stride
(
i
)
)
;
}
this
->
storageShape
=
inferStorageShape
(
this
->
shape
,
this
->
strides
);
this
->
dataType
=
toAclDataType
(
desc
->
dtype
());
...
...
@@ -41,7 +47,12 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(aclDataType dtype, const std::vecto
this
->
strides
=
strides
;
this
->
dataType
=
dtype
;
this
->
format
=
aclFormat
::
ACL_FORMAT_ND
;
this
->
storageShape
=
inferStorageShape
(
this
->
shape
,
this
->
strides
);
if
(
this
->
ndim
!=
0
)
{
this
->
storageShape
=
inferStorageShape
(
this
->
shape
,
this
->
strides
);
}
else
{
this
->
storageShape
=
shape
;
this
->
storageNdim
=
0
;
}
this
->
tensor
=
aclCreateTensor
(
this
->
shape
.
data
(),
this
->
ndim
,
this
->
dataType
,
...
...
src/infiniop/devices/cuda/cuda_common.cuh
View file @
c2e87202
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "../../reduce/cuda/reduce.cuh"
#include "cuda_handle.cuh"
#include "infinicore.h"
#ifdef ENABLE_SUGON_CUDA_API
#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_1024 1024
#define CUDA_BLOCK_SIZE_512 512
namespace
device
::
cuda
{
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
}
// namespace device::cuda
#endif // __INFINIOP_CUDA_COMMON_CUH__
src/infiniop/devices/cuda/cuda_kernel_common.cuh
0 → 100644
View file @
c2e87202
#ifdef ENABLE_SUGON_CUDA_API
#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_1024 1024
#define CUDA_BLOCK_SIZE_512 512
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace
device
::
cuda
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
}
// namespace device::cuda
#ifdef ENABLE_CUDA_API
#include <cuda_fp16.h>
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
}
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
expl
(
val
);
}
__forceinline__
__device__
double
exp_
(
const
double
val
)
{
return
exp
(
val
);
}
__forceinline__
__device__
__half
exp_
(
const
__half
x
)
{
return
hexp
(
x
);
}
#endif
src/infiniop/devices/kunlun/kunlun_handle.h
View file @
c2e87202
...
...
@@ -16,7 +16,7 @@ typedef XPUStream kunlunStream_t;
typedef
XPUEvent
kunlunEvent_t
;
typedef
xdnn
::
Context
*
xdnnHandle_t
;
#define CHECK_
XDN
N(API) CHECK_INTERNAL(API, XPU_SUCCESS)
#define CHECK_
KUNLU
N(API) CHECK_INTERNAL(API, XPU_SUCCESS)
namespace
device
::
kunlun
{
...
...
src/infiniop/devices/kunlun/kunlun_kernel_common.h
0 → 100644
View file @
c2e87202
#ifndef __INFINIOP_KUNLUN_KERNEL_COMMON_H__
#define __INFINIOP_KUNLUN_KERNEL_COMMON_H__
// This header file will only be include by .xpu file
#include "kunlun_kernel_dtype.h"
#include "xpu/kernel/xtdk.h"
#include "xpu/kernel/xtdk_math.h"
#include "xpu/kernel/xtdk_simd.h"
#include "xpu/runtime.h"
namespace
device
::
kunlun
::
kernel
{
// Get mask for kunlun xpu 512bit register calculation
// if data is not enough to 512bit, padding zero and use
// mask to identify real data
// 0 - i bit 1, others 0
inline
__device__
float
lowerBitMask
(
int
i
)
{
return
(
1
<<
(
i
+
1
))
-
1
;
}
// Atomic add for reduce
inline
__device__
void
atomicAddF32
(
__shared_ptr__
float
*
ptr
,
float
value
)
{
int
success
=
1
;
while
(
success
)
{
// SM2REG read 32bit data to register
float
a
=
SM2REG_atomic
(
ptr
);
a
=
a
+
value
;
success
=
REG2SM_atomic
(
ptr
,
a
);
}
}
inline
__device__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
_ptrdiff_t
*
broadcasted_strides
,
const
_ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
].
value
*
target_strides
[
i
].
value
;
flat_index
%=
broadcasted_strides
[
i
].
value
;
mfence
();
}
return
res
;
}
inline
__device__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
_size_t
*
shape
,
const
_ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
].
value
)
*
strides
[
i
].
value
;
flat_index
/=
shape
[
i
].
value
;
mfence
();
}
return
res
;
}
}
// namespace device::kunlun::kernel
// TODO: atomicAddF16
// TODO: atomicAddI8
#endif
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment