Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
012df56c
Unverified
Commit
012df56c
authored
Feb 11, 2026
by
thatPepe
Committed by
GitHub
Feb 11, 2026
Browse files
Merge pull request #963 from InfiniTensor/issue/523-020
issue/523 - switched to cambricon mlu 1.22 interface
parents
f1b8ab64
aac54e1f
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
34 additions
and
34 deletions
+34
-34
src/infiniccl/cambricon/infiniccl_cambricon.cc
src/infiniccl/cambricon/infiniccl_cambricon.cc
+1
-1
src/infiniop/elementwise/bang/elementwise_bang.h
src/infiniop/elementwise/bang/elementwise_bang.h
+2
-2
src/infiniop/elementwise/bang/elementwise_bang_kernel.h
src/infiniop/elementwise/bang/elementwise_bang_kernel.h
+2
-2
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
+1
-1
src/infiniop/ops/gemm/bang/gemm_bang.cc
src/infiniop/ops/gemm/bang/gemm_bang.cc
+7
-7
src/infiniop/ops/random_sample/bang/random_sample_kernel.mlu
src/infiniop/ops/random_sample/bang/random_sample_kernel.mlu
+9
-9
src/infiniop/ops/rearrange/bang/rearrange_bang.mlu
src/infiniop/ops/rearrange/bang/rearrange_bang.mlu
+1
-1
src/infiniop/ops/rms_norm/bang/rms_norm_bang.mlu
src/infiniop/ops/rms_norm/bang/rms_norm_bang.mlu
+4
-4
src/infiniop/ops/rope/bang/rope_bang.mlu
src/infiniop/ops/rope/bang/rope_bang.mlu
+1
-1
src/infiniop/reduce/bang/reduce_bang.h
src/infiniop/reduce/bang/reduce_bang.h
+6
-6
No files found.
src/infiniccl/cambricon/infiniccl_cambricon.cc
View file @
012df56c
...
...
@@ -62,7 +62,7 @@ infiniStatus_t commInitAll(
for
(
int
i
=
0
;
i
<
ndevice
;
i
++
)
{
rank_list
[
i
]
=
i
;
CHECK_INTERNAL
(
cnrtSetDevice
(
device_ids
[
i
]),
CNRT_RET_SUCCESS
);
CHECK_INTERNAL
(
cnrtSetDevice
(
device_ids
[
i
]),
cnrtSuccess
);
}
CHECK_CNCL
(
cnclInitComms
(
cncl_comms
.
data
(),
ndevice
,
...
...
src/infiniop/elementwise/bang/elementwise_bang.h
View file @
012df56c
...
...
@@ -127,8 +127,8 @@ private:
const
int8_t
*
d_meta_start
=
reinterpret_cast
<
int8_t
*>
(
workspace
)
+
input_arr_size
;
// Copy input pointer array and metadata to device
CNRT_CHECK
(
cnrtMemcpy
(
workspace
,
(
void
*
)
h_inputs_arr
,
input_arr_size
,
CNRT_MEM_TRANS_DIR_HOST2DEV
));
CNRT_CHECK
(
cnrtMemcpy
((
void
*
)
d_meta_start
,
(
void
*
)
info_meta_start
,
info
.
getMetaMemSize
(),
CNRT_MEM_TRANS_DIR_HOST2DEV
));
CNRT_CHECK
(
cnrtMemcpy
(
workspace
,
(
void
*
)
h_inputs_arr
,
input_arr_size
,
cnrtMemcpyHostToDev
));
CNRT_CHECK
(
cnrtMemcpy
((
void
*
)
d_meta_start
,
(
void
*
)
info_meta_start
,
info
.
getMetaMemSize
(),
cnrtMemcpyHostToDev
));
// Setup pointers to device memory regions
d_inputs_arr
=
reinterpret_cast
<
const
void
**>
(
workspace
);
...
...
src/infiniop/elementwise/bang/elementwise_bang_kernel.h
View file @
012df56c
...
...
@@ -248,10 +248,10 @@ void launchElementwiseKernelWrapper(
dim
.
z
=
1
;
// Choose kernel type based on problem characteristics
cnrtFunctionType_t
func_type
=
CNRT_FUNC_TYPE_BLOCK
;
cnrtFunctionType_t
func_type
=
cnrtFuncTypeBlock
;
if
(
output_size
>
1024
*
1024
&&
output_contiguous
)
{
// For large contiguous operations, use UNION type
func_type
=
CNRT_FUNC_TYPE_UNION
1
;
func_type
=
cnrtFuncTypeUnion
1
;
}
// Launch the kernel with optimal configuration
...
...
src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
View file @
012df56c
...
...
@@ -131,7 +131,7 @@ void causalSoftmaxUnion(void *workspace, int core_per_cluster, int cluster_count
kernel_dim.x = core_per_cluster;
kernel_dim.y = cluster_count;
kernel_dim.z = 1;
kernel_type =
CNRT_FUNC_TYPE_UNION
1;
kernel_type =
cnrtFuncTypeUnion
1;
// Launch kernel
causalSoftmax<T><<<kernel_dim, kernel_type, queue>>>(
...
...
src/infiniop/ops/gemm/bang/gemm_bang.cc
View file @
012df56c
...
...
@@ -15,8 +15,8 @@ struct Descriptor::Opaque {
cnnlDestroyTensorDescriptor
(
a
);
cnnlDestroyTensorDescriptor
(
b
);
cnnlDestroyTensorDescriptor
(
c
);
cnnlMatMulDesc
Destroy
(
op
);
cnnlMatMulAlgo
Destroy
(
algo
);
cnnl
Destroy
MatMulDesc
riptor
(
op
);
cnnl
Destroy
MatMulAlgo
(
algo
);
cnnlDestroyMatMulHeuristicResult
(
algoResult
);
}
};
...
...
@@ -85,8 +85,8 @@ infiniStatus_t Descriptor::create(
cnnlMatMulDescriptor_t
op
;
cnnlMatMulAlgo_t
algo
;
cnnlMatMulHeuristicResult_t
algoResult
;
CHECK_BANG
(
cnnlMatMulDesc
Create
(
&
op
));
CHECK_BANG
(
cnnlMatMulAlgo
Create
(
&
algo
));
CHECK_BANG
(
cnnl
Create
MatMulDesc
riptor
(
&
op
));
CHECK_BANG
(
cnnl
Create
MatMulAlgo
(
&
algo
));
CHECK_BANG
(
cnnlCreateMatMulHeuristicResult
(
&
algoResult
));
int32_t
use_stride
=
true
;
CHECK_BANG
(
cnnlSetMatMulDescAttr
(
...
...
@@ -101,7 +101,7 @@ infiniStatus_t Descriptor::create(
(
cnrtQueue_t
)
nullptr
,
[
&
](
cnnlHandle_t
_handle
)
{
CHECK_BANG
(
cnnlGetBatchMatMulAlgoHeuristic
(
cnnlGetBatchMatMul
Ex
AlgoHeuristic
(
_handle
,
op
,
a
,
b
,
c
,
NULL
,
1
,
&
algoResult
,
&
count
));
...
...
@@ -109,7 +109,7 @@ infiniStatus_t Descriptor::create(
}));
size_t
workspace_size
;
CHECK_BANG
(
cnnlGetBatchMatMulHeuristicResult
(
algoResult
,
algo
,
&
workspace_size
));
CHECK_BANG
(
cnnlGetBatchMatMul
Ex
HeuristicResult
(
algoResult
,
algo
,
&
workspace_size
));
*
desc_ptr
=
new
Descriptor
(
dtype
,
info
,
workspace_size
,
...
...
@@ -135,7 +135,7 @@ infiniStatus_t Descriptor::calculate(
CHECK_STATUS
(
_opaque
->
internal
->
useCnnl
(
(
cnrtQueue_t
)
stream
,
[
&
](
cnnlHandle_t
handle
)
{
CHECK_BANG
(
cnnlBatchMatMul
BCast_v2
(
CHECK_BANG
(
cnnlBatchMatMul
Ex
(
handle
,
_opaque
->
op
,
_opaque
->
algo
,
...
...
src/infiniop/ops/random_sample/bang/random_sample_kernel.mlu
View file @
012df56c
...
...
@@ -534,13 +534,13 @@ struct Algo {
if constexpr (std::is_same<Tval_, float>::value) {
auto logits = reinterpret_cast<const float *>(probs);
argMax<<<dim,
CNRT_FUNC_TYPE_BLOCK
, queue>>>(logits, result, gdram_indices, voc);
argMax<<<dim,
cnrtFuncTypeBlock
, queue>>>(logits, result, gdram_indices, voc);
} else if constexpr (std::is_same<Tval_, CustomFloat16>::value) {
auto logits = reinterpret_cast<const half *>(probs);
argMax<<<dim,
CNRT_FUNC_TYPE_BLOCK
, queue>>>(logits, result, gdram_indices, voc);
argMax<<<dim,
cnrtFuncTypeBlock
, queue>>>(logits, result, gdram_indices, voc);
} else if constexpr (std::is_same<Tval_, CustomBFloat16>::value) {
auto logits = reinterpret_cast<const bfloat16_t *>(probs);
argMax<<<dim,
CNRT_FUNC_TYPE_BLOCK
, queue>>>(logits, result, gdram_indices, voc);
argMax<<<dim,
cnrtFuncTypeBlock
, queue>>>(logits, result, gdram_indices, voc);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
...
...
@@ -575,10 +575,10 @@ struct Algo {
const int max_num = SRC_MAX_SIZE / sizeof(float);
if (voc >= task_num * max_num) {
randomSampleKernelLarge<<<dim,
CNRT_FUNC_TYPE_UNION
1, queue>>>(
randomSampleKernelLarge<<<dim,
cnrtFuncTypeUnion
1, queue>>>(
logits, result, gdram_indices, global_top_k, global_sum, voc, random_val, topp, topk, temperature);
} else {
randomSampleKernel<<<dim,
CNRT_FUNC_TYPE_UNION
1, queue>>>(
randomSampleKernel<<<dim,
cnrtFuncTypeUnion
1, queue>>>(
logits, result, gdram_indices, global_top_k, global_sum, voc, random_val, topp, topk, temperature);
}
} else if constexpr (std::is_same<Tval_, CustomFloat16>::value) {
...
...
@@ -592,10 +592,10 @@ struct Algo {
const int max_num = SRC_MAX_SIZE / sizeof(half);
if (voc >= task_num * max_num) {
randomSampleKernelLarge<<<dim,
CNRT_FUNC_TYPE_UNION
1, queue>>>(
randomSampleKernelLarge<<<dim,
cnrtFuncTypeUnion
1, queue>>>(
logits, result, gdram_indices, global_top_k, global_sum, voc, random_val, topp, topk, temperature);
} else {
randomSampleKernel<<<dim,
CNRT_FUNC_TYPE_UNION
1, queue>>>(
randomSampleKernel<<<dim,
cnrtFuncTypeUnion
1, queue>>>(
logits, result, gdram_indices, global_top_k, global_sum, voc, random_val, topp, topk, temperature);
}
} else if constexpr (std::is_same<Tval_, CustomBFloat16>::value) {
...
...
@@ -609,10 +609,10 @@ struct Algo {
const int max_num = SRC_MAX_SIZE / sizeof(bfloat16_t);
if (voc >= task_num * max_num) {
randomSampleKernelLarge<<<dim,
CNRT_FUNC_TYPE_UNION
1, queue>>>(
randomSampleKernelLarge<<<dim,
cnrtFuncTypeUnion
1, queue>>>(
logits, result, gdram_indices, global_top_k, global_sum, voc, random_val, topp, topk, temperature);
} else {
randomSampleKernel<<<dim,
CNRT_FUNC_TYPE_UNION
1, queue>>>(
randomSampleKernel<<<dim,
cnrtFuncTypeUnion
1, queue>>>(
logits, result, gdram_indices, global_top_k, global_sum, voc, random_val, topp, topk, temperature);
}
} else {
...
...
src/infiniop/ops/rearrange/bang/rearrange_bang.mlu
View file @
012df56c
...
...
@@ -267,7 +267,7 @@ infiniStatus_t Descriptor::calculate(
dim.x = 4; // Using 4 clusters
dim.y = 10;
dim.z = 1;
func_type =
CNRT_FUNC_TYPE_UNION
1;
func_type =
cnrtFuncTypeUnion
1;
if (_opaque->use_2d_copy) {
// Use optimized 2D copy kernel
...
...
src/infiniop/ops/rms_norm/bang/rms_norm_bang.mlu
View file @
012df56c
...
...
@@ -82,7 +82,7 @@ __mlu_global__ void rmsnorm(T *output, const T *input, const Tw *weight,
}
} else {
// Large vector processing with chunking
__bang_write_
zero
(reduction_buffer, reduce_buffer_size);
__bang_write_
value
(reduction_buffer, reduce_buffer_size
, 0
);
size_t processed_elements = 0;
while (processed_elements < vector_size) {
...
...
@@ -223,7 +223,7 @@ void rmsnormUnion(void *workspace, int core_per_cluster, int cluster_count, cnrt
kernel_dim.x = core_per_cluster;
kernel_dim.y = cluster_count;
kernel_dim.z = 1;
kernel_type =
CNRT_FUNC_TYPE_UNION
1; // Can choose others, but must adapt kernel_type accordingly
kernel_type =
cnrtFuncTypeUnion
1; // Can choose others, but must adapt kernel_type accordingly
int dimsize = shape[ndim - 1]; // Length of operation dimension
int dim_s; // dim_s is the next power of 2 greater than dimsize
float mi = log2(dimsize);
...
...
src/infiniop/ops/rope/bang/rope_bang.mlu
View file @
012df56c
...
...
@@ -52,7 +52,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
k_dim.x = 4;
k_dim.y = 1;
k_dim.z = 1;
k_type =
CNRT_FUNC_TYPE_UNION
1;
k_type =
cnrtFuncTypeUnion
1;
// Launch kernel with batch dimension
ropeKernel<<<k_dim, k_type, queue>>>(
...
...
src/infiniop/reduce/bang/reduce_bang.h
View file @
012df56c
...
...
@@ -50,7 +50,7 @@ __mlu_func__ float sum(const T *source, T *src, float *dst, int num_elements, in
size_t
curr_batch
=
std
::
min
<
size_t
>
(
max_batch
,
num_elements
-
processed
);
if
(
curr_batch
<
max_batch
)
{
__bang_write_
zero
(
src
,
max_batch
+
offset
);
__bang_write_
value
(
src
,
max_batch
+
offset
,
0
);
}
__memcpy
(
src
+
offset
,
source
+
processed
,
curr_batch
*
sizeof
(
T
),
GDRAM2NRAM
);
...
...
@@ -81,7 +81,7 @@ __mlu_func__ float sumBatched(const T *source, T *src, float *dst, int num_eleme
size_t
remainder
=
curr_batch
%
batch_size
;
// Ensure NRAM buffer is zeroed
__bang_write_
zero
(
src
,
max_batch
+
offset
);
__bang_write_
value
(
src
,
max_batch
+
offset
,
0
);
// Copy data to NRAM
__memcpy
(
src
+
offset
,
source
+
processed
,
curr_batch
*
sizeof
(
T
),
GDRAM2NRAM
);
...
...
@@ -120,7 +120,7 @@ __mlu_func__ float sumSquared(const T *source, T *src, float *dst, int num_eleme
size_t
curr_batch
=
std
::
min
<
size_t
>
(
max_batch
,
num_elements
-
processed
);
if
(
curr_batch
<
max_batch
)
{
__bang_write_
zero
(
src
,
max_batch
+
offset
);
__bang_write_
value
(
src
,
max_batch
+
offset
,
0
);
}
__memcpy
(
src
+
offset
,
source
+
processed
,
curr_batch
*
sizeof
(
T
),
GDRAM2NRAM
);
...
...
@@ -165,7 +165,7 @@ __mlu_func__ float sumSquaredBatched(const T *source, T *src, float *dst, int nu
size_t
remainder
=
curr_batch
%
batch_size
;
// Ensure NRAM buffer is zeroed
__bang_write_
zero
(
src
,
max_batch
+
offset
);
__bang_write_
value
(
src
,
max_batch
+
offset
,
0
);
// Copy data to NRAM
__memcpy
(
src
+
offset
,
source
+
processed
,
curr_batch
*
sizeof
(
T
),
GDRAM2NRAM
);
...
...
@@ -235,7 +235,7 @@ __mlu_func__ float max(const T *source, T *src, float *dst, int num_elements, in
size_t
curr_batch
=
std
::
min
<
size_t
>
(
max_batch
,
num_elements
-
processed
);
if
(
curr_batch
<
max_batch
)
{
__bang_write_
zero
(
src
,
max_batch
+
offset
);
__bang_write_
value
(
src
,
max_batch
+
offset
,
0
);
}
__memcpy
(
src
+
offset
,
source
+
processed
,
curr_batch
*
sizeof
(
T
),
GDRAM2NRAM
);
...
...
@@ -264,7 +264,7 @@ __mlu_func__ float maxBatched(const T *source, T *src, float *dst, int num_eleme
size_t
curr_batch
=
std
::
min
<
size_t
>
(
max_batch
,
num_elements
-
processed
);
if
(
curr_batch
<
max_batch
)
{
__bang_write_
zero
(
src
,
max_batch
+
offset
);
__bang_write_
value
(
src
,
max_batch
+
offset
,
0
);
}
__memcpy
(
src
+
offset
,
source
+
processed
,
curr_batch
*
sizeof
(
T
),
GDRAM2NRAM
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment