Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
c0439e64
"...resnet50_tensorflow.git" did not exist on "458ea726e4889277d3a444c4815c3a75d17ba7dc"
Commit
c0439e64
authored
Dec 19, 2024
by
ThomasNing
Browse files
Finished the receiving kernel for the message receive coding.
parent
5dd5b531
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
103 additions
and
32 deletions
+103
-32
example/ck_tile/15_cross_gpu_reduce/cross_gpu_reduce.cpp
example/ck_tile/15_cross_gpu_reduce/cross_gpu_reduce.cpp
+17
-2
example/ck_tile/15_cross_gpu_reduce/cross_gpu_reduce.hpp
example/ck_tile/15_cross_gpu_reduce/cross_gpu_reduce.hpp
+3
-0
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_connect.hpp
...ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_connect.hpp
+9
-2
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_receive_kernel.hpp
.../ops/cross_gpu_reduce/kernel/cross_gpu_receive_kernel.hpp
+39
-15
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_reduce_tile_partitioner.hpp
...s_gpu_reduce/kernel/cross_gpu_reduce_tile_partitioner.hpp
+1
-1
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_send_kernel.hpp
...ile/ops/cross_gpu_reduce/kernel/cross_gpu_send_kernel.hpp
+2
-2
include/ck_tile/ops/cross_gpu_reduce/pipeline/reduce_receive_pipeline_scale_up.hpp
..._gpu_reduce/pipeline/reduce_receive_pipeline_scale_up.hpp
+32
-10
No files found.
example/ck_tile/15_cross_gpu_reduce/cross_gpu_reduce.cpp
View file @
c0439e64
...
@@ -19,6 +19,7 @@ struct AllocateAndTransferFunctor
...
@@ -19,6 +19,7 @@ struct AllocateAndTransferFunctor
{
{
// Invoke the memory transfer between GPUs based on whether it is host gpu or slave gpu.
// Invoke the memory transfer between GPUs based on whether it is host gpu or slave gpu.
float
invoke_transfer
(
ck_tile
::
DeviceMem
&
transfer_buf
,
float
invoke_transfer
(
ck_tile
::
DeviceMem
&
transfer_buf
,
std
::
vector
<
ck_tile
::
DeviceMem
>&
receive_mem_bufs
,
ck_tile
::
index_t
host_gpu
,
ck_tile
::
index_t
host_gpu
,
int
device_id
,
int
device_id
,
const
ck_tile
::
ArgParser
&
arg_parser
,
const
ck_tile
::
ArgParser
&
arg_parser
,
...
@@ -86,8 +87,11 @@ struct AllocateAndTransferFunctor
...
@@ -86,8 +87,11 @@ struct AllocateAndTransferFunctor
if
(
static_cast
<
ck_tile
::
index_t
>
(
device_id
)
==
host_gpu
)
if
(
static_cast
<
ck_tile
::
index_t
>
(
device_id
)
==
host_gpu
)
{
{
// initialize the receive data buffer and global memory location.
// initialize the receive data buffer and global memory location.
ck_tile
::
HostTensor
<
InputType
>
receive_host
({
M
,
N
});
std
::
array
<
const
void
*
,
MaxSendGPUNum
>
p_receive_list
;
ck_tile
::
DeviceMem
receive_buf
(
receive_host
.
get_element_space_size_in_bytes
());
for
(
size_t
i
=
0
;
i
<
receive_mem_bufs
.
size
();
++
i
)
{
p_receive_list
[
i
]
=
receive_mem_bufs
[
i
].
GetDeviceBuffer
();
}
args_receive
.
p_receive_list
=
p_receive_list
;
// initialize the output data buffer.
// initialize the output data buffer.
std
::
string
output_type
=
arg_parser
.
get_str
(
"output_type"
);
std
::
string
output_type
=
arg_parser
.
get_str
(
"output_type"
);
if
(
output_type
.
compare
(
"float"
)
==
0
)
if
(
output_type
.
compare
(
"float"
)
==
0
)
...
@@ -96,6 +100,7 @@ struct AllocateAndTransferFunctor
...
@@ -96,6 +100,7 @@ struct AllocateAndTransferFunctor
ck_tile
::
DeviceMem
output_buf
(
output_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
output_buf
(
output_host
.
get_element_space_size_in_bytes
());
args_receive
.
p_output
=
output_buf
.
GetDeviceBuffer
();
args_receive
.
p_output
=
output_buf
.
GetDeviceBuffer
();
auto
kargs_slave
=
SlaveKernel
::
MakeKargs
(
args_receive
.
p_reduce
,
auto
kargs_slave
=
SlaveKernel
::
MakeKargs
(
args_receive
.
p_reduce
,
args_receive
.
p_receive_list
,
args_receive
.
p_output
,
args_receive
.
p_output
,
args_receive
.
M
,
args_receive
.
M
,
args_receive
.
N
);
args_receive
.
N
);
...
@@ -132,6 +137,7 @@ struct AllocateAndTransferFunctor
...
@@ -132,6 +137,7 @@ struct AllocateAndTransferFunctor
void
operator
()(
int
device_id
,
void
operator
()(
int
device_id
,
ck_tile
::
HostTensor
<
InputType
>&
host_tensor
,
ck_tile
::
HostTensor
<
InputType
>&
host_tensor
,
ck_tile
::
DeviceMem
&
device_mem
,
ck_tile
::
DeviceMem
&
device_mem
,
std
::
vector
<
ck_tile
::
DeviceMem
>&
receive_mem
,
ck_tile
::
index_t
host_gpu
,
ck_tile
::
index_t
host_gpu
,
const
ck_tile
::
ArgParser
&
arg_parser
)
const
ck_tile
::
ArgParser
&
arg_parser
)
{
{
...
@@ -142,6 +148,11 @@ struct AllocateAndTransferFunctor
...
@@ -142,6 +148,11 @@ struct AllocateAndTransferFunctor
<<
hipGetErrorString
(
hip_err_set_device
)
<<
std
::
endl
;
<<
hipGetErrorString
(
hip_err_set_device
)
<<
std
::
endl
;
return
;
return
;
}
}
if
(
device_id
==
host_gpu
){
for
(
size_t
i
=
0
;
i
<
receive_mem
.
size
();
++
i
)
{
receive_mem
[
i
].
Realloc
(
host_tensor
.
get_element_space_size_in_bytes
());
}
}
// Allocate device memory
// Allocate device memory
device_mem
.
Realloc
(
host_tensor
.
get_element_space_size_in_bytes
());
device_mem
.
Realloc
(
host_tensor
.
get_element_space_size_in_bytes
());
// Transfer data to device
// Transfer data to device
...
@@ -152,12 +163,14 @@ struct AllocateAndTransferFunctor
...
@@ -152,12 +163,14 @@ struct AllocateAndTransferFunctor
static_cast
<
int
>
(
host_gpu
),
static_cast
<
int
>
(
host_gpu
),
static_cast
<
int
>
(
worldSize
),
static_cast
<
int
>
(
worldSize
),
device_mem
.
GetDeviceBuffer
(),
device_mem
.
GetDeviceBuffer
(),
receive_mem
,
host_tensor
.
get_element_space_size_in_bytes
());
host_tensor
.
get_element_space_size_in_bytes
());
int
n_warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
n_warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
n_repeat
=
arg_parser
.
get_int
(
"repeat"
);
int
n_repeat
=
arg_parser
.
get_int
(
"repeat"
);
invoke_transfer
(
device_mem
,
invoke_transfer
(
device_mem
,
receive_mem
,
host_gpu
,
host_gpu
,
device_id
,
device_id
,
arg_parser
,
arg_parser
,
...
@@ -221,6 +234,7 @@ bool run_cross_gpu_reduce(ck_tile::ArgParser arg_parser)
...
@@ -221,6 +234,7 @@ bool run_cross_gpu_reduce(ck_tile::ArgParser arg_parser)
std
::
vector
<
ck_tile
::
HostTensor
<
InputType
>>
transfer_tensor_host_list
;
std
::
vector
<
ck_tile
::
HostTensor
<
InputType
>>
transfer_tensor_host_list
;
transfer_tensor_host_list
.
reserve
(
gpu_nums
);
transfer_tensor_host_list
.
reserve
(
gpu_nums
);
std
::
vector
<
ck_tile
::
DeviceMem
>
transfer_bufs
(
gpu_nums
);
std
::
vector
<
ck_tile
::
DeviceMem
>
transfer_bufs
(
gpu_nums
);
std
::
vector
<
ck_tile
::
DeviceMem
>
slave_receive_bufs
(
gpu_nums
-
1
);
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
thread
>
threads
;
AllocateAndTransferFunctor
<
InputType
,
OutputType
>
allocateAndTransfer
;
AllocateAndTransferFunctor
<
InputType
,
OutputType
>
allocateAndTransfer
;
...
@@ -306,6 +320,7 @@ bool run_cross_gpu_reduce(ck_tile::ArgParser arg_parser)
...
@@ -306,6 +320,7 @@ bool run_cross_gpu_reduce(ck_tile::ArgParser arg_parser)
device_list
[
i
],
device_list
[
i
],
std
::
ref
(
transfer_tensor_host_list
[
i
]),
std
::
ref
(
transfer_tensor_host_list
[
i
]),
std
::
ref
(
transfer_bufs
[
i
]),
std
::
ref
(
transfer_bufs
[
i
]),
std
::
ref
(
slave_receive_bufs
),
host_gpu
,
host_gpu
,
arg_parser
);
arg_parser
);
}
}
...
...
example/ck_tile/15_cross_gpu_reduce/cross_gpu_reduce.hpp
View file @
c0439e64
...
@@ -5,9 +5,12 @@
...
@@ -5,9 +5,12 @@
#include "ck_tile/host.hpp"
#include "ck_tile/host.hpp"
constexpr
int
MaxSendGPUNum
=
7
;
struct
transfer_receive_basic_args
struct
transfer_receive_basic_args
{
{
const
void
*
p_reduce
;
const
void
*
p_reduce
;
std
::
array
<
const
void
*
,
MaxSendGPUNum
>
p_receive_list
;
const
void
*
p_output
;
const
void
*
p_output
;
ck_tile
::
index_t
host_gpu
;
ck_tile
::
index_t
host_gpu
;
ck_tile
::
index_t
device_id
;
ck_tile
::
index_t
device_id
;
...
...
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_connect.hpp
View file @
c0439e64
...
@@ -5,6 +5,8 @@
...
@@ -5,6 +5,8 @@
#include "ck_tile/core.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/common.hpp"
#include <vector>
#pragma clang diagnostic push
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wsuggest-destructor-override"
#pragma clang diagnostic ignored "-Wsuggest-destructor-override"
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wold-style-cast"
...
@@ -26,7 +28,12 @@ extern __constant__ DeviceHandle<mscclpp::SmChannel> constSlaveSmChannels[8]; //
...
@@ -26,7 +28,12 @@ extern __constant__ DeviceHandle<mscclpp::SmChannel> constSlaveSmChannels[8]; //
extern
__constant__
DeviceHandle
<
mscclpp
::
SmChannel
>
constMasterSmChannel
;
extern
__constant__
DeviceHandle
<
mscclpp
::
SmChannel
>
constMasterSmChannel
;
void
setupConnection
(
int
rank
,
int
slaveRank
,
int
worldSize
,
void
*
dst_data
,
size_t
dataSize
)
void
setupConnection
(
int
rank
,
int
slaveRank
,
int
worldSize
,
void
*
dst_data
,
std
::
vector
<
ck_tile
::
DeviceMem
>&
receive_mem_vector
,
size_t
dataSize
)
{
{
// Initialize MSCCL++ Communicator
// Initialize MSCCL++ Communicator
auto
bootstrap
=
std
::
make_shared
<
mscclpp
::
TcpBootstrap
>
(
rank
,
worldSize
);
auto
bootstrap
=
std
::
make_shared
<
mscclpp
::
TcpBootstrap
>
(
rank
,
worldSize
);
...
@@ -87,7 +94,7 @@ void setupConnection(int rank, int slaveRank, int worldSize, void* dst_data, siz
...
@@ -87,7 +94,7 @@ void setupConnection(int rank, int slaveRank, int worldSize, void* dst_data, siz
SmChannels
.
push_back
(
mscclpp
::
deviceHandle
(
SmChannels
.
push_back
(
mscclpp
::
deviceHandle
(
mscclpp
::
SmChannel
(
slaveSemaphores
[
i
],
mscclpp
::
SmChannel
(
slaveSemaphores
[
i
],
remoteMemories
[
i
],
// Remote buffer from the sender
remoteMemories
[
i
],
// Remote buffer from the sender
dst_data
// Local buffer (this slave's buffer)
receive_mem_vector
[
i
].
GetDeviceBuffer
()
// Local buffer (this slave's buffer)
)));
)));
}
}
hipError_t
error_slave
=
hipError_t
error_slave
=
...
...
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_receive_kernel.hpp
View file @
c0439e64
...
@@ -20,17 +20,20 @@ struct ReduceReceiveKernel
...
@@ -20,17 +20,20 @@ struct ReduceReceiveKernel
struct
ReduceReceiveKargs
struct
ReduceReceiveKargs
{
{
const
void
*
reduce_ptr
;
const
void
*
reduce_ptr
;
std
::
array
<
const
void
*
,
MaxSendGPUNum
>
receive_ptr_list
;
const
void
*
output_ptr
;
const
void
*
output_ptr
;
index_t
M
;
index_t
M
;
index_t
N
;
index_t
N
;
};
};
CK_TILE_HOST
static
constexpr
ReduceReceiveKargs
MakeKargs
(
const
void
*
reduce_ptr
,
CK_TILE_HOST
static
constexpr
ReduceReceiveKargs
const
void
*
output_ptr
,
MakeKargs
(
const
void
*
reduce_ptr
,
index_t
M
,
std
::
array
<
const
void
*
,
MaxSendGPUNum
>
receive_ptr_list
,
index_t
N
)
const
void
*
output_ptr
,
index_t
M
,
index_t
N
)
{
{
return
ReduceReceiveKargs
{
reduce_ptr
,
output_ptr
,
M
,
N
};
return
ReduceReceiveKargs
{
reduce_ptr
,
receive_ptr_list
,
output_ptr
,
M
,
N
};
}
}
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
...
@@ -45,10 +48,10 @@ struct ReduceReceiveKernel
...
@@ -45,10 +48,10 @@ struct ReduceReceiveKernel
CK_TILE_DEVICE
void
operator
()(
ReduceReceiveKargs
kargs
)
const
CK_TILE_DEVICE
void
operator
()(
ReduceReceiveKargs
kargs
)
const
{
{
auto
channel
=
*
constSlaveSmChannels
[
0
];
auto
channel
=
constSlaveSmChannels
[
0
];
const
auto
[
i_m
,
i_n
]
=
CrossReducePartitioner
{}();
const
auto
[
i_m
,
i_n
]
=
CrossReducePartitioner
{}();
const
DataType
*
reduce_start
=
static_cast
<
const
DataType
*>
(
reduce_ptr
);
const
DataType
*
reduce_start
=
static_cast
<
const
DataType
*>
(
kargs
.
reduce_ptr
);
auto
transfer_tensor_view
=
[
&
]()
{
auto
transfer_tensor_view
=
[
&
]()
{
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
reduce_start
,
reduce_start
,
make_tuple
(
kargs
.
M
,
kargs
.
N
),
make_tuple
(
kargs
.
M
,
kargs
.
N
),
...
@@ -61,12 +64,32 @@ struct ReduceReceiveKernel
...
@@ -61,12 +64,32 @@ struct ReduceReceiveKernel
make_tuple
(
number
<
ReduceReceivePipeline
::
Block_M
>
{},
make_tuple
(
number
<
ReduceReceivePipeline
::
Block_M
>
{},
number
<
ReduceReceivePipeline
::
Block_N
>
{}),
number
<
ReduceReceivePipeline
::
Block_N
>
{}),
{
i_m
,
i_n
});
{
i_m
,
i_n
});
uint32_t
numThreads
=
static_cast
<
uint32_t
>
(
CrossReducePartitioner
::
NumThreads
(
kargs
.
M
,
kargs
.
N
));
uint32_t
numThreads
=
uint32_t
threadId
=
static_cast
<
uint32_t
>
(
i_m
+
i_n
*
(
kargs
.
M
+
ReduceReceivePipeline
::
Block_M
-
1
)
/
ReduceReceivePipeline
::
Block_M
);
static_cast
<
uint32_t
>
(
CrossReducePartitioner
::
NumThreads
(
kargs
.
M
,
kargs
.
N
));
uint64_t
totalBytes
=
static_cast
<
uint64_t
>
(
ReduceReceivePipeline
::
Block_M
*
ReduceReceivePipeline
::
Block_N
*
sizeof
(
DataType
));
uint32_t
threadId
=
static_cast
<
uint32_t
>
(
i_m
+
i_n
*
(
kargs
.
M
+
ReduceReceivePipeline
::
Block_M
-
1
)
/
ReduceReceivePipeline
::
Block_M
);
uint64_t
totalBytes
=
static_cast
<
uint64_t
>
(
ReduceReceivePipeline
::
Block_M
*
ReduceReceivePipeline
::
Block_N
*
sizeof
(
DataType
));
channel
.
get
(
0
,
totalBytes
,
threadId
,
numThreads
);
channel
.
get
(
0
,
totalBytes
,
threadId
,
numThreads
);
// After the channel get, start the memory block preparation for the receiving window
const
DataType
*
receive_start
=
static_cast
<
const
DataType
*>
(
kargs
.
receive_ptr_list
[
0
]);
auto
receive_tensor_view
=
[
&
]()
{
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
receive_start
,
make_tuple
(
kargs
.
M
,
kargs
.
N
),
make_tuple
(
kargs
.
N
,
1
),
number
<
ReduceReceivePipeline
::
Vector_N
>
{},
number
<
1
>
{});
}();
auto
receive_block_window
=
make_tile_window
(
receive_tensor_view
,
make_tuple
(
number
<
ReduceReceivePipeline
::
Block_M
>
{},
number
<
ReduceReceivePipeline
::
Block_N
>
{}),
{
i_m
,
i_n
});
const
ODataType
*
output_start
=
static_cast
<
const
ODataType
*>
(
kargs
.
output_ptr
);
const
ODataType
*
output_start
=
static_cast
<
const
ODataType
*>
(
kargs
.
output_ptr
);
auto
output_tensor_view
=
[
&
]()
{
auto
output_tensor_view
=
[
&
]()
{
...
@@ -85,7 +108,8 @@ struct ReduceReceiveKernel
...
@@ -85,7 +108,8 @@ struct ReduceReceiveKernel
__shared__
char
smem_ptr
[
ReduceReceivePipeline
::
GetSmemSize
()];
__shared__
char
smem_ptr
[
ReduceReceivePipeline
::
GetSmemSize
()];
ReduceReceivePipeline
{}(
transfer_block_window
,
output_block_window
,
smem_ptr
);
ReduceReceivePipeline
{}(
transfer_block_window
,
receive_block_window
,
output_block_window
,
smem_ptr
);
return
;
return
;
}
}
};
};
...
...
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_reduce_tile_partitioner.hpp
View file @
c0439e64
...
@@ -14,7 +14,7 @@ struct CrossReducePartitioner
...
@@ -14,7 +14,7 @@ struct CrossReducePartitioner
static
constexpr
index_t
kM
=
CrossReduceShape
::
Block_M
;
static
constexpr
index_t
kM
=
CrossReduceShape
::
Block_M
;
static
constexpr
index_t
kN
=
CrossReduceShape
::
Block_N
;
static
constexpr
index_t
kN
=
CrossReduceShape
::
Block_N
;
CK_TILE_HOST
static
constexpr
auto
NumThreads
(
index_t
M
,
index_t
N
){
CK_TILE_HOST
_DEVICE
static
constexpr
auto
NumThreads
(
index_t
M
,
index_t
N
){
index_t
GridDimX
=
(
M
+
kM
-
1
)
/
kM
;
index_t
GridDimX
=
(
M
+
kM
-
1
)
/
kM
;
index_t
GridDimY
=
(
N
+
kN
-
1
)
/
kN
;
index_t
GridDimY
=
(
N
+
kN
-
1
)
/
kN
;
return
GridDimX
*
GridDimY
;
return
GridDimX
*
GridDimY
;
...
...
include/ck_tile/ops/cross_gpu_reduce/kernel/cross_gpu_send_kernel.hpp
View file @
c0439e64
...
@@ -40,7 +40,7 @@ struct ReduceSendKernel
...
@@ -40,7 +40,7 @@ struct ReduceSendKernel
CK_TILE_DEVICE
void
operator
()(
ReduceSendKargs
kargs
)
const
CK_TILE_DEVICE
void
operator
()(
ReduceSendKargs
kargs
)
const
{
{
const
auto
i_
M
=
CrossReducePartitioner
{}();
const
auto
[
i_
m
,
i_n
]
=
CrossReducePartitioner
{}();
const
DataType
*
reduce_start
=
static_cast
<
const
DataType
*>
(
kargs
.
reduce_ptr
);
const
DataType
*
reduce_start
=
static_cast
<
const
DataType
*>
(
kargs
.
reduce_ptr
);
auto
transfer_tensor_view
=
[
&
]()
{
auto
transfer_tensor_view
=
[
&
]()
{
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
...
@@ -54,7 +54,7 @@ struct ReduceSendKernel
...
@@ -54,7 +54,7 @@ struct ReduceSendKernel
make_tile_window
(
transfer_tensor_view
,
make_tile_window
(
transfer_tensor_view
,
make_tuple
(
number
<
ReduceSendPipeline
::
Block_M
>
{},
make_tuple
(
number
<
ReduceSendPipeline
::
Block_M
>
{},
number
<
ReduceSendPipeline
::
Block_N
>
{}),
number
<
ReduceSendPipeline
::
Block_N
>
{}),
{
i_
M
,
0
});
{
i_
m
,
i_n
});
__shared__
char
smem_ptr
[
ReduceSendPipeline
::
GetSmemSize
()];
__shared__
char
smem_ptr
[
ReduceSendPipeline
::
GetSmemSize
()];
...
...
include/ck_tile/ops/cross_gpu_reduce/pipeline/reduce_receive_pipeline_scale_up.hpp
View file @
c0439e64
...
@@ -40,10 +40,14 @@ struct CrossReduceReceivePipelineScaleUp
...
@@ -40,10 +40,14 @@ struct CrossReduceReceivePipelineScaleUp
return
Policy
::
template
GetSmemSize
<
DataType
,
ReduceShape
>();
return
Policy
::
template
GetSmemSize
<
DataType
,
ReduceShape
>();
}
}
template
<
typename
InDramBlockWindowTmp
,
typename
OutDramBlockWindowTmp
>
template
<
typename
InDramBlockWindowTmp
,
CK_TILE_HOST_DEVICE
auto
operator
()(
const
InDramBlockWindowTmp
&
input_dram_block_window_tmp
,
typename
ReceiveDramBlockWindowTmp
,
const
OutDramBlockWindowTmp
&
output_dram_block_window_tmp
,
typename
OutDramBlockWindowTmp
>
void
*
p_smem
)
const
CK_TILE_HOST_DEVICE
auto
operator
()(
const
InDramBlockWindowTmp
&
input_dram_block_window_tmp
,
const
ReceiveDramBlockWindowTmp
&
receive_dram_block_window_tmp
,
const
OutDramBlockWindowTmp
&
output_dram_block_window_tmp
,
void
*
p_smem
)
const
{
{
DataType
*
p_lds
=
static_cast
<
DataType
*>
(
p_smem
);
DataType
*
p_lds
=
static_cast
<
DataType
*>
(
p_smem
);
constexpr
auto
lds_block_desc
=
Policy
::
template
MakeLdsBlockDescriptor
<
ReduceShape
>();
constexpr
auto
lds_block_desc
=
Policy
::
template
MakeLdsBlockDescriptor
<
ReduceShape
>();
...
@@ -52,9 +56,6 @@ struct CrossReduceReceivePipelineScaleUp
...
@@ -52,9 +56,6 @@ struct CrossReduceReceivePipelineScaleUp
integer_divide_ceil
(
sizeof
(
DataType
)
*
lds_block_desc
.
get_element_space_size
(),
16
)
*
integer_divide_ceil
(
sizeof
(
DataType
)
*
lds_block_desc
.
get_element_space_size
(),
16
)
*
16
;
16
;
DataType
*
p_receive_lds
=
static_cast
<
DataType
*>
(
static_cast
<
void
*>
(
static_cast
<
char
*>
(
p_smem
)
+
lds_block_space_size_aligned
));
// DRAM tile window for load
// DRAM tile window for load
auto
copy_dram_window
=
auto
copy_dram_window
=
make_tile_window
(
input_dram_block_window_tmp
.
get_bottom_tensor_view
(),
make_tile_window
(
input_dram_block_window_tmp
.
get_bottom_tensor_view
(),
...
@@ -69,10 +70,31 @@ struct CrossReduceReceivePipelineScaleUp
...
@@ -69,10 +70,31 @@ struct CrossReduceReceivePipelineScaleUp
auto
host_block_tile
=
load_tile
(
copy_dram_window
);
auto
host_block_tile
=
load_tile
(
copy_dram_window
);
const
auto
block_tile_tmp
=
// Receive tile window initialization
DataType
*
p_receive_lds
=
static_cast
<
DataType
*>
(
static_cast
<
void
*>
(
static_cast
<
char
*>
(
p_smem
)
+
lds_block_space_size_aligned
));
auto
receive_dram_window
=
make_tile_window
(
receive_dram_block_window_tmp
.
get_bottom_tensor_view
(),
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
receive_dram_block_window_tmp
.
get_window_origin
(),
Policy
::
template
MakeDramTileDistribution
<
ReduceShape
>());
auto
receive_lds_block
=
make_tensor_view
<
address_space_enum
::
lds
>
(
p_receive_lds
,
lds_block_desc
);
auto
receive_lds_window
=
make_tile_window
(
receive_lds_block
,
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
{
0
,
0
},
receive_dram_window
.
get_tile_distribution
());
auto
receive_block_tile
=
load_tile
(
receive_dram_window
);
const
auto
host_block_tile_tmp
=
tile_elementwise_in
([](
const
DataType
&
a
)
{
return
a
;
},
host_block_tile
);
tile_elementwise_in
([](
const
DataType
&
a
)
{
return
a
;
},
host_block_tile
);
store_tile
(
copy_lds_window
,
block_tile_tmp
);
store_tile
(
copy_lds_window
,
host_block_tile_tmp
);
move_tile_window
(
copy_lds_window
,
{
0
,
Block_N
});
const
auto
receive_block_tile_tmp
=
tile_elementwise_in
([](
const
DataType
&
a
)
{
return
a
;
},
receive_block_tile
);
store_tile
(
receive_lds_window
,
receive_block_tile_tmp
);
__syncthreads
();
__syncthreads
();
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment