Unverified Commit 8030a448 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

[PyTorch] Debug UB conversion function from NVTEShape to std::vector (#1742)



Debug UB conversion function from NVTEShape to std::vector
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
parent d3352524
......@@ -21,12 +21,18 @@
#define HALF_BYTES 2
#define UB_MAX_SM 32
#define AS_VECTOR(shape) std::vector<size_t>(shape.data, shape.data + shape.ndim)
using namespace std::placeholders;
namespace transformer_engine {
namespace {
std::vector<size_t> shape_to_vector(const NVTEShape &shape) {
return std::vector<size_t>(shape.data, shape.data + shape.ndim);
}
} // namespace
/***************************************************************************************************
* Comm+GEMM Overlap Common Core
**************************************************************************************************/
......@@ -147,7 +153,7 @@ TensorWrapper CommOverlapCore::get_tensor_chunk(const TensorWrapper &source, siz
auto param = source.get_parameter(param_type);
auto param_dptr = reinterpret_cast<char *>(param.data_ptr);
auto param_dtype = static_cast<DType>(param.dtype);
auto param_shape = AS_VECTOR(param.shape);
auto param_shape = shape_to_vector(param.shape);
if (param_dptr != nullptr) {
if (param_type == NVTETensorParam::kNVTERowwiseData ||
......@@ -617,7 +623,7 @@ CommOverlapP2PBase::~CommOverlapP2PBase() {
TensorWrapper CommOverlapP2PBase::get_buffer_chunk_by_id(const TensorWrapper &source,
size_t chunk_id) {
// Start with a chunk of the source tensor
auto chunk = get_tensor_chunk(source, 0, AS_VECTOR(_ubufs[chunk_id].shape()));
auto chunk = get_tensor_chunk(source, 0, shape_to_vector(_ubufs[chunk_id].shape()));
// Update chunk with offset data pointers from the communication buffer
if (chunk.dptr() != nullptr) {
......@@ -667,7 +673,7 @@ void CommOverlapP2PBase::atomic_gemm_overlap_ag(
NVTE_CHECK_CUDA(cudaStreamWaitEvent(_stream_send[0], _start_compute, 0));
NVTE_CHECK_CUDA(cudaStreamWaitEvent(_stream_recv, _start_compute, 0));
auto input_b = get_buffer_chunk_like(B, 0, AS_VECTOR(B.shape()));
auto input_b = get_buffer_chunk_like(B, 0, shape_to_vector(B.shape()));
size_t workspace_size_chunk = workspace.numel() / _stream_compute.size();
auto workspace_chunk = get_tensor_chunk(workspace, 0, {workspace_size_chunk});
......@@ -910,7 +916,7 @@ void CommOverlapP2PBase::atomic_gemm_overlap_rs(
// Atomic GEMM
// Process GEMM chunks in the order that AG+GEMM places the output chunks.
auto output_d = get_buffer_chunk_like(D, 0, AS_VECTOR(D.shape()));
auto output_d = get_buffer_chunk_like(D, 0, shape_to_vector(D.shape()));
nvte_cublas_atomic_gemm(A.data(), B.data(), output_d.data(), bias.data(), pre_gelu_out.data(),
transa, transb, grad, workspace.data(), accumulate, use_split_accumulator,
_math_sms, 0, _tp_size, true, _counter.data(), stream_main);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment