"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "e4b056fe652536ac89ff2c98e36b2d3685cbccd2"
Unverified Commit ae8dbe6d authored by nv-dlasalle's avatar nv-dlasalle Committed by GitHub
Browse files

[Feature][Performance] Implement NCCL wrapper for communicating NodeEmbeddings...


[Feature][Performance] Implement NCCL wrapper for communicating NodeEmbeddings and sparse gradients. (#2825)

* Split NCCL wrapper from sparse optimizer and sparse embedding

* Add more unit tests for single node nccl

* Fix unit test for tf

* Switch to device histogram

* Fix histgram issues

* Finish migration to histogram

* Handle cases with zero send/recieve data

* Start on partition object

* Get compiling

* Updates

* Add unit tests

* Switch to partition object

* Fix linting issues

* Rename partition file

* Add python doc

* Fix python assert and finish doxygen comments

* Remove stubs for range based partition to satisfy pylint

* Wrap unit test in GPU only

* Wrap explicit cuda call in ifdef

* Merge with partition.py

* update docstrings

* Cleanup partition_op

* Add Workspace object

* Switch to using workspace object

* Move last remainder based function out of nccl_api

* Add error messages

* Update docs with examples

* Fix linting erros
Co-authored-by: default avatarxiang song(charlie.song) <classicxsong@gmail.com>
parent 0e9259b4
#include <gtest/gtest.h>
#include "../../src/partition/ndarray_partition.h"
using namespace dgl;
using namespace dgl::partition;
template<DLDeviceType XPU, typename IdType>
void _TestRemainder()
{
const int64_t size = 160000;
const int num_parts = 7;
NDArrayPartitionRef part = CreatePartitionRemainderBased(
size, num_parts);
IdArray idxs = aten::Range(0, size/10, sizeof(IdType)*8,
DGLContext{XPU, 0});
std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs);
// first part of result should be the permutation
IdArray perm = result.first.CopyTo(DGLContext{kDLCPU, 0});
ASSERT_TRUE(perm.Ptr<IdType>() != nullptr);
ASSERT_EQ(perm->shape[0], idxs->shape[0]);
const IdType * const perm_cpu = static_cast<const IdType*>(perm->data);
// second part of result should be the counts
IdArray counts = result.second.CopyTo(DGLContext{kDLCPU, 0});
ASSERT_TRUE(counts.Ptr<int64_t>() != nullptr);
ASSERT_EQ(counts->shape[0], num_parts);
const int64_t * const counts_cpu = static_cast<const int64_t*>(counts->data);
std::vector<int64_t> prefix(num_parts+1, 0);
for (int p = 0; p < num_parts; ++p) {
prefix[p+1] = prefix[p] + counts_cpu[p];
}
ASSERT_EQ(prefix.back(), idxs->shape[0]);
// copy original indexes to cpu
idxs = idxs.CopyTo(DGLContext{kDLCPU, 0});
const IdType * const idxs_cpu = static_cast<const IdType*>(idxs->data);
for (int p = 0; p < num_parts; ++p) {
for (int64_t i = prefix[p]; i < prefix[p+1]; ++i) {
EXPECT_EQ(idxs_cpu[perm_cpu[i]] % num_parts, p);
}
}
}
TEST(PartitionTest, TestRemainderPartition) {
#ifdef DGL_USE_CUDA
_TestRemainder<kDLGPU, int32_t>();
_TestRemainder<kDLGPU, int64_t>();
#endif
// CPU is not implemented
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment