Merge pull request #1340 from NVIDIA/peer_memory

Peer memory halo exchange

Merge pull request #1340 from NVIDIA/peer_memory
Peer memory halo exchange
fed20d2a · Thor Johnsen · GitHub · d89f5e66 · 5698eeeb · fed20d2a
Unverified Commit fed20d2a authored Apr 20, 2022 by Thor Johnsen Committed by GitHub Apr 20, 2022
16 changed files
--- a/apex/contrib/bottleneck/__init__.py
+++ b/apex/contrib/bottleneck/__init__.py
 from .bottleneck import Bottleneck, SpatialBottleneck
+from .halo_exchangers import HaloExchangerNoComm, HaloExchangerAllGather, HaloExchangerSendRecv, HaloExchangerPeer
--- a/apex/contrib/bottleneck/bottleneck.py
+++ b/apex/contrib/bottleneck/bottleneck.py
--- a/apex/contrib/bottleneck/bottleneck_module_test.py
+++ b/apex/contrib/bottleneck/bottleneck_module_test.py
--- a/apex/contrib/bottleneck/halo_exchangers.py
+++ b/apex/contrib/bottleneck/halo_exchangers.py
+import torch
+import torch.distributed as dist
+from torch import nn
+import nccl_p2p_cuda as inc
+import peer_memory_cuda as pm
+# Communication free halo exchanger.
+# NB! This halo exchanger does not exchange halos with neighbors as it should, it merely swaps the inputs
+# NB! This is only useful for performance testing.
+# NB! Do not use for actual production runs
+class HaloExchanger(object):
+    def __init__(self, spatial_group_size, rank):
+        self.stream1 = torch.cuda.Stream()
+        self.stream2 = torch.cuda.Stream()
+        self.stream3 = torch.cuda.Stream()
+        spatial_rank = rank % spatial_group_size
+        self.left_zero = True if spatial_rank == 0 else False
+        self.right_zero = True if spatial_rank == spatial_group_size - 1 else False
+class HaloExchangerNoComm(HaloExchanger):
+    def __init__(self, world_size, spatial_group_size, rank, comm):
+        super(HaloExchangerNoComm, self).__init__(spatial_group_size, rank)
+    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
+        if left_input_halo is None:
+            return right_output_halo, left_output_halo
+        else:
+            left_input_halo.copy_(right_output_halo)
+            right_input_halo.copy_(left_output_halo)
+class HaloExchangerAllGather(HaloExchanger):
+    def __init__(self, world_size, spatial_group_size, rank, comm):
+        super(HaloExchangerAllGather, self).__init__(spatial_group_size, rank)
+        self.spatial_group_size = spatial_group_size
+        self.local_rank = rank % spatial_group_size
+        self.comm = comm
+    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
+        N,Hh,W,C = list(left_output_halo.shape)
+        send_halos = torch.empty((N,2*Hh,W,C),dtype=left_output_halo.dtype,device=left_output_halo.device)
+        send_halos[:,:Hh,:,:].copy_(left_output_halo)
+        send_halos[:,Hh:,:,:].copy_(right_output_halo)
+        all_halos = torch.empty((N,2*Hh*self.spatial_group_size,W,C),dtype=left_output_halo.dtype,device=left_output_halo.device)
+        all_halos = [all_halos[:,i*2*Hh:(i+1)*2*Hh,:,:] for i in range(self.spatial_group_size)]
+        torch.distributed.all_gather(all_halos,send_halos,group=self.comm,no_copy=True)
+        ag_left_input_halo = all_halos[(self.spatial_group_size+self.local_rank-1)%self.spatial_group_size][:,Hh:,:,:]
+        ag_right_input_halo = all_halos[(self.local_rank+1)%self.spatial_group_size][:,:Hh,:,:]
+        if left_input_halo is None:
+            if self.left_zero:
+                ag_left_input_halo.zero_()
+            if self.right_zero:
+                ag_right_input_halo.zero_()
+            return ag_left_input_halo, ag_right_input_halo
+        else:
+            if self.left_zero:
+                left_input_halo.zero_()
+            else:
+                left_input_halo.copy_(ag_left_input_halo)
+            if self.right_zero:
+                right_input_halo.zero_()
+            else:
+                right_input_halo.copy_(ag_right_input_halo)
+class HaloExchangerSendRecv(HaloExchanger):
+    def __init__(self, world_size, spatial_group_size, rank, comm):
+        super(HaloExchangerSendRecv, self).__init__(spatial_group_size, rank)
+        self.world_size = world_size
+        self.spatial_group_size = spatial_group_size
+        nccl_id = inc.get_unique_nccl_id(1).cuda()
+        torch.distributed.broadcast(nccl_id, 0)
+        nccl_id = nccl_id.cpu()
+        self.handle = inc.init_nccl_comm(nccl_id, rank, world_size)
+    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
+        if left_input_halo is None:
+            left_input_halo, right_input_halo = inc.left_right_halo_exchange(self.handle, self.left_zero, self.right_zero, left_output_halo, right_output_halo, self.spatial_group_size)
+            return left_input_halo, right_input_halo
+        else:
+            inc.left_right_halo_exchange_inplace(self.handle, self.left_zero, self.right_zero, left_output_halo, right_output_halo, left_input_halo, right_input_halo, self.spatial_group_size)
+class HaloExchangerPeer(HaloExchanger):
+    def __init__(self, world_size, spatial_group_size, rank, comm, peer_pool, explicit_nhwc, numSM=1):
+        super(HaloExchangerPeer, self).__init__(spatial_group_size, rank)
+        self.diagnostics = False
+        self.spatial_group_size = spatial_group_size
+        self.peer_rank = rank % spatial_group_size
+        self.left_neighbor = (self.peer_rank + self.spatial_group_size - 1) % self.spatial_group_size
+        self.right_neighbor = (self.peer_rank + 1) % self.spatial_group_size
+        self.peer_pool = peer_pool
+        self.signals = peer_pool.allocate_peer_tensors([2,4], torch.int32, False, False)
+        self.signals[self.peer_rank].zero_()
+        self.explicit_nhwc = explicit_nhwc
+        self.numSM = numSM
+    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
+        inplace = False if left_input_halo is None and right_input_halo is None else True
+        if not inplace:
+            left_input_halo = torch.empty_like(right_output_halo)
+            right_input_halo = torch.empty_like(left_output_halo)
+        channels_last = left_output_halo.is_contiguous(memory_format=torch.channels_last) and not self.explicit_nhwc
+        left_tx = self.peer_pool.allocate_peer_tensors(list(left_output_halo.shape), left_output_halo.dtype, channels_last, True)
+        right_tx = self.peer_pool.allocate_peer_tensors(list(right_output_halo.shape), right_output_halo.dtype, channels_last, True)
+        pm.push_pull_halos_1d(
+                self.diagnostics, self.explicit_nhwc, self.numSM,
+                left_output_halo,  left_tx[self.peer_rank],  right_tx[self.left_neighbor], left_input_halo,
+                right_output_halo, right_tx[self.peer_rank], left_tx[self.right_neighbor],  right_input_halo,
+                self.signals[self.left_neighbor], self.signals[self.right_neighbor], self.signals[self.peer_rank]
+                )
+        # TODO: Add to push_pull_halos_1d kernel
+        if self.left_zero:
+            left_input_halo.zero_()
+        if self.right_zero:
+            right_input_halo.zero_()
+        if not inplace:
+            return left_input_halo, right_input_halo
+# Class that combines input volume with halos from neighbors (1d).
+class HaloPadder:
+    def __init__(self, halo_ex):
+        self.halo_ex = halo_ex
+        self.stream1 = torch.cuda.Stream()
+        self.stream2 = torch.cuda.Stream()
+    def __call__(self, y, half_halo, explicit_nhwc, H_split):
+        channels_last = not explicit_nhwc and y.is_contiguous(memory_format=torch.channels_last)
+        if explicit_nhwc:
+            N,H,W,C = list(y.shape)
+            if H_split:
+                padded_shape = [N,H+2*half_halo,W,C]
+                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.contiguous_format)
+                yleft = ypad[:,:half_halo,:,:]
+                ymid = ypad[:,half_halo:H+half_halo,:,:]
+                yright = ypad[:,H+half_halo:H+2*half_halo,:,:]
+                oleft = y[:,:half_halo,:,:]
+                oright = y[:,H-half_halo:,:,:]
+            else:
+                padded_shape = [N,H,W+2*half_halo,C]
+                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.contiguous_format)
+                yleft = ypad[:,:,:half_halo,:]
+                ymid = ypad[:,:,half_halo:W+half_halo,:]
+                yright = ypad[:,:,W+half_halo:W+2*half_halo,:]
+                oleft = y[:,:,:half_halo,:]
+                oright = y[:,:,W-half_halo:,:]
+        else:
+            N,C,H,W = list(y.shape)
+            if H_split:
+                padded_shape = [N,C,H+2*half_halo,W]
+                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.channels_last)
+                yleft = ypad[:,:,:half_halo,:]
+                ymid = ypad[:,:,half_halo:H+half_halo,:]
+                yright = ypad[:,:,H+half_halo:H+2*half_halo,:]
+                oleft = y[:,:,:half_halo,:]
+                oright = y[:,:,H-half_halo:,:]
+            else:
+                padded_shape = [N,C,H,W+2*half_halo]
+                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.channels_last)
+                yleft = ypad[:,:,:,:half_halo]
+                ymid = ypad[:,:,:,half_halo:W+half_halo]
+                yright = ypad[:,:,:,W+half_halo:W+2*half_halo]
+                oleft = y[:,:,:,:half_halo]
+                oright = y[:,:,:,W-half_halo:]
+        with torch.cuda.stream(self.stream1):
+            self.halo_ex(oleft, oright, yleft, yright)
+        with torch.cuda.stream(self.stream2):
+            ymid.copy_(y)
+        return ypad
+    def wait(self):
+        current_stream = torch.cuda.current_stream()
+        current_stream.wait_stream(self.stream1)
+        current_stream.wait_stream(self.stream2)
--- a/apex/contrib/csrc/bottleneck/bottleneck.cpp
+++ b/apex/contrib/csrc/bottleneck/bottleneck.cpp
--- a/apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp
+++ b/apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp
+/**
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nccl_p2p_cuda.cuh"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_unique_nccl_id", &apex::contrib::nccl_p2p::get_unique_nccl_id, "get_unique_nccl_id");
+  m.def("init_nccl_comm", &apex::contrib::nccl_p2p::init_nccl_comm, "init_nccl_comm");
+  m.def("nccl_send", &apex::contrib::nccl_p2p::nccl_send, "nccl_send");
+  m.def("nccl_recv", &apex::contrib::nccl_p2p::nccl_recv, "nccl_recv");
+  m.def("left_right_halo_exchange_inplace", &apex::contrib::nccl_p2p::left_right_halo_exchange_inplace, "left_right_halo_exchange_inplace");
+  m.def("left_right_halo_exchange", &apex::contrib::nccl_p2p::left_right_halo_exchange, "left_right_halo_exchange");
+  m.def("add_delay", &apex::contrib::nccl_p2p::add_delay, "add_delay");
+}
--- a/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu
+++ b/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu
+#include <torch/extension.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <list>
+#include <cstdio>
+#include <ctime>
+#include <cassert>
+#include "nccl.h"
+/*
+ * This file implements a crude but effective mechanism for copying data between tenors owned by different ranks
+ * on the same machine using cudaMemcpyAsync peer-to-peer transfers.
+ */
+namespace {
+__global__ void AddDelay_kernel(const int delay, int* counter) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        // waste time while doing something compiler can't predict, thus preventing it from optimizing away this code.
+        int new_counter = 0;
+        double elapsed = 0;
+        clock_t start = clock();
+        do {
+            clock_t now = clock();
+            elapsed = (double)(now - start)*1e9 / CLOCKS_PER_SEC;
+            ++new_counter;
+        } while (elapsed < (double)delay);
+        *counter = new_counter;
+    }
+}
+class NcclCommWrapper
+{
+    private:
+        ncclComm_t comm;
+        int rank, world_size;
+        ncclDataType_t get_nccl_type(at::Tensor input)
+        {
+            switch (input.scalar_type())
+            {
+                case at::ScalarType::Half:
+                    return ncclFloat16;
+                case at::ScalarType::Float:
+                    return ncclFloat32;
+                case at::ScalarType::Double:
+                    return ncclFloat64;
+                case at::ScalarType::Byte:
+                    return ncclUint8;
+                case at::ScalarType::Char:
+                    return ncclInt8;
+                case at::ScalarType::Int:
+                    return ncclInt32;
+                case at::ScalarType::Long:
+                    return ncclInt64;
+                case at::ScalarType::BFloat16:
+                    return ncclBfloat16;
+                default:
+                    assert(false);
+            }
+        }
+    public:
+        NcclCommWrapper()
+        {
+            memset(&comm, 0, sizeof(ncclComm_t));
+            rank = 0;
+            world_size = 0;
+        }
+        NcclCommWrapper(ncclUniqueId id, int my_rank, int num_ranks)
+        {
+            ncclCommInitRank(&comm, num_ranks, id, my_rank);
+            rank = my_rank;
+            world_size = num_ranks;
+        }
+        ~NcclCommWrapper()
+        {
+            printf("ncclCommDestroy()\n");
+            ncclCommDestroy(comm);
+        }
+        void send(at::Tensor input, int destination)
+        {
+            ncclDataType_t ncclType = get_nccl_type(input);
+            AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, input.scalar_type(), "nccl_send", [&]() {
+                size_t count = sizeof(scalar_t) * torch::numel(input);
+                auto input_ptr = input.data_ptr<scalar_t>();
+                ncclSend(input_ptr, count, ncclType, destination, comm, at::cuda::getCurrentCUDAStream());
+            });
+        }
+        void recv(at::Tensor input, int sender)
+        {
+            ncclDataType_t ncclType = get_nccl_type(input);
+            AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, input.scalar_type(), "nccl_send", [&]() {
+                size_t count = sizeof(scalar_t) * torch::numel(input);
+                auto input_ptr = input.data_ptr<scalar_t>();
+                ncclRecv(input_ptr, count, ncclType, sender, comm, at::cuda::getCurrentCUDAStream());
+            });
+        }
+	void left_right_halo_exchange_inplace(bool left_zero, bool right_zero, at::Tensor left_output_halo, at::Tensor right_output_halo, at::Tensor left_input_halo, at::Tensor right_input_halo, int group_size)
+	{
+            auto stream = at::cuda::getCurrentCUDAStream();
+            ncclGroupStart();
+            ncclDataType_t ncclType = get_nccl_type(left_output_halo);
+            // we use wrap-around ranks, so left_input_halo of rank 0 has right_output_halo of rank world_size-1 after exchange etc.
+            // this is technically speaking wasteful, but there is no benefit in having the edge ranks do less work than internal ranks.
+            int group_rank = rank % group_size;
+            int group_index = rank / group_size;
+            int prev_rank = (group_rank + group_size - 1) % group_size;
+            int next_rank = (group_rank + 1) % group_size;
+            prev_rank = prev_rank + group_index * group_size;
+            next_rank = next_rank + group_index * group_size;
+            size_t left_n = torch::numel(left_output_halo);
+            size_t right_n = torch::numel(right_output_halo);
+            if (group_rank > 0) {
+                AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, left_output_halo.scalar_type(), "left_halo_exch", [&]() {
+                    // send left (to my_rank - 1)
+                    ncclSend(left_output_halo.data_ptr<scalar_t>(), left_n, ncclType, prev_rank, comm, stream);
+                    // receive left (from my_rank - 1)
+                    ncclRecv(left_input_halo.data_ptr<scalar_t>(), right_n, ncclType, prev_rank, comm, stream);
+                });
+            }
+            if (group_rank < group_size-1) {
+                AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, right_output_halo.scalar_type(), "right_halo_exch", [&]() {
+                    // send right (to my_rank + 1 )
+                    ncclSend(right_output_halo.data_ptr<scalar_t>(), right_n, ncclType, next_rank, comm, stream);
+                    // receive right (from my_rank + 1)
+                    ncclRecv(right_input_halo.data_ptr<scalar_t>(), left_n, ncclType, next_rank, comm, stream);
+                });
+            }
+            ncclGroupEnd();
+	    if (left_zero) left_input_halo.zero_();
+	    if (right_zero) right_input_halo.zero_();
+	}
+        std::vector<at::Tensor> left_right_halo_exchange(bool left_zero, bool right_zero, at::Tensor left_output_halo, at::Tensor right_output_halo, int group_size)
+        {
+            // after halo exchange:
+            // left_output_halo of rank+1 ends up in right_input_halo of rank
+            // right_output_halo of rank-1 ends up in left_input_halo of rank
+            auto right_input_halo = torch::empty_like(left_output_halo);
+            auto left_input_halo = torch::empty_like(right_output_halo);
+	    left_right_halo_exchange_inplace(left_zero, right_zero, left_output_halo, right_output_halo, left_input_halo, right_input_halo, group_size);
+	    return {left_input_halo, right_input_halo};
+        }
+};
+std::vector<NcclCommWrapper> nccl_comms;
+} // end anonymous namespace
+namespace apex { namespace contrib { namespace nccl_p2p {
+at::Tensor get_unique_nccl_id(int n)
+{
+    ncclUniqueId id;
+    ncclGetUniqueId(&id);
+    auto id_tensor = torch::empty({n*(int)sizeof(ncclUniqueId)}, torch::dtype(torch::kUInt8).device(torch::kCPU).requires_grad(false));
+    auto id_ptr = id_tensor.data_ptr<uint8_t>();
+    size_t offset = 0;
+    for (int i = 0;  i < n;  ++i)
+    {
+        ncclUniqueId id;
+        ncclGetUniqueId(&id);
+        memcpy(id_ptr+offset, &id, sizeof(ncclUniqueId));
+        offset += sizeof(ncclUniqueId);
+    }
+    return id_tensor;
+}
+int init_nccl_comm(at::Tensor unique_nccl_id, int my_rank, int num_ranks)
+{
+    ncclUniqueId id;
+    auto unique_nccl_id_ptr = unique_nccl_id.data_ptr<uint8_t>();
+    memcpy(&id, unique_nccl_id_ptr, sizeof(ncclUniqueId));
+    NcclCommWrapper* comm = new NcclCommWrapper(id, my_rank, num_ranks);
+    int handle = nccl_comms.size();
+    nccl_comms.push_back(*comm);
+    comm = 0L;
+    return handle;
+}
+void nccl_send(int handle, at::Tensor input, int destination)
+{
+    assert(handle >= 0 && handle < nccl_comms.size());
+    class NcclCommWrapper communicator = nccl_comms[handle];
+    communicator.send(input, destination);
+}
+void nccl_recv(int handle, at::Tensor input, int sender)
+{
+    assert(handle >= 0 && handle < nccl_comms.size());
+    class NcclCommWrapper communicator = nccl_comms[handle];
+    communicator.recv(input, sender);
+}
+void left_right_halo_exchange_inplace(int handle, bool left_zero, bool right_zero, at::Tensor left_output_halo, at::Tensor right_output_halo, at::Tensor left_input_halo, at::Tensor right_input_halo, int group_size)
+{
+    assert(handle >= 0 && handle < nccl_comms.size());
+    class NcclCommWrapper& communicator = nccl_comms[handle];
+    return communicator.left_right_halo_exchange_inplace(left_zero, right_zero, left_output_halo, right_output_halo, left_input_halo, right_input_halo, group_size);
+}
+std::vector<at::Tensor> left_right_halo_exchange(int handle, bool left_zero, bool right_zero, at::Tensor left_output_halo, at::Tensor right_output_halo, int group_size)
+{
+    assert(handle >= 0 && handle < nccl_comms.size());
+    class NcclCommWrapper& communicator = nccl_comms[handle];
+    return communicator.left_right_halo_exchange(left_zero, right_zero, left_output_halo, right_output_halo, group_size);
+}
+void add_delay(int delay)
+{
+    auto stream = at::cuda::getCurrentCUDAStream();
+    auto t = torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    AddDelay_kernel<<<1,1,0,stream>>>(delay, t.data_ptr<int>());
+}
+}}}
--- a/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cuh
+++ b/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cuh
+/**
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <torch/extension.h>
+#ifndef _nccl_p2p_h_
+#define _nccl_p2p_h_
+namespace apex { namespace contrib { namespace nccl_p2p {
+at::Tensor get_unique_nccl_id(int n);
+int init_nccl_comm(
+        at::Tensor unique_nccl_id, 
+        int my_rank, 
+        int num_ranks
+        );
+void nccl_send(
+        int handle, 
+        at::Tensor input, 
+        int destination
+        );
+void nccl_recv(
+        int handle, 
+        at::Tensor input, 
+        int sender
+        );
+void left_right_halo_exchange_inplace(
+        int handle,
+	bool left_zero,
+	bool right_zero,
+	at::Tensor left_output_halo,
+	at::Tensor right_output_halo,
+	at::Tensor left_input_halo,
+	at::Tensor right_input_halo,
+	int group_size);
+std::vector<at::Tensor> left_right_halo_exchange(
+        int handle,
+	bool left_zero,
+	bool right_zero,
+        at::Tensor left_output_halo, 
+        at::Tensor right_output_halo,
+        int group_size
+        );
+void add_delay(int delay);
+}}}
+#endif
--- a/apex/contrib/csrc/peer_memory/peer_memory.cpp
+++ b/apex/contrib/csrc/peer_memory/peer_memory.cpp
+/**
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "peer_memory_cuda.cuh"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("allocate_raw", &apex::contrib::peer_memory::allocate_raw, "allocate_raw");
+    m.def("free_raw", &apex::contrib::peer_memory::free_raw, "free_raw");
+    m.def("zero", &apex::contrib::peer_memory::zero, "zero");
+    m.def("get_raw_ipc_address", &apex::contrib::peer_memory::get_raw_ipc_address, "get_raw_ipc_address");
+    m.def("get_raw_peers", &apex::contrib::peer_memory::get_raw_peers, "get_raw_peers");
+    m.def("blob_view_half", &apex::contrib::peer_memory::blob_view_half, "blob_view_half");
+    m.def("blob_view_float", &apex::contrib::peer_memory::blob_view_float, "blob_view_float");
+    m.def("blob_view_int", &apex::contrib::peer_memory::blob_view_int, "blob_view_int");
+    m.def("push_pull_halos_1d", &apex::contrib::peer_memory::push_pull_halos_1d, "push_pull_halos_1d");
+}
--- a/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
+++ b/apex/contrib/csrc/peer_memory/peer_memory_cuda.cu
--- a/apex/contrib/csrc/peer_memory/peer_memory_cuda.cuh
+++ b/apex/contrib/csrc/peer_memory/peer_memory_cuda.cuh
+/**
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <torch/extension.h>
+#ifndef _peer_memory_h_
+#define _peer_memory_h_ 
+namespace apex { namespace contrib { namespace peer_memory {
+    int64_t allocate_raw(int64_t size);
+    void free_raw(int64_t raw);
+    void zero(int64_t raw, int64_t size);
+    at::Tensor get_raw_ipc_address(int64_t raw);
+    std::vector<int64_t> get_raw_peers(at::Tensor ipc_addresses, int peer_rank, int64_t raw);
+    at::Tensor blob_view_half(int64_t raw, std::vector<int64_t> shape, bool channels_last);
+    at::Tensor blob_view_float(int64_t raw, std::vector<int64_t> shape, bool channels_last);
+    at::Tensor blob_view_int(int64_t raw, std::vector<int64_t> shape, bool channels_last);
+    void push_pull_halos_1d(
+        bool diagnostics,
+        bool explicit_nhwc,
+        int numSM,                      // number of SMs to use
+        at::Tensor top_out_halo,        // top output halo in sender device memory
+        at::Tensor top_out_tx,          // top output transfer buffer in sender peer pool memory
+	at::Tensor top_inp_tx,		// top input transfer buffer in top neighbor peer pool memory
+        at::Tensor top_inp_halo,        // top input halo in receiver device memory
+        at::Tensor btm_out_halo,        // btm output halo in sender device memory
+        at::Tensor btm_out_tx,          // btm output transfer buffer in sender peer pool memory
+	at::Tensor btm_inp_tx,		// btm input transfer buffer in btm neighbor peer pool memory
+        at::Tensor btm_inp_halo,        // btm input halo in receiver device memory
+        at::Tensor top_signal,          // top input signal in receiver device memory
+        at::Tensor btm_signal,          // btm input signal in receiver device memory
+        at::Tensor waits                // top and btm signals for this rank
+        );
+} } }
+#endif
--- a/apex/contrib/peer_memory/__init__.py
+++ b/apex/contrib/peer_memory/__init__.py
+from .peer_memory import PeerMemoryPool
+from .peer_halo_exchanger_1d import PeerHaloExchanger1d
--- a/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
+++ b/apex/contrib/peer_memory/peer_halo_exchange_module_tests.py
+import torch
+from apex.contrib.peer_memory import PeerMemoryPool, PeerHaloExchanger1d
+import peer_memory_cuda as pm
+# How to run:
+# torchrun --nproc_per_node <num-GPU> <this-python-prog>
+# <num-GPU> must be a power of 2 greater than 1.
+# Output of this function is used as ground truth in module tests.
+def nccl_halo_ex(peer_rank, peer_group_size, y, half_halo, explicit_nhwc, H_split):
+    if explicit_nhwc:
+        if H_split:
+            _, Hp, _, _ = list(y.shape)
+            H = Hp - 2*half_halo
+            top_out_halo = y[:,half_halo:2*half_halo,:,:]
+            top_inp_halo = y[:,:half_halo,:,:]
+            btm_out_halo = y[:,H:H+half_halo,:,:]
+            btm_inp_halo = y[:,H+half_halo:H+2*half_halo,:,:]
+        else:
+            _, _, Wp, _ = list(y.shape)
+            W = Wp - 2*half_halo
+            top_out_halo = y[:,:,half_halo:2*half_halo,:]
+            top_inp_halo = y[:,:,:half_halo,:]
+            btm_out_halo = y[:,:,W:W+half_halo,:]
+            btm_inp_halo = y[:,:,W+half_halo:W+2*half_halo,:]
+    else:
+        if H_split:
+            _, _, Hp, _ = list(y.shape)
+            H = Hp - 2*half_halo
+            top_out_halo = y[:,:,half_halo:2*half_halo,:]
+            top_inp_halo = y[:,:,:half_halo,:]
+            btm_out_halo = y[:,:,H:H+half_halo,:]
+            btm_inp_halo = y[:,:,H+half_halo:H+2*half_halo,:]
+        else:
+            _, _, _, Wp = list(y.shape)
+            W = Wp - 2*half_halo
+            top_out_halo = y[:,:,:,half_halo:2*half_halo]
+            top_inp_halo = y[:,:,:,:half_halo]
+            btm_out_halo = y[:,:,:,W:W+half_halo]
+            btm_inp_halo = y[:,:,:,W+half_halo:W+2*half_halo]
+    top_out_halo = top_out_halo.clone(memory_format=torch.preserve_format)
+    btm_out_halo = btm_out_halo.clone(memory_format=torch.preserve_format)
+    top_inp_halos = [torch.empty_like(top_out_halo) for _ in range(peer_group_size)]
+    torch.distributed.all_gather(top_inp_halos, top_out_halo)
+    btm_inp_halos = [torch.empty_like(btm_out_halo) for _ in range(peer_group_size)]
+    torch.distributed.all_gather(btm_inp_halos, btm_out_halo)
+    top_rank = (peer_rank + peer_group_size - 1) % peer_group_size
+    btm_rank = (peer_rank + 1) % peer_group_size
+    top_inp_halo.copy_(btm_inp_halos[top_rank])
+    btm_inp_halo.copy_(top_inp_halos[btm_rank])
+def single_test(peer_rank, peer_group_size, halo_ex, C, H, W, half_halo, dtype, memory_format, H_split, num_steps, numSM=1):
+    if memory_format == 1:
+        # 1 -> explicit nhwc
+        explicit_nhwc = True
+        if H_split:
+            y = torch.randn([1,H+2*half_halo,W,C], dtype=dtype, device='cuda')
+            ym = y[:,half_halo:H+half_halo,:,:]
+        else:
+            y = torch.randn([1,H,W+2*half_halo,C], dtype=dtype, device='cuda')
+            ym = y[:,:,half_halo:W+half_halo,:]
+    else:
+        # 2 -> native nhwc
+        # 3 -> nchw
+        explicit_nhwc = False
+        if H_split:
+            y = torch.randn([1,C,H+2*half_halo,W], dtype=dtype, device='cuda')
+            if memory_format == 2:
+                y = y.to(memory_format=torch.channels_last)
+            ym = y[:,:,half_halo:H+half_halo,:]
+        else:
+            y = torch.randn([1,C,H,W+2*half_halo], dtype=dtype, device='cuda')
+            if memory_format == 2:
+                y = y.to(memory_format=torch.channels_last)
+            ym = y[:,:,:,half_halo:W+half_halo]
+    y3 = y.clone()
+    list_y = []
+    for step in range(num_steps):
+        halo_ex(y, H_split, explicit_nhwc, numSM)
+        list_y.append(y.clone())
+        y.copy_(y3)
+        halo_ex.peer_pool.reset()
+        torch.distributed.barrier()
+    y2 = y3.clone()
+    list_y2 = []
+    for step in range(num_steps):
+        nccl_halo_ex(peer_rank, peer_group_size, y2, half_halo, explicit_nhwc, H_split)
+        list_y2.append(y2.clone())
+        y2.copy_(y3)
+    is_equal = [torch.all(torch.eq(yy,yy2)) for yy,yy2 in zip(list_y,list_y2)]
+    is_equal = torch.tensor(is_equal, dtype=torch.bool)
+    is_equal = torch.all(is_equal)
+    if peer_rank == 0:
+        if memory_format == 1:
+            memory_format_str = "explicit_nhwc"
+        elif memory_format == 2:
+            memory_format_str = "native nhwc"
+        elif memory_format == 3:
+            memory_format_str = "nchw"
+        else:
+            memory_format_str = "???"
+        if is_equal:
+            print("SUCCESS : N,C,H,W = 1,%d,%d,%d, half_halo=%d, %s, %s, %s" % (C,H,W,half_halo,str(dtype),memory_format_str,"H-split" if H_split else "W-split"))
+        else:
+            print("FAILURE : N,C,H,W = 1,%d,%d,%d, half_halo=%d, %s, %s, %s" % (C,H,W,half_halo,str(dtype),memory_format_str,"H-split" if H_split else "W-split"))
+    # peer memory flag sync relies on there being at least one barrier per step
+    torch.distributed.barrier()
+def H_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps):
+    Hr = 8*world_size
+    Hp = ((H + Hr - 1) // Hr) * 8
+    for i in range(4):
+        div = int(pow(2,i))
+        single_test(rank, world_size, halo_ex, C*div, Hp//div, W//div, half_halo, torch.float16, 1, True, num_steps)
+        single_test(rank, world_size, halo_ex, C*div, Hp//div, W//div, half_halo, torch.float16, 2, True, num_steps)
+        single_test(rank, world_size, halo_ex, C*div, Hp//div, W//div, half_halo, torch.float16, 3, True, num_steps)
+def W_split_tests(N, C, H, W, half_halo, rank, world_size, halo_ex, num_steps):
+    Wr = 8*world_size
+    Wp = ((W + Wr - 1) // Wr) * 8
+    for i in range(4):
+        div = int(pow(2,i))
+        single_test(rank, world_size, halo_ex, C*div, H//div, Wp//div, half_halo, torch.float16, 1, False, num_steps)
+        single_test(rank, world_size, halo_ex, C*div, H//div, Wp//div, half_halo, torch.float16, 2, False, num_steps)
+        single_test(rank, world_size, halo_ex, C*div, H//div, Wp//div, half_halo, torch.float16, 3, False, num_steps)
+def main():
+    # for this trivial example peer_rank == rank and peer_group_size == world_size
+    torch.distributed.init_process_group("nccl")
+    rank = torch.distributed.get_rank()
+    world_size = torch.distributed.get_world_size()
+    torch.cuda.set_device(rank)
+    pool = PeerMemoryPool(rank, world_size, world_size, 64*1024, 2*1024*1024)
+    num_steps = 100
+    half_halo = 1
+    halo_ex = PeerHaloExchanger1d(rank, world_size, pool, half_halo)
+    H_split_tests(1,64,336,200, half_halo,rank,world_size,halo_ex,num_steps)
+    W_split_tests(1,64,200,336, half_halo,rank,world_size,halo_ex,num_steps)
+if __name__ == "__main__":
+    main()
--- a/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
+++ b/apex/contrib/peer_memory/peer_halo_exchanger_1d.py
+import torch
+from apex.contrib.peer_memory import PeerMemoryPool
+import peer_memory_cuda as pm
+class PeerHaloExchanger1d:
+    def __init__(self, rank, peer_group_size, peer_pool, half_halo):
+        self.peer_group_size = peer_group_size
+        self.peer_rank = rank % peer_group_size
+        self.peer_pool = peer_pool
+        self.signals = peer_pool.allocate_peer_tensors([2,4], torch.int32, False, False)
+        self.signals[self.peer_rank].zero_()
+        self.half_halo = half_halo
+    def __call__(self, y, H_split=True, explicit_nhwc=False, numSM=1, diagnostics=False):
+        channels_last = y.is_contiguous(memory_format=torch.channels_last) and not explicit_nhwc
+        if H_split:
+            if explicit_nhwc:
+                _, Hs, _, _ = list(y.shape)
+                H = Hs - 2*self.half_halo
+                top_out_halo = y[:,self.half_halo:2*self.half_halo,:,:]
+                top_tx = self.peer_pool.allocate_peer_tensors(list(top_out_halo.shape), top_out_halo.dtype, False, True)
+                top_inp_halo = y[:,:self.half_halo,:,:]
+                btm_out_halo = y[:,H:H+self.half_halo,:,:]
+                btm_tx = self.peer_pool.allocate_peer_tensors(list(btm_out_halo.shape), btm_out_halo.dtype, False, True)
+                btm_inp_halo = y[:,H+self.half_halo:H+2*self.half_halo,:,:]
+            else:
+                _, _, Hs, _ = list(y.shape)
+                H = Hs - 2*self.half_halo
+                top_out_halo = y[:,:,self.half_halo:2*self.half_halo,:]
+                top_tx = self.peer_pool.allocate_peer_tensors(list(top_out_halo.shape), top_out_halo.dtype, channels_last, True)
+                top_inp_halo = y[:,:,:self.half_halo,:]
+                btm_out_halo = y[:,:,H:H+self.half_halo,:]
+                btm_tx = self.peer_pool.allocate_peer_tensors(list(btm_out_halo.shape), btm_out_halo.dtype, channels_last, True)
+                btm_inp_halo = y[:,:,H+self.half_halo:H+2*self.half_halo,:]
+        else:
+            if explicit_nhwc:
+                _, _, Ws, _ = list(y.shape)
+                W = Ws - 2*self.half_halo
+                top_out_halo = y[:,:,self.half_halo:2*self.half_halo,:]
+                top_tx = self.peer_pool.allocate_peer_tensors(list(top_out_halo.shape), top_out_halo.dtype, False, True)
+                top_inp_halo = y[:,:,:self.half_halo,:]
+                btm_out_halo = y[:,:,W:W+self.half_halo,:]
+                btm_tx = self.peer_pool.allocate_peer_tensors(list(btm_out_halo.shape), btm_out_halo.dtype, False, True)
+                btm_inp_halo = y[:,:,W+self.half_halo:W+2*self.half_halo,:]
+            else:
+                _, _, _, Ws = list(y.shape)
+                W = Ws - 2*self.half_halo
+                top_out_halo = y[:,:,:,self.half_halo:2*self.half_halo]
+                top_tx = self.peer_pool.allocate_peer_tensors(list(top_out_halo.shape), top_out_halo.dtype, channels_last, True)
+                top_inp_halo = y[:,:,:,:self.half_halo]
+                btm_out_halo = y[:,:,:,W:W+self.half_halo]
+                btm_tx = self.peer_pool.allocate_peer_tensors(list(btm_out_halo.shape), btm_out_halo.dtype, channels_last, True)
+                btm_inp_halo = y[:,:,:,W+self.half_halo:W+2*self.half_halo]
+        top_neighbor = (self.peer_rank + self.peer_group_size - 1) % self.peer_group_size
+        btm_neighbor = (self.peer_rank + 1) % self.peer_group_size
+        pm.push_pull_halos_1d(
+                diagnostics, explicit_nhwc, numSM,
+                top_out_halo, top_tx[self.peer_rank], btm_tx[top_neighbor], top_inp_halo, 
+                btm_out_halo, btm_tx[self.peer_rank], top_tx[btm_neighbor], btm_inp_halo,
+                self.signals[top_neighbor], self.signals[btm_neighbor], self.signals[self.peer_rank]
+                )
--- a/apex/contrib/peer_memory/peer_memory.py
+++ b/apex/contrib/peer_memory/peer_memory.py
+import torch
+import numpy as np
+import peer_memory_cuda as pm
+class PeerMemoryPool(object):
+    def __init__(self, rank, world_size, peer_group_size, static_size, dynamic_size):
+        self.peer_group = rank // peer_group_size
+        self.peer_rank = rank % peer_group_size
+        self.peer_group_size = peer_group_size
+        self.alignment = 256
+        self.static_size = ((static_size + self.alignment - 1) // self.alignment) * self.alignment
+        self.dynamic_size = ((dynamic_size + self.alignment - 1) // self.alignment) * self.alignment
+        # allocate giant pool of device memory
+        self.raw = pm.allocate_raw(self.static_size+self.dynamic_size)
+        # exchange peer pointers with nccl
+        raw_ipc = pm.get_raw_ipc_address(self.raw).cuda()
+        peer_raw_ipcs = [torch.empty_like(raw_ipc) for _ in range(world_size)]
+        torch.distributed.all_gather(peer_raw_ipcs, raw_ipc)
+        peer_raw_ipcs = torch.stack(peer_raw_ipcs).cpu()
+        self.peer_raw = pm.get_raw_peers(peer_raw_ipcs, self.peer_rank, self.raw)
+        self.static_offset = 0
+        self.dynamic_offset = 0
+    def __del__(self):
+        pm.free_raw(self.raw)
+    def reset(self):
+        self.dynamic_offset = 0
+    def allocate_peer_tensors(self, shape, dtype, channels_last, dynamic):
+        nels = np.prod(shape)
+        if dtype == torch.float16:
+            elem_size = 2
+            if dynamic:
+                start = ((self.dynamic_offset + self.alignment - 1) // self.alignment) * self.alignment
+                self.dynamic_offset =  start + nels * elem_size
+                assert(self.dynamic_offset < self.dynamic_size), "Dynamic peer memory pool exhausted"
+                return [pm.blob_view_half(pr + self.static_size + start, shape, channels_last) for pr in self.peer_raw]
+            else:
+                start = ((self.static_offset + self.alignment - 1) // self.alignment) * self.alignment
+                self.static_offset = start + nels * elem_size
+                assert(self.static_offset < self.static_size), "Static peer memory pool exhausted"
+                return [pm.blob_view_half(pr + start, shape, channels_last) for pr in self.peer_raw]
+        if dtype == torch.float32:
+            elem_size = 4
+            if dynamic:
+                start = ((self.dynamic_offset + self.alignment - 1) // self.alignment) * self.alignment
+                self.dynamic_offset =  start + nels * elem_size
+                assert(self.dynamic_offset < self.dynamic_size), "Dynamic peer memory pool exhausted"
+                return [pm.blob_view_float(pr + self.static_size + start, shape, channels_last) for pr in self.peer_raw]
+            else:
+                start = ((self.static_offset + self.alignment - 1) // self.alignment) * self.alignment
+                self.static_offset = start + nels * elem_size
+                assert(self.static_offset < self.static_size), "Static peer memory pool exhausted"
+                return [pm.blob_view_float(pr + start, shape, channels_last) for pr in self.peer_raw]
+        if dtype == torch.int32:
+            elem_size = 4
+            if dynamic:
+                start = ((self.dynamic_offset + self.alignment - 1) // self.alignment) * self.alignment
+                self.dynamic_offset =  start + nels * elem_size
+                assert(self.dynamic_offset < self.dynamic_size), "Dynamic peer memory pool exhausted"
+                return [pm.blob_view_int(pr + self.static_size + start, shape, channels_last) for pr in self.peer_raw]
+            else:
+                start = ((self.static_offset + self.alignment - 1) // self.alignment) * self.alignment
+                self.static_offset = start + nels * elem_size
+                assert(self.static_offset < self.static_size), "Static peer memory pool exhausted"
+                return [pm.blob_view_int(pr + start, shape, channels_last) for pr in self.peer_raw]
+        else:
+            assert(False), "dtype %s not supported" % (str(dtype))
--- a/setup.py
+++ b/setup.py
@@ -652,6 +652,34 @@ if "--fast_bottleneck" in sys.argv:
        )
    )
+if "--peer_memory" in sys.argv:
+    sys.argv.remove("--peer_memory")
+    raise_if_cuda_home_none("--peer_memory")
+    ext_modules.append(
+        CUDAExtension(
+            name="peer_memory_cuda",
+            sources=[
+                "apex/contrib/csrc/peer_memory/peer_memory_cuda.cu",
+                "apex/contrib/csrc/peer_memory/peer_memory.cpp",
+            ],
+            extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
+        )
+    )
+if "--nccl_p2p" in sys.argv:
+    sys.argv.remove("--nccl_p2p")
+    raise_if_cuda_home_none("--nccl_p2p")
+    ext_modules.append(
+        CUDAExtension(
+            name="nccl_p2p_cuda",
+            sources=[
+                "apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
+                "apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp",
+            ],
+            extra_compile_args={"cxx": ["-O3"] + version_dependent_macros + generator_flag},
+        )
+    )
 if "--fused_conv_bias_relu" in sys.argv:
    sys.argv.remove("--fused_conv_bias_relu")