make cudaStreamManager thread local

27af1828 · Jiezhong Qiu · f5cc759c · 27af1828 · 27af1828 · 27af1828
Commit 27af1828 authored Jan 03, 2021 by Jiezhong Qiu
Showing with 18 additions and 7 deletions

pytorch/cuda/cuda_stream_manager.cpp pytorch/cuda/cuda_stream_manager.cpp +2 -1

pytorch/cuda/cuda_stream_manager.h pytorch/cuda/cuda_stream_manager.h +9 -1

pytorch/cuda/moe.py pytorch/cuda/moe.py +7 -5

No files found.
--- a/pytorch/cuda/cuda_stream_manager.cpp
+++ b/pytorch/cuda/cuda_stream_manager.cpp
 #include <cassert>
+#include <thread>
 #include "cuda_stream_manager.h"
-CudaStreamManager* smgr = NULL;
+thread_local CudaStreamManager* smgr = NULL;
 CudaStreamManager* getCudaStreamManager(const size_t num_expert, const int device) { 
    if (!smgr) {

--- a/pytorch/cuda/cuda_stream_manager.h
+++ b/pytorch/cuda/cuda_stream_manager.h
@@ -11,8 +11,16 @@
 class CudaStreamManager {
 public:
    CudaStreamManager(const size_t num_expert_, const int device_) : num_expert(num_expert_), device(device_) {
+        /* 
+        Actually, we will see current_device == device,  
+        which means pytorch always sets the correct device for us.
+        But for safety, we still manually set device to the desired one.
+        */
+        int current_device;
+        checkCudaErrors(cudaGetDevice(&current_device));
+        printf("CudaStreamManager construnctor called, get device %d, set device %d\n", current_device, device);
        checkCudaErrors(cudaSetDevice(device));
-        printf("set device %d\n", device);
        streams = new cudaStream_t[num_expert];
        checkCudaErrors(cublasCreate(&handle));
        for (size_t i=0; i<num_expert; ++i) {

--- a/pytorch/cuda/moe.py
+++ b/pytorch/cuda/moe.py
@@ -115,7 +115,7 @@ def test():
 def test_dp():
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
-    batch_size = 4
+    batch_size = 6
    num_expert = 4
    in_feat = 2
    out_feat = 3
@@ -125,14 +125,16 @@ def test_dp():
    print("data parallel of a nn.Linear model")
    linear = nn.Linear(in_feat, in_feat).cuda()
-    moe_linear = torch.nn.DataParallel(linear, device_ids=[0, 1])
+    linear_dp = torch.nn.DataParallel(linear, device_ids=[0,1,2])
-    output = moe_linear(inp)
+    output = linear_dp(inp)
    print("successful!")
    print("data parallel of our MoE model")
    moe = MOELayer(num_expert, in_feat, out_feat).cuda()
-    moe_dp = torch.nn.DataParallel(moe, device_ids=[0, 1])
+    moe_dp = torch.nn.DataParallel(moe, device_ids=[0,1,2])
-    output = moe_dp(inp, gate)
+    for i in range(5):
+        print(i, "forward")
+        output = moe_dp(inp, gate)