multi_stream.cpp 3.59 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/*************************************************************************
 * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 *
 * See LICENSE for license information.
 ************************************************************************/

#ifndef TRANSFORMER_ENGINE_UTIL_MULTI_STREAM_H_
#define TRANSFORMER_ENGINE_UTIL_MULTI_STREAM_H_

#include "multi_stream.h"

#include <transformer_engine/multi_stream.h>
yuguo's avatar
yuguo committed
13
#include <transformer_engine/gemm.h>
14
15
16
17
18
19
20

#include <mutex>
#include <vector>

#include "cuda_runtime.h"
#include "logging.h"

yuguo's avatar
yuguo committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static inline int getIntEnv(const char *name, int defval, int minval)
{
  int val = defval;
  const char* env = std::getenv(name);
  if (env != nullptr && env[0] != '\0')
  {
     val = atoi(env);
     if (val < minval)
     {
        val = minval;
     }
  }
  return val;
}

36
37
38
39
40
41
42
43
44
namespace transformer_engine::detail {

cudaStream_t get_compute_stream(int idx) {
  const size_t num_streams = nvte_get_num_compute_streams();
  NVTE_CHECK(0 <= idx && idx < num_streams, "Invalid compute stream (requested idx ", idx,
             ", but there are ", num_streams, " streams)");
  static std::vector<cudaStream_t> streams(num_streams);
  static std::once_flag stream_init_flag;
  auto init = [&]() {
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    int comm_cu_nums = getIntEnv("TORCH_COMM_CU_NUMS", 8, 4);
    unsigned int cuMask[4];
    unsigned int cuMaskSize = 4;
    if (comm_cu_nums == 4) {
      cuMask[0] = 0xfffffff0;  
      cuMask[1] = 0xffffffff;
      cuMask[2] = 0xffffffff;
      cuMask[3] = 0xffffffff;
    } else if (comm_cu_nums == 8) {
      cuMask[0] = 0xffffff00;  
      cuMask[1] = 0xffffffff;
      cuMask[2] = 0xffffffff;
      cuMask[3] = 0xffffffff;
    } else if (comm_cu_nums == 16) {
      cuMask[0] = 0xffff0000;  
      cuMask[1] = 0xffffffff;
      cuMask[2] = 0xffffffff;
      cuMask[3] = 0xffffffff;
    } else if (comm_cu_nums == 32) {
      cuMask[0] = 0x00000000;  
      cuMask[1] = 0xffffffff;
      cuMask[2] = 0xffffffff;
      cuMask[3] = 0xffffffff;
    } else {
      NVTE_CHECK(false, "comm_cu_nums must be 4,8,16,32");
    }
    const char *TORCH_COMM_CU_NUMS = std::getenv("TORCH_COMM_CU_NUMS");
72
    for (size_t i = 0; i < num_streams; i++) {
73
74
75
76
77
78
79
#ifdef __HIP_PLATFORM_AMD__    
      if (TORCH_COMM_CU_NUMS != nullptr && TORCH_COMM_CU_NUMS[0] != '\0') {
        NVTE_CHECK_CUDA(hipExtStreamCreateWithCUMask(&streams[i], cuMaskSize, cuMask));
      } else {
        NVTE_CHECK_CUDA(cudaStreamCreateWithPriority(&streams[i], cudaStreamNonBlocking, -1));
      }
#else
80
      NVTE_CHECK_CUDA(cudaStreamCreateWithPriority(&streams[i], cudaStreamNonBlocking, -1));
81
#endif
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    }
  };
  std::call_once(stream_init_flag, init);
  return streams[idx];
}

cudaEvent_t get_compute_stream_event(int idx) {
  const size_t num_streams = nvte_get_num_compute_streams();
  NVTE_CHECK(0 <= idx && idx < num_streams, "Invalid compute stream (requested idx ", idx,
             ", but there are ", num_streams, " streams)");
  static std::vector<cudaEvent_t> events(num_streams);
  static std::once_flag event_init_flag;
  auto init = [&]() {
    for (size_t i = 0; i < num_streams; i++) {
      NVTE_CHECK_CUDA(cudaEventCreate(&events[i]));
    }
  };
  std::call_once(event_init_flag, init);
  return events[idx];
}

int get_num_compute_streams() {
yuguo's avatar
yuguo committed
104
#ifdef __HIP_PLATFORM_AMD__
yuguo's avatar
yuguo committed
105
  static constexpr int num_compute_streams = compute_num_streams;
yuguo's avatar
yuguo committed
106
#else
107
  static constexpr int num_compute_streams = 4;
yuguo's avatar
yuguo committed
108
#endif
109
110
111
112
113
114
115
116
  return num_compute_streams;
}

}  // namespace transformer_engine::detail

int nvte_get_num_compute_streams() { return transformer_engine::detail::get_num_compute_streams(); }

#endif  // TRANSFORMER_ENGINE_UTIL_MULTI_STREAM_H_