Restrict TF tests to one GPU (#264)

* Only use one gpu for tensorflow tests Signed-off-by: kaixih <kaixih@nvidia.com> * Simplify the change Signed-off-by: kaixih <kaixih@nvidia.com> * Final fix Signed-off-by: kaixih <kaixih@nvidia.com> --------- Signed-off-by: kaixih <kaixih@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

Restrict TF tests to one GPU (#264)
* Only use one gpu for tensorflow tests Signed-off-by: kaixih <kaixih@nvidia.com> * Simplify the change Signed-off-by: kaixih <kaixih@nvidia.com> * Final fix Signed-off-by: kaixih <kaixih@nvidia.com> --------- Signed-off-by: kaixih <kaixih@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
6b6823a1 · Kaixi Hou · GitHub · 39b2ef10 · 6b6823a1 · 6b6823a1
Unverified Commit 6b6823a1 authored Jun 08, 2023 by Kaixi Hou Committed by GitHub Jun 08, 2023
4 changed files
--- a/tests/tensorflow/test_layers.py
+++ b/tests/tensorflow/test_layers.py
@@ -75,6 +75,10 @@ def get_adjusted_layernorm_dx(x, ln_dy, init):
 class LayersTest(test.TestCase):
+    def setUp(self):
+        super().setUp()
+        tf.keras.mixed_precision.set_global_policy('mixed_float16')
    @test_util.run_gpu_only
    def testDenseFwd(self):
        B, M, K, N = 4, 8, 16, 32
@@ -578,5 +582,4 @@ class LayersTest(test.TestCase):
 if __name__ == '__main__':
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test.main()
--- a/tests/tensorflow/test_mha.py
+++ b/tests/tensorflow/test_mha.py
@@ -126,6 +126,10 @@ class MultiHeadAttentionKeras(tf.keras.Model):
 class MHATest(test.TestCase):
+    def setUp(self):
+        super().setUp()
+        tf.keras.mixed_precision.set_global_policy('mixed_float16')
    @test_util.run_gpu_only
    def testMHAForward(self):
        use_fp8 = tf.test.is_gpu_available(True, (9, 0))
@@ -252,5 +256,4 @@ class MHATest(test.TestCase):
 if __name__ == '__main__':
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test.main()
--- a/tests/tensorflow/test_transformer.py
+++ b/tests/tensorflow/test_transformer.py
@@ -38,6 +38,10 @@ def train_step(dy, x, x_mask, x_dec, x_dec_mask, model, use_fp8=False,
 class TransformerLayerTest(test.TestCase):
+    def setUp(self):
+        super().setUp()
+        tf.keras.mixed_precision.set_global_policy('mixed_float16')
    @test_util.run_gpu_only
    def testTransformerSanity(self):
        use_fp8 = tf.test.is_gpu_available(True, (9, 0))
@@ -115,5 +119,4 @@ class TransformerLayerTest(test.TestCase):
 if __name__ == '__main__':
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test.main()
--- a/transformer_engine/tensorflow/csrc/extensions.cu
+++ b/transformer_engine/tensorflow/csrc/extensions.cu
@@ -15,6 +15,7 @@
 #include "common/include/transformer_engine/transformer_engine.h"
 #include "common/include/transformer_engine/transpose.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status_internal.h"
@@ -200,6 +201,17 @@ TFE_Context* GetContext(TF_Status* status) {
  static TFE_Context* context = nullptr;
  if (context == nullptr) {
    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    // Current TF-TE only supports a single GPU. Here we need to manually set
+    // the GPU number to 1 in case of the multi-GPU environment. Otherwise, the
+    // TF will still traverse all the valid GPUs (to get stream priority ranges)
+    // and eventually cudaSetDevice to the last one (This logic is defined in
+    // BaseGPUDeviceFactory::CreateDevices). This would cause the other pybind
+    // functions to be dispatched onto other GPUs, leading to bad results.
+    auto* device_count =
+        opts->session_options.options.config.mutable_device_count();
+    device_count->insert({"GPU", 1});
    context = TFE_NewContext(opts, status);
  }
  return context;