init

c0dad530 · wangkaixiong · c0dad530 · c0dad530 · c0dad530 · c0dad530
Commit c0dad530 authored Jun 09, 2025 by wangkaixiong 🚴🏼
20 changed files
--- a/src/scatter.cu
+++ b/src/scatter.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = count/nranks;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+      TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
+      k++;
+
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+      int peer = (rank-1+nranks)%nranks;
+      TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
--- a/src/timer.cc
+++ b/src/timer.cc
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
--- a/src/timer.h
+++ b/src/timer.h
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif
--- a/t1/1/rank.00/stderr
+++ b/t1/1/rank.00/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.00/stdout
+++ b/t1/1/rank.00/stdout
+# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
+#
+# Using devices
+#   Rank  0 Pid    781 on     master device  0 [0000:9f:00.0] BW200
+master:781:781 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
+master:781:781 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
+master:781:781 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+
+master:781:781 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
+master:781:781 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
+
+master:781:781 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
+
+master:781:781 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
+master:781:781 [0] NCCL INFO ROCr version 1.1
+master:781:781 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
+RCCL version 2.18.3+hip6.1 HEAD:037e9b3
+master:781:814 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
+master:781:814 [0] NCCL INFO Using network IB
+master:781:814 [0] NCCL INFO comm 0xf33c60 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xb95015ec1d0d080e - Init START
+master:781:814 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
+master:781:814 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
+master:781:814 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
+master:781:814 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
+
+master:781:814 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
+master:781:814 [0] NCCL INFO Channel 00/32 :    0
+master:781:814 [0] NCCL INFO Channel 01/32 :    0
+master:781:814 [0] NCCL INFO Channel 02/32 :    0
+master:781:814 [0] NCCL INFO Channel 03/32 :    0
+master:781:814 [0] NCCL INFO Channel 04/32 :    0
+master:781:814 [0] NCCL INFO Channel 05/32 :    0
+master:781:814 [0] NCCL INFO Channel 06/32 :    0
+master:781:814 [0] NCCL INFO Channel 07/32 :    0
+master:781:814 [0] NCCL INFO Channel 08/32 :    0
+master:781:814 [0] NCCL INFO Channel 09/32 :    0
+master:781:814 [0] NCCL INFO Channel 10/32 :    0
+master:781:814 [0] NCCL INFO Channel 11/32 :    0
+master:781:814 [0] NCCL INFO Channel 12/32 :    0
+master:781:814 [0] NCCL INFO Channel 13/32 :    0
+master:781:814 [0] NCCL INFO Channel 14/32 :    0
+master:781:814 [0] NCCL INFO Channel 15/32 :    0
+master:781:814 [0] NCCL INFO Channel 16/32 :    0
+master:781:814 [0] NCCL INFO Channel 17/32 :    0
+master:781:814 [0] NCCL INFO Channel 18/32 :    0
+master:781:814 [0] NCCL INFO Channel 19/32 :    0
+master:781:814 [0] NCCL INFO Channel 20/32 :    0
+master:781:814 [0] NCCL INFO Channel 21/32 :    0
+master:781:814 [0] NCCL INFO Channel 22/32 :    0
+master:781:814 [0] NCCL INFO Channel 23/32 :    0
+master:781:814 [0] NCCL INFO Channel 24/32 :    0
+master:781:814 [0] NCCL INFO Channel 25/32 :    0
+master:781:814 [0] NCCL INFO Channel 26/32 :    0
+master:781:814 [0] NCCL INFO Channel 27/32 :    0
+master:781:814 [0] NCCL INFO Channel 28/32 :    0
+master:781:814 [0] NCCL INFO Channel 29/32 :    0
+master:781:814 [0] NCCL INFO Channel 30/32 :    0
+master:781:814 [0] NCCL INFO Channel 31/32 :    0
+master:781:814 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0xf33c60 nRanks 01 busId 9f000
+master:781:814 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
+master:781:814 [0] NCCL INFO P2P Chunksize set to 131072
+master:781:814 [0] NCCL INFO Connected all rings comm 0xf33c60 nRanks 01 busId 9f000
+master:781:814 [0] NCCL INFO Connected all trees
+master:781:814 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
+master:781:814 [0] NCCL INFO Init config for nccl_context_test: 0
+master:781:814 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
+
+master:781:814 [0] NCCL INFO comm 0xf33c60 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xb95015ec1d0d080e localSize 464 used 67142608 bytes - Init COMPLETE
+#
+#                                                              out-of-place                       in-place          
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+        7616          1904     float     sum      -1    318.2    0.02    0.00      0     0.17   44.67    0.00      0
+       15236          3809     float     sum      -1    148.8    0.10    0.00      0     0.17   91.51    0.00      0
+       30472          7618     float     sum      -1     3.72    8.19    0.00      0     0.17  183.02    0.00      0
+       60944         15236     float     sum      -1     5.67   10.75    0.00      0     0.17  366.03    0.00      0
+      121888         30472     float     sum      -1     3.93   31.04    0.00      0     0.17  727.69    0.00      0
+      243776         60944     float     sum      -1    498.9    0.49    0.00      0     0.17  1421.43    0.00      0
+      487552        121888     float     sum      -1    82.86    5.88    0.00      0     0.17  2937.06    0.00      0
+      975104        243776     float     sum      -1    10.57   92.27    0.00      0     0.17  5891.87    0.00      0
+     1950208        487552     float     sum      -1   1273.7    1.53    0.00      0     0.17  11677.89    0.00      0
+     3900416        975104     float     sum      -1   2760.9    1.41    0.00      0     0.17  23425.92    0.00      0
+     7800832       1950208     float     sum      -1   2038.5    3.83    0.00      0     0.17  46851.84    0.00      0
+    15601664       3900416     float     sum      -1   4799.2    3.25    0.00      0     0.17  93985.93    0.00      0
+    31203328       7800832     float     sum      -1   9049.6    3.45    0.00      0     0.16  190264.20    0.00      0
+    62406656      15601664     float     sum      -1    10579    5.90    0.00      0     0.17  372577.05    0.00      0
+   124813312      31203328     float     sum      -1   5672.0   22.01    0.00      0     0.17  749629.50    0.00      0
+   249626624      62406656     float     sum      -1   7586.1   32.91    0.00      0     0.17  1490308.20    0.00      0
+   499253248     124813312     float     sum      -1    11629   42.93    0.00      0     0.17  2980616.41    0.00      0
+   998506496     249626624     float     sum      -1    16813   59.39    0.00      0     0.17  5961232.81    0.00      0
+master:781:781 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
+master:781:781 [0] NCCL INFO comm 0xf33c60 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
+# Errors with asterisks indicate errors that have exceeded the maximum threshold.
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 0 
+#
+
--- a/t1/1/rank.01/stderr
+++ b/t1/1/rank.01/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.01/stdout
+++ b/t1/1/rank.01/stdout
--- a/t1/1/rank.02/stderr
+++ b/t1/1/rank.02/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.02/stdout
+++ b/t1/1/rank.02/stdout
--- a/t1/1/rank.03/stderr
+++ b/t1/1/rank.03/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.03/stdout
+++ b/t1/1/rank.03/stdout
--- a/t1/1/rank.04/stderr
+++ b/t1/1/rank.04/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.04/stdout
+++ b/t1/1/rank.04/stdout
--- a/t1/1/rank.05/stderr
+++ b/t1/1/rank.05/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.05/stdout
+++ b/t1/1/rank.05/stdout
--- a/t1/1/rank.06/stderr
+++ b/t1/1/rank.06/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.06/stdout
+++ b/t1/1/rank.06/stdout
--- a/t1/1/rank.07/stderr
+++ b/t1/1/rank.07/stderr
+Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program ! 
--- a/t1/1/rank.07/stdout
+++ b/t1/1/rank.07/stdout