/* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * @file rccl_prim_test.cpp * * test performance if individual rccl primitives */ #include //fprintf #include //cerr #include //usleep #include #include #include #include "copy_kernel.h" #define MAX_GPU 16 #define MAX_WORKGROUPS 32 #define THREADS 256 #define NGPUS 2 #define COPY_UNROLL 4 #define REDUCE_UNROLL 2 #define DOUBLECOPY_UNROLL 2 #define DOUBLECOPYLOCAL_UNROLL 2 #define REDUCECOPY_UNROLL 2 #define PRINT_GPU0_ONLY 1 #define RST "\x1B[0m" #define KBLU "\x1B[34m" #define FBLU(x) KBLU x RST #define BOLD(x) "\x1B[1m" x RST struct transfer_data_t { float *dest0[MAX_WORKGROUPS]; //remote fine grain float *src0[MAX_WORKGROUPS]; //local fine grain float *dest1[MAX_WORKGROUPS]; //local coarse grain float *dest2[MAX_WORKGROUPS]; //local fine grain float *src1[MAX_WORKGROUPS]; //local coarse grain int N; int gpu; int ngpu; uint64_t *remOpCount; }; struct profiling_data_t { uint64_t write_cycles[MAX_WORKGROUPS]; uint64_t bytes_transferred[MAX_WORKGROUPS]; }; #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) void print_table_header(void) { fprintf(stderr, "%120s","=================================================================================================================================\n"); fprintf(stderr, "%-20s %-13s %-13s %-13s %-13s %-20s %-20s %-10s\n","[Originating GPU]", "[Directions]", "[WorkGroup]", "[linktype]", "[time(ms)]" , "[bytes_transferred]", "[throughput(GB/s)]", "[StdDev]"); fprintf(stderr, "%120s","=================================================================================================================================\n"); } void print_table_summary_line(void) { fprintf(stderr, "%120s","---------------------------------------------------------------------------------------------------------------------------------\n"); } enum Ops { OP_COPY, OP_LOCALCOPY, OP_DOUBLECOPY, OP_DOUBLECOPYLOCAL, OP_REDUCE, OP_REDUCECOPY, OP_READ, NUM_OPS, }; template __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount) { size_t tid = threadIdx.x; uint64_t curr_time; int bid = blockIdx.x; int n = transfer_data->N; const float *srcs[NGPUS]; float *dsts[NGPUS]; // signal self ready and wait until all GPUs are ready if (tid == 0) { __atomic_fetch_add(&transfer_data->remOpCount[transfer_data->gpu], 1, __ATOMIC_SEQ_CST); if (sync) { for (int i = 0; i < transfer_data->ngpu; i++) { while (LOAD(&transfer_data->remOpCount[i]) < opCount) {}; } } } __syncthreads(); if (tid == 0) curr_time = wall_clock64(); if (op == OP_COPY) { srcs[0] = transfer_data->src0[bid]; dsts[0] = transfer_data->dest0[bid]; ReduceOrCopyMulti, float, 1, 1, 1, 1>(threadIdx.x, THREADS, 1, srcs, 1, dsts, n); } if (op == OP_LOCALCOPY) { srcs[0] = transfer_data->src0[bid]; dsts[0] = transfer_data->dest1[bid]; ReduceOrCopyMulti, float, 1, 1, 1, 1>(threadIdx.x, THREADS, 1, srcs, 1, dsts, n); } if (op == OP_DOUBLECOPY) { srcs[0] = transfer_data->src0[bid]; dsts[0] = transfer_data->dest0[bid]; dsts[1] = transfer_data->dest1[bid]; ReduceOrCopyMulti, float, 1, 1, 1, 2>(threadIdx.x, THREADS, 1, srcs, 2, dsts, n); } if (op == OP_DOUBLECOPYLOCAL) { srcs[0] = transfer_data->src0[bid]; dsts[0] = transfer_data->dest1[bid]; dsts[1] = transfer_data->dest2[bid]; ReduceOrCopyMulti, float, 1, 1, 1, 2>(threadIdx.x, THREADS, 1, srcs, 2, dsts, n); } if (op == OP_REDUCE) { srcs[0] = transfer_data->src0[bid]; srcs[1] = transfer_data->src1[bid]; dsts[0] = transfer_data->dest0[bid]; ReduceOrCopyMulti, float, 1, 2, 1, 1>(threadIdx.x, THREADS, 2, srcs, 1, dsts, n); } if (op == OP_REDUCECOPY) { srcs[0] = transfer_data->src0[bid]; srcs[1] = transfer_data->src1[bid]; dsts[0] = transfer_data->dest0[bid]; dsts[1] = transfer_data->dest1[bid]; ReduceOrCopyMulti, float, 1, 2, 1, 2>(threadIdx.x, THREADS, 2, srcs, 2, dsts, n); } if (op == OP_READ) { // Swapped the dest0 and src0 in passed parameter of copy kernel so that it can utilized for as a read kernel. // fetch op will happen on transfer_data->dest0[bid] and store op will happen on transfer_data->src0[bid] srcs[0] = transfer_data->dest0[bid]; dsts[0] = transfer_data->src0[bid]; ReduceOrCopyMulti, float, 1, 1, 1, 1>(threadIdx.x, THREADS, 1, srcs, 1, dsts, n); } __syncthreads(); if (tid == 0) { __atomic_fetch_add(&(profiling_data->write_cycles[bid]), __builtin_amdgcn_s_memrealtime() - curr_time, __ATOMIC_SEQ_CST); __atomic_fetch_add(&(profiling_data->bytes_transferred[bid]), n * sizeof(float), __ATOMIC_SEQ_CST); } } typedef void(*flag_sync_kernel_t)(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount); static flag_sync_kernel_t const flagSyncKerns[NUM_OPS*2] = { flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, }; __global__ void initTestDataKernel(float* data, const size_t N, const int gpu) { int tid = threadIdx.x + blockIdx.x * blockDim.x; while (tid < N) { data[tid] = 1.0/(float)(gpu*17 + tid%77); tid += blockDim.x * gridDim.x; } } #define HIPCHECK(cmd) \ do { \ hipError_t error = (cmd); \ if (error != hipSuccess) \ { \ std::cerr << "Encountered HIP error (" << error << ") at line " \ << __LINE__ << " in file " << __FILE__ << "\n"; \ exit(-1); \ } \ } while (0) static void setupPeers(uint32_t *info, bool* is_xgmi) { int deviceCnt, dev; // is_xgmi indicates all link are one hop XGMI *is_xgmi = 1; HIPCHECK(hipGetDeviceCount(&deviceCnt)); HIPCHECK(hipGetDevice(&dev)); //! If gpus are not peer enabled, enable them for (int i = 0; i < deviceCnt; i++) { HIPCHECK(hipSetDevice(i)); for (int j = 0; j < deviceCnt; j++) { if (i != j) { int p2p; HIPCHECK(hipDeviceCanAccessPeer(&p2p, i, j)); if (!p2p) { printf("Cannot enable peer access between device %d and %d. You may use HIP_VISIBLE_DEVICES to limit GPUs.\n", i, j); exit(-1); } HIPCHECK(hipDeviceEnablePeerAccess(j, 0)); uint32_t linktype; hipError_t error = hipExtGetLinkTypeAndHopCount(i, j, &linktype, &info[i*deviceCnt+j]); if (error != hipSuccess) *is_xgmi = 0; if (linktype != 4 || info[i*deviceCnt+j] != 1) *is_xgmi = 0; } else info[i*deviceCnt+j] = 0; } } HIPCHECK(hipSetDevice(dev)); } static void parseChordalRing(char **str) { static const char *ringBase = "0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3|0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4"; static char ringRemap[256]; int id[8], dist[8]; int i; int ngpus; HIPCHECK(hipGetDeviceCount(&ngpus)); // single node CR8G only if (ngpus != 8) return; // validate chordal ring and calculate distance for (i=0; i ngpus-1) { return; } dist[i] = sum; } // remap GPU ids for (i = 0; i= '0' && ringBase[i] <= '9') ringRemap[i] = id[ringBase[i]-'0']+'0'; else ringRemap[i] = ringBase[i]; } ringRemap[i] = 0; *str = ringRemap; return; } static void printRing(int id, int *ring, int deviceCnt) { printf("Ring %d: ", id); for (int i = 0; i < deviceCnt; i++) printf("%1d ", ring[i]); printf("\n"); } static void findConnect(uint32_t *info, int *ring, int deviceCnt) { int n = 0, curr = 0, best; uint32_t temp[MAX_GPU*MAX_GPU]; for (int i = 0; i < deviceCnt*deviceCnt; i++) temp[i] = 0; for (int i = 0; i < deviceCnt; i++) { for (int j = 0; j < deviceCnt; j++) temp[j*deviceCnt+curr] = 1; ring[n] = curr; n++; int hops = 99; for (int j = 0; j < deviceCnt; j++) { if (temp[curr*deviceCnt+j]) continue; if (info[curr*deviceCnt+j] < hops) { best = j; hops = info[curr*deviceCnt+j]; } } curr = best; } } static int findNextGpu(int *ring, int gpu, int deviceCnt) { int i; for (i = 0; i < deviceCnt; i ++) if (ring[i] == gpu) break; return ring[(i+1)%deviceCnt]; } static void setupRings(uint32_t *info, int *ring_0, int *ring_1) { int deviceCnt, dev; HIPCHECK(hipGetDeviceCount(&deviceCnt)); printf("Connection matrix:\n"); for (int i = 0; i < deviceCnt; i++) { for (int j = 0; j < deviceCnt; j++) printf("%2d ", info[i*deviceCnt+j]); printf("\n"); } findConnect(info, ring_0, deviceCnt); ring_1[0] =0; for (int i = 1; i < deviceCnt; i++) ring_1[i] = ring_0[deviceCnt-i]; } char* getCmdOption(char ** begin, char ** end, const std::string & option) { char ** itr = std::find(begin, end, option); if (itr != end && ++itr != end) { return *itr; } return 0; } bool cmdOptionExists(char** begin, char** end, const std::string& option) { return std::find(begin, end, option) != end; } static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"}; int main(int argc,char* argv[]) { if (cmdOptionExists(argv, argv + argc, "-h")) { printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|doublecopylocal|reduce|reducecopy|all -i iterations -n bytes -r \"0 1 2 3|3 2 1 0\"\n"); exit(0); } int workgroups = 0; char *wg = getCmdOption(argv, argv + argc, "-w"); if (wg) workgroups = atol(wg); printf("Benchmarking using %d workgroups\n", workgroups); int iters = 10; char *it = getCmdOption(argv, argv + argc, "-i"); if (it) iters = atol(it); printf("Benchmarking using %d iterations\n", iters); uint64_t nBytes = 2097152; char *nb = getCmdOption(argv, argv + argc, "-n"); if (nb) nBytes = atol(nb); printf("Benchmarking using %ld bytes\n", nBytes); uint64_t N = nBytes/sizeof(float); int sync = 0; char *s = getCmdOption(argv, argv + argc, "-s"); if (s) sync = atol(s); if (sync) printf("Sync all GPUs before operation\n"); char *r = getCmdOption(argv, argv + argc, "-r"); if (r) printf("User specified ring topology: %s\n", r); const char *ops[] = {"copy", "localcopy", "doublecopy", "doublecopylocal", "reduce", "reducecopy", "read", "all"}; char *prim = getCmdOption(argv, argv + argc, "-p"); int op = NUM_OPS, begin_op, end_op; if (prim) { for (op = 0; op < sizeof(ops); op++) if (!strcmp((const char *)prim, ops[op])) break; } if (op < NUM_OPS ) { begin_op = op; end_op = op + 1; } else { begin_op = 0; end_op = NUM_OPS; printf("Benchmarking all ops\n"); } int nGpu = 1; HIPCHECK(hipGetDeviceCount(&nGpu)); uint32_t connection_info[MAX_GPU*MAX_GPU]; // Enable peer access bool is_xgmi; char *cr8g = 0; static const char *ring_4p3l = "0 1 2 3|0 1 3 2|0 2 1 3|0 2 3 1|0 3 1 2|0 3 2 1"; static const char *ring_8p1h = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0"; static const char *ring_16p1h = "0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0"; static const char *ring_gfx940_8p = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0"; setupPeers(connection_info, &is_xgmi); if (!r) { parseChordalRing(&cr8g); if (nGpu == 4 && is_xgmi) r = (char *)ring_4p3l; if (nGpu == 8 && cr8g) r = (char *)cr8g; if (nGpu == 8 && !cr8g) { hipDeviceProp_t prop; HIPCHECK(hipGetDeviceProperties(&prop, 0)); if (prop.gcnArch/10 == 94) { r = (char *)ring_gfx940_8p; if(!workgroups) workgroups = 28; } else { r = (char *)ring_8p1h; if(!workgroups) workgroups = 16; } } if (nGpu == 16) { r = (char *)ring_16p1h; if(!workgroups) workgroups = 24; } } if(!workgroups) workgroups = 1; // clockwise and counter clockwise rings int ring[MAX_WORKGROUPS][MAX_GPU]; for (int i = 0; i < MAX_WORKGROUPS; i++) for (int j = 0; j = 0 && digit <= 9) { if (state) ring[num_rings][j] = ring[num_rings][j]*10 + digit; else { ring[num_rings][j] = digit; state = 1; } } else { state = 0; j++; if (r[n] == ' ') continue; if (r[n] == '|') { num_rings ++; j = 0; continue; } } } while (r[n++] != 0x0); num_rings ++; } else { setupRings(connection_info, ring[0], ring[1]); num_rings = 2; } // duplicate rings for (int i = num_rings; i < MAX_WORKGROUPS; i++) { for (int j = 0; j Next GPU %d\n", i, j, next_gpu); h_transfer_data[i].dest0[j] = buff[next_gpu*MAX_WORKGROUPS+j] + N; h_transfer_data[i].dest1[j] = buff_coarse[i*MAX_WORKGROUPS+j] + N; h_transfer_data[i].dest2[j] = buff_fine[i*MAX_WORKGROUPS+j]; // additional local fine grain h_transfer_data[i].src0[j] = buff[i*MAX_WORKGROUPS+j]; h_transfer_data[i].src1[j] = buff_coarse[i*MAX_WORKGROUPS+j]; } h_transfer_data[i].N = N; h_transfer_data[i].gpu = i; h_transfer_data[i].ngpu = nGpu; h_transfer_data[i].remOpCount = d_remOpCount; } for (int i = 0; i < nGpu; i ++) { HIPCHECK(hipSetDevice(i)); HIPCHECK(hipMemcpyAsync(transfer_data[i], &h_transfer_data[i], sizeof(struct transfer_data_t), hipMemcpyHostToDevice, stream[i])); HIPCHECK(hipStreamSynchronize(stream[i])); } void *args[MAX_GPU*3]; hipLaunchParams *launchParamsList= reinterpret_cast( malloc(sizeof(hipLaunchParams)*MAX_GPU)); uint64_t opCount = workgroups; for (int op = begin_op; op < end_op; op ++) { const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "doublecopylocal", "Reduce", "ReduceCopy", "Read"}; printf("\n[Testing %s]: \n", OpsName[op]); // 4 warm up cycles for (int j = 0; j < 4; j ++) { for (int i = 0; i < nGpu; i ++) { #if 0 args[i*3] = &transfer_data[i]; args[i*3+1] = &d_profiling_data[i]; args[i*3+2] = &opCount; launchParamsList[i].func = reinterpret_cast(flagSyncKerns[op*2 + sync]); launchParamsList[i].gridDim = dim3(workgroups, 1, 1), launchParamsList[i].blockDim = dim3(THREADS, 1, 1), launchParamsList[i].sharedMem = 0; launchParamsList[i].stream = stream[i]; launchParamsList[i].args = args + i*3; } hipExtLaunchMultiKernelMultiDevice(launchParamsList, nGpu, hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync); #else HIPCHECK(hipSetDevice(i)); //launch the kernel hipLaunchKernelGGL(flagSyncKerns[op*2 + sync], /*grid dim x,y,z*/ dim3(workgroups, 1, 1), /*block dim x,y,z*/ dim3(THREADS, 1, 1), /*dynamic shared mem*/ 0, /*stream*/ stream[i], /*kernel args*/ transfer_data[i], d_profiling_data[i]+j, opCount); } #endif opCount+=workgroups; } for (int i = 0; i < nGpu; i ++) { HIPCHECK(hipSetDevice(i)); HIPCHECK(hipMemsetAsync(d_profiling_data[i], 0, sizeof(struct profiling_data_t)*iters, stream[i])); HIPCHECK(hipStreamSynchronize(stream[i])); } auto start = std::chrono::high_resolution_clock::now(); for (int j = 0; j < iters; j ++) { for (int i = 0; i < nGpu; i ++) { #if 0 args[i*3] = &transfer_data[i]; args[i*3+1] = &d_profiling_data[i]; args[i*3+2] = &opCount; launchParamsList[i].func = reinterpret_cast(flagSyncKerns[op*2 + sync]); launchParamsList[i].gridDim = dim3(workgroups, 1, 1), launchParamsList[i].blockDim = dim3(THREADS, 1, 1), launchParamsList[i].sharedMem = 0; launchParamsList[i].stream = stream[i]; launchParamsList[i].args = args + i*3; } hipExtLaunchMultiKernelMultiDevice(launchParamsList, nGpu, hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync); #else HIPCHECK(hipSetDevice(i)); //launch the kernel hipLaunchKernelGGL(flagSyncKerns[op*2 + sync], /*grid dim x,y,z*/ dim3(workgroups, 1, 1), /*block dim x,y,z*/ dim3(THREADS, 1, 1), /*dynamic shared mem*/ 0, /*stream*/ stream[i], /*kernel args*/ transfer_data[i], d_profiling_data[i]+j, opCount); } #endif opCount+=workgroups; } for (int i = 0; i < nGpu; i ++) { HIPCHECK(hipSetDevice(i)); HIPCHECK(hipStreamSynchronize(stream[i])); } auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = std::chrono::duration_cast>(delta).count(); std::cout << BOLD(FBLU("[GPU to GPU Transfer Profiling Data]"))<write_cycles[j]); iter_total_write_cycle = iter_total_write_cycle + (profiling_data[i]+k)->write_cycles[j]; iter_bytes_transferred = iter_bytes_transferred + (profiling_data[i]+k)->bytes_transferred[j]; } bytes_transferred += iter_bytes_transferred; double t1 = iter_total_write_cycle/vega_gpu_rtc_freq; max_write_cycle = std::max(max_write_cycle, (uint64_t)iter_total_write_cycle); mean_write_cycle = mean_write_cycle + iter_total_write_cycle; for (int k = 0; k < iters; k++) { double t0 = (double)(profiling_data[i]+k)->write_cycles[j]/vega_gpu_rtc_freq; iter_bw_std_dev += std::pow((double)(profiling_data[i]+k)->bytes_transferred[j]/(t0*1.0E9) - (double)(profiling_data[i]+k)->bytes_transferred[j]*iters/(iter_total_write_cycle*1.0E9/vega_gpu_rtc_freq), 2); } iter_bw_std_dev = std::sqrt(iter_bw_std_dev/iters); //store bytes_transferred and write_cycle from all itres into in first iter entry profiling_data[i]->write_cycles[j] = (uint64_t)iter_total_write_cycle; profiling_data[i]->bytes_transferred[j] = iter_bytes_transferred; fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.3f %-20lu %-8.2f %.3f\n", i,i, next_gpu,j,link_type_name[linktype], t1*1000, iter_bytes_transferred, (double)(iter_bytes_transferred)/(t1*1.0E9), iter_bw_std_dev); } //calculate stddev for rings mean_write_cycle /= workgroups; for (int j = 0; j < workgroups; j++) { double t0 = (double)profiling_data[i]->write_cycles[j]/vega_gpu_rtc_freq; bw_std_dev += std::pow((double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9) - (double)profiling_data[i]->bytes_transferred[j]/(mean_write_cycle*1.0E9/vega_gpu_rtc_freq), 2); } bw_std_dev = std::sqrt(bw_std_dev/workgroups); print_table_summary_line(); double total = 0; total = (double)max_write_cycle/vega_gpu_rtc_freq; fprintf(stderr, " Workgroups throughput standard deviation %-20.3f %-13.3f %-20lu %-.2f\n", bw_std_dev, total*1000, bytes_transferred, (double)bytes_transferred/(total*1.0E9)); print_table_summary_line(); #ifdef PRINT_GPU0_ONLY break; #endif } std::cout << BOLD(FBLU("[Application Level Transfer Profiling Data]"))<bytes_transferred[0] * workgroups ; print_table_summary_line(); fprintf(stderr, " %-61s %-13.3f %-20lu %-.2f\n", "Total" , deltaSec*1000, total_bytes_transferred, (double)total_bytes_transferred/(deltaSec*1.0E9)); print_table_summary_line(); } for (int i = 0; i < nGpu; i ++) { HIPCHECK(hipStreamDestroy(stream[i])); HIPCHECK(hipFree((void*) transfer_data[i])); for (int j = 0; j < workgroups; j++) { HIPCHECK(hipFree((void*) buff[i*MAX_WORKGROUPS+j])); HIPCHECK(hipFree((void*) buff_coarse[i*MAX_WORKGROUPS+j])); HIPCHECK(hipFree((void*) buff_fine[i*MAX_WORKGROUPS+j])); } HIPCHECK(hipFree((void*) d_profiling_data[i])); free(profiling_data[i]); } printf("opCount: "); for (int i = 0; i < nGpu; i++) printf("%ld ", remOpCount[i]); printf("\n"); HIPCHECK(hipHostFree((void*)remOpCount)); }