Unverified Commit 484e6b99 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2553 from peastman/index

Fixed incorrect indexing in kernel
parents f37bdaa2 c3895fc5
...@@ -23,7 +23,7 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) { ...@@ -23,7 +23,7 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
temp[LOCAL_ID] = val; temp[LOCAL_ID] = val;
SYNC_WARPS; SYNC_WARPS;
for (int offset = 16; offset > 0; offset /= 2) { for (int offset = 16; offset > 0; offset /= 2) {
if (offset < indexInWarp) if (indexInWarp < offset)
temp[LOCAL_ID] = max(temp[LOCAL_ID], temp[LOCAL_ID+offset]); temp[LOCAL_ID] = max(temp[LOCAL_ID], temp[LOCAL_ID+offset]);
SYNC_WARPS; SYNC_WARPS;
} }
......
...@@ -258,7 +258,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -258,7 +258,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
if (computeCapability >= 7.0) { if (cudaDriverVersion >= 9000) {
compilationDefines["SYNC_WARPS"] = "__syncwarp();"; compilationDefines["SYNC_WARPS"] = "__syncwarp();";
compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);"; compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);"; compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment