Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
82069311
Commit
82069311
authored
Jun 02, 2015
by
Peter Eastman
Browse files
Cleaned up CUDA FFT code
parent
d3e91b15
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
67 additions
and
89 deletions
+67
-89
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+1
-1
platforms/cuda/src/CudaFFT3D.cpp
platforms/cuda/src/CudaFFT3D.cpp
+18
-39
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+44
-34
platforms/cuda/src/kernels/fft.cu
platforms/cuda/src/kernels/fft.cu
+4
-15
No files found.
platforms/cuda/include/CudaKernels.h
View file @
82069311
...
...
@@ -665,7 +665,7 @@ private:
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
int
interpolateForceThreads
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
,
useCudaFFT
;
static
const
int
PmeOrder
=
5
;
};
...
...
platforms/cuda/src/CudaFFT3D.cpp
View file @
82069311
...
...
@@ -168,14 +168,12 @@ static int getSmallestRadix(int size) {
}
CUfunction
CudaFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
int
maxThreads
=
256
;
//std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
int
maxThreads
=
256
;
// while (maxThreads > 128 && maxThreads-64 >= zsize)
// maxThreads -= 64;
int
threadsPerBlock
=
zsize
/
getSmallestRadix
(
zsize
);
bool
isCPU
=
false
;
//context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
bool
loopRequired
=
(
threadsPerBlock
>
maxThreads
||
isCPU
);
stringstream
source
;
int
blocksPerGroup
=
(
loopRequired
?
1
:
max
(
1
,
maxThreads
/
threadsPerBlock
)
)
;
int
blocksPerGroup
=
max
(
1
,
maxThreads
/
threadsPerBlock
);
int
stage
=
0
;
int
L
=
zsize
;
int
m
=
1
;
...
...
@@ -201,11 +199,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
source
<<
"{
\n
"
;
L
=
L
/
radix
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix "
<<
radix
<<
")
\n
"
;
if
(
loopRequired
)
{
source
<<
"for (int i = threadIdx.x; i < "
<<
(
L
*
m
)
<<
"; i += blockDim.x) {
\n
"
;
source
<<
"int base = i;
\n
"
;
}
else
{
if
(
L
*
m
<
threadsPerBlock
)
source
<<
"if (threadIdx.x < "
<<
(
blocksPerGroup
*
L
*
m
)
<<
") {
\n
"
;
else
...
...
@@ -213,7 +206,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
source
<<
"int block = threadIdx.x/"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int i = threadIdx.x-block*"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int base = i+block*"
<<
zsize
<<
";
\n
"
;
}
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
if
(
radix
==
7
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
...
...
@@ -328,7 +320,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
||
true
)
{
if
(
outputIsPacked
)
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
else
...
...
@@ -338,17 +329,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
else
source
<<
"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
if
(
outputIsPacked
)
{
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+(threadIdx.x%ZSIZE)*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[threadIdx.x]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(threadIdx.x%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[threadIdx.x]"
<<
outputSuffix
<<
";
\n
"
;
}
}
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
...
...
@@ -357,7 +337,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
replacements
[
"THREADS_PER_BLOCK"
]
=
context
.
intToString
(
threadsPerBlock
);
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
...
...
@@ -366,6 +345,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
CUmodule
module
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
fft
,
replacements
));
CUfunction
kernel
=
context
.
getKernel
(
module
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
threadsPerBlock
)
;
threads
=
blocksPerGroup
*
threadsPerBlock
;
return
kernel
;
}
platforms/cuda/src/CudaKernels.cpp
View file @
82069311
...
...
@@ -1502,8 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (pmeio != NULL)
delete pmeio;
if (hasInitializedFFT) {
if (useCudaFFT) {
cufftDestroy(fftForward);
cufftDestroy(fftBackward);
}
if (usePmeStream) {
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
...
...
@@ -1694,39 +1696,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(*directPmeGrid);
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
useCudaFFT = false; // We might switch back in the future, once Nvidia has all their bugs worked out
if (useCudaFFT) {
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
}
else
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
// Prepare for doing PME on its own stream.
int cufftVersion
;
cu
fftGetVersion(&cufftVersion
);
usePmeStream =
true;//(cu.getComputeCapability() < 5.0 && numParticles < 130000 && cufftVersion >= 6000 && cufftVersion != 7000); // Workarounds for various CUDA bugs
char deviceName[100]
;
cu
DeviceGetName(deviceName, 100, cu.getDevice()
);
usePmeStream =
(string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
if (usePmeStream) {
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
if (useCudaFFT) {
cufftSetStream(fftForward, pmeStream);
cufftSetStream(fftBackward, pmeStream);
}
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
...
...
@@ -1896,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
}
// if (cu.getUseDoublePrecision())
// cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
// else
// cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
else
cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
...
...
@@ -1914,12 +1921,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
// if (cu.getUseDoublePrecision())
// cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
// else
// cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer());
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
else
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
...
...
platforms/cuda/src/kernels/fft.cu
View file @
82069311
...
...
@@ -35,7 +35,6 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
#if OUTPUT_IS_PACKED
if
(
x
<
XSIZE
/
2
+
1
)
{
#endif
//#if LOOP_REQUIRED
if
(
index
<
XSIZE
*
YSIZE
)
for
(
int
i
=
threadIdx
.
x
-
block
*
THREADS_PER_BLOCK
;
i
<
ZSIZE
;
i
+=
THREADS_PER_BLOCK
)
#if INPUT_IS_REAL
...
...
@@ -45,16 +44,6 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
#else
data0
[
i
+
block
*
ZSIZE
]
=
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
];
#endif
//#else
// if (index < XSIZE*YSIZE && (threadIdx.x%BLOCK_SIZE) < ZSIZE)
// #if INPUT_IS_REAL
// data0[threadIdx.x] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE], 0);
// #elif INPUT_IS_PACKED
// data0[threadIdx.x] = loadComplexValue(in, x, y, threadIdx.x%BLOCK_SIZE);
// #else
// data0[threadIdx.x] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE];
// #endif
//#endif
#if OUTPUT_IS_PACKED
}
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment