/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit. * * See https://openmm.org/development. * * * * Portions copyright (c) 2009-2025 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "OpenCLFFT3D.h" #include "OpenCLContext.h" #include "OpenCLExpressionUtilities.h" #include "OpenCLKernelSources.h" #include "SimTKOpenMMRealType.h" #include #include #include #include using namespace OpenMM; using namespace std; #ifdef USE_VKFFT OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) { app = {}; VkFFTConfiguration config = {}; config.FFTdim = 3; config.size[0] = zsize; config.size[1] = ysize; config.size[2] = xsize; config.performR2C = realToComplex; config.doublePrecision = context.getUseDoublePrecision(); config.device = &context.getDevice()(); config.context = &context.getContext()(); config.inverseReturnToInputBuffer = true; config.isInputFormatted = 1; config.inputBufferStride[0] = zsize; config.inputBufferStride[1] = ysize*zsize; config.inputBufferStride[2] = xsize*ysize*zsize; cl::Platform platform(context.getDevice().getInfo()); string platformVendor = platform.getInfo(); if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel") { // Intel's OpenCL uses low accuracy trig functions, so tell VkFFT to use lookup tables instead. config.useLUT = 1; } VkFFTResult result = initializeVkFFT(&app, config); if (result != VKFFT_SUCCESS) throw OpenMMException("Error initializing VkFFT: "+context.intToString(result)); } OpenCLFFT3D::~OpenCLFFT3D() { deleteVkFFT(&app); } void OpenCLFFT3D::execFFT(ArrayInterface& in, ArrayInterface& out, bool forward) { VkFFTLaunchParams params = {}; if (forward) { params.inputBuffer = &context.unwrap(in).getDeviceBuffer()(); params.buffer = &context.unwrap(out).getDeviceBuffer()(); } else { params.inputBuffer = &context.unwrap(out).getDeviceBuffer()(); params.buffer = &context.unwrap(in).getDeviceBuffer()(); } params.commandQueue = &context.getQueue()(); VkFFTResult result = VkFFTAppend(&app, forward ? -1 : 1, ¶ms); if (result != VKFFT_SUCCESS) throw OpenMMException("Error executing VkFFT: "+context.intToString(result)); } #else OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) { packRealAsComplex = false; int packedXSize = xsize; int packedYSize = ysize; int packedZSize = zsize; if (realToComplex) { // If any axis size is even, we can pack the real values into a complex grid that is only half as large. // Look for an appropriate axis. packRealAsComplex = true; int packedAxis, bufferSize; if (xsize%2 == 0) { packedAxis = 0; packedXSize /= 2; bufferSize = packedXSize; } else if (ysize%2 == 0) { packedAxis = 1; packedYSize /= 2; bufferSize = packedYSize; } else if (zsize%2 == 0) { packedAxis = 2; packedZSize /= 2; bufferSize = packedZSize; } else packRealAsComplex = false; if (packRealAsComplex) { // Build the kernels for packing and unpacking the data. map defines; defines["XSIZE"] = context.intToString(xsize); defines["YSIZE"] = context.intToString(ysize); defines["ZSIZE"] = context.intToString(zsize); defines["PACKED_AXIS"] = context.intToString(packedAxis); defines["PACKED_XSIZE"] = context.intToString(packedXSize); defines["PACKED_YSIZE"] = context.intToString(packedYSize); defines["PACKED_ZSIZE"] = context.intToString(packedZSize); defines["M_PI"] = context.doubleToString(M_PI); cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines); packForwardKernel = cl::Kernel(program, "packForwardData"); unpackForwardKernel = cl::Kernel(program, "unpackForwardData"); unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL); packBackwardKernel = cl::Kernel(program, "packBackwardData"); packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL); unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData"); } } bool inputIsReal = (realToComplex && !packRealAsComplex); zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal); xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal); ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal); invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal); invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal); invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal); } void OpenCLFFT3D::execFFT(ArrayInterface& in, ArrayInterface& out, bool forward) { OpenCLArray& in2 = context.unwrap(in); OpenCLArray& out2 = context.unwrap(out); cl::Kernel kernel1 = (forward ? zkernel : invzkernel); cl::Kernel kernel2 = (forward ? xkernel : invxkernel); cl::Kernel kernel3 = (forward ? ykernel : invykernel); if (packRealAsComplex) { cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel); cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel); int gridSize = xsize*ysize*zsize/2; // Pack the data into a half sized grid. packKernel.setArg(0, in2.getDeviceBuffer()); packKernel.setArg(1, out2.getDeviceBuffer()); context.executeKernel(packKernel, gridSize); // Perform the FFT. kernel1.setArg(0, out2.getDeviceBuffer()); kernel1.setArg(1, in2.getDeviceBuffer()); context.executeKernel(kernel1, gridSize, zthreads); kernel2.setArg(0, in2.getDeviceBuffer()); kernel2.setArg(1, out2.getDeviceBuffer()); context.executeKernel(kernel2, gridSize, xthreads); kernel3.setArg(0, out2.getDeviceBuffer()); kernel3.setArg(1, in2.getDeviceBuffer()); context.executeKernel(kernel3, gridSize, ythreads); // Unpack the data. unpackKernel.setArg(0, in2.getDeviceBuffer()); unpackKernel.setArg(1, out2.getDeviceBuffer()); context.executeKernel(unpackKernel, gridSize); } else { kernel1.setArg(0, in2.getDeviceBuffer()); kernel1.setArg(1, out2.getDeviceBuffer()); context.executeKernel(kernel1, xsize*ysize*zsize, zthreads); kernel2.setArg(0, out2.getDeviceBuffer()); kernel2.setArg(1, in2.getDeviceBuffer()); context.executeKernel(kernel2, xsize*ysize*zsize, xthreads); kernel3.setArg(0, in2.getDeviceBuffer()); kernel3.setArg(1, out2.getDeviceBuffer()); context.executeKernel(kernel3, xsize*ysize*zsize, ythreads); } } cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) { int maxThreads = min(256, (int) context.getDevice().getInfo()); while (maxThreads > 128 && maxThreads-64 >= zsize) maxThreads -= 64; bool isCPU = context.getDevice().getInfo() == CL_DEVICE_TYPE_CPU; while (true) { bool loopRequired = (zsize > maxThreads || isCPU); stringstream source; int blocksPerGroup = (loopRequired ? 1 : max(1, maxThreads/zsize)); int stage = 0; int L = zsize; int m = 1; // Factor zsize, generating an appropriate block of code for each factor. while (L > 1) { int input = stage%2; int output = 1-input; int radix; if (L%7 == 0) radix = 7; else if (L%5 == 0) radix = 5; else if (L%4 == 0) radix = 4; else if (L%3 == 0) radix = 3; else if (L%2 == 0) radix = 2; else throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize)); source<<"{\n"; L = L/radix; source<<"// Pass "<<(stage+1)<<" (radix "< replacements; replacements["XSIZE"] = context.intToString(xsize); replacements["YSIZE"] = context.intToString(ysize); replacements["ZSIZE"] = context.intToString(zsize); replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup); replacements["M_PI"] = context.doubleToString(M_PI); replacements["COMPUTE_FFT"] = source.str(); replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0"); replacements["SIGN"] = (forward ? "1" : "-1"); replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2"); replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2"); replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0"); replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0"); replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0"); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements)); cl::Kernel kernel(program, "execFFT"); threads = (isCPU ? 1 : blocksPerGroup*zsize); int kernelMaxThreads = kernel.getWorkGroupInfo(context.getDevice()); if (threads > kernelMaxThreads) { // The device can't handle this block size, so reduce it. maxThreads = kernelMaxThreads; continue; } int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)); kernel.setArg(2, bufferSize, NULL); kernel.setArg(3, bufferSize, NULL); kernel.setArg(4, bufferSize, NULL); return kernel; } } #endif int OpenCLFFT3D::findLegalDimension(int minimum) { if (minimum < 1) return 1; #ifdef USE_VKFFT const int maxFactor = 13; #else const int maxFactor = 7; #endif while (true) { // Attempt to factor the current value. int unfactored = minimum; for (int factor = 2; factor <= maxFactor; factor++) { while (unfactored > 1 && unfactored%factor == 0) unfactored /= factor; } if (unfactored == 1) return minimum; minimum++; } }