/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009-2021 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #ifdef WIN32 #define _USE_MATH_DEFINES // Needed to get M_PI #endif #include #include "CudaContext.h" #include "CudaArray.h" #include "CudaBondedUtilities.h" #include "CudaEvent.h" #include "CudaIntegrationUtilities.h" #include "CudaKernels.h" #include "CudaKernelSources.h" #include "CudaNonbondedUtilities.h" #include "CudaProgram.h" #include "openmm/common/ComputeArray.h" #include "openmm/common/ContextSelector.h" #include "SHA1.h" #include "openmm/Platform.h" #include "openmm/System.h" #include "openmm/VirtualSite.h" #include "CudaExpressionUtilities.h" #include "openmm/internal/ContextImpl.h" #include #include #include #include #include #include #include #include #include #include #ifndef WIN32 #include #endif #define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage); #define CHECK_RESULT2(result, prefix) \ if (result != CUDA_SUCCESS) { \ std::stringstream m; \ m< static int executeInWindows(const string &command) { // COMSPEC is an env variable pointing to full dir of cmd.exe // it always defined on pretty much all Windows OSes string fullcommand = getenv("COMSPEC") + string(" /C ") + command; STARTUPINFO si; PROCESS_INFORMATION pi; ZeroMemory( &si, sizeof(si) ); si.cb = sizeof(si); ZeroMemory( &pi, sizeof(pi) ); vector args(std::max(1000, (int) fullcommand.size()+1)); strcpy(&args[0], fullcommand.c_str()); si.dwFlags = STARTF_USESHOWWINDOW; si.wShowWindow = SW_HIDE; if (!CreateProcess(NULL, &args[0], NULL, NULL, FALSE, 0, NULL, NULL, &si, &pi)) { return -1; } WaitForSingleObject(pi.hProcess, INFINITE); DWORD exitCode = -1; if(!GetExitCodeProcess(pi.hProcess, &exitCode)) { throw(OpenMMException("Could not get nvcc.exe's exit code\n")); } else { if(exitCode == 0) return 0; else return -1; } } #endif CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, const string& tempDir, const std::string& hostCompiler, bool allowRuntimeCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false), hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) { // Determine what compiler to use. this->compiler = "\""+compiler+"\""; if (allowRuntimeCompiler && platformData.context != NULL) { try { compilerKernel = platformData.context->getPlatform().createKernel(CudaCompilerKernel::Name(), *platformData.context); hasCompilerKernel = true; } catch (...) { // The runtime compiler plugin isn't available. } } #ifdef WIN32 string testCompilerCommand = this->compiler+" --version > nul 2> nul"; int res = executeInWindows(testCompilerCommand.c_str()); #else string testCompilerCommand = this->compiler+" --version > /dev/null 2> /dev/null"; int res = std::system(testCompilerCommand.c_str()); #endif struct stat info; isNvccAvailable = (res == 0 && stat(tempDir.c_str(), &info) == 0); int cudaDriverVersion; cuDriverGetVersion(&cudaDriverVersion); if (hostCompiler.size() > 0) this->compiler = compiler+" --compiler-bindir "+hostCompiler; if (!hasInitializedCuda) { CHECK_RESULT2(cuInit(0), "Error initializing CUDA"); hasInitializedCuda = true; } if (precision == "single") { useDoublePrecision = false; useMixedPrecision = false; } else if (precision == "mixed") { useDoublePrecision = false; useMixedPrecision = true; } else if (precision == "double") { useDoublePrecision = true; useMixedPrecision = false; } else throw OpenMMException("Illegal value for Precision: "+precision); char* cacheVariable = getenv("OPENMM_CACHE_DIR"); cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable)); #ifdef WIN32 this->tempDir = tempDir+"\\"; cacheDir = cacheDir+"\\"; #else this->tempDir = tempDir+"/"; cacheDir = cacheDir+"/"; #endif contextIndex = platformData.contexts.size(); string errorMessage = "Error initializing Context"; if (originalContext == NULL) { isLinkedContext = false; int numDevices; CHECK_RESULT(cuDeviceGetCount(&numDevices)); if (deviceIndex < -1 || deviceIndex >= numDevices) throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex)); vector devicePrecedence; if (deviceIndex == -1) { devicePrecedence = getDevicePrecedence(); } else { devicePrecedence.push_back(deviceIndex); } this->deviceIndex = -1; for (int i = 0; i < static_cast(devicePrecedence.size()); i++) { int trialDeviceIndex = devicePrecedence[i]; CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex)); defaultOptimizationOptions = "--use_fast_math"; unsigned int flags = CU_CTX_MAP_HOST; if (useBlockingSync) flags += CU_CTX_SCHED_BLOCKING_SYNC; else flags += CU_CTX_SCHED_SPIN; if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) { this->deviceIndex = trialDeviceIndex; CUcontext popped; cuCtxPopCurrent(&popped); break; } } if (this->deviceIndex == -1) { if (deviceIndex != -1) throw OpenMMException("The requested CUDA device could not be loaded"); else throw OpenMMException("No compatible CUDA device is available"); } } else { isLinkedContext = true; context = originalContext->context; this->deviceIndex = originalContext->deviceIndex; this->device = originalContext->device; } int major, minor; CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); int numThreadBlocksPerComputeUnit = (major == 6 ? 4 : 6); if (cudaDriverVersion < 7000) { // This is a workaround to support GTX 980 with CUDA 6.5. It reports // its compute capability as 5.2, but the compiler doesn't support // anything beyond 5.0. if (major == 5) minor = 0; } if (cudaDriverVersion < 8000) { // This is a workaround to support Pascal with CUDA 7.5. It reports // its compute capability as 6.x, but the compiler doesn't support // anything beyond 5.3. if (major == 6) { major = 5; minor = 3; } } gpuArchitecture = 10*major+minor; computeCapability = major+0.1*minor; contextIsValid = true; ContextSelector selector(*this); CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED)); if (contextIndex > 0) { int canAccess; cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice()); if (canAccess) { { ContextSelector selector2(*platformData.contexts[0]); CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0)); } CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0)); } } numAtoms = system.getNumParticles(); paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize); numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; int multiprocessors; CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; if (cudaDriverVersion >= 9000) { compilationDefines["SYNC_WARPS"] = "__syncwarp();"; compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);"; compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);"; } else { compilationDefines["SYNC_WARPS"] = ""; compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);"; compilationDefines["BALLOT(var)"] = "__ballot(var);"; } if (useDoublePrecision) { posq.initialize(*this, paddedNumAtoms, "posq"); velm.initialize(*this, paddedNumAtoms, "velm"); compilationDefines["USE_DOUBLE_PRECISION"] = "1"; compilationDefines["make_real2"] = "make_double2"; compilationDefines["make_real3"] = "make_double3"; compilationDefines["make_real4"] = "make_double4"; compilationDefines["make_mixed2"] = "make_double2"; compilationDefines["make_mixed3"] = "make_double3"; compilationDefines["make_mixed4"] = "make_double4"; } else if (useMixedPrecision) { posq.initialize(*this, paddedNumAtoms, "posq"); posqCorrection.initialize(*this, paddedNumAtoms, "posqCorrection"); velm.initialize(*this, paddedNumAtoms, "velm"); compilationDefines["USE_MIXED_PRECISION"] = "1"; compilationDefines["make_real2"] = "make_float2"; compilationDefines["make_real3"] = "make_float3"; compilationDefines["make_real4"] = "make_float4"; compilationDefines["make_mixed2"] = "make_double2"; compilationDefines["make_mixed3"] = "make_double3"; compilationDefines["make_mixed4"] = "make_double4"; } else { posq.initialize(*this, paddedNumAtoms, "posq"); velm.initialize(*this, paddedNumAtoms, "velm"); compilationDefines["make_real2"] = "make_float2"; compilationDefines["make_real3"] = "make_float3"; compilationDefines["make_real4"] = "make_float4"; compilationDefines["make_mixed2"] = "make_float2"; compilationDefines["make_mixed3"] = "make_float3"; compilationDefines["make_mixed4"] = "make_float4"; } force.initialize(*this, paddedNumAtoms*3, "force"); posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0)); atomIndexDevice.initialize(*this, paddedNumAtoms, "atomIndex"); atomIndex.resize(paddedNumAtoms); for (int i = 0; i < paddedNumAtoms; ++i) atomIndex[i] = i; atomIndexDevice.upload(atomIndex); // Create utility kernels that are used in multiple places. CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities); clearBufferKernel = getKernel(utilities, "clearBuffer"); clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers"); clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers"); clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers"); clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers"); clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers"); reduceEnergyKernel = getKernel(utilities, "reduceEnergy"); setChargesKernel = getKernel(utilities, "setCharges"); // Set defines based on the requested precision. compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf"; compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf"; compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/"; compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf"; compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf"; compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf"; compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf"; compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf"; compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf"; compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf"; compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf"; compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf"; compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff"; compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf"; // Set defines for applying periodic boundary conditions. Vec3 boxVectors[3]; system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 || boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 || boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0); if (boxIsTriclinic) { compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] = "{" "real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n" "delta.x -= scale3*periodicBoxVecZ.x; \\\n" "delta.y -= scale3*periodicBoxVecZ.y; \\\n" "delta.z -= scale3*periodicBoxVecZ.z; \\\n" "real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n" "delta.x -= scale2*periodicBoxVecY.x; \\\n" "delta.y -= scale2*periodicBoxVecY.y; \\\n" "real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n" "delta.x -= scale1*periodicBoxVecX.x;}"; compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] = "{" "real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n" "pos.x -= scale3*periodicBoxVecZ.x; \\\n" "pos.y -= scale3*periodicBoxVecZ.y; \\\n" "pos.z -= scale3*periodicBoxVecZ.z; \\\n" "real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n" "pos.x -= scale2*periodicBoxVecY.x; \\\n" "pos.y -= scale2*periodicBoxVecY.y; \\\n" "real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n" "pos.x -= scale1*periodicBoxVecX.x;}"; compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] = "{" "real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n" "pos.x -= scale3*periodicBoxVecZ.x; \\\n" "pos.y -= scale3*periodicBoxVecZ.y; \\\n" "pos.z -= scale3*periodicBoxVecZ.z; \\\n" "real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n" "pos.x -= scale2*periodicBoxVecY.x; \\\n" "pos.y -= scale2*periodicBoxVecY.y; \\\n" "real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n" "pos.x -= scale1*periodicBoxVecX.x;}"; } else { compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] = "{" "delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n" "delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n" "delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}"; compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] = "{" "pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n" "pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n" "pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}"; compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] = "{" "pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n" "pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n" "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}"; } // Create utilities objects. bonded = new CudaBondedUtilities(*this); nonbonded = new CudaNonbondedUtilities(*this); integration = new CudaIntegrationUtilities(*this, system); expression = new CudaExpressionUtilities(*this); } CudaContext::~CudaContext() { pushAsCurrent(); for (auto force : forces) delete force; for (auto listener : reorderListeners) delete listener; for (auto computation : preComputations) delete computation; for (auto computation : postComputations) delete computation; if (pinnedBuffer != NULL) cuMemFreeHost(pinnedBuffer); if (integration != NULL) delete integration; if (expression != NULL) delete expression; if (bonded != NULL) delete bonded; if (nonbonded != NULL) delete nonbonded; popAsCurrent(); string errorMessage = "Error deleting Context"; if (contextIsValid && !isLinkedContext) { cuProfilerStop(); cuCtxDestroy(context); } contextIsValid = false; } void CudaContext::initialize() { ContextSelector selector(*this); string errorMessage = "Error initializing Context"; int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); if (useDoublePrecision) { energyBuffer.initialize(*this, numEnergyBuffers, "energyBuffer"); energySum.initialize(*this, 1, "energySum"); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); } else if (useMixedPrecision) { energyBuffer.initialize(*this, numEnergyBuffers, "energyBuffer"); energySum.initialize(*this, 1, "energySum"); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); } else { energyBuffer.initialize(*this, numEnergyBuffers, "energyBuffer"); energySum.initialize(*this, 1, "energySum"); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); } for (int i = 0; i < numAtoms; i++) { double mass = system.getParticleMass(i); if (useDoublePrecision || useMixedPrecision) ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass); else ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass)); } velm.upload(pinnedBuffer); bonded->initialize(system); addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize()); addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize()); int numEnergyParamDerivs = energyParamDerivNames.size(); if (numEnergyParamDerivs > 0) { if (useDoublePrecision || useMixedPrecision) energyParamDerivBuffer.initialize(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer"); else energyParamDerivBuffer.initialize(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer"); addAutoclearBuffer(energyParamDerivBuffer); } findMoleculeGroups(); nonbonded->initialize(system); } void CudaContext::initializeContexts() { getPlatformData().initializeContexts(system); } void CudaContext::setAsCurrent() { if (contextIsValid) cuCtxSetCurrent(context); } void CudaContext::pushAsCurrent() { if (contextIsValid) cuCtxPushCurrent(context); } void CudaContext::popAsCurrent() { CUcontext popped; if (contextIsValid) cuCtxPopCurrent(&popped); } CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) { return createModule(source, map(), optimizationFlags); } CUmodule CudaContext::createModule(const string source, const map& defines, const char* optimizationFlags) { string bits = intToString(8*sizeof(void*)); string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags)); stringstream src; if (!options.empty()) src << "// Compilation Options: " << options << endl << endl; for (auto& pair : compilationDefines) { // Query defines to avoid duplicate variables if (defines.find(pair.first) == defines.end()) { src << "#define " << pair.first; if (!pair.second.empty()) src << " " << pair.second; src << endl; } } if (!compilationDefines.empty()) src << endl; if (useDoublePrecision) { src << "typedef double real;\n"; src << "typedef double2 real2;\n"; src << "typedef double3 real3;\n"; src << "typedef double4 real4;\n"; } else { src << "typedef float real;\n"; src << "typedef float2 real2;\n"; src << "typedef float3 real3;\n"; src << "typedef float4 real4;\n"; } if (useDoublePrecision || useMixedPrecision) { src << "typedef double mixed;\n"; src << "typedef double2 mixed2;\n"; src << "typedef double3 mixed3;\n"; src << "typedef double4 mixed4;\n"; } else { src << "typedef float mixed;\n"; src << "typedef float2 mixed2;\n"; src << "typedef float3 mixed3;\n"; src << "typedef float4 mixed4;\n"; } src << "typedef unsigned int tileflags;\n"; src << CudaKernelSources::common << endl; for (auto& pair : defines) { src << "#define " << pair.first; if (!pair.second.empty()) src << " " << pair.second; src << endl; } if (!defines.empty()) src << endl; src << source << endl; // Determine what architecture to compile for. string compileArchitecture; if (hasCompilerKernel) { int maxCompilerArchitecture = compilerKernel.getAs().getMaxSupportedArchitecture(); compileArchitecture = intToString(min(gpuArchitecture, maxCompilerArchitecture)); } else compileArchitecture = intToString(gpuArchitecture); // See whether we already have PTX for this kernel cached. CSHA1 sha1; sha1.Update((const UINT_8*) src.str().c_str(), src.str().size()); sha1.Final(); UINT_8 hash[20]; sha1.GetHash(hash); stringstream cacheFile; cacheFile << cacheDir; cacheFile.flags(ios::hex); for (int i = 0; i < 20; i++) cacheFile << setw(2) << setfill('0') << (int) hash[i]; cacheFile << '_' << compileArchitecture << '_' << bits; CUmodule module; if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS) return module; // Select names for the various temporary files. stringstream tempFileName; tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions. #ifdef WIN32 tempFileName << "_" << GetCurrentProcessId(); #else tempFileName << "_" << getpid(); #endif string inputFile = (tempDir+tempFileName.str()+".cu"); string outputFile = (tempDir+tempFileName.str()+".ptx"); string logFile = (tempDir+tempFileName.str()+".log"); int res = 0; // If the runtime compiler plugin is available, use it. if (hasCompilerKernel) { string ptx = compilerKernel.getAs().createModule(src.str(), "-arch=compute_"+compileArchitecture+" "+options, *this); // If possible, write the PTX out to a temporary file so we can cache it for later use. bool wroteCache = false; try { ofstream out(outputFile.c_str()); out << ptx; out.close(); if (!out.fail()) wroteCache = true; } catch (...) { // Ignore. } if (!wroteCache) { // An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly. CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module"); return module; } } else { // Write out the source to a temporary file. ofstream out(inputFile.c_str()); out << src.str(); out.close(); #ifdef WIN32 #ifdef _DEBUG string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+compileArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile; #else string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+compileArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile; #endif res = executeInWindows(command); #else string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+compileArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\""; res = std::system(command.c_str()); #endif } try { if (res != 0) { // Load the error log. stringstream error; error << "Error launching CUDA compiler: " << res; ifstream log(logFile.c_str()); if (log.is_open()) { string line; while (!log.eof()) { getline(log, line); error << '\n' << line; } log.close(); } throw OpenMMException(error.str()); } CUresult result = cuModuleLoad(&module, outputFile.c_str()); if (result != CUDA_SUCCESS) { std::stringstream m; m<<"Error loading CUDA module: "<(new CudaEvent(*this)); } ComputeProgram CudaContext::compileProgram(const std::string source, const std::map& defines) { CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines); return shared_ptr(new CudaProgram(*this, module)); } CudaArray& CudaContext::unwrap(ArrayInterface& array) const { CudaArray* cuarray; ComputeArray* wrapper = dynamic_cast(&array); if (wrapper != NULL) cuarray = dynamic_cast(&wrapper->getArray()); else cuarray = dynamic_cast(&array); if (cuarray == NULL) throw OpenMMException("Array argument is not an CudaArray"); return *cuarray; } std::string CudaContext::getErrorString(CUresult result) { const char* message; if (cuGetErrorName(result, &message) == CUDA_SUCCESS) return string(message); return "CUDA error"; } void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) { if (blockSize == -1) blockSize = ThreadBlockSize; int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks); CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL); if (result != CUDA_SUCCESS) { stringstream str; str<<"Error invoking kernel: "<= 6) { void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base], &autoclearBuffers[base+1], &autoclearBufferSizes[base+1], &autoclearBuffers[base+2], &autoclearBufferSizes[base+2], &autoclearBuffers[base+3], &autoclearBufferSizes[base+3], &autoclearBuffers[base+4], &autoclearBufferSizes[base+4], &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]}; executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128); base += 6; } if (total-base == 5) { void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base], &autoclearBuffers[base+1], &autoclearBufferSizes[base+1], &autoclearBuffers[base+2], &autoclearBufferSizes[base+2], &autoclearBuffers[base+3], &autoclearBufferSizes[base+3], &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]}; executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128); } else if (total-base == 4) { void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base], &autoclearBuffers[base+1], &autoclearBufferSizes[base+1], &autoclearBuffers[base+2], &autoclearBufferSizes[base+2], &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]}; executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128); } else if (total-base == 3) { void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base], &autoclearBuffers[base+1], &autoclearBufferSizes[base+1], &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]}; executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128); } else if (total-base == 2) { void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base], &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]}; executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128); } else if (total-base == 1) { clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4); } } double CudaContext::reduceEnergy() { int bufferSize = energyBuffer.getSize(); int workGroupSize = 512; void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize}; executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize()); energySum.download(pinnedBuffer); if (getUseDoublePrecision() || getUseMixedPrecision()) return *((double*) pinnedBuffer); else return *((float*) pinnedBuffer); } void CudaContext::setCharges(const vector& charges) { if (!chargeBuffer.isInitialized()) chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer"); vector c(numAtoms); for (int i = 0; i < numAtoms; i++) c[i] = charges[i]; chargeBuffer.upload(c, true); void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms}; executeKernel(setChargesKernel, args, numAtoms); } bool CudaContext::requestPosqCharges() { bool allow = !hasAssignedPosqCharges; hasAssignedPosqCharges = true; return allow; } void CudaContext::addEnergyParameterDerivative(const string& param) { // See if this parameter has already been registered. for (int i = 0; i < energyParamDerivNames.size(); i++) if (param == energyParamDerivNames[i]) return; energyParamDerivNames.push_back(param); } void CudaContext::flushQueue() { cuStreamSynchronize(getCurrentStream()); } vector CudaContext::getDevicePrecedence() { int numDevices; CUdevice thisDevice; string errorMessage = "Error initializing Context"; vector, int> > devices; CHECK_RESULT(cuDeviceGetCount(&numDevices)); for (int i = 0; i < numDevices; i++) { CHECK_RESULT(cuDeviceGet(&thisDevice, i)); int major, minor, clock, multiprocessors, speed; CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, thisDevice)); CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, thisDevice)); if (major == 1 && minor < 2) continue; if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3)) continue; CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice)); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice)); speed = clock*multiprocessors; pair deviceProperties = std::make_pair(major, speed); devices.push_back(std::make_pair(deviceProperties, -i)); } // sort first by compute capability (higher is better), then speed // (higher is better), and finally device index (lower is better) std::sort(devices.begin(), devices.end()); std::reverse(devices.begin(), devices.end()); vector precedence; for (int i = 0; i < static_cast(devices.size()); i++) { precedence.push_back(-devices[i].second); } return precedence; }